From 6ff3185b5281aa074a6d66757c6b81bae8569816 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Mon, 5 Oct 2015 02:28:32 -0700
Subject: [PATCH 1/7] Added criteria based exemption to db.ignore.external

---
 conf/db-ignore-external-exemptions.txt             |  17 ++
 conf/nutch-default.xml                             |  22 ++-
 .../apache/nutch/exempt/ExemptionUrlFilter.java    | 176 +++++++++++++++++++++
 src/java/org/apache/nutch/exempt/package-info.java |   6 +
 .../org/apache/nutch/parse/ParseOutputFormat.java  |  10 +-
 5 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 conf/db-ignore-external-exemptions.txt
 create mode 100644 src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
 create mode 100644 src/java/org/apache/nutch/exempt/package-info.java

diff --git a/conf/db-ignore-external-exemptions.txt b/conf/db-ignore-external-exemptions.txt
new file mode 100644
index 0000000..8f6b7b2
--- /dev/null
+++ b/conf/db-ignore-external-exemptions.txt
@@ -0,0 +1,17 @@
+# Exceptions to db ignore
+#
+# Format :
+#--------
+# MimeType=UrlRegex
+# The first occurance of '=' divides comma separated mime-types with the url regex.
+# When the url matches regex and content type is present in the specified list,it is considered as exceptional URL.
+
+
+# Example :
+#----------
+# To ingore except urls ending with .jpg or .png and has content type image/jpeg or image/png
+image/jpeg,image/png=.*\.jpg$|.*\.png$
+
+# To accept all urls ending with gif, without looking for mimetypes.
+#Note : Mimes are empty => accept any mimetype
+=.*\.gif$
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index bf1189a..bdc16d1 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -554,7 +554,27 @@
   </description>
 </property>
 
- <property>
+<property>
+  <name>db.ignore.external.exemptions</name>
+  <value>false</value>
+  <description>
+    If true, external resources obeying to rules specified in
+    'db.ignore.external.exemptions.file' will not be ignored.
+    This config overrides 'db.ignore.external.links' setting to
+    the resources which match to exemption rules.
+  </description>
+</property>
+
+<property>
+  <name>db.ignore.external.exemptions.file</name>
+  <value>db-ignore-external-exemptions.txt</value>
+  <description>
+    This file contains exemption rules to 'db.ignore.external.links'.
+    To activate this, 'db.ignore.external.exemptions' must be set to true
+  </description>
+</property>
+
+<property>
   <name>db.injector.overwrite</name>
   <value>false</value>
   <description>Whether existing records in the CrawlDB will be overwritten
diff --git a/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java b/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..9683d19
--- /dev/null
+++ b/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
@@ -0,0 +1,176 @@
+package org.apache.nutch.exempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.http.Header;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpHead;
+import org.apache.http.entity.ContentType;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
+import java.util.regex.Pattern;
+
+/**
+ * This implementation of {@link URLFilter} checks if URL is eligible for db ignore exemption.
+ * When this filter is enabled, urls will be checked against configured set of regex and mimetype rules.
+ * @author Thamme Gowda
+ * @since October 5, 2015
+ * @version 1
+ * @see org.apache.nutch.net.URLFilter
+ */
+public class ExemptionUrlFilter implements URLFilter {
+
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE = "db.ignore.external.exemptions.file";
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS = "db.ignore.external.exemptions";
+  private static final Logger LOG = LoggerFactory.getLogger(ExemptionUrlFilter.class);
+  private static ExemptionUrlFilter INSTANCE;
+
+  private LinkedHashMap<Pattern, Set<String>> exemptions; //preserves  insertion order
+  private Configuration conf;
+  private HttpClientBuilder clientBuilder;
+  private boolean enabled;
+
+  public static ExemptionUrlFilter getInstance() {
+    if(INSTANCE == null) {
+      synchronized (ExemptionUrlFilter.class) {
+        if (INSTANCE == null) {
+          INSTANCE = new ExemptionUrlFilter();
+          INSTANCE.setConf(NutchConfiguration.create());
+        }
+      }
+    }
+    return INSTANCE;
+  }
+
+  public boolean isEnabled() {
+    return enabled;
+  }
+
+  public LinkedHashMap<Pattern, Set<String>> getExemptions() {
+    return exemptions;
+  }
+
+  private String getContentType(String urlString){
+    //FIXME : Do this in nutch way, use Fetcher Queues and Protocols
+    CloseableHttpClient client = null;
+    CloseableHttpResponse response = null;
+    try {
+      client = clientBuilder.build();
+      HttpHead httpHead = new HttpHead(urlString);
+      response = client.execute(httpHead);
+      Header cTypeHeader = response.getFirstHeader(HttpHeaders.CONTENT_TYPE);
+      if (cTypeHeader != null) {
+        ContentType contentType = ContentType.parse(cTypeHeader.getValue());
+        LOG.debug("{} MimeType={}", urlString, contentType.getMimeType());
+        return contentType.getMimeType();
+      }
+    } catch (Exception e) {
+      LOG.debug("{} while trying to HTTP HEAD on {}", e.getMessage(), urlString);
+    } finally {
+        IOUtils.closeQuietly(response);
+        IOUtils.closeQuietly(client);
+    }
+    // couldn't get mime type
+    return null;
+  }
+
+  @Override
+  public String filter(String urlString) {
+    if (exemptions != null) {
+      for (Pattern pattern : exemptions.keySet()) {
+        if (pattern.matcher(urlString).matches()) {
+          Set<String> mimes = exemptions.get(pattern);
+          if (mimes.isEmpty() || mimes.contains(getContentType(urlString))) {
+            //when mimes are empty, it means don't care
+            //when mimes are specified, perform match
+            return urlString;
+          }
+        }
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    boolean ignoreExternal = conf.getBoolean(LinkDb.IGNORE_EXTERNAL_LINKS, false);
+    boolean ignoreExternalExemptions = conf.getBoolean(DB_IGNORE_EXTERNAL_EXEMPTIONS, false);
+    this.enabled =  ignoreExternal && ignoreExternalExemptions;
+    if (!enabled) {
+      LOG.info("DB Ignore Exemptions are not Enabled. To enable, set '{}' and '{}' to 'true'",
+          LinkDb.IGNORE_EXTERNAL_LINKS, DB_IGNORE_EXTERNAL_EXEMPTIONS);
+    } else{
+      LOG.info("Ignore exceptions enabled");
+      RequestConfig requestConfig = RequestConfig.custom()
+          .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
+          .build();
+      this.clientBuilder = HttpClientBuilder.create()
+          .setDefaultRequestConfig(requestConfig)
+          .setUserAgent(conf.get("http.agent.name"));
+
+      String fileName = this.conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+      InputStream stream = this.conf.getConfResourceAsInputStream(fileName);
+      if (stream == null) {
+        throw new RuntimeException("Couldn't find config file :" + fileName);
+      }
+      try {
+        this.exemptions = new LinkedHashMap<Pattern, Set<String>>();
+        List<String> lines = IOUtils.readLines(stream);
+        for (String line : lines) {
+          line = line.trim();
+          if (line.startsWith("#") || line.isEmpty()) {
+            continue; //Skip : comment line or empty line
+          }
+          int firstIndex = line.indexOf('=');
+          if (firstIndex == -1) {
+            // No Split! Invalid
+            LOG.error("{} : Invalid Config  :: {}", fileName, line);
+            continue;
+          }
+          String mimeString = line.substring(0, firstIndex).trim();
+          String regex = line.substring(firstIndex + 1, line.length()).trim();
+          if (regex.isEmpty()) {
+            LOG.error("{} : Invalid Config  :: {}", fileName, line);
+            continue;
+          }
+          //NOTE:empty mime string means don't care => */*
+
+          HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
+          Pattern compiled = Pattern.compile(regex);
+          LOG.info("URL rule :: {} <=> {}", regex, mimes);
+          exemptions.put(compiled, mimes);
+        }
+        LOG.info("Read {} rules from {}", exemptions.size(), fileName);
+      } catch (IOException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      System.out.println("Error: Invalid Args");
+      System.out.println("Usage:" + ExemptionUrlFilter.class.getName() + " <url>");
+      return;
+    }
+    String url = args[0];
+    System.out.println(ExemptionUrlFilter.getInstance().filter(url) != null);
+  }
+}
diff --git a/src/java/org/apache/nutch/exempt/package-info.java b/src/java/org/apache/nutch/exempt/package-info.java
new file mode 100644
index 0000000..bba5f3c
--- /dev/null
+++ b/src/java/org/apache/nutch/exempt/package-info.java
@@ -0,0 +1,6 @@
+/**
+ *
+ * FIXME: Move this package to plugin.
+ * This package contains exemption filter
+ */
+package org.apache.nutch.exempt;
\ No newline at end of file
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index ccaf8b5..21c7bcc 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.parse;
 
 // Commons Logging imports
+import org.apache.nutch.exempt.ExemptionUrlFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.io.*;
@@ -331,7 +332,14 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
         toHost = null;
       }
       if (toHost == null || !toHost.equals(fromHost)) { // external links
-        return null; // skip it
+        if (ExemptionUrlFilter.getInstance().isEnabled()
+            && ExemptionUrlFilter.getInstance().filter(toUrl) != null) {
+          // This url is exempted.
+          LOG.info("External Link is exempted from ignore :: {}",  toUrl);
+        } else {
+          LOG.info("External Link will be Skipped :: {}", toUrl);
+          return null; // skip it
+        }
       }
     }
     try {
-- 
2.1.4


From 16218fecf4435f9a4f09c8830b770a5710080298 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Fri, 9 Oct 2015 18:31:27 -0700
Subject: [PATCH 2/7] External Link Ignore Exemption Filter plugin added

1. Exemption check extension point is defined
2. Plugin source code and config added
3. Build configs updated to include newly added plugin
4. Extension point is integrated with existing pipeline
5. Nutch config updated to include plugin's config properties
---
 build.xml                                          |   4 +
 conf/db-ignore-external-exemptions.txt             |   2 +-
 .../apache/nutch/exempt/ExemptionUrlFilter.java    | 176 ---------------
 src/java/org/apache/nutch/exempt/package-info.java |   6 -
 .../org/apache/nutch/fetcher/FetcherThread.java    |   3 +
 .../org/apache/nutch/net/URLExemptionCheckers.java |  42 ++++
 .../org/apache/nutch/net/URLExemptionFilter.java   |  43 ++++
 .../org/apache/nutch/parse/ParseOutputFormat.java  |  21 +-
 .../org/apache/nutch/plugin/ExtensionPoint.java    |   2 +-
 src/plugin/build.xml                               |   3 +
 src/plugin/urlfilter-ignoreexempt/build.xml        |  28 +++
 src/plugin/urlfilter-ignoreexempt/data/hosts.txt   |   5 +
 src/plugin/urlfilter-ignoreexempt/ivy.xml          |  41 ++++
 src/plugin/urlfilter-ignoreexempt/plugin.xml       |  43 ++++
 .../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 244 +++++++++++++++++++++
 .../nutch/urlfilter/ignoreexempt/package-info.java |  24 ++
 .../urlfilter/domain/TestDomainURLFilter.java      |  48 ++++
 17 files changed, 541 insertions(+), 194 deletions(-)
 delete mode 100644 src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
 delete mode 100644 src/java/org/apache/nutch/exempt/package-info.java
 create mode 100644 src/java/org/apache/nutch/net/URLExemptionCheckers.java
 create mode 100644 src/java/org/apache/nutch/net/URLExemptionFilter.java
 create mode 100644 src/plugin/urlfilter-ignoreexempt/build.xml
 create mode 100644 src/plugin/urlfilter-ignoreexempt/data/hosts.txt
 create mode 100644 src/plugin/urlfilter-ignoreexempt/ivy.xml
 create mode 100644 src/plugin/urlfilter-ignoreexempt/plugin.xml
 create mode 100644 src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
 create mode 100644 src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
 create mode 100644 src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

diff --git a/build.xml b/build.xml
index 713e2b5..b79f79c 100644
--- a/build.xml
+++ b/build.xml
@@ -213,6 +213,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
@@ -623,6 +624,7 @@
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-suffix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-validator/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/urlmeta/src/java"/>
       <packageset dir="${plugins.dir}/urlnormalizer-basic/src/java"/>
@@ -1040,6 +1042,8 @@
         <source path="${plugins.dir}/urlfilter-suffix/src/test/" />
         <source path="${plugins.dir}/urlfilter-validator/src/java/" />
         <source path="${plugins.dir}/urlfilter-validator/src/test/" />
+        <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
+        <source path="${plugins.dir}/urlfilter-ignoreexempt/src/test/" />
         <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
         <source path="${plugins.dir}/urlmeta/src/java/" />
         <source path="${plugins.dir}/urlnormalizer-basic/src/java/" />
diff --git a/conf/db-ignore-external-exemptions.txt b/conf/db-ignore-external-exemptions.txt
index 8f6b7b2..fe22b9c 100644
--- a/conf/db-ignore-external-exemptions.txt
+++ b/conf/db-ignore-external-exemptions.txt
@@ -10,7 +10,7 @@
 # Example :
 #----------
 # To ingore except urls ending with .jpg or .png and has content type image/jpeg or image/png
-image/jpeg,image/png=.*\.jpg$|.*\.png$
+image/jpeg,image/png=.*\.jpg$|.*\.JPG$|.*\.png$|.*\.PNG$
 
 # To accept all urls ending with gif, without looking for mimetypes.
 #Note : Mimes are empty => accept any mimetype
diff --git a/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java b/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
deleted file mode 100644
index 9683d19..0000000
--- a/src/java/org/apache/nutch/exempt/ExemptionUrlFilter.java
+++ /dev/null
@@ -1,176 +0,0 @@
-package org.apache.nutch.exempt;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.http.Header;
-import org.apache.http.client.config.RequestConfig;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpHead;
-import org.apache.http.entity.ContentType;
-import org.apache.http.impl.client.CloseableHttpClient;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.nutch.crawl.LinkDb;
-import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.URLFilter;
-import org.apache.nutch.util.NutchConfiguration;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.*;
-import java.util.regex.Pattern;
-
-/**
- * This implementation of {@link URLFilter} checks if URL is eligible for db ignore exemption.
- * When this filter is enabled, urls will be checked against configured set of regex and mimetype rules.
- * @author Thamme Gowda
- * @since October 5, 2015
- * @version 1
- * @see org.apache.nutch.net.URLFilter
- */
-public class ExemptionUrlFilter implements URLFilter {
-
-  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE = "db.ignore.external.exemptions.file";
-  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS = "db.ignore.external.exemptions";
-  private static final Logger LOG = LoggerFactory.getLogger(ExemptionUrlFilter.class);
-  private static ExemptionUrlFilter INSTANCE;
-
-  private LinkedHashMap<Pattern, Set<String>> exemptions; //preserves  insertion order
-  private Configuration conf;
-  private HttpClientBuilder clientBuilder;
-  private boolean enabled;
-
-  public static ExemptionUrlFilter getInstance() {
-    if(INSTANCE == null) {
-      synchronized (ExemptionUrlFilter.class) {
-        if (INSTANCE == null) {
-          INSTANCE = new ExemptionUrlFilter();
-          INSTANCE.setConf(NutchConfiguration.create());
-        }
-      }
-    }
-    return INSTANCE;
-  }
-
-  public boolean isEnabled() {
-    return enabled;
-  }
-
-  public LinkedHashMap<Pattern, Set<String>> getExemptions() {
-    return exemptions;
-  }
-
-  private String getContentType(String urlString){
-    //FIXME : Do this in nutch way, use Fetcher Queues and Protocols
-    CloseableHttpClient client = null;
-    CloseableHttpResponse response = null;
-    try {
-      client = clientBuilder.build();
-      HttpHead httpHead = new HttpHead(urlString);
-      response = client.execute(httpHead);
-      Header cTypeHeader = response.getFirstHeader(HttpHeaders.CONTENT_TYPE);
-      if (cTypeHeader != null) {
-        ContentType contentType = ContentType.parse(cTypeHeader.getValue());
-        LOG.debug("{} MimeType={}", urlString, contentType.getMimeType());
-        return contentType.getMimeType();
-      }
-    } catch (Exception e) {
-      LOG.debug("{} while trying to HTTP HEAD on {}", e.getMessage(), urlString);
-    } finally {
-        IOUtils.closeQuietly(response);
-        IOUtils.closeQuietly(client);
-    }
-    // couldn't get mime type
-    return null;
-  }
-
-  @Override
-  public String filter(String urlString) {
-    if (exemptions != null) {
-      for (Pattern pattern : exemptions.keySet()) {
-        if (pattern.matcher(urlString).matches()) {
-          Set<String> mimes = exemptions.get(pattern);
-          if (mimes.isEmpty() || mimes.contains(getContentType(urlString))) {
-            //when mimes are empty, it means don't care
-            //when mimes are specified, perform match
-            return urlString;
-          }
-        }
-      }
-    }
-    return null;
-  }
-
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    boolean ignoreExternal = conf.getBoolean(LinkDb.IGNORE_EXTERNAL_LINKS, false);
-    boolean ignoreExternalExemptions = conf.getBoolean(DB_IGNORE_EXTERNAL_EXEMPTIONS, false);
-    this.enabled =  ignoreExternal && ignoreExternalExemptions;
-    if (!enabled) {
-      LOG.info("DB Ignore Exemptions are not Enabled. To enable, set '{}' and '{}' to 'true'",
-          LinkDb.IGNORE_EXTERNAL_LINKS, DB_IGNORE_EXTERNAL_EXEMPTIONS);
-    } else{
-      LOG.info("Ignore exceptions enabled");
-      RequestConfig requestConfig = RequestConfig.custom()
-          .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
-          .build();
-      this.clientBuilder = HttpClientBuilder.create()
-          .setDefaultRequestConfig(requestConfig)
-          .setUserAgent(conf.get("http.agent.name"));
-
-      String fileName = this.conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
-      InputStream stream = this.conf.getConfResourceAsInputStream(fileName);
-      if (stream == null) {
-        throw new RuntimeException("Couldn't find config file :" + fileName);
-      }
-      try {
-        this.exemptions = new LinkedHashMap<Pattern, Set<String>>();
-        List<String> lines = IOUtils.readLines(stream);
-        for (String line : lines) {
-          line = line.trim();
-          if (line.startsWith("#") || line.isEmpty()) {
-            continue; //Skip : comment line or empty line
-          }
-          int firstIndex = line.indexOf('=');
-          if (firstIndex == -1) {
-            // No Split! Invalid
-            LOG.error("{} : Invalid Config  :: {}", fileName, line);
-            continue;
-          }
-          String mimeString = line.substring(0, firstIndex).trim();
-          String regex = line.substring(firstIndex + 1, line.length()).trim();
-          if (regex.isEmpty()) {
-            LOG.error("{} : Invalid Config  :: {}", fileName, line);
-            continue;
-          }
-          //NOTE:empty mime string means don't care => */*
-
-          HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
-          Pattern compiled = Pattern.compile(regex);
-          LOG.info("URL rule :: {} <=> {}", regex, mimes);
-          exemptions.put(compiled, mimes);
-        }
-        LOG.info("Read {} rules from {}", exemptions.size(), fileName);
-      } catch (IOException e) {
-        throw new IllegalStateException(e);
-      }
-    }
-  }
-
-  @Override
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  public static void main(String[] args) {
-    if (args.length != 1) {
-      System.out.println("Error: Invalid Args");
-      System.out.println("Usage:" + ExemptionUrlFilter.class.getName() + " <url>");
-      return;
-    }
-    String url = args[0];
-    System.out.println(ExemptionUrlFilter.getInstance().filter(url) != null);
-  }
-}
diff --git a/src/java/org/apache/nutch/exempt/package-info.java b/src/java/org/apache/nutch/exempt/package-info.java
deleted file mode 100644
index bba5f3c..0000000
--- a/src/java/org/apache/nutch/exempt/package-info.java
+++ /dev/null
@@ -1,6 +0,0 @@
-/**
- *
- * FIXME: Move this package to plugin.
- * This package contains exemption filter
- */
-package org.apache.nutch.exempt;
\ No newline at end of file
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 9a482b9..1b83b56 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -37,6 +37,7 @@ import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLExemptionCheckers;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
@@ -74,6 +75,7 @@ public class FetcherThread extends Thread {
 
   private Configuration conf;
   private URLFilters urlFilters;
+  private URLExemptionCheckers urlExemptionCheckers;
   private ScoringFilters scfilters;
   private ParseUtil parseUtil;
   private URLNormalizers normalizers;
@@ -137,6 +139,7 @@ public class FetcherThread extends Thread {
     this.setName("FetcherThread"); // use an informative name
     this.conf = conf;
     this.urlFilters = new URLFilters(conf);
+    this.urlExemptionCheckers = new URLExemptionCheckers(conf);
     this.scfilters = new ScoringFilters(conf);
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
diff --git a/src/java/org/apache/nutch/net/URLExemptionCheckers.java b/src/java/org/apache/nutch/net/URLExemptionCheckers.java
new file mode 100644
index 0000000..f0593f9
--- /dev/null
+++ b/src/java/org/apache/nutch/net/URLExemptionCheckers.java
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.PluginRepository;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionCheckers {
+
+  public static final String URLFILTER_ORDER = "urlfilter.order";
+  private URLExemptionFilter[] filters;
+
+  public URLExemptionCheckers(Configuration conf) {
+    this.filters = (URLExemptionFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+        URLExemptionFilter.class, URLExemptionFilter.X_POINT_ID, URLFILTER_ORDER);
+  }
+
+  /** Run all defined filters. Assume logical AND. */
+  public boolean isExempted(String fromUrl, String toUrl) {
+    boolean exempted = fromUrl != null && toUrl != null; //Initially assume the url is exempted
+    for (int i = 0; i < this.filters.length && exempted; i++) {
+      exempted = this.filters[i].filter(fromUrl, toUrl);
+    }
+    return exempted;
+  }
+}
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilter.java b/src/java/org/apache/nutch/net/URLExemptionFilter.java
new file mode 100644
index 0000000..639f2ad
--- /dev/null
+++ b/src/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+//Hadoop
+import org.apache.hadoop.conf.Configurable;
+// Nutch
+import org.apache.nutch.plugin.Pluggable;
+
+/**
+ * Interface used to allow exemptions to external domain resources by overriding db.ignore.external.links.
+ * Used by the ParseOutput Generator
+ */
+
+public interface URLExemptionFilter extends Pluggable, Configurable{
+
+  /** The name of the extension point. */
+  public final static String X_POINT_ID = URLExemptionFilter.class.getName();
+
+  /**
+   * Checks if toUrl is exempted when the ignore external is enabled
+   * @param fromUrl : the source url which generated the outlink
+   * @param toUrl : the destination url which needs to be checked for exemption
+   * @return true when toUrl is exempted from dbIgnore
+   */
+  public boolean filter(String fromUrl, String toUrl);
+
+}
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index 21c7bcc..cf97579 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -18,7 +18,6 @@
 package org.apache.nutch.parse;
 
 // Commons Logging imports
-import org.apache.nutch.exempt.ExemptionUrlFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.io.*;
@@ -52,6 +51,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
       .getLogger(ParseOutputFormat.class);
 
   private URLFilters filters;
+  private URLExemptionCheckers exemptionFilters;
   private URLNormalizers normalizers;
   private ScoringFilters scfilters;
 
@@ -95,6 +95,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
 
     if (job.getBoolean("parse.filter.urls", true)) {
       filters = new URLFilters(job);
+      exemptionFilters = new URLExemptionCheckers(job);
     }
 
     if (job.getBoolean("parse.normalize.urls", true)) {
@@ -202,8 +203,8 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
           String newUrl = pstatus.getMessage();
           int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
           newUrl = filterNormalize(fromUrl, newUrl, fromHost,
-              ignoreExternalLinks, filters, normalizers,
-              URLNormalizers.SCOPE_FETCHER);
+              ignoreExternalLinks, filters, exemptionFilters,
+              normalizers, URLNormalizers.SCOPE_FETCHER);
 
           if (newUrl != null) {
             String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl,
@@ -233,7 +234,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
           // Only normalize and filter if fetcher.parse = false
           if (!isParsing) {
             toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
-                ignoreExternalLinks, filters, normalizers);
+                ignoreExternalLinks, filters, exemptionFilters, normalizers);
             if (toUrl == null) {
               continue;
             }
@@ -312,14 +313,15 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
 
   public static String filterNormalize(String fromUrl, String toUrl,
       String fromHost, boolean ignoreExternalLinks, URLFilters filters,
-      URLNormalizers normalizers) {
+      URLExemptionCheckers exemptionCheckers, URLNormalizers normalizers) {
     return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
-        filters, normalizers, URLNormalizers.SCOPE_OUTLINK);
+        filters, exemptionCheckers, normalizers, URLNormalizers.SCOPE_OUTLINK);
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
       String fromHost, boolean ignoreExternalLinks, URLFilters filters,
-      URLNormalizers normalizers, String urlNormalizerScope) {
+      URLExemptionCheckers exemptionCheckers, URLNormalizers normalizers,
+                                       String urlNormalizerScope) {
     // ignore links to self (or anchors within the page)
     if (fromUrl.equals(toUrl)) {
       return null;
@@ -332,12 +334,11 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
         toHost = null;
       }
       if (toHost == null || !toHost.equals(fromHost)) { // external links
-        if (ExemptionUrlFilter.getInstance().isEnabled()
-            && ExemptionUrlFilter.getInstance().filter(toUrl) != null) {
+        if ( exemptionCheckers != null
+            && exemptionCheckers.isExempted(fromUrl, toUrl)) {
           // This url is exempted.
           LOG.info("External Link is exempted from ignore :: {}",  toUrl);
         } else {
-          LOG.info("External Link will be Skipped :: {}", toUrl);
           return null; // skip it
         }
       }
diff --git a/src/java/org/apache/nutch/plugin/ExtensionPoint.java b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
index 1200e4f..178c5a2 100644
--- a/src/java/org/apache/nutch/plugin/ExtensionPoint.java
+++ b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
@@ -36,7 +36,7 @@ public class ExtensionPoint {
    * @param pId
    *          unique extension point Id
    * @param pName
-   *          name of the extension poin
+   *          name of the extension point
    * @param pSchema
    *          xml schema of the extension point
    */
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 056e817..6d5bd91 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -75,6 +75,7 @@
      <ant dir="urlfilter-regex" target="deploy"/>
      <ant dir="urlfilter-suffix" target="deploy"/>
      <ant dir="urlfilter-validator" target="deploy"/>
+     <ant dir="urlfilter-ignoreexempt" target="deploy"/>
      <ant dir="parsefilter-naivebayes" target="deploy"/>
      <ant dir="urlmeta" target="deploy"/>
      <ant dir="urlnormalizer-ajax" target="deploy"/>
@@ -119,6 +120,7 @@
      <ant dir="urlfilter-regex" target="test"/>
      <ant dir="urlfilter-suffix" target="test"/>
      <ant dir="urlfilter-validator" target="test"/>
+     <ant dir="urlfilter-ignoreexempt" target="test"/>
      <ant dir="urlnormalizer-ajax" target="test"/>
      <ant dir="urlnormalizer-basic" target="test"/>
      <ant dir="urlnormalizer-host" target="test"/>
@@ -184,6 +186,7 @@
     <ant dir="urlfilter-regex" target="clean"/>
     <ant dir="urlfilter-suffix" target="clean"/>
     <ant dir="urlfilter-validator" target="clean"/>
+    <ant dir="urlfilter-ignoreexempt" target="clean"/>
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="urlmeta" target="clean"/>
     <ant dir="urlnormalizer-ajax" target="clean"/>
diff --git a/src/plugin/urlfilter-ignoreexempt/build.xml b/src/plugin/urlfilter-ignoreexempt/build.xml
new file mode 100644
index 0000000..e9d3c81
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-ignoreexempt" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>
diff --git a/src/plugin/urlfilter-ignoreexempt/data/hosts.txt b/src/plugin/urlfilter-ignoreexempt/data/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/data/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file
diff --git a/src/plugin/urlfilter-ignoreexempt/ivy.xml b/src/plugin/urlfilter-ignoreexempt/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/urlfilter-ignoreexempt/plugin.xml b/src/plugin/urlfilter-ignoreexempt/plugin.xml
new file mode 100644
index 0000000..816a9b9
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-ignoreexempt"
+   name="External Domain Ignore Exemption"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-ignoreexempt.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.ignoreexempt"
+              name="Ignore Exemption Url Filter"
+              point="org.apache.nutch.net.URLExemptionFilter">
+      <implementation id="ExemptionUrlFilter"
+        class="org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter">
+        <parameter name="file" value="db-ignore-external-exemptions.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>
diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
new file mode 100644
index 0000000..cdd5113
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.http.Header;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.methods.CloseableHttpResponse;
+import org.apache.http.client.methods.HttpHead;
+import org.apache.http.entity.ContentType;
+import org.apache.http.impl.client.CloseableHttpClient;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.nutch.crawl.LinkDb;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
+import java.util.regex.Pattern;
+
+
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * present in the file is allowed.
+ * </p>
+ *
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ *
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ *
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ *
+ * The domain file defaults to domain-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ *
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ *
+ * the attribute "file" has higher precedence if defined.
+ */
+/**
+ * This implementation of {@link URLFilter} checks if URL is eligible for db ignore exemption.
+ * When this filter is enabled, urls will be checked against configured set of regex and mimetype rules.
+ *
+ * This plugin needs to be enabled by setting 'db.ignore.external.exemptions' to <code>true</code>
+ *
+ * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
+ * overridden using the:
+ *
+ * <ul>
+ * <ol>
+ * property "db.ignore.external.exemptions.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * @author Thamme Gowda
+ * @since October 5, 2015
+ * @version 1
+ * @see URLFilter
+ */
+public class ExemptionUrlFilter implements URLFilter {
+
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE = "db.ignore.external.exemptions.file";
+  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS = "db.ignore.external.exemptions";
+  private static final Logger LOG = LoggerFactory.getLogger(ExemptionUrlFilter.class);
+  private static ExemptionUrlFilter INSTANCE;
+
+  private LinkedHashMap<Pattern, Set<String>> exemptions; //preserves  insertion order
+  private Configuration conf;
+  private HttpClientBuilder clientBuilder;
+  private boolean enabled;
+
+  public static ExemptionUrlFilter getInstance() {
+    if(INSTANCE == null) {
+      synchronized (ExemptionUrlFilter.class) {
+        if (INSTANCE == null) {
+          INSTANCE = new ExemptionUrlFilter();
+          INSTANCE.setConf(NutchConfiguration.create());
+        }
+      }
+    }
+    return INSTANCE;
+  }
+
+  public boolean isEnabled() {
+    return enabled;
+  }
+
+  public LinkedHashMap<Pattern, Set<String>> getExemptions() {
+    return exemptions;
+  }
+
+  private String getContentType(String urlString){
+    //FIXME : Do this in nutch way, use Fetcher Queues and Protocols
+    CloseableHttpClient client = null;
+    CloseableHttpResponse response = null;
+    try {
+      client = clientBuilder.build();
+      HttpHead httpHead = new HttpHead(urlString);
+      response = client.execute(httpHead);
+      Header cTypeHeader = response.getFirstHeader(HttpHeaders.CONTENT_TYPE);
+      if (cTypeHeader != null) {
+        ContentType contentType = ContentType.parse(cTypeHeader.getValue());
+        LOG.debug("{} MimeType={}", urlString, contentType.getMimeType());
+        return contentType.getMimeType();
+      }
+    } catch (Exception e) {
+      LOG.debug("{} while trying to HTTP HEAD on {}", e.getMessage(), urlString);
+    } finally {
+        IOUtils.closeQuietly(response);
+        IOUtils.closeQuietly(client);
+    }
+    // couldn't get mime type
+    return null;
+  }
+
+  @Override
+  public String filter(String urlString) {
+    if (exemptions != null) {
+      for (Pattern pattern : exemptions.keySet()) {
+        if (pattern.matcher(urlString).matches()) {
+          Set<String> mimes = exemptions.get(pattern);
+          if (mimes.isEmpty() || mimes.contains(getContentType(urlString))) {
+            //when mimes are empty, it means don't care
+            //when mimes are specified, perform match
+            return urlString;
+          }
+        }
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    boolean ignoreExternal = conf.getBoolean(LinkDb.IGNORE_EXTERNAL_LINKS, false);
+    boolean ignoreExternalExemptions = conf.getBoolean(DB_IGNORE_EXTERNAL_EXEMPTIONS, false);
+    this.enabled =  ignoreExternal && ignoreExternalExemptions;
+    if (!enabled) {
+      LOG.info("DB Ignore Exemptions are not Enabled. To enable, set '{}' and '{}' to 'true'",
+          LinkDb.IGNORE_EXTERNAL_LINKS, DB_IGNORE_EXTERNAL_EXEMPTIONS);
+    } else{
+      LOG.info("Ignore exceptions enabled");
+      RequestConfig requestConfig = RequestConfig.custom()
+          .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
+          .build();
+      this.clientBuilder = HttpClientBuilder.create()
+          .setDefaultRequestConfig(requestConfig)
+          .setUserAgent(conf.get("http.agent.name"));
+
+      String fileName = this.conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+      InputStream stream = this.conf.getConfResourceAsInputStream(fileName);
+      if (stream == null) {
+        throw new RuntimeException("Couldn't find config file :" + fileName);
+      }
+      try {
+        this.exemptions = new LinkedHashMap<Pattern, Set<String>>();
+        List<String> lines = IOUtils.readLines(stream);
+        for (String line : lines) {
+          line = line.trim();
+          if (line.startsWith("#") || line.isEmpty()) {
+            continue; //Skip : comment line or empty line
+          }
+          int firstIndex = line.indexOf('=');
+          if (firstIndex == -1) {
+            // No Split! Invalid
+            LOG.error("{} : Invalid Config  :: {}", fileName, line);
+            continue;
+          }
+          String mimeString = line.substring(0, firstIndex).trim();
+          String regex = line.substring(firstIndex + 1, line.length()).trim();
+          if (regex.isEmpty()) {
+            LOG.error("{} : Invalid Config  :: {}", fileName, line);
+            continue;
+          }
+          //NOTE:empty mime string means don't care => */*
+
+          HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
+          Pattern compiled = Pattern.compile(regex);
+          LOG.info("URL rule :: {} <=> {}", regex, mimes);
+          exemptions.put(compiled, mimes);
+        }
+        LOG.info("Read {} rules from {}", exemptions.size(), fileName);
+      } catch (IOException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      System.out.println("Error: Invalid Args");
+      System.out.println("Usage:" + ExemptionUrlFilter.class.getName() + " <url>");
+      return;
+    }
+    String url = args[0];
+    System.out.println(ExemptionUrlFilter.getInstance().filter(url) != null);
+  }
+}
diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
new file mode 100644
index 0000000..ee949c5
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin which identifies exemptions to external urls when
+ * when external urls are set to ignore.
+ *
+ */
+package org.apache.nutch.urlfilter.ignoreexempt;
+
diff --git a/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
new file mode 100644
index 0000000..466fd78
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestDomainURLFilter {
+
+  private final static String SEPARATOR = System.getProperty("file.separator");
+  private final static String SAMPLES = System.getProperty("test.data", ".");
+
+  @Test
+  public void testFilter() throws Exception {
+
+    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
+    Configuration conf = NutchConfiguration.create();
+    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+    domainFilter.setConf(conf);
+    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
+    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
+    Assert.assertNull(domainFilter.filter("http://www.google.com"));
+    Assert.assertNull(domainFilter.filter("http://mail.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
+    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
+    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
+    Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
+  }
+
+}
-- 
2.1.4


From 02b69f12ac34de86119f1237a6e480f8f07848ea Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Fri, 9 Oct 2015 23:40:59 -0700
Subject: [PATCH 3/7] Code refactoring:

+ Docs improved
+ Class name is made proper
---
 .../org/apache/nutch/fetcher/FetcherThread.java    |  8 +--
 .../org/apache/nutch/net/URLExemptionChecker.java  | 41 ++++++++++++
 .../org/apache/nutch/net/URLExemptionCheckers.java | 42 ------------
 .../org/apache/nutch/net/URLExemptionFilter.java   |  4 +-
 .../org/apache/nutch/parse/ParseOutputFormat.java  | 16 ++---
 src/plugin/nutch-extensionpoints/plugin.xml        |  4 ++
 .../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 77 +++++++++-------------
 7 files changed, 90 insertions(+), 102 deletions(-)
 create mode 100644 src/java/org/apache/nutch/net/URLExemptionChecker.java
 delete mode 100644 src/java/org/apache/nutch/net/URLExemptionCheckers.java

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 1b83b56..3fb468d 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -37,7 +37,7 @@ import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.net.URLExemptionCheckers;
+import org.apache.nutch.net.URLExemptionChecker;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
@@ -75,7 +75,7 @@ public class FetcherThread extends Thread {
 
   private Configuration conf;
   private URLFilters urlFilters;
-  private URLExemptionCheckers urlExemptionCheckers;
+  private URLExemptionChecker urlExemptionChecker;
   private ScoringFilters scfilters;
   private ParseUtil parseUtil;
   private URLNormalizers normalizers;
@@ -139,7 +139,7 @@ public class FetcherThread extends Thread {
     this.setName("FetcherThread"); // use an informative name
     this.conf = conf;
     this.urlFilters = new URLFilters(conf);
-    this.urlExemptionCheckers = new URLExemptionCheckers(conf);
+    this.urlExemptionChecker = new URLExemptionChecker(conf);
     this.scfilters = new ScoringFilters(conf);
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
@@ -648,7 +648,7 @@ public class FetcherThread extends Thread {
             String toUrl = links[i].getToUrl();
 
             toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl,
-                fromHost, ignoreExternalLinks, urlFilters, normalizers);
+                fromHost, ignoreExternalLinks, urlFilters, urlExemptionChecker, normalizers);
             if (toUrl == null) {
               continue;
             }
diff --git a/src/java/org/apache/nutch/net/URLExemptionChecker.java b/src/java/org/apache/nutch/net/URLExemptionChecker.java
new file mode 100644
index 0000000..2b2a1f6
--- /dev/null
+++ b/src/java/org/apache/nutch/net/URLExemptionChecker.java
@@ -0,0 +1,41 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.PluginRepository;
+
+/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
+public class URLExemptionChecker {
+
+  private URLExemptionFilter[] filters;
+
+  public URLExemptionChecker(Configuration conf) {
+    this.filters = (URLExemptionFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+        URLExemptionFilter.class, URLExemptionFilter.X_POINT_ID, URLFilters.URLFILTER_ORDER);
+  }
+
+  /** Run all defined filters. Assume logical AND. */
+  public boolean isExempted(String fromUrl, String toUrl) {
+    boolean exempted = fromUrl != null && toUrl != null; //Initially assume the url is exempted
+    for (int i = 0; i < this.filters.length && exempted; i++) {
+      exempted = this.filters[i].filter(fromUrl, toUrl);
+    }
+    return exempted;
+  }
+}
diff --git a/src/java/org/apache/nutch/net/URLExemptionCheckers.java b/src/java/org/apache/nutch/net/URLExemptionCheckers.java
deleted file mode 100644
index f0593f9..0000000
--- a/src/java/org/apache/nutch/net/URLExemptionCheckers.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.net;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.plugin.PluginRepository;
-
-/** Creates and caches {@link URLExemptionFilter} implementing plugins. */
-public class URLExemptionCheckers {
-
-  public static final String URLFILTER_ORDER = "urlfilter.order";
-  private URLExemptionFilter[] filters;
-
-  public URLExemptionCheckers(Configuration conf) {
-    this.filters = (URLExemptionFilter[]) PluginRepository.get(conf).getOrderedPlugins(
-        URLExemptionFilter.class, URLExemptionFilter.X_POINT_ID, URLFILTER_ORDER);
-  }
-
-  /** Run all defined filters. Assume logical AND. */
-  public boolean isExempted(String fromUrl, String toUrl) {
-    boolean exempted = fromUrl != null && toUrl != null; //Initially assume the url is exempted
-    for (int i = 0; i < this.filters.length && exempted; i++) {
-      exempted = this.filters[i].filter(fromUrl, toUrl);
-    }
-    return exempted;
-  }
-}
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilter.java b/src/java/org/apache/nutch/net/URLExemptionFilter.java
index 639f2ad..533141f 100644
--- a/src/java/org/apache/nutch/net/URLExemptionFilter.java
+++ b/src/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -23,8 +23,8 @@ import org.apache.hadoop.conf.Configurable;
 import org.apache.nutch.plugin.Pluggable;
 
 /**
- * Interface used to allow exemptions to external domain resources by overriding db.ignore.external.links.
- * Used by the ParseOutput Generator
+ * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>.
+ * This is useful when the crawl is focussed to a domain but resources like images are hosted on CDN.
  */
 
 public interface URLExemptionFilter extends Pluggable, Configurable{
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index cf97579..1a7d891 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -51,7 +51,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
       .getLogger(ParseOutputFormat.class);
 
   private URLFilters filters;
-  private URLExemptionCheckers exemptionFilters;
+  private URLExemptionChecker exemptionChecker;
   private URLNormalizers normalizers;
   private ScoringFilters scfilters;
 
@@ -95,7 +95,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
 
     if (job.getBoolean("parse.filter.urls", true)) {
       filters = new URLFilters(job);
-      exemptionFilters = new URLExemptionCheckers(job);
+      exemptionChecker = new URLExemptionChecker(job);
     }
 
     if (job.getBoolean("parse.normalize.urls", true)) {
@@ -203,7 +203,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
           String newUrl = pstatus.getMessage();
           int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
           newUrl = filterNormalize(fromUrl, newUrl, fromHost,
-              ignoreExternalLinks, filters, exemptionFilters,
+              ignoreExternalLinks, filters, exemptionChecker,
               normalizers, URLNormalizers.SCOPE_FETCHER);
 
           if (newUrl != null) {
@@ -234,7 +234,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
           // Only normalize and filter if fetcher.parse = false
           if (!isParsing) {
             toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost,
-                ignoreExternalLinks, filters, exemptionFilters, normalizers);
+                ignoreExternalLinks, filters, exemptionChecker, normalizers);
             if (toUrl == null) {
               continue;
             }
@@ -313,14 +313,14 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
 
   public static String filterNormalize(String fromUrl, String toUrl,
       String fromHost, boolean ignoreExternalLinks, URLFilters filters,
-      URLExemptionCheckers exemptionCheckers, URLNormalizers normalizers) {
+      URLExemptionChecker exemptionCheckers, URLNormalizers normalizers) {
     return filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
         filters, exemptionCheckers, normalizers, URLNormalizers.SCOPE_OUTLINK);
   }
 
   public static String filterNormalize(String fromUrl, String toUrl,
       String fromHost, boolean ignoreExternalLinks, URLFilters filters,
-      URLExemptionCheckers exemptionCheckers, URLNormalizers normalizers,
+      URLExemptionChecker exemptionChecker, URLNormalizers normalizers,
                                        String urlNormalizerScope) {
     // ignore links to self (or anchors within the page)
     if (fromUrl.equals(toUrl)) {
@@ -334,8 +334,8 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
         toHost = null;
       }
       if (toHost == null || !toHost.equals(fromHost)) { // external links
-        if ( exemptionCheckers != null
-            && exemptionCheckers.isExempted(fromUrl, toUrl)) {
+        if ( exemptionChecker != null
+            && exemptionChecker.isExempted(fromUrl, toUrl)) {
           // This url is exempted.
           LOG.info("External Link is exempted from ignore :: {}",  toUrl);
         } else {
diff --git a/src/plugin/nutch-extensionpoints/plugin.xml b/src/plugin/nutch-extensionpoints/plugin.xml
index e095c1c..8cf7a23 100644
--- a/src/plugin/nutch-extensionpoints/plugin.xml
+++ b/src/plugin/nutch-extensionpoints/plugin.xml
@@ -49,6 +49,10 @@
       name="Nutch URL Filter"/>
 
 <extension-point
+        id="org.apache.nutch.net.URLExemptionFilter"
+        name="Nutch URL Ignore Exemption Filter"/>
+
+<extension-point
       id="org.apache.nutch.net.URLNormalizer"
       name="Nutch URL Normalizer"/>
 
diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
index cdd5113..fad0461 100644
--- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -38,60 +38,45 @@ import java.util.*;
 import java.util.regex.Pattern;
 
 
-
 /**
- * <p>
- * Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. Only a url that matches one of the suffixes, domains, or hosts
- * present in the file is allowed.
- * </p>
- *
- * <p>
- * Urls are checked in order of domain suffix, domain name, and hostname against
- * entries in the domain file. The domain file would be setup as follows with
- * one entry per line:
- *
- * <pre>
- * com apache.org www.apache.org
- * </pre>
- *
- * <p>
- * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
- * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
- * entries are from more general to more specific with the more general
- * overridding the more specific.
- * </p>
- *
- * The domain file defaults to domain-urlfilter.txt in the classpath but can be
- * overridden using the:
- *
- * <ul>
- * <ol>
- * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
- * </ol>
- * <ol>
- * attribute "file" in plugin.xml of this plugin
- * </ol>
- * </ul>
- *
- * the attribute "file" has higher precedence if defined.
- */
-/**
- * This implementation of {@link URLFilter} checks if URL is eligible for db ignore exemption.
+ * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} checks uses regex and mime-type configuration
+ * to check if URL is eligible for exemption from 'db.ignore.external'.
  * When this filter is enabled, urls will be checked against configured set of regex and mimetype rules.
  *
  * This plugin needs to be enabled by setting 'db.ignore.external.exemptions' to <code>true</code>
  *
  * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
- * overridden using the:
+ * overridden using the <code>
+ * property "db.ignore.external.exemptions.file" in ./conf/nutch-*.xml, and
+ * </code>
+ *
  *
- * <ul>
+ * The exemption rules are specified in plain text file where each line is a rule.
+ * <br/><code>MimeType1,MimeType2,MimeTypeN=UrlRegex</code><br/>
+ * The first occurance of '=' from left divides comma separated mime-types with the url regex.
+ * When the url matches regex and content type is present in the specified list,
+ * then it is considered as exceptional URL.<br/>
+ * <h3>Examples:</h3>
  * <ol>
- * property "db.ignore.external.exemptions.file" in ./conf/nutch-*.xml, and
+ *   <li>
+ *     <b>Exempt urls ending with .jpg or .png and have content type image/jpeg or image/png </b>
+ *      <br/><code>image/jpeg,image/png=.*\.jpg$|.*\.JPG$|.*\.png$|.*\.PNG$</code><br/><br/>
+ *   </li>
+ *   <li>
+ *  <b> Exempt all urls ending with gif, without looking for mimetypes.</b>
+ *    <br/><code>=.*\.gif$</code><br/>
+ *    <i>Note : Mimes are empty => accept any mimetype</i>
+ *    <br/>
+ *   </li>
+ *   <li>
+ *      <b>Exempt all urls having mimetype image/jpeg or image/png.</b><br/>
+ *      <br/><code>image/jpeg,image/png=.*</code><br/>
+ *       <i>Note : .* regex matches all urls</i>
+ *       <br/>
+ *   </li>
  * </ol>
- * <ol>
+ </pre>
+*
  * @author Thamme Gowda
  * @since October 5, 2015
  * @version 1
@@ -180,7 +165,7 @@ public class ExemptionUrlFilter implements URLFilter {
       LOG.info("DB Ignore Exemptions are not Enabled. To enable, set '{}' and '{}' to 'true'",
           LinkDb.IGNORE_EXTERNAL_LINKS, DB_IGNORE_EXTERNAL_EXEMPTIONS);
     } else{
-      LOG.info("Ignore exceptions enabled");
+      LOG.info("Ignore exemptions enabled");
       RequestConfig requestConfig = RequestConfig.custom()
           .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
           .build();
-- 
2.1.4


From 5fa3ed6c109138081fb0d75ef29d769dac24c7a2 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Sat, 10 Oct 2015 02:30:24 -0700
Subject: [PATCH 4/7] Auto activate urlfilter-ignoreexempt plugin.

+ Plugin initialization issue fixed
+ unwanted 'db.ignore.external.exemptions' config removed
+ README added
---
 conf/nutch-default.xml                             |  14 +-
 src/java/org/apache/nutch/ExemptionUrlFilter.java  |   0
 .../org/apache/nutch/net/URLExemptionChecker.java  |  31 ++++-
 src/plugin/urlfilter-ignoreexempt/README.md        |  34 +++++
 .../urlfilter/ignoreexempt/ExemptionUrlFilter.java | 150 ++++++++++-----------
 5 files changed, 137 insertions(+), 92 deletions(-)
 create mode 100644 src/java/org/apache/nutch/ExemptionUrlFilter.java
 create mode 100644 src/plugin/urlfilter-ignoreexempt/README.md

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index bdc16d1..75020ea 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -555,22 +555,10 @@
 </property>
 
 <property>
-  <name>db.ignore.external.exemptions</name>
-  <value>false</value>
-  <description>
-    If true, external resources obeying to rules specified in
-    'db.ignore.external.exemptions.file' will not be ignored.
-    This config overrides 'db.ignore.external.links' setting to
-    the resources which match to exemption rules.
-  </description>
-</property>
-
-<property>
   <name>db.ignore.external.exemptions.file</name>
   <value>db-ignore-external-exemptions.txt</value>
   <description>
-    This file contains exemption rules to 'db.ignore.external.links'.
-    To activate this, 'db.ignore.external.exemptions' must be set to true
+    This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/ExemptionUrlFilter.java b/src/java/org/apache/nutch/ExemptionUrlFilter.java
new file mode 100644
index 0000000..e69de29
diff --git a/src/java/org/apache/nutch/net/URLExemptionChecker.java b/src/java/org/apache/nutch/net/URLExemptionChecker.java
index 2b2a1f6..1ae2741 100644
--- a/src/java/org/apache/nutch/net/URLExemptionChecker.java
+++ b/src/java/org/apache/nutch/net/URLExemptionChecker.java
@@ -18,24 +18,47 @@
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /** Creates and caches {@link URLExemptionFilter} implementing plugins. */
 public class URLExemptionChecker {
 
+  private static final Logger LOG = LoggerFactory.getLogger(URLExemptionChecker.class);
+
   private URLExemptionFilter[] filters;
 
   public URLExemptionChecker(Configuration conf) {
-    this.filters = (URLExemptionFilter[]) PluginRepository.get(conf).getOrderedPlugins(
-        URLExemptionFilter.class, URLExemptionFilter.X_POINT_ID, URLFilters.URLFILTER_ORDER);
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions();
+    filters = new URLExemptionFilter[extensions.length];
+    for (int i = 0; i < extensions.length; i++) {
+      try {
+        filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance();
+      } catch (PluginRuntimeException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+    LOG.info("Found {} extensions at point:'{}'", filters.length,
+        URLExemptionFilter.X_POINT_ID);
   }
 
+
   /** Run all defined filters. Assume logical AND. */
   public boolean isExempted(String fromUrl, String toUrl) {
-    boolean exempted = fromUrl != null && toUrl != null; //Initially assume the url is exempted
+    if (filters.length < 1) {
+      //at least one filter should be on
+      return false;
+    }
+    //validate from, to and filters
+    boolean exempted = fromUrl != null && toUrl != null;
+    //An URL is exempted when all the filters accept it to pass through
     for (int i = 0; i < this.filters.length && exempted; i++) {
       exempted = this.filters[i].filter(fromUrl, toUrl);
     }
     return exempted;
   }
-}
+}
\ No newline at end of file
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md
new file mode 100644
index 0000000..de1c912
--- /dev/null
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -0,0 +1,34 @@
+urlfilter-ignoreexempt
+======================
+  This plugin allows certain urls to be exempted when the external links are configured to be ignored. This is useful when focused crawl is setup but some resources like static files are linked from CDNs (external domains).
+
+How to enable ?
+==============
+Add `urlfilter-ignoreexempt` value to `plugin.includes` property
+```xml
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-http|urlfilter-(regex|ignoreexempt)...</value>
+</property>
+```
+
+How to configure ?
+================
+
+open `conf/db-ignore-external-exemptions.txt` and add rules
+
+## Format :
+
+`MimeType1,MimeType2=UrlRegex`
+
+The first occurrence of '=' from left divides comma separated mime-types with the url regex.
+When the url matches regex and content type is present in the specified list, it is considered as exempted URL.
+
+## Example :
+
+__Exempt urls ending with .jpg or .png and has content type image/jpeg or image/png__  
+```image/jpeg,image/png=.*\.jpg$|.*\.JPG$|.*\.png$|.*\.PNG$```
+
+__Exempt urls ending with gif, without looking for mimetypes.__  
+_Note : Mimes are empty => accept any mimetype_  
+`=.*\.gif$`
diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
index fad0461..b665ebb 100644
--- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -25,37 +25,33 @@ import org.apache.http.client.methods.HttpHead;
 import org.apache.http.entity.ContentType;
 import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.nutch.crawl.LinkDb;
 import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.net.URLExemptionFilter;
 import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.*;
+import java.util.Arrays;
 import java.util.regex.Pattern;
+import java.util.*;
 
 
 /**
  * This implementation of {@link org.apache.nutch.net.URLExemptionFilter} checks uses regex and mime-type configuration
  * to check if URL is eligible for exemption from 'db.ignore.external'.
- * When this filter is enabled, urls will be checked against configured set of regex and mimetype rules.
- *
- * This plugin needs to be enabled by setting 'db.ignore.external.exemptions' to <code>true</code>
- *
+ * When this filter is enabled, urls will be checked against configured sequence of regex and mimetype rules.
+ *<p>
  * The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
- * overridden using the <code>
- * property "db.ignore.external.exemptions.file" in ./conf/nutch-*.xml, and
- * </code>
- *
+ * overridden using the property  <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code>
+ *</p>
  *
  * The exemption rules are specified in plain text file where each line is a rule.
  * <br/><code>MimeType1,MimeType2,MimeTypeN=UrlRegex</code><br/>
  * The first occurance of '=' from left divides comma separated mime-types with the url regex.
  * When the url matches regex and content type is present in the specified list,
- * then it is considered as exceptional URL.<br/>
+ * then url is exempted from 'db.ignore.external...'<br/>
  * <h3>Examples:</h3>
  * <ol>
  *   <li>
@@ -76,16 +72,15 @@ import java.util.regex.Pattern;
  *   </li>
  * </ol>
  </pre>
-*
+ *
  * @author Thamme Gowda
  * @since October 5, 2015
  * @version 1
- * @see URLFilter
+ * @see URLExemptionFilter
  */
-public class ExemptionUrlFilter implements URLFilter {
+public class ExemptionUrlFilter implements URLExemptionFilter {
 
   public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE = "db.ignore.external.exemptions.file";
-  public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS = "db.ignore.external.exemptions";
   private static final Logger LOG = LoggerFactory.getLogger(ExemptionUrlFilter.class);
   private static ExemptionUrlFilter INSTANCE;
 
@@ -131,84 +126,89 @@ public class ExemptionUrlFilter implements URLFilter {
     } catch (Exception e) {
       LOG.debug("{} while trying to HTTP HEAD on {}", e.getMessage(), urlString);
     } finally {
-        IOUtils.closeQuietly(response);
-        IOUtils.closeQuietly(client);
+      IOUtils.closeQuietly(response);
+      IOUtils.closeQuietly(client);
     }
     // couldn't get mime type
     return null;
   }
 
   @Override
-  public String filter(String urlString) {
+  public boolean filter(String fromUrl, String toUrl) {
+    //this implementation doesnt do anything with fromUrl
     if (exemptions != null) {
+      String mimeType = null;
       for (Pattern pattern : exemptions.keySet()) {
-        if (pattern.matcher(urlString).matches()) {
+        //condition 1: regex should match
+        if (pattern.matcher(toUrl).matches()) {
           Set<String> mimes = exemptions.get(pattern);
-          if (mimes.isEmpty() || mimes.contains(getContentType(urlString))) {
+          //condition 2a) Dont care mimes
+          if (mimes.isEmpty()) {
             //when mimes are empty, it means don't care
-            //when mimes are specified, perform match
-            return urlString;
+            return true; //exempted
+          }
+          if (mimeType == null) {
+            //get it lazily, only once in this loop
+            mimeType = getContentType(toUrl);
+          }
+          //condition 2b) mime type also matches
+          if (mimes.contains(mimeType)) {
+            // exempted
+            return true;
           }
         }
       }
     }
-    return null;
+    //not exempted
+    return false;
   }
 
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    boolean ignoreExternal = conf.getBoolean(LinkDb.IGNORE_EXTERNAL_LINKS, false);
-    boolean ignoreExternalExemptions = conf.getBoolean(DB_IGNORE_EXTERNAL_EXEMPTIONS, false);
-    this.enabled =  ignoreExternal && ignoreExternalExemptions;
-    if (!enabled) {
-      LOG.info("DB Ignore Exemptions are not Enabled. To enable, set '{}' and '{}' to 'true'",
-          LinkDb.IGNORE_EXTERNAL_LINKS, DB_IGNORE_EXTERNAL_EXEMPTIONS);
-    } else{
-      LOG.info("Ignore exemptions enabled");
-      RequestConfig requestConfig = RequestConfig.custom()
-          .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
-          .build();
-      this.clientBuilder = HttpClientBuilder.create()
-          .setDefaultRequestConfig(requestConfig)
-          .setUserAgent(conf.get("http.agent.name"));
-
-      String fileName = this.conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
-      InputStream stream = this.conf.getConfResourceAsInputStream(fileName);
-      if (stream == null) {
-        throw new RuntimeException("Couldn't find config file :" + fileName);
-      }
-      try {
-        this.exemptions = new LinkedHashMap<Pattern, Set<String>>();
-        List<String> lines = IOUtils.readLines(stream);
-        for (String line : lines) {
-          line = line.trim();
-          if (line.startsWith("#") || line.isEmpty()) {
-            continue; //Skip : comment line or empty line
-          }
-          int firstIndex = line.indexOf('=');
-          if (firstIndex == -1) {
-            // No Split! Invalid
-            LOG.error("{} : Invalid Config  :: {}", fileName, line);
-            continue;
-          }
-          String mimeString = line.substring(0, firstIndex).trim();
-          String regex = line.substring(firstIndex + 1, line.length()).trim();
-          if (regex.isEmpty()) {
-            LOG.error("{} : Invalid Config  :: {}", fileName, line);
-            continue;
-          }
-          //NOTE:empty mime string means don't care => */*
-
-          HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
-          Pattern compiled = Pattern.compile(regex);
-          LOG.info("URL rule :: {} <=> {}", regex, mimes);
-          exemptions.put(compiled, mimes);
+    LOG.info("Ignore exemptions enabled");
+    RequestConfig requestConfig = RequestConfig.custom()
+        .setConnectTimeout(conf.getInt("http.timeout", 10 * 1000))
+        .build();
+    this.clientBuilder = HttpClientBuilder.create()
+        .setDefaultRequestConfig(requestConfig)
+        .setUserAgent(conf.get("http.agent.name"));
+
+    String fileName = this.conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
+    InputStream stream = this.conf.getConfResourceAsInputStream(fileName);
+    if (stream == null) {
+      throw new RuntimeException("Couldn't find config file :" + fileName);
+    }
+    try {
+      this.exemptions = new LinkedHashMap<Pattern, Set<String>>();
+      List<String> lines = IOUtils.readLines(stream);
+      for (String line : lines) {
+        line = line.trim();
+        if (line.startsWith("#") || line.isEmpty()) {
+          continue; //Skip : comment line or empty line
+        }
+        int firstIndex = line.indexOf('=');
+        if (firstIndex == -1) {
+          // No Split! Invalid
+          LOG.error("{} : Invalid Config  :: {}", fileName, line);
+          continue;
         }
-        LOG.info("Read {} rules from {}", exemptions.size(), fileName);
-      } catch (IOException e) {
-        throw new IllegalStateException(e);
+        String mimeString = line.substring(0, firstIndex).trim();
+        String regex = line.substring(firstIndex + 1, line.length()).trim();
+        if (regex.isEmpty()) {
+          LOG.error("{} : Invalid Config  :: {}", fileName, line);
+          continue;
+        }
+        //NOTE:empty mime string means don't care => */*
+
+        HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
+        Pattern compiled = Pattern.compile(regex);
+        LOG.info("URL rule :: {} <=> {}", regex, mimes);
+        exemptions.put(compiled, mimes);
       }
+      LOG.info("Read {} rules from {}", exemptions.size(), fileName);
+    } catch (IOException e) {
+      throw new IllegalStateException(e);
     }
   }
 
@@ -220,10 +220,10 @@ public class ExemptionUrlFilter implements URLFilter {
   public static void main(String[] args) {
     if (args.length != 1) {
       System.out.println("Error: Invalid Args");
-      System.out.println("Usage:" + ExemptionUrlFilter.class.getName() + " <url>");
+      System.out.println("Usage:" + ExemptionUrlFilter.class.getName() + "<url>");
       return;
     }
     String url = args[0];
-    System.out.println(ExemptionUrlFilter.getInstance().filter(url) != null);
+    System.out.println(ExemptionUrlFilter.getInstance().filter(null, url));
   }
 }
-- 
2.1.4


From 6f6ea997bfc16d2fe401c0b584b9e48b6089df6d Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Mon, 12 Oct 2015 14:01:25 -0700
Subject: [PATCH 5/7] fix: ignore empty string in mimetype conf

---
 .../org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
index b665ebb..6729b2b 100644
--- a/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
+++ b/src/plugin/urlfilter-ignoreexempt/src/java/org/apache/nutch/urlfilter/ignoreexempt/ExemptionUrlFilter.java
@@ -200,8 +200,10 @@ public class ExemptionUrlFilter implements URLExemptionFilter {
           continue;
         }
         //NOTE:empty mime string means don't care => */*
-
-        HashSet<String> mimes = new HashSet<String>(Arrays.asList(mimeString.split(",")));
+        HashSet<String> mimes = new HashSet<String>();
+        if (!mimes.isEmpty()) {
+          Arrays.asList(mimeString.split(","));
+        }
         Pattern compiled = Pattern.compile(regex);
         LOG.info("URL rule :: {} <=> {}", regex, mimes);
         exemptions.put(compiled, mimes);
-- 
2.1.4


From b5b31e630a7653df194d8f8ca806cd526a8d8744 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Sun, 18 Oct 2015 21:49:13 -0700
Subject: [PATCH 6/7] Delete unused files

---
 src/plugin/urlfilter-ignoreexempt/data/hosts.txt   |  5 ---
 .../urlfilter/domain/TestDomainURLFilter.java      | 48 ----------------------
 2 files changed, 53 deletions(-)
 delete mode 100644 src/plugin/urlfilter-ignoreexempt/data/hosts.txt
 delete mode 100644 src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

diff --git a/src/plugin/urlfilter-ignoreexempt/data/hosts.txt b/src/plugin/urlfilter-ignoreexempt/data/hosts.txt
deleted file mode 100644
index 2b88c3b..0000000
--- a/src/plugin/urlfilter-ignoreexempt/data/hosts.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# comments start with the pound sign
-net
-apache.org
-be
-www.yahoo.com
\ No newline at end of file
diff --git a/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
deleted file mode 100644
index 466fd78..0000000
--- a/src/plugin/urlfilter-ignoreexempt/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.urlfilter.domain;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class TestDomainURLFilter {
-
-  private final static String SEPARATOR = System.getProperty("file.separator");
-  private final static String SAMPLES = System.getProperty("test.data", ".");
-
-  @Test
-  public void testFilter() throws Exception {
-
-    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
-    Configuration conf = NutchConfiguration.create();
-    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
-    domainFilter.setConf(conf);
-    Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
-    Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
-    Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
-    Assert.assertNull(domainFilter.filter("http://www.google.com"));
-    Assert.assertNull(domainFilter.filter("http://mail.yahoo.com"));
-    Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
-    Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
-    Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
-    Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
-    Assert.assertNull(domainFilter.filter("http://www.adobe.com"));
-  }
-
-}
-- 
2.1.4


From 8a3ea9e9853909bd53567e39beceaf1c6a508687 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Sun, 18 Oct 2015 22:41:04 -0700
Subject: [PATCH 7/7] Improve documentation

---
 conf/db-ignore-external-exemptions.txt      | 29 ++++++++++++++++++++---------
 src/plugin/urlfilter-ignoreexempt/README.md | 29 +++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/conf/db-ignore-external-exemptions.txt b/conf/db-ignore-external-exemptions.txt
index fe22b9c..4baf877 100644
--- a/conf/db-ignore-external-exemptions.txt
+++ b/conf/db-ignore-external-exemptions.txt
@@ -1,17 +1,28 @@
-# Exceptions to db ignore
+# Exemption rules to db.ignore.external.links
 #
 # Format :
 #--------
-# MimeType=UrlRegex
+# MimeType1,MimeType2=UrlRegex
 # The first occurance of '=' divides comma separated mime-types with the url regex.
-# When the url matches regex and content type is present in the specified list,it is considered as exceptional URL.
+# When the url matches regex and content type is present in the specified list, then that
+# url will not be ignored. Mimetypes are optional.
 
 
-# Example :
+# Example 1:
 #----------
-# To ingore except urls ending with .jpg or .png and has content type image/jpeg or image/png
-image/jpeg,image/png=.*\.jpg$|.*\.JPG$|.*\.png$|.*\.PNG$
+# To exempt urls ending with image extension and has content type image/jpeg, image/png or image/gif
 
-# To accept all urls ending with gif, without looking for mimetypes.
-#Note : Mimes are empty => accept any mimetype
-=.*\.gif$
+image/jpeg,image/png,image/gif=.*\.(jpg|JPG|png$|PNG|gif|GIF)$
+
+# NOTE: Mime Type detection requires HTTP HEAD call, which is a costly task. Use example 2 when feasible
+
+
+
+# Example 2:
+#----------
+# To exempt urls ending with image extensions
+
+=.*\.(jpg|JPG|png$|PNG|gif|GIF)$
+
+# NOTE: mime types are empty. This case accepts all urls matching to regex without probing the mimetypes,
+# No HTTP HEAD call will be made. 
diff --git a/src/plugin/urlfilter-ignoreexempt/README.md b/src/plugin/urlfilter-ignoreexempt/README.md
index de1c912..2d53809 100644
--- a/src/plugin/urlfilter-ignoreexempt/README.md
+++ b/src/plugin/urlfilter-ignoreexempt/README.md
@@ -22,13 +22,30 @@ open `conf/db-ignore-external-exemptions.txt` and add rules
 `MimeType1,MimeType2=UrlRegex`
 
 The first occurrence of '=' from left divides comma separated mime-types with the url regex.
-When the url matches regex and content type is present in the specified list, it is considered as exempted URL.
+When the url matches regex and content type is present in the specified list, the url will be exempted from db.ignore.external.
 
 ## Example :
 
-__Exempt urls ending with .jpg or .png and has content type image/jpeg or image/png__  
-```image/jpeg,image/png=.*\.jpg$|.*\.JPG$|.*\.png$|.*\.PNG$```
+1. Exempt URLS ending with image file extension
 
-__Exempt urls ending with gif, without looking for mimetypes.__  
-_Note : Mimes are empty => accept any mimetype_  
-`=.*\.gif$`
+    `=.*\.(jpg|JPG|png$|PNG|gif|GIF)$`
+
+2. Exempt URLS ending with image file extension and has image mime-type 
+
+    `image/jpeg,image/png,image/gif=.*\.(jpg|JPG|png$|PNG|gif|GIF)$`
+> NOTE: this costs a HTTP HEAD call for every url matched by regex
+
+
+# Testing
+
+   After enabling the plugin and adding your rules to `conf/db-ignore-external-exemptions.txt`, run:
+   
+   `bin/nutch plugin urlfilter-ignoreexempt  org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://yoururl.here`
+   
+## Example :
+
+> ```
+bin/nutch plugin urlfilter-ignoreexempt  org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://nutch.apache.org/
+bin/nutch plugin urlfilter-ignoreexempt  org.apache.nutch.urlfilter.ignoreexempt.ExemptionUrlFilter http://nutch.apache.org/assets/img/nutch_logo_tm.png```
+
+This should print `true` for urls which are accepted by configured rules.
\ No newline at end of file
-- 
2.1.4


From 43a206c7074fdb43673204d6b8d8d237516c6fde Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Mon, 19 Oct 2015 00:21:06 -0700
Subject: [PATCH 8/8] delete unused file

---
 src/java/org/apache/nutch/ExemptionUrlFilter.java   | 0
 src/plugin/urlfilter-ignoreexempt/data/.donotdelete | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/java/org/apache/nutch/ExemptionUrlFilter.java
 create mode 100644 src/plugin/urlfilter-ignoreexempt/data/.donotdelete

diff --git a/src/java/org/apache/nutch/ExemptionUrlFilter.java b/src/java/org/apache/nutch/ExemptionUrlFilter.java
deleted file mode 100644
index e69de29..0000000
diff --git a/src/plugin/urlfilter-ignoreexempt/data/.donotdelete b/src/plugin/urlfilter-ignoreexempt/data/.donotdelete
new file mode 100644
index 0000000..e69de29
-- 
2.1.4

