Index: conf/bad-words.txt.template
===================================================================
--- conf/bad-words.txt.template	(revision 0)
+++ conf/bad-words.txt.template	(revision 0)
@@ -0,0 +1 @@
+# bad words one per line
\ No newline at end of file
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 951473)
+++ conf/nutch-default.xml	(working copy)
@@ -1105,6 +1105,20 @@
   </description>
 </property>
 
+<!-- fetch filter properties -->
+
+<property>
+  <name>fetchfilter.order</name>
+  <value></value>
+  <description>The order by which fetch filters are applied.
+  If empty, all available fetch filters (as dictated by properties
+  plugin-includes and plugin-excludes above) are loaded and applied in system
+  defined order. If not empty, only named filters are loaded and applied
+  in given order.  This is similar to how urlfilters operate.
+  </description>
+</property>
+
+
 <!-- scoring filters properties -->
 
 <property>
Index: src/java/org/apache/nutch/fetcher/FetchFilter.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetchFilter.java	(revision 0)
+++ src/java/org/apache/nutch/fetcher/FetchFilter.java	(revision 0)
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Interface for filtering fetch content.  Usually this is done after fetching
+ * has completed but before data is updated back into the CrawlDb.
+ * 
+ * @author dennis
+ */
+public interface FetchFilter
+  extends Pluggable, Configurable {
+
+  // content filter extension point name
+  public final static String X_POINT_ID = FetchFilter.class.getName();
+
+  /**
+   * Generic segment filter interface that acts upon the segments. The method
+   * should return true if the segment data passes and should be included, false
+   * if it doesn't pass and should not be included.
+   * 
+   * @param url The url to filter.
+   * @param content The content to filter.
+   * @param parseResult The parse result containing parseText and parseData.
+   * 
+   * @return True if the content record passes and should be included false if
+   * it does not.
+   */
+  public boolean filter(String url, Content content, ParseResult parseResult);
+
+}
Index: src/java/org/apache/nutch/fetcher/FetchFilterException.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetchFilterException.java	(revision 0)
+++ src/java/org/apache/nutch/fetcher/FetchFilterException.java	(revision 0)
@@ -0,0 +1,22 @@
+package org.apache.nutch.fetcher;
+
+public class FetchFilterException
+  extends Exception {
+
+  public FetchFilterException() {
+    super();
+  }
+
+  public FetchFilterException(String message) {
+    super(message);
+  }
+
+  public FetchFilterException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FetchFilterException(Throwable cause) {
+    super(cause);
+  }
+
+}
Index: src/java/org/apache/nutch/fetcher/FetchFilters.java
===================================================================
--- src/java/org/apache/nutch/fetcher/FetchFilters.java	(revision 0)
+++ src/java/org/apache/nutch/fetcher/FetchFilters.java	(revision 0)
@@ -0,0 +1,94 @@
+package org.apache.nutch.fetcher;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.ObjectCache;
+
+public class FetchFilters {
+
+  public static final String FETCH_FILTER_ORDER = "fetchfilter.order";
+  private FetchFilter[] filters;
+
+  public FetchFilters(Configuration conf) {
+
+    String order = conf.get(FETCH_FILTER_ORDER);
+    ObjectCache objectCache = ObjectCache.get(conf);
+    this.filters = (FetchFilter[])objectCache.getObject(FetchFilter.class
+      .getName());
+
+    if (this.filters == null) {
+
+      String[] orderedFilters = null;
+      if (order != null && !order.trim().equals("")) {
+        orderedFilters = order.split("\\s+");
+      }
+
+      try {
+
+        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+          FetchFilter.X_POINT_ID);
+        if (point == null) {
+          throw new RuntimeException(FetchFilter.X_POINT_ID + " not found.");
+        }
+
+        Extension[] extensions = point.getExtensions();
+        Map<String, FetchFilter> filterMap = new HashMap<String, FetchFilter>();
+        for (int i = 0; i < extensions.length; i++) {
+          Extension extension = extensions[i];
+          FetchFilter filter = (FetchFilter)extension
+            .getExtensionInstance();
+          if (!filterMap.containsKey(filter.getClass().getName())) {
+            filterMap.put(filter.getClass().getName(), filter);
+          }
+        }
+
+        if (orderedFilters == null) {
+          objectCache.setObject(FetchFilter.class.getName(), filterMap
+            .values().toArray(new FetchFilter[0]));
+        }
+        else {
+          ArrayList<FetchFilter> filters = new ArrayList<FetchFilter>();
+          for (int i = 0; i < orderedFilters.length; i++) {
+            FetchFilter filter = filterMap.get(orderedFilters[i]);
+            if (filter != null) {
+              filters.add(filter);
+            }
+          }
+          objectCache.setObject(FetchFilter.class.getName(), filters
+            .toArray(new FetchFilter[filters.size()]));
+        }
+
+      }
+      catch (PluginRuntimeException e) {
+        throw new RuntimeException(e);
+      }
+      this.filters = (FetchFilter[])objectCache.getObject(FetchFilter.class
+        .getName());
+    }
+  }
+
+  public boolean filter(String url, Content content, ParseResult parseResult)
+    throws FetchFilterException {
+
+    boolean shouldInclude = true;
+    if (content != null && parseResult != null) {
+      for (int i = 0; i < this.filters.length; i++) {
+        shouldInclude = this.filters[i].filter(url, content, parseResult);
+        if (!shouldInclude) {
+          break;
+        }
+      }
+    }
+
+    return shouldInclude;
+  }
+}
Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java	(revision 951473)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java	(working copy)
@@ -128,6 +128,7 @@
 
   private boolean storingContent;
   private boolean parsing;
+  private boolean filtering;
   FetchItemQueues fetchQueues;
   QueueFeeder feeder;
   
@@ -540,12 +541,14 @@
     private boolean redirecting;
     private int redirectCount;
     private boolean ignoreExternalLinks;
+    private FetchFilters fetchFilters;
 
     public FetcherThread(Configuration conf) {
       this.setDaemon(true);                       // don't hang JVM on exit
       this.setName("FetcherThread");              // use an informative name
       this.conf = conf;
       this.urlFilters = new URLFilters(conf);
+      this.fetchFilters = new FetchFilters(conf);
       this.scfilters = new ScoringFilters(conf);
       this.parseUtil = new ParseUtil(conf);
       this.protocolFactory = new ProtocolFactory(conf);
@@ -852,7 +855,7 @@
 
     private ParseStatus output(Text key, CrawlDatum datum,
                         Content content, ProtocolStatus pstatus, int status) {
-
+      
       datum.setStatus(status);
       datum.setFetchTime(System.currentTimeMillis());
       if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
@@ -874,6 +877,7 @@
         /* Note: Fetcher will only follow meta-redirects coming from the
          * original URL. */ 
         if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
+          
           try {
             parseResult = this.parseUtil.parse(content);
           } catch (Exception e) {
@@ -886,12 +890,44 @@
                   new ParseStatus().getEmptyParse(conf));
             datum.setSignature(signature);
           }
+          
+          /* Store status code in content So we can read this value during 
+           * parsing (as a separate job) and decide to parse or not.
+           */
+          content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
+          
+          // if filtering run the fetch filters over the content/parse data, only
+          // those records that pass will be written to the fetch output
+          if (filtering) {
+            
+            String url = key.toString();
+            try {
+              
+              if (!fetchFilters.filter(url, content, parseResult)) {
+                
+                // if filtered then set status to gone to not continue fetching
+                // down this url path and not update urls
+                ProtocolStatus filtered = new ProtocolStatus(
+                  ProtocolStatus.GONE, "Gone due to fetch filters");
+                datum.setStatus(CrawlDatum.STATUS_FETCH_GONE);
+                datum.setFetchTime(System.currentTimeMillis());
+                if (pstatus != null) {
+                  datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, filtered);
+                }
+                status = CrawlDatum.STATUS_FETCH_GONE;
+                
+                // set content and parse result to null, ignore the page.
+                content = null;
+                parseResult = null;
+              }
+            }
+            catch (FetchFilterException e) {
+              LOG.warn("Skip filtering key, error in fetch filter" + url + ": " + 
+                e.getMessage());
+            }
+          }
+          
         }
-        
-        /* Store status code in content So we can read this value during 
-         * parsing (as a separate job) and decide to parse or not.
-         */
-        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
       }
 
       try {
@@ -982,6 +1018,7 @@
     this.segmentName = job.get(Nutch.SEGMENT_NAME_KEY);
     this.storingContent = isStoringContent(job);
     this.parsing = isParsing(job);
+    this.filtering = isParsing(job);
 
 //    if (job.getBoolean("fetcher.verbose", false)) {
 //      LOG.setLevel(Level.FINE);
@@ -993,6 +1030,10 @@
   public static boolean isParsing(Configuration conf) {
     return conf.getBoolean("fetcher.parse", true);
   }
+  
+  public static boolean isFiltering(Configuration conf) {
+    return conf.getBoolean("fetcher.filter", true);
+  }
 
   public static boolean isStoringContent(Configuration conf) {
     return conf.getBoolean("fetcher.store.content", true);
@@ -1114,7 +1155,7 @@
   
   public int run(String[] args) throws Exception {
 
-    String usage = "Usage: Fetcher <segment> [-threads n] [-noParsing]";
+    String usage = "Usage: Fetcher <segment> [-threads n] [-filtering] [-noParsing]";
 
     if (args.length < 1) {
       System.err.println(usage);
@@ -1129,7 +1170,13 @@
     for (int i = 1; i < args.length; i++) {       // parse command line
       if (args[i].equals("-threads")) {           // found -threads option
         threads =  Integer.parseInt(args[++i]);
-      } else if (args[i].equals("-noParsing")) parsing = false;
+      } 
+      else if (args[i].equals("-noParsing")) {
+        parsing = false;
+      }
+      else if (args[i].equals("-filtering")) {
+        filtering = true;
+      }
     }
 
     getConf().setInt("fetcher.threads.fetch", threads);
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 951473)
+++ src/plugin/build.xml	(working copy)
@@ -29,6 +29,7 @@
      <ant dir="clustering-carrot2" target="deploy"/>
      <ant dir="creativecommons" target="deploy"/>
      <ant dir="feed" target="deploy"/>
+     <ant dir="fetch-safe" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
@@ -130,6 +131,7 @@
     <ant dir="clustering-carrot2" target="clean"/>
     <ant dir="creativecommons" target="clean"/>
     <ant dir="feed" target="clean"/>
+    <ant dir="fetch-safe" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-more" target="clean"/>
Index: src/plugin/fetch-safe/build.xml
===================================================================
--- src/plugin/fetch-safe/build.xml	(revision 0)
+++ src/plugin/fetch-safe/build.xml	(revision 0)
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="fetch-safe" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>
Index: src/plugin/fetch-safe/plugin.xml
===================================================================
--- src/plugin/fetch-safe/plugin.xml	(revision 0)
+++ src/plugin/fetch-safe/plugin.xml	(revision 0)
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="fetch-safe"
+   name="Safe Content FetchFilter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="fetch-safe.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.fetcher.filter.safe"
+              name="Nutch Safe Content Fetch Filter"
+              point="org.apache.nutch.fetcher.FetchFilter">
+      <implementation id="SafeFetchFilter"
+        class="org.apache.nutch.fetcher.filter.safe.SafeFetchFilter">
+        <!-- <parameter name="file" value="bad-words.txt"/> -->
+      </implementation>
+   </extension>
+
+</plugin>
Index: src/plugin/fetch-safe/src/java/org/apache/nutch/fetcher/filter/safe/SafeFetchFilter.java
===================================================================
--- src/plugin/fetch-safe/src/java/org/apache/nutch/fetcher/filter/safe/SafeFetchFilter.java	(revision 0)
+++ src/plugin/fetch-safe/src/java/org/apache/nutch/fetcher/filter/safe/SafeFetchFilter.java	(revision 0)
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fetcher.filter.safe;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.fetcher.FetchFilter;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * <p>Filters content for bad words based on a file containing a list of bad
+ * words. Only content that doesn't contain any of the bad words is allowed.</p>
+ * 
+ * <p>The bad words file would be setup with one word per line. If any bad word
+ * is found, the filter returns immediately.</p>
+ * 
+ * The bad words file defaults to bad-words.txt in the classpath but can be
+ * overridden using the:
+ * 
+ * <ul> <ol>property "fetchfilter.badwords.file" in ./conf/nutch-*.xml,
+ * and</ol> <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class SafeFetchFilter
+  implements FetchFilter {
+
+  private static final Log LOG = LogFactory.getLog(SafeFetchFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String badWordsFile = null;
+  private Set<String> badWordSet = new HashSet<String>();
+
+  private void readBadWordsFile(Reader configReader)
+    throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        badWordSet.add(StringUtils.lowerCase(line));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public SafeFetchFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param badWordsFile The bad words file, overrides bad-words.txt default.
+   * 
+   * @throws IOException
+   */
+  public SafeFetchFilter(String badWordsFile) {
+    this.badWordsFile = badWordsFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for safe content fetch filter
+    String pluginName = "fetch-safe";
+    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
+      URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+          + " as " + attributeFile);
+      }
+    }
+    else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+          + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("fetchfilter.badwords.file");
+    if (badWordsFile != null) {
+      file = badWordsFile;
+    }
+    else if (attributeFile != null) {
+      file = attributeFile;
+    }
+
+    // get the bad words file as a classpath resource and populate the domain
+    // set with the bad words from the file.
+    try {
+      Reader reader = conf.getConfResourceAsReader(file);
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readBadWordsFile(reader);
+    }
+    catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public boolean filter(String url, Content content, ParseResult parseResult) {
+
+    try {
+      
+      // turn the words on the page into a set of words in a very naive way
+      // this could be enhanced but is simply a way to demonstrate how the 
+      // FetchFilter interface and plugin works
+      if (parseResult != null) {
+        for (Entry<Text, Parse> entry : parseResult) {          
+          Parse parse = entry.getValue();          
+          String[] words = StringUtils.split(StringUtils.lowerCase(parse.getText()));
+          Set<String> wordSet = new HashSet<String>();
+          wordSet.addAll(Arrays.asList(words));
+          
+          for (String word : wordSet) {
+            if (badWordSet.contains(word)) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+    catch (Exception e) {
+
+      LOG.error("Could not apply filter on: " + url + "\n"
+        + org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+
+    return true;
+  }
+}
Index: src/plugin/nutch-extensionpoints/plugin.xml
===================================================================
--- src/plugin/nutch-extensionpoints/plugin.xml	(revision 951473)
+++ src/plugin/nutch-extensionpoints/plugin.xml	(working copy)
@@ -65,6 +65,10 @@
       name="Nutch URL Normalizer"/>
 
 <extension-point
+      id="org.apache.nutch.fetcher.FetchFilter"
+      name="Nutch Fetch Filter"/>
+      
+<extension-point
       id="org.apache.nutch.analysis.NutchAnalyzer"
       name="Nutch Analysis"/>
 

