Index: src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistIndexer.java
===================================================================
--- src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistIndexer.java	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistIndexer.java	(Revision 0)
@@ -0,0 +1,47 @@
+package at.scintillation.nutch;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Class to index the content which has been parsed and stored in the {@link BlacklistWhitelistParser}.
+ * The Lucene index field name containing the stripped content is called "strippedContent".
+ * 
+ * @author Elisabeth Adler
+ */
+public class BlacklistWhitelistIndexer implements IndexingFilter
+{
+
+    private Configuration conf;
+
+    @Override
+    public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+        throws IndexingException
+    {
+        // Attempt to get the headings
+        String strippedContent = parse.getData().getMeta("strippedContent");
+        if (strippedContent != null)
+        {
+            doc.add("strippedContent", strippedContent);
+        }
+
+        return doc;
+    }
+
+    public void setConf(Configuration conf)
+    {
+        this.conf = conf;
+    }
+
+    public Configuration getConf()
+    {
+        return this.conf;
+    }
+
+}
\ No newline at end of file
Index: src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistParser.java
===================================================================
--- src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistParser.java	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/src/java/at/scintillation/nutch/BlacklistWhitelistParser.java	(Revision 0)
@@ -0,0 +1,259 @@
+package at.scintillation.nutch;
+
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NodeWalker;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * Class to parse the content and apply a blacklist or whitelist. The content is stored in 
+ * the index in the field "strippedContent".<br/>
+ * If a blacklist configuration is provided, all elements plus their subelements are not included in the
+ * final content field which is indexed. If a whitelist configuration is provided, only the elements
+ * and their subelements are included in the indexed field.<br/><br/>
+ * On the basis of {@link https://issues.apache.org/jira/browse/NUTCH-585}
+ * 
+ * @author Elisabeth Adler
+ */
+public class BlacklistWhitelistParser implements HtmlParseFilter
+{
+
+    public static final Log LOG = LogFactory.getLog("at.scintillation.nutch");
+
+    private Configuration conf;
+
+    private String[] blacklist;
+
+    private String[] whitelist;
+
+    @Override
+    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc)
+    {
+        Parse parse = parseResult.get(content.getUrl());
+
+        DocumentFragment rootToIndex = null;
+        StringBuffer strippedContent = new StringBuffer();
+        if ((this.whitelist != null) && (this.whitelist.length > 0))
+        {
+            LOG.info("Applying whitelist...");
+            rootToIndex = (DocumentFragment) doc.cloneNode(false);
+            whitelisting(doc, rootToIndex);
+        }
+        else if ((this.blacklist != null) && (this.blacklist.length > 0))
+        {
+            LOG.info("Applying blacklist...");
+            rootToIndex = (DocumentFragment) doc.cloneNode(true);
+            blacklisting(rootToIndex);
+        }
+
+        getText(strippedContent, rootToIndex); // extract text to index
+        parse.getData().getContentMeta().set("strippedContent", strippedContent.toString());
+
+        return parseResult;
+    }
+
+    /**
+     * Traverse through the document and set all elements matching the given
+     * blacklist configuration to empty
+     * @param pNode Root node
+     */
+    private void blacklisting(Node pNode)
+    {
+        boolean wasStripped = false;
+        String type = pNode.getNodeName().toLowerCase();
+        String id = null;
+        String className = null;
+        if (pNode.hasAttributes())
+        {
+            Node node = pNode.getAttributes().getNamedItem("id");
+            id = (node != null) ? node.getNodeValue().toLowerCase() : null;
+
+            node = pNode.getAttributes().getNamedItem("class");
+            className = (node != null) ? node.getNodeValue().toLowerCase() : null;
+        }
+
+        String typeAndId = type + "#" + id;
+        String typeAndClass = type + "." + className;
+
+        // check if the given element is in blacklist: either only the element type, or type and id or type and class
+        boolean inList = false;
+        if (type != null && Arrays.binarySearch(this.blacklist, type) >= 0)
+            inList = true;
+        else if (type != null && id != null && Arrays.binarySearch(this.blacklist, typeAndId) >= 0)
+            inList = true;
+        else if (type != null && className != null && Arrays.binarySearch(this.blacklist, typeAndClass) >= 0)
+            inList = true;
+
+        if (LOG.isTraceEnabled())
+            LOG.trace("In blacklist: " + inList + " (" + type + " or " + typeAndId + " or " + typeAndClass + ")");
+
+        if (inList)
+        {
+            // can't remove this node, but we can strip it
+            if (LOG.isTraceEnabled())
+                LOG.trace("Removing " + type + (id != null ? "#" + id : (className != null ? "." + className : "")));
+            pNode.setNodeValue("");
+            // remove all children for this node
+            while (pNode.hasChildNodes())
+                pNode.removeChild(pNode.getFirstChild());
+            wasStripped = true;
+        }
+
+        if (!wasStripped)
+        {
+            // process the children recursively
+            NodeList children = pNode.getChildNodes();
+            if (children != null)
+            {
+                int len = children.getLength();
+                for (int i = 0; i < len; i++)
+                {
+                    blacklisting(children.item(i));
+                }
+            }
+        }
+    }
+
+    /**
+     * Traverse through the document and copy all elements matching the given
+     * whitelist configuration to the new node parameter, which will then only
+     * contain all allowed nodes including all their children.
+     * @param pNode Root node
+     * @param newNode node containing only the allowed elements
+     */
+    private void whitelisting(Node pNode, Node newNode)
+    {
+        boolean wasStripped = false;
+        String type = pNode.getNodeName().toLowerCase();
+        String id = null;
+        String className = null;
+        if (pNode.hasAttributes())
+        {
+            Node node = pNode.getAttributes().getNamedItem("id");
+            id = (node != null) ? node.getNodeValue().toLowerCase() : null;
+
+            node = pNode.getAttributes().getNamedItem("class");
+            className = (node != null) ? node.getNodeValue().toLowerCase() : null;
+        }
+
+        String typeAndId = type + "#" + id;
+        String typeAndClass = type + "." + className;
+
+        // check if the given element is in whitelist: either only the element type, or type and id or type and class
+        boolean inList = false;
+        if (type != null && Arrays.binarySearch(this.whitelist, type) >= 0)
+            inList = true;
+        else if (type != null && id != null && Arrays.binarySearch(this.whitelist, typeAndId) >= 0)
+            inList = true;
+        else if (type != null && className != null && Arrays.binarySearch(this.whitelist, typeAndClass) >= 0)
+            inList = true;
+
+        if (LOG.isTraceEnabled())
+            LOG.trace("In whitelist: " + inList + " (" + type + " or " + typeAndId + " or " + typeAndClass + ")");
+
+        if (inList)
+        {
+            // can't remove this node, but we can strip it
+            if (LOG.isTraceEnabled())
+                LOG.trace("Using " + type + (id != null ? "#" + id : (className != null ? "." + className : "")));
+            newNode.appendChild(pNode.cloneNode(true));
+            wasStripped = true;
+        }
+
+        if (!wasStripped)
+        {
+            // process the children recursively
+            NodeList children = pNode.getChildNodes();
+            if (children != null)
+            {
+                int len = children.getLength();
+                for (int i = 0; i < len; i++)
+                {
+                    whitelisting(children.item(i), newNode);
+                }
+            }
+        }
+    }
+
+    /**
+     * copied from {@link org.apache.nutch.parse.html.DOMContentUtils}
+     */
+    private boolean getText(StringBuffer sb, Node node)
+    {
+        boolean abort = false;
+        NodeWalker walker = new NodeWalker(node);
+        
+        while (walker.hasNext()) {
+        
+          Node currentNode = walker.nextNode();
+          String nodeName = currentNode.getNodeName();
+          short nodeType = currentNode.getNodeType();
+          
+          if ("script".equalsIgnoreCase(nodeName)) {
+            walker.skipChildren();
+          }
+          if ("style".equalsIgnoreCase(nodeName)) {
+            walker.skipChildren();
+          }
+          if (nodeType == Node.COMMENT_NODE) {
+            walker.skipChildren();
+          }
+          if (nodeType == Node.TEXT_NODE) {
+            // cleanup and trim the value
+            String text = currentNode.getNodeValue();
+            text = text.replaceAll("\\s+", " ");
+            text = text.trim();
+            if (text.length() > 0) {
+              if (sb.length() > 0) sb.append(' ');
+                sb.append(text);
+            }
+          }
+        }
+        
+        return abort;
+    }
+
+    public void setConf(Configuration conf)
+    {
+        this.conf = conf;
+        // parse configuration for blacklist
+        this.blacklist = null;
+        String elementsToExclude = getConf().get("parser.html.blacklist", null);
+        if ((elementsToExclude != null) && (elementsToExclude.trim().length() > 0))
+        {
+            elementsToExclude = elementsToExclude.toLowerCase(); // convert to lower case so that there's no case
+                                                                 // problems
+            LOG.info("Configured using [parser.html.blacklist] to ignore elements [" + elementsToExclude + "]...");
+            this.blacklist = elementsToExclude.split(",");
+            Arrays.sort(this.blacklist); // required for binary search
+        }
+
+        // parse configuration for whitelist
+        this.whitelist = null;
+        String elementsToInclude = getConf().get("parser.html.whitelist", null);
+        if ((elementsToInclude != null) && (elementsToInclude.trim().length() > 0))
+        {
+            elementsToInclude = elementsToInclude.toLowerCase(); // convert to lower case so that there's no case
+                                                                 // problems
+            LOG.info("Configured using [parser.html.whitelist] to only use elements [" + elementsToInclude + "]...");
+            this.whitelist = elementsToInclude.split(",");
+            Arrays.sort(this.whitelist); // required for binary search
+        }
+    }
+
+    public Configuration getConf()
+    {
+        return this.conf;
+    }
+
+}
Index: src/plugin/index-blacklist-whitelist/README.txt
===================================================================
--- src/plugin/index-blacklist-whitelist/README.txt	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/README.txt	(Revision 0)
@@ -0,0 +1,57 @@
+index-blacklist-whitelist plugin
+---------------------------------
+
+The index-blacklist-whitelist plugin takes a list of content elements as parameters to define which parts of the page will be indexed.
+
+1. Enable the blacklist/whitelist plugin:
+Edit your nutch-site.xml file and adding "|index-blacklist-whitelist" to the property "plugin.includes".
+Example:
+<property>
+  <name>plugin.includes</name>
+  <value>protocol-http|urlfilter-regex|parse-html|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)|index-blacklist-whitelist</value>
+</property>
+
+2. Define the blacklist or whitelist:
+To define a blacklist or whitelist, specify either a property 'parser.html.blacklist' or 'parser.html.whitelist' in your nutch-site.xml file.
+In the list you can provide the name of the element and optional provide the name or class of the element. 
+Depending on which configuration is available, the corresponding list will be used. If both configurations are 
+available, only the whitelist is used. 
+Examples: to define a blacklist for header and footer or a whiteliste for the div 'post', use:
+<property>
+  <name>parser.html.blacklist</name>
+  <value>div#header,div#footer</value>
+  <description>
+	A comma-delimited list of css like tags to identify the elements which should
+	NOT be parsed. Use this to tell the HTML parser to ignore the given elements, e.g. site navigation.
+	It is allowed to only specify the element type (required), and optional its class name ('.')
+	or ID ('#'). More complex expressions will not be parsed.
+	Valid examples: div.header,span,p#test,div#main,ul,div.footercol
+	Invalid expressions: div#head#part1,#footer,.inner#post
+	Note that the elements and their children will be silently ignored by the parser,
+	so verify the indexed content with Luke to confirm results.
+	Use either 'parser.html.blacklist' or 'parser.html.whitelist', but not both of them at once. If so,
+	only the whitelist is used.
+  </description>
+</property>
+
+<property>
+  <name>parser.html.whitelist</name>
+  <value>div.post</value>
+  <description>
+	A comma-delimited list of css like tags to identify the elements which should
+	be parsed. Use this to tell the HTML parser to only use the given elements, e.g. content.
+	It is allowed to only specify the element type (required), and optional its class name ('.')
+	or ID ('#'). More complex expressions will not be parsed.
+	Valid examples: div.header,span,p#test
+	Invalid expressions: div#head#part1,#footer,.inner#post
+	Note that the elements and their children will be silently ignored by the parser,
+	so verify the indexed content with Luke to confirm results.
+	Use either 'parser.html.blacklist' or 'parser.html.whitelist', but not both of them at once. If so,
+	only the whitelist is used.
+  </description>
+</property>
+
+3. Add the field with the content the list has been applied to to be indexed in Solr:
+In order to have the stripped content indexed by Solr, edit your Solr schema.xml and include the following line:
+<!-- fields for the blacklist/whitelist plugin -->
+<field name="strippedContent" type="text" stored="true" indexed="true"/>
Index: src/plugin/index-blacklist-whitelist/build.xml
===================================================================
--- src/plugin/index-blacklist-whitelist/build.xml	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/build.xml	(Revision 0)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-blacklist-whitelist" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
Index: src/plugin/index-blacklist-whitelist/ivy.xml
===================================================================
--- src/plugin/index-blacklist-whitelist/ivy.xml	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/ivy.xml	(Revision 0)
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+      <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
Index: src/plugin/index-blacklist-whitelist/plugin.xml
===================================================================
--- src/plugin/index-blacklist-whitelist/plugin.xml	(Revision 0)
+++ src/plugin/index-blacklist-whitelist/plugin.xml	(Revision 0)
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-blacklist-whitelist"
+   name="Blacklist and Whitelist Parser and Indexer"
+   version="1.0.0"
+   provider-name="scintillation.at">
+
+   <runtime>
+      <library name="index-blacklist-whitelist.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="at.scintillation.nutch.BlacklistWhitelistIndexer"
+              name="Nutch Blacklist and Whitelist Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="BlacklistWhitelistIndexer"
+                      class="at.scintillation.nutch.BlacklistWhitelistIndexer"/>
+   </extension>
+   
+	<extension id="at.scintillation.nutch.BlacklistWhitelistParser"
+              name="Nutch Blacklist and Whitelist Parsing Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="BlacklistWhitelistParser"
+                      class="at.scintillation.nutch.BlacklistWhitelistParser"/>
+   </extension>
+
+</plugin>
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(Revision 603)
+++ src/plugin/build.xml	(Arbeitskopie)
@@ -62,6 +62,7 @@
      <ant dir="urlnormalizer-basic" target="deploy"/>
      <ant dir="urlnormalizer-pass" target="deploy"/>
      <ant dir="urlnormalizer-regex" target="deploy"/>
+  	 <ant dir="index-blacklist-whitelist" target="deploy"/>
   </target>
 
   <!-- ====================================================== -->
