Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (revision 511238)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (working copy)
@@ -23,7 +23,10 @@
import java.io.*;
import java.util.regex.*;
-import org.cyberneko.html.parsers.*;
+import org.apache.xerces.xni.parser.XMLDocumentFilter;
+import org.cyberneko.html.filters.ElementRemover;
+import org.cyberneko.html.parsers.DOMFragmentParser;
+
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.*;
@@ -253,6 +256,64 @@
true);
parser.setFeature("http://cyberneko.org/html/features/report-errors",
true);
+
+ ElementRemover remover = new ElementRemover();
+ remover.acceptElement("a", new String[] { "href", "target", "name", "rel", "title" });
+ remover.acceptElement("b", null);
+ remover.acceptElement("big", null);
+ remover.acceptElement("blockquote", null);
+ remover.acceptElement("br", null);
+ remover.acceptElement("caption", null);
+ remover.acceptElement("center", null);
+ remover.acceptElement("code", null);
+ remover.acceptElement("div", null);
+ remover.acceptElement("em", null);
+ remover.acceptElement("h1", null);
+ remover.acceptElement("h2", null);
+ remover.acceptElement("h3", null);
+ remover.acceptElement("h4", null);
+ remover.acceptElement("h5", null);
+ remover.acceptElement("h6", null);
+ remover.acceptElement("hr", null);
+ remover.acceptElement("i", null);
+ remover.acceptElement("li", null);
+ remover.acceptElement("ol", null);
+ remover.acceptElement("p", null);
+ remover.acceptElement("pre", null);
+ remover.acceptElement("s", null);
+ remover.acceptElement("small", null);
+ remover.acceptElement("strike", null);
+ remover.acceptElement("strong", null);
+ remover.acceptElement("sub", null);
+ remover.acceptElement("sup", null);
+ remover.acceptElement("table", null);
+ remover.acceptElement("td", null);
+ remover.acceptElement("th", null);
+ remover.acceptElement("tr", null);
+ remover.acceptElement("tt", null);
+ remover.acceptElement("u", null);
+ remover.acceptElement("ul", null);
+ remover.acceptElement("var", null);
+ remover.acceptElement("link", null);
+ remover.acceptElement("head", null);
+
+ remover.removeElement("select");
+ remover.removeElement("map");
+ remover.removeElement("img");
+ remover.removeElement("input");
+ remover.removeElement("form");
+ remover.removeElement("area");
+ remover.removeElement("option");
+ remover.removeElement("title");
+ remover.removeElement("textarea");
+ remover.removeElement("style");
+ remover.removeElement("meta");
+ remover.removeElement("script");
+ remover.removeElement("noscript");
+
+ XMLDocumentFilter[] filter = new XMLDocumentFilter[] { remover };
+ parser.setProperty("http://cyberneko.org/html/properties/filters", filter);
+
} catch (SAXException e) {}
// convert Document to DocumentFragment
HTMLDocumentImpl doc = new HTMLDocumentImpl();