Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (revision 511238)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (working copy)
@@ -241,19 +241,18 @@
private DocumentFragment parseNeko(InputSource input) throws Exception {
DOMFragmentParser parser = new DOMFragmentParser();
- // some plugins, e.g., creativecommons, need to examine html comments
try {
- parser.setFeature("http://apache.org/xml/features/include-comments",
- true);
- parser.setFeature("http://apache.org/xml/features/augmentations",
- true);
+ parser.setFeature("http://cyberneko.org/html/features/augmentations", true);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content",
false);
parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment",
true);
parser.setFeature("http://cyberneko.org/html/features/report-errors",
- true);
- } catch (SAXException e) {}
+ LOG.isTraceEnabled());
+ parser.setProperty("http://cyberneko.org/html/properties/default-encoding", defaultCharEncoding);
+ } catch (SAXException e) {
+ LOG.trace(e);
+ }
// convert Document to DocumentFragment
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);