diff --git src/plugin/lib-nekohtml/ivy.xml src/plugin/lib-nekohtml/ivy.xml
index c35544c..ed70b80 100644
--- src/plugin/lib-nekohtml/ivy.xml
+++ src/plugin/lib-nekohtml/ivy.xml
@@ -36,7 +36,7 @@
-
+
diff --git src/plugin/lib-nekohtml/plugin.xml src/plugin/lib-nekohtml/plugin.xml
index e650bc8..513c9a7 100644
--- src/plugin/lib-nekohtml/plugin.xml
+++ src/plugin/lib-nekohtml/plugin.xml
@@ -16,20 +16,21 @@
limitations under the License.
-->
+ version="1.9.19"
+ provider-name="net.sourceforge.nekohtml">
-
+
diff --git src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index f3ddf05..d05f5e0 100644
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -213,6 +213,7 @@ public class HtmlParser implements Parser {
}
private DocumentFragment parseTagSoup(InputSource input) throws Exception {
+ LOG.debug("Using TagSoup to parse the InputSource");
HTMLDocumentImpl doc = new HTMLDocumentImpl();
DocumentFragment frag = doc.createDocumentFragment();
DOMBuilder builder = new DOMBuilder(doc, frag);
@@ -226,8 +227,11 @@ public class HtmlParser implements Parser {
}
private DocumentFragment parseNeko(InputSource input) throws Exception {
+ LOG.debug("Using NekoHTML to parse the InputSource");
DOMFragmentParser parser = new DOMFragmentParser();
try {
+ parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
parser.setFeature("http://cyberneko.org/html/features/augmentations",
true);
parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 97f8d19..ef3c49a 100644
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -233,6 +233,11 @@ public class TestDOMContentUtils extends TestCase {
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser= new DOMFragmentParser();
+ try {
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
+ } catch (SAXException e) {}
for (int i= 0; i < testPages.length; i++) {
DocumentFragment node=
new HTMLDocumentImpl().createDocumentFragment();
diff --git src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
index 482a396..373a1e4 100644
--- src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
@@ -180,6 +180,9 @@ public class TestDOMContentUtils extends TestCase {
conf.setBoolean("parser.html.form.use_action", true);
utils = new DOMContentUtils(conf);
DOMFragmentParser parser = new DOMFragmentParser();
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
+ true);
for (int i = 0; i < testPages.length; i++) {
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {