Nutch includes a modified version of tagsoup library. For more information refer to https://issues.apache.org/jira/browse/NUTCH-567 . Patch against 1.1.3 of tagsoup. diff -u -b -r -N -x .classpath -x .project /cygdrive/d/_ftp/downloads/tagsoup-1.1.3/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java ./src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java --- /cygdrive/d/_ftp/downloads/tagsoup-1.1.3/src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java 2007-05-10 10:18:08.000000000 +0200 +++ ./src/templates/org/ccil/cowan/tagsoup/HTMLScanner.java 2007-10-17 13:55:52.796875000 +0200 @@ -35,6 +35,8 @@ private int theLastColumn; private int theCurrentLine; private int theCurrentColumn; + private final StringBuilder lastOpenTag = new StringBuilder(30); + private final StringBuilder lastAttribute = new StringBuilder(30); int theState; // Current state int theNextState; // Next state @@ -156,15 +158,18 @@ break; case A_ANAME: h.aname(theOutputBuffer, 0, theSize); + lastAttribute.setLength(0); lastAttribute.append(theOutputBuffer, 0, theSize); theSize = 0; break; case A_ANAME_ADUP: h.aname(theOutputBuffer, 0, theSize); + lastAttribute.setLength(0); lastAttribute.append(theOutputBuffer, 0, theSize); theSize = 0; h.adup(theOutputBuffer, 0, theSize); break; case A_ANAME_ADUP_STAGC: h.aname(theOutputBuffer, 0, theSize); + lastAttribute.setLength(0); lastAttribute.append(theOutputBuffer, 0, theSize); theSize = 0; h.adup(theOutputBuffer, 0, theSize); h.stagc(theOutputBuffer, 0, theSize); @@ -215,7 +220,17 @@ // System.err.println("%%" + new String(theOutputBuffer, 0, theSize)); h.entity(theOutputBuffer, savedSize + 1, theSize - savedSize - 1); int ent = h.getEntity(); -// System.err.println("%% value = " + ent); + + final String lastTag = lastOpenTag.toString().toLowerCase(); + final String lastAttr = lastAttribute.toString().toLowerCase(); + if (ch != ';' && + (("a".equals(lastTag) && "href".equals(lastAttr)) + || ("img".equals(lastTag) && "src".equals(lastAttr)))) { + // Copy verbatim, this is not a valid entity and it's better + // not to replace it (it may be an URI parameter). + unread(r, ch); + theCurrentColumn--; + } else { if (ent != 0) { theSize = savedSize; if (ent >= 0x80 && ent <= 0x9F) { @@ -239,6 +254,7 @@ unread(r, ch); theCurrentColumn--; } + } theNextState = savedState; break; case A_ETAG: @@ -251,10 +267,16 @@ break; case A_GI: h.gi(theOutputBuffer, 0, theSize); + lastOpenTag.setLength(0); + lastOpenTag.append(theOutputBuffer, 0, theSize); + theSize = 0; break; case A_GI_STAGC: h.gi(theOutputBuffer, 0, theSize); + lastOpenTag.setLength(0); + lastOpenTag.append(theOutputBuffer, 0, theSize); + theSize = 0; h.stagc(theOutputBuffer, 0, theSize); break; diff -u -b -r -N -x .classpath -x .project /cygdrive/d/_ftp/downloads/tagsoup-1.1.3/src/test/org/ccil/cowan/tagsoup/URIParsingTest.java ./src/test/org/ccil/cowan/tagsoup/URIParsingTest.java --- /cygdrive/d/_ftp/downloads/tagsoup-1.1.3/src/test/org/ccil/cowan/tagsoup/URIParsingTest.java 1970-01-01 01:00:00.000000000 +0100 +++ ./src/test/org/ccil/cowan/tagsoup/URIParsingTest.java 2007-10-17 13:50:06.750000000 +0200 @@ -0,0 +1,44 @@ +package org.ccil.cowan.tagsoup; + +import java.io.StringReader; +import java.io.StringWriter; + +import junit.framework.TestCase; + +import org.xml.sax.InputSource; +import org.xml.sax.XMLReader; + +/** + * Test if URIs are parsed properly. + */ +public class URIParsingTest extends TestCase { + /** + * + */ + public void testUnescapedAmpersands() throws Exception { + final String [][] testCases = new String [][] { + {"", + ""}, + + {"

", + "

"}, + }; + + for (int i = 0; i < testCases.length; i++) { + final StringWriter out = new StringWriter(); + final StringReader in = new StringReader(testCases[i][0]); + + final XMLWriter h = new XMLWriter(out); + h.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes"); + + final XMLReader r = new Parser(); + r.setProperty(Parser.schemaProperty, new HTMLSchema()); + r.setFeature(Parser.namespacesFeature, false); + r.setContentHandler(h); + + r.parse(new InputSource(in)); + + assertEquals(testCases[i][1].trim(), out.toString().trim()); + } + } +}