Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (revision 1685992)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (working copy)
@@ -21,7 +21,7 @@
import java.util.Map;
import java.net.URL;
import java.net.MalformedURLException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.io.*;
import java.util.regex.*;
@@ -30,10 +30,8 @@
import org.xml.sax.SAXException;
import org.w3c.dom.*;
import org.apache.html.dom.*;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
@@ -48,7 +46,8 @@
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- private static final int CHUNK_SIZE = 2000;
+ // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+ private static final int CHUNK_SIZE = 8192;
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
@@ -87,12 +86,7 @@
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
- try {
- str = new String(content, 0, length, Charset.forName("ASCII").toString());
- } catch (UnsupportedEncodingException e) {
- // code should never come here, but just in case...
- return null;
- }
+ str = new String(content, 0, length, StandardCharsets.US_ASCII);
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;