Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(revision 1685992)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(working copy)
@@ -21,7 +21,7 @@
 import java.util.Map;
 import java.net.URL;
 import java.net.MalformedURLException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.io.*;
 import java.util.regex.*;
 
@@ -30,10 +30,8 @@
 import org.xml.sax.SAXException;
 import org.w3c.dom.*;
 import org.apache.html.dom.*;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
@@ -48,7 +46,8 @@
   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
-  private static final int CHUNK_SIZE = 2000;
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -87,12 +86,7 @@
     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
     // {U+0041, U+0082, U+00B7}.
     String str = "";
-    try {
-      str = new String(content, 0, length, Charset.forName("ASCII").toString());
-    } catch (UnsupportedEncodingException e) {
-      // code should never come here, but just in case...
-      return null;
-    }
+    str = new String(content, 0, length, StandardCharsets.US_ASCII);
 
     Matcher metaMatcher = metaPattern.matcher(str);
     String encoding = null;
