Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (revision 1686242)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (working copy)
@@ -27,6 +27,7 @@
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -67,7 +68,8 @@
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- private static final int CHUNK_SIZE = 2000;
+ // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+ private static final int CHUNK_SIZE = 8192;
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
- String str = "";
- try {
- str = new String(content.array(), content.arrayOffset()
- + content.position(), length, Charset.forName("ASCII").toString());
- } catch (UnsupportedEncodingException e) {
- // code should never come here, but just in case...
- return null;
- }
+ String str = new String(content.array(), content.arrayOffset()
+ + content.position(), length, StandardCharsets.US_ASCII);
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;