diff --git src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 0a7226f..5c38ee2 100644
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -75,6 +75,9 @@ public class HtmlParser implements Parser {
private static Pattern charsetPattern =
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
+ private static Pattern charsetPatternHTML5 =
+ Pattern.compile("]*>",
+ Pattern.CASE_INSENSITIVE);
private static Collection FIELDS = new HashSet();
@@ -86,15 +89,16 @@ public class HtmlParser implements Parser {
/**
* Given a ByteBuffer
representing an html file of an
- * unknown encoding, read out 'charset' parameter in the meta tag
+ * unknown encoding, read out 'charset' parameter in the meta tag
* from the first CHUNK_SIZE
bytes.
* If there's no meta tag for Content-Type or no charset is specified,
+ * the content is checked for a Unicode Byte Order Mark (BOM).
+ * This will also cover non-byte oriented character encodings (UTF-16 only).
+ * If no character set can be determined,
* null
is returned.
- * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
- * can't be handled with this.
- * We need to do something similar to what's done by mozilla
- * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
- * See also http://www.w3.org/TR/REC-xml/#sec-guessing
+ * See also http://www.w3.org/International/questions/qa-html-encoding-declarations,
+ * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+ * http://www.w3.org/TR/REC-xml/#sec-guessing
*
*
* @param content ByteBuffer
representation of an html file
@@ -123,6 +127,30 @@ public class HtmlParser implements Parser {
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
+ if (encoding == null) {
+ // check for HTML5 meta charset
+ metaMatcher = charsetPatternHTML5.matcher(str);
+ if (metaMatcher.find()) {
+ encoding = new String(metaMatcher.group(1));
+ }
+ }
+ if (encoding == null) {
+ // check for BOM
+ if (length >= 3
+ && content.get(0) == (byte) 0xEF
+ && content.get(1) == (byte) 0xBB
+ && content.get(2) == (byte) 0xBF) {
+ encoding = "UTF-8";
+ } else if (length >= 2) {
+ if (content.get(0) == (byte)0xFF
+ && content.get(1) == (byte)0xFE) {
+ encoding = "UTF-16LE";
+ } else if (content.get(0) == (byte)0xFE
+ && content.get(1) == (byte)0xFF) {
+ encoding = "UTF-16BE";
+ }
+ }
+ }
return encoding;
}
diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
new file mode 100644
index 0000000..c540b70
--- /dev/null
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -0,0 +1,157 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class TestHtmlParser {
+
+ public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
+
+ private static final String encodingTestKeywords =
+ "français, español, русский язык, čeština, ελληνικά";
+ private static final String encodingTestBody =
+ "\n - français\n
- español\n
- русский язык\n
- čeština\n
- ελληνικά\n
";
+ private static final String encodingTestContent =
+ "" + encodingTestKeywords + "\n"
+ + "\n"
+ + "\n" + encodingTestBody + "\n