Index: src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
===================================================================
--- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java	(revision 0)
+++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java	(revision 0)
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import java.nio.charset.Charset;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import junit.framework.TestCase;
+
+public class TestHtmlParser extends TestCase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class);
+
+  private static final String encodingTestKeywords = 
+      "français, español, русский язык, čeština, ελληνικά";
+  private static final String encodingTestBody =
+      "<ul>\n  <li>français\n  <li>español\n  <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
+  private static final String encodingTestContent =
+      "<title>" + encodingTestKeywords + "</title>\n"
+          + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "</meta>\n"
+          + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages= {
+    { 
+      "HTML4, utf-8, meta http-equiv, no quotes",
+      "utf-8",
+      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+          + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+          + encodingTestContent
+    },
+    { 
+      "HTML4, utf-8, meta http-equiv, single quotes",
+      "utf-8",
+      "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+          + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+          + encodingTestContent
+    },
+    { 
+      "XHTML, utf-8, meta http-equiv, double quotes",
+      "utf-8",
+      "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+          + "<html>\n<head>\n"
+          + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-8, meta charset",
+      "utf-8",
+      "<!DOCTYPE html>\n<html>\n<head>\n"
+          + "<meta charset=\"utf-8\">"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-8, BOM",
+      "utf-8",
+      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+          + encodingTestContent
+    },
+    { 
+      "HTML5, utf-16, BOM",
+      "utf-16",
+      "\ufeff<!DOCTYPE html>\n<html>\n<head>\n"
+          + encodingTestContent
+    }
+  };
+  
+  private Configuration conf;
+  private Parser parser;
+  
+  public TestHtmlParser(String name) { 
+    super(name);
+    conf = NutchConfiguration.create();
+    parser = new HtmlParser();
+    parser.setConf(conf);
+  }
+
+  protected Parse parse(byte[] contentBytes) {
+    String dummyUrl = "http://dummy.url/";
+    return parser.getParse(
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html", new Metadata(),
+            conf)).get(dummyUrl);
+  }
+  
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      Parse parse = parse(contentBytes);
+      String text = parse.getText();
+      String title = parse.getData().getTitle();
+      String keywords = parse.getData().getMeta("keywords");
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      if (keywords != null) {
+        assertEquals("Keywords not extracted properly (" + name + ")",
+            encodingTestKeywords, keywords);
+      }
+    }
+  }
+
+}
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(revision 1577224)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java	(working copy)
@@ -56,6 +56,9 @@
   private static Pattern charsetPattern =
     Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
                     Pattern.CASE_INSENSITIVE);
+  private static Pattern charsetPatternHTML5 =
+      Pattern.compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
+                      Pattern.CASE_INSENSITIVE);
   
   private String parserImpl;
 
@@ -64,13 +67,13 @@
    * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag   
    * from the first <code>CHUNK_SIZE</code> bytes.
    * If there's no meta tag for Content-Type or no charset is specified,
+   * the content is checked for a Unicode Byte Order Mark (BOM).
+   * This will also cover non-byte oriented character encodings (UTF-16 only).
+   * If no character set can be determined,
    * <code>null</code> is returned.  <br />
-   * FIXME: non-byte oriented character encodings (UTF-16, UTF-32)
-   * can't be handled with this. 
-   * We need to do something similar to what's done by mozilla
-   * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993).
-   * See also http://www.w3.org/TR/REC-xml/#sec-guessing
-   * <br />
+   * See also http://www.w3.org/International/questions/qa-html-encoding-declarations,
+   * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
+   * http://www.w3.org/TR/REC-xml/#sec-guessing
    *
    * @param content <code>byte[]</code> representation of an html file
    */
@@ -99,6 +102,30 @@
       if (charsetMatcher.find()) 
         encoding = new String(charsetMatcher.group(1));
     }
+    if (encoding == null) {
+      // check for HTML5 meta charset
+      metaMatcher = charsetPatternHTML5.matcher(str);
+      if (metaMatcher.find()) {
+        encoding = new String(metaMatcher.group(1));
+      }
+    }
+    if (encoding == null) {
+      // check for BOM
+      if (content.length >= 3
+          && content[0] == (byte) 0xEF
+          && content[1] == (byte) 0xBB
+          && content[2] == (byte) 0xBF) {
+        encoding = "UTF-8";
+      } else if (content.length >= 2) {
+        if (content[0] == (byte)0xFF
+            && content[1] == (byte)0xFE) {
+          encoding = "UTF-16LE";
+        } else if (content[0] == (byte)0xFE
+            && content[1] == (byte)0xFF) {
+          encoding = "UTF-16BE";
+        }
+      }
+    }
 
     return encoding;
   }
