diff --git src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 0a7226f..5c38ee2 100644 --- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -75,6 +75,9 @@ public class HtmlParser implements Parser { private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); + private static Pattern charsetPatternHTML5 = + Pattern.compile("]*>", + Pattern.CASE_INSENSITIVE); private static Collection FIELDS = new HashSet(); @@ -86,15 +89,16 @@ public class HtmlParser implements Parser { /** * Given a ByteBuffer representing an html file of an - * unknown encoding, read out 'charset' parameter in the meta tag + * unknown encoding, read out 'charset' parameter in the meta tag * from the first CHUNK_SIZE bytes. * If there's no meta tag for Content-Type or no charset is specified, + * the content is checked for a Unicode Byte Order Mark (BOM). + * This will also cover non-byte oriented character encodings (UTF-16 only). + * If no character set can be determined, * null is returned.
- * FIXME: non-byte oriented character encodings (UTF-16, UTF-32) - * can't be handled with this. - * We need to do something similar to what's done by mozilla - * (http://lxr.mozilla.org/seamonkey/source/parser/htmlparser/src/nsParser.cpp#1993). - * See also http://www.w3.org/TR/REC-xml/#sec-guessing + * See also http://www.w3.org/International/questions/qa-html-encoding-declarations, + * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and + * http://www.w3.org/TR/REC-xml/#sec-guessing *
* * @param content ByteBuffer representation of an html file @@ -123,6 +127,30 @@ public class HtmlParser implements Parser { if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } + if (encoding == null) { + // check for HTML5 meta charset + metaMatcher = charsetPatternHTML5.matcher(str); + if (metaMatcher.find()) { + encoding = new String(metaMatcher.group(1)); + } + } + if (encoding == null) { + // check for BOM + if (length >= 3 + && content.get(0) == (byte) 0xEF + && content.get(1) == (byte) 0xBB + && content.get(2) == (byte) 0xBF) { + encoding = "UTF-8"; + } else if (length >= 2) { + if (content.get(0) == (byte)0xFF + && content.get(1) == (byte)0xFE) { + encoding = "UTF-16LE"; + } else if (content.get(0) == (byte)0xFE + && content.get(1) == (byte)0xFF) { + encoding = "UTF-16BE"; + } + } + } return encoding; } diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java new file mode 100644 index 0000000..c540b70 --- /dev/null +++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -0,0 +1,157 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import java.nio.ByteBuffer; +import java.nio.charset.Charset; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.Bytes; +import org.apache.nutch.util.NutchConfiguration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.junit.Before; +import org.junit.Test; +import static org.junit.Assert.*; + +public class TestHtmlParser { + + public static final Logger LOG = LoggerFactory.getLogger(TestHtmlParser.class); + + private static final String encodingTestKeywords = + "français, español, русский язык, čeština, ελληνικά"; + private static final String encodingTestBody = + "
    \n
  • français\n
  • español\n
  • русский язык\n
  • čeština\n
  • ελληνικά\n
"; + private static final String encodingTestContent = + "" + encodingTestKeywords + "\n" + + "\n" + + "\n" + encodingTestBody + "\n"; + + private static String[][] encodingTestPages= { + { + "HTML4, utf-8, meta http-equiv, no quotes", + "utf-8", + "\n" + + "\n\n" + + "" + + encodingTestContent + }, + { + "HTML4, utf-8, meta http-equiv, single quotes", + "utf-8", + "\n" + + "\n\n" + + "" + + encodingTestContent + }, + { + "XHTML, utf-8, meta http-equiv, double quotes", + "utf-8", + "\n" + + "\n\n" + + "" + + encodingTestContent + }, + { + "HTML5, utf-8, meta charset", + "utf-8", + "\n\n\n" + + "" + + encodingTestContent + }, + { + "HTML5, utf-8, BOM", + "utf-8", + "\ufeff\n\n\n" + + encodingTestContent + }, + { + "HTML5, utf-16, BOM", + "utf-16", + "\ufeff\n\n\n" + + encodingTestContent + } + }; + + private Configuration conf; + private Parser parser; + + private static final String dummyUrl = "http://dummy.url/"; + + + @Before + public void setup() { + conf = NutchConfiguration.create(); + parser = new HtmlParser(); + parser.setConf(conf); + } + + protected WebPage page(byte[] contentBytes) { + WebPage page = new WebPage(); + page.setBaseUrl(new Utf8(dummyUrl)); + page.setContent(ByteBuffer.wrap(contentBytes)); + page.setContentType(new Utf8("text/html")); + return page; + } + + protected Parse parse(WebPage page) { + return parser.getParse(dummyUrl, page); + } + + + @Test + public void testEncodingDetection() { + for (String[] testPage : encodingTestPages) { + String name = testPage[0]; + Charset charset = Charset.forName(testPage[1]); + byte[] contentBytes = testPage[2].getBytes(charset); + //Parse parse = parse(contentBytes); + WebPage page = page(contentBytes); + Parse parse = parse(page); + String text = parse.getText(); + String title = parse.getTitle(); + //String keywords = parse.getMeta("keywords"); + String keywords = Bytes.toString(page + .getFromMetadata(new Utf8("keywords"))); + LOG.info(name); + LOG.info("title:\t" + title); + LOG.info("keywords:\t" + keywords); + LOG.info("text:\t" + text); + assertEquals("Title not extracted properly (" + name + ")", + encodingTestKeywords, title); + for (String keyword : encodingTestKeywords.split(",\\s*")) { + assertTrue(keyword + " not found in text (" + name + ")", + text.contains(keyword)); + } + if (keywords != null) { + assertEquals("Keywords not extracted properly (" + name + ")", + encodingTestKeywords, keywords); + } + } + } + +}