diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 66100f1..4baa359 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -50,8 +50,9 @@ public class HtmlParser implements Parser { "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" + "([^'\\\"]+)['\\\"]\\s*/>"); - private static final Pattern CONTENT_TYPE_PATTERN = - Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); + // TIKA-350: handle charset as first element in content-type + private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile( + "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)"); /** * TIKA-332: Check for meta http-equiv tag with charset info in @@ -86,7 +87,8 @@ public class HtmlParser implements Parser { } } - // No charset in a meta http-equiv tag, so detect from actual content bytes. + // No charset in a meta http-equiv tag, see if it's in the passed content-encoding + // hint, or the passed content-type hint. CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); if (incomingCharset == null) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index da90393..18bc5af 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -336,5 +336,28 @@ public class HtmlParserTest extends TestCase { assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-350 + * @see TIKA-350 + */ + public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { + final String test = + "the name is \u00e1ndre" + + ""; + + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + }