diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 66100f1..4baa359 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -50,8 +50,9 @@ public class HtmlParser implements Parser { "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" + "([^'\\\"]+)['\\\"]\\s*/>"); - private static final Pattern CONTENT_TYPE_PATTERN = - Pattern.compile("(?i);\\s*charset\\s*=\\s*(.*)"); + // TIKA-350: handle charset as first element in content-type + private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile( + "(?i)(?:;|)\\s*charset\\s*=\\s*([^\r;\\s]*)"); /** * TIKA-332: Check for meta http-equiv tag with charset info in @@ -86,7 +87,8 @@ public class HtmlParser implements Parser { } } - // No charset in a meta http-equiv tag, so detect from actual content bytes. + // No charset in a meta http-equiv tag, see if it's in the passed content-encoding + // hint, or the passed content-type hint. CharsetDetector detector = new CharsetDetector(); String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); if (incomingCharset == null) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index da90393..18bc5af 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -336,5 +336,28 @@ public class HtmlParserTest extends TestCase { assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-350 + * @see TIKA-350 + */ + public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { + final String test = + "