diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 7656367..22cc3fd 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -47,7 +47,8 @@ public class HtmlParser implements Parser { private static final int META_TAG_BUFFER_SIZE = 4096; private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern .compile("(?is)TIKA-XXX + */ + public void testUsingCharsetInContentTypeHeader() throws Exception { + final String test = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index 3bdd351..c39ce2d 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -140,6 +140,33 @@ public class TXTParserTest extends TestCase { assertEquals("en", metadata.get(Metadata.LANGUAGE)); } + /** + * Test case for TIKA-341: using charset in content-type + * + * @see TIKA-341 + */ + public void testUsingCharsetInContentTypeHeader() throws Exception { + // Could be UTF-8 or ISO 8859-1 or ... + // u00e1 is latin small letter a with acute + final String test2 = "the name is \u00e1ndre"; + + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + + metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + parser.parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + private void assertExtractText(String msg, String expected, byte[] input) throws Exception { ContentHandler handler = new BodyContentHandler() {