diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 7656367..22cc3fd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -47,7 +47,8 @@ public class HtmlParser implements Parser {
private static final int META_TAG_BUFFER_SIZE = 4096;
private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern
.compile("(?is)TIKA-XXX
+ */
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ final String test = "
the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 3bdd351..c39ce2d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -140,6 +140,33 @@ public class TXTParserTest extends TestCase {
assertEquals("en", metadata.get(Metadata.LANGUAGE));
}
+ /**
+ * Test case for TIKA-341: using charset in content-type
+ *
+ * @see TIKA-341
+ */
+ public void testUsingCharsetInContentTypeHeader() throws Exception {
+ // Could be UTF-8 or ISO 8859-1 or ...
+ // u00e1 is latin small letter a with acute
+ final String test2 = "the name is \u00e1ndre";
+
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ parser.parse(
+ new ByteArrayInputStream(test2.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
private void assertExtractText(String msg, String expected, byte[] input)
throws Exception {
ContentHandler handler = new BodyContentHandler() {