diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 1c10261..9b6699d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -61,13 +61,18 @@ public class HtmlParser implements Parser { if (Charset.isSupported(match.getName())) { metadata.set(Metadata.CONTENT_ENCODING, match.getName()); - // Is the encoding language-specific (KOI8-R, SJIS, etc.)? + // TIKA-339: Don't set language, as it's typically not a very good + // guess, and it can create ambiguity if another (better) language + // value is specified by a meta tag in the HTML (or via HTTP response + // header). + /* String language = match.getLanguage(); if (language != null) { metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); metadata.set(Metadata.LANGUAGE, match.getLanguage()); } - + */ + break; } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 158e4fb..4510ca3 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -88,8 +88,8 @@ public class TXTParser implements Parser { // Is the encoding language-specific (KOI8-R, SJIS, etc.)? String language = match.getLanguage(); if (language != null) { - metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); - metadata.set(Metadata.LANGUAGE, match.getLanguage()); + metadata.add(Metadata.CONTENT_LANGUAGE, language); + metadata.add(Metadata.LANGUAGE, language); } break; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 7d255c0..0f7c90a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -232,4 +232,19 @@ public class HtmlParserTest extends TestCase { assertEquals("\u017d", metadata.get(Metadata.TITLE)); } + /** + * Test case for TIKA-339: Don't use language returned by CharsetDetector + * @see TIKA-339 + */ + public void testIgnoreCharsetDetectorLanguage() throws Exception { + String test = "Simple Content"; + Metadata metadata = new Metadata(); + metadata.add(Metadata.CONTENT_LANGUAGE, "en"); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE)); + } + } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index 7906dd1..3bdd351 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -122,6 +122,23 @@ public class TXTParserTest extends TestCase { assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-339: don't override incoming language + * + * @see TIKA-335 + */ + public void testRetainIncomingLanguage() throws Exception { + final String test = "Simple Content"; + + Metadata metadata = new Metadata(); + metadata.set(Metadata.LANGUAGE, "en"); + + parser.parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("en", metadata.get(Metadata.LANGUAGE)); + } private void assertExtractText(String msg, String expected, byte[] input) throws Exception {