diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 1c10261..9b6699d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -61,13 +61,18 @@ public class HtmlParser implements Parser { if (Charset.isSupported(match.getName())) { metadata.set(Metadata.CONTENT_ENCODING, match.getName()); - // Is the encoding language-specific (KOI8-R, SJIS, etc.)? + // TIKA-339: Don't set language, as it's typically not a very good + // guess, and it can create ambiguity if another (better) language + // value is specified by a meta tag in the HTML (or via HTTP response + // header). + /* String language = match.getLanguage(); if (language != null) { metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); metadata.set(Metadata.LANGUAGE, match.getLanguage()); } - + */ + break; } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java index 158e4fb..4510ca3 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/txt/TXTParser.java @@ -88,8 +88,8 @@ public class TXTParser implements Parser { // Is the encoding language-specific (KOI8-R, SJIS, etc.)? String language = match.getLanguage(); if (language != null) { - metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); - metadata.set(Metadata.LANGUAGE, match.getLanguage()); + metadata.add(Metadata.CONTENT_LANGUAGE, language); + metadata.add(Metadata.LANGUAGE, language); } break; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 7d255c0..0f7c90a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -232,4 +232,19 @@ public class HtmlParserTest extends TestCase { assertEquals("\u017d", metadata.get(Metadata.TITLE)); } + /** + * Test case for TIKA-339: Don't use language returned by CharsetDetector + * @see TIKA-339 + */ + public void testIgnoreCharsetDetectorLanguage() throws Exception { + String test = "