the name is \u00e1ndre

### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/main/java/org/apache/tika/parser/txt/TXTParser.java =================================================================== --- src/main/java/org/apache/tika/parser/txt/TXTParser.java (revision 1171650) +++ src/main/java/org/apache/tika/parser/txt/TXTParser.java (working copy) @@ -25,6 +25,8 @@ import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.Collections; +import java.util.HashMap; +import java.util.Map; import java.util.Set; import org.apache.tika.exception.TikaException; @@ -33,26 +35,21 @@ import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.CharsetUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * Plain text parser. The text encoding of the document stream is * automatically detected based on the byte patterns found at the - * beginning of the stream. The input metadata key - * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used - * as an encoding hint if the automatic encoding detection fails. + * beginning of the stream, unless the input metadata key + * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} contains + * a valid charset=xxx parameter. *

* This parser sets the following output metadata entries: *

{@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}: text/plain
{@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}: The detected text encoding of the document.
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and - * {@link org.apache.tika.metadata.DublinCore#LANGUAGE} - *: text/plain; charset=detected text encoding

*/ @SuppressWarnings("serial") @@ -76,43 +73,50 @@ } // Detect the content encoding (the stream is reset to the beginning) - CharsetDetector detector = new CharsetDetector(); - String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + String charset = null; + MediaType mt = null; String incomingType = metadata.get(Metadata.CONTENT_TYPE); - if (incomingCharset == null && incomingType != null) { + if (incomingType != null) { // TIKA-341: Use charset in content-type - MediaType mt = MediaType.parse(incomingType); + mt = MediaType.parse(incomingType); if (mt != null) { - incomingCharset = mt.getParameters().get("charset"); + String incomingCharset = CharsetUtils.clean(mt.getParameters().get("charset")); + if (CharsetUtils.isSupported(incomingCharset)) { + charset = incomingCharset; + } } } - if (incomingCharset != null) { - detector.setDeclaredEncoding(incomingCharset); - } - - detector.setText(stream); - for (CharsetMatch match : detector.detectAll()) { - if (Charset.isSupported(match.getName())) { - metadata.set(Metadata.CONTENT_ENCODING, match.getName()); - break; + if (charset == null) { + CharsetDetector detector = new CharsetDetector(); + detector.setText(stream); + for (CharsetMatch match : detector.detectAll()) { + if (CharsetUtils.isSupported(match.getName())) { + charset = match.getName(); + break; + } } } - - String encoding = metadata.get(Metadata.CONTENT_ENCODING); - if (encoding == null) { + + if (charset == null) { throw new TikaException( "Text encoding could not be detected and no encoding" + " hint is available in document metadata"); } - // TIKA-341: Only stomp on content-type after we're done trying to - // use it to guess at the charset. - metadata.set(Metadata.CONTENT_TYPE, "text/plain"); + // Set or update the metadata content-type with the charset. + if (mt == null) { + metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=" + charset); + } else { + Map params = new HashMap(mt.getParameters()); + params.put("charset", charset); + mt = new MediaType(mt, params); + metadata.set(Metadata.CONTENT_TYPE, mt.toString()); + } try { Reader reader = - new BufferedReader(new InputStreamReader(stream, encoding)); + new BufferedReader(new InputStreamReader(stream, charset)); // TIKA-240: Drop the BOM when extracting plain text reader.mark(1); @@ -137,7 +141,7 @@ xhtml.endDocument(); } catch (UnsupportedEncodingException e) { throw new TikaException( - "Unsupported text encoding: " + encoding, e); + "Unsupported text encoding: " + charset, e); } } Index: src/main/java/org/apache/tika/parser/html/HtmlParser.java =================================================================== --- src/main/java/org/apache/tika/parser/html/HtmlParser.java (revision 1171650) +++ src/main/java/org/apache/tika/parser/html/HtmlParser.java (working copy) @@ -20,7 +20,9 @@ import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -79,6 +81,9 @@ * TODO: Move this into core, along with CharsetDetector */ private String getEncoding(InputStream stream, Metadata metadata) throws IOException { + + // First see if we have valid charset in a tag. + String metaTagCharset = null; stream.mark(META_TAG_BUFFER_SIZE); char[] buffer = new char[META_TAG_BUFFER_SIZE]; InputStreamReader isr = new InputStreamReader(stream, "us-ascii"); @@ -98,70 +103,105 @@ // TIKA-459: improve charset handling. String charset = CharsetUtils.clean(keyValue[1]); if (CharsetUtils.isSupported(charset)) { - metadata.set(Metadata.CONTENT_ENCODING, charset); - return charset; + metaTagCharset = charset; + break; } } } } } - // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding - // hint, or the passed content-type hint. - CharsetDetector detector = new CharsetDetector(); - String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING); + // Now see if we have a charset in the passed content-type hint. + String metaDataCharset = null; String incomingType = metadata.get(Metadata.CONTENT_TYPE); - if (incomingCharset == null && incomingType != null) { + if (incomingType != null) { // TIKA-341: Use charset in content-type MediaType mt = MediaType.parse(incomingType); if (mt != null) { - String charset = mt.getParameters().get("charset"); - if ((charset != null) && Charset.isSupported(charset)) { - incomingCharset = charset; + String charset = CharsetUtils.clean(mt.getParameters().get("charset")); + if ((charset != null) && CharsetUtils.isSupported(charset)) { + metaDataCharset = charset; } } } - if (incomingCharset != null) { - detector.setDeclaredEncoding(incomingCharset); + // If we have both and they agree, or we have one or the other, + // then we're all set. + String resultingCharset = null; + if (metaTagCharset != null) { + if (metaDataCharset == null) { + resultingCharset = metaTagCharset; + } else if (metaDataCharset == metaTagCharset) { + resultingCharset = metaTagCharset; + } else { + // disagreement, so use detector + } + } else if (metaDataCharset != null) { + resultingCharset = metaDataCharset; + } else { + // no charset, so use detector. } + + if (resultingCharset == null) { + // If we have a disagreement, and that's why we're using the detector, + // favor meta tag charset info over response header charset info. + CharsetDetector detector = new CharsetDetector(); + if (metaTagCharset != null) { + detector.setDeclaredEncoding(metaTagCharset); + } else if (metaDataCharset != null) { + detector.setDeclaredEncoding(metaDataCharset); + } - // TIKA-341 without enabling input filtering (stripping of tags) the - // short HTML tests don't work well. - detector.enableInputFilter(true); - detector.setText(stream); - for (CharsetMatch match : detector.detectAll()) { - if (Charset.isSupported(match.getName())) { - metadata.set(Metadata.CONTENT_ENCODING, match.getName()); + // TIKA-341 without enabling input filtering (stripping of tags) the + // short HTML tests don't work well. + detector.enableInputFilter(true); + detector.setText(stream); + for (CharsetMatch match : detector.detectAll()) { + if (CharsetUtils.isSupported(match.getName())) { + resultingCharset = match.getName(); - // TIKA-339: Don't set language, as it's typically not a very good - // guess, and it can create ambiguity if another (better) language - // value is specified by a meta tag in the HTML (or via HTTP response - // header). - /* + // TIKA-339: Don't set language, as it's typically not a very good + // guess, and it can create ambiguity if another (better) language + // value is specified by a meta tag in the HTML (or via HTTP response + // header). + /* String language = match.getLanguage(); if (language != null) { metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); metadata.set(Metadata.LANGUAGE, match.getLanguage()); } - */ - - break; + */ + + break; + } } } - - String encoding = metadata.get(Metadata.CONTENT_ENCODING); - if (encoding == null) { - if (Charset.isSupported(DEFAULT_CHARSET)) { - encoding = DEFAULT_CHARSET; + + if (resultingCharset == null) { + if (CharsetUtils.isSupported(DEFAULT_CHARSET)) { + resultingCharset = DEFAULT_CHARSET; } else { - encoding = Charset.defaultCharset().name(); + resultingCharset = Charset.defaultCharset().name(); } - - metadata.set(Metadata.CONTENT_ENCODING, encoding); } - - return encoding; + + // Now we want to update the CONTENT_TYPE field in the metadata so that + // the encoding that we've picked will get returned. + MediaType mt = null; + if (incomingType != null) { + mt = MediaType.parse(incomingType); + } + + if (mt == null) { + metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=" + resultingCharset); + } else { + Map params = new HashMap(mt.getParameters()); + params.put("charset", resultingCharset); + mt = new MediaType(mt, params); + metadata.set(Metadata.CONTENT_TYPE, mt.toString()); + } + + return resultingCharset; } public void parse( @@ -177,13 +217,15 @@ // TODO: Is this still needed, given our use of TagSoup? stream = new CloseShieldInputStream(stream); - // Prepare the input source using the encoding hint if available + // Prepare the input source using the encoding hint if available. + // Save off CONTENT_TYPE in case XHTMLDowngradeHandler winds up + // stepping on what we have carefully constructed for the encoding. InputSource source = new InputSource(stream); source.setEncoding(getEncoding(stream, metadata)); - + String contentType = metadata.get(Metadata.CONTENT_TYPE); + // Get the HTML mapper from the parse context - HtmlMapper mapper = - context.get(HtmlMapper.class, new HtmlParserMapper()); + HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = @@ -200,6 +242,10 @@ new HtmlHandler(mapper, handler, metadata))); parser.parse(source); + + // During HTML parsing, we'll propagate any tags into metadata. + // But this steps on the encoding that we derived earlier, so restore. + metadata.set(Metadata.CONTENT_TYPE, contentType); } /** Index: src/main/java/org/apache/tika/parser/mbox/MboxParser.java =================================================================== --- src/main/java/org/apache/tika/parser/mbox/MboxParser.java (revision 1171650) +++ src/main/java/org/apache/tika/parser/mbox/MboxParser.java (working copy) @@ -83,8 +83,7 @@ BufferedReader reader = new BufferedReader(isr); - metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE); - metadata.set(Metadata.CONTENT_ENCODING, "us-ascii"); + metadata.set(Metadata.CONTENT_TYPE, String.format("%s; charset=%s", MBOX_MIME_TYPE, "us-ascii")); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); Index: src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (revision 1171650) +++ src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (working copy) @@ -155,5 +155,19 @@ .getResourceAsStream(name); } + /** + * Test case for TIKA-431 + * @see TIKA-431 + */ + public void testNotAbusingContentEncoding() throws Exception { + Parser parser = new MboxParser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/simple.mbox"); + ContentHandler handler = mock(DefaultHandler.class); + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING)); + } + + } Index: src/test/java/org/apache/tika/parser/txt/TXTParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (revision 1171650) +++ src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (working copy) @@ -22,6 +22,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.ContentHandler; @@ -34,7 +35,7 @@ public void testEnglishText() throws Exception { String text = - "Hello, World! This is simple UTF-8 text content written" + "Hello, World! This is simple ascii text content written" + " in English to test autodetection of both the character" + " encoding and the language of the input stream."; @@ -47,7 +48,7 @@ new ParseContext()); String content = writer.toString(); - assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); // TIKA-501: Remove language detection from TXTParser assertNull(metadata.get(Metadata.CONTENT_LANGUAGE)); @@ -67,8 +68,7 @@ parser.parse( new ByteArrayInputStream(text.getBytes("UTF-8")), handler, metadata, new ParseContext()); - assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); assertTrue(handler.toString().contains(text)); } @@ -78,7 +78,7 @@ Metadata metadata = new Metadata(); parser.parse( new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext()); - assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("\n", handler.toString()); } @@ -97,31 +97,6 @@ } /** - * Test case for TIKA-335: using incoming charset - * - * @see TIKA-335 - */ - public void testUseIncomingCharsetAsHint() throws Exception { - // Could be UTF-8 or ISO 8859-1 or ... - // u00e1 is latin small letter a with acute - final String test2 = "the name is \u00e1ndre"; - - Metadata metadata = new Metadata(); - parser.parse( - new ByteArrayInputStream(test2.getBytes("UTF-8")), - new BodyContentHandler(), metadata, new ParseContext()); - - assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); - - metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1"); - parser.parse( - new ByteArrayInputStream(test2.getBytes("UTF-8")), - new BodyContentHandler(), metadata, new ParseContext()); - - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); - } - - /** * Test case for TIKA-341: using charset in content-type * * @see TIKA-341 @@ -136,15 +111,15 @@ new ByteArrayInputStream(test2.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); metadata = new Metadata(); - metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); + metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-1"); parser.parse( new ByteArrayInputStream(test2.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } private void assertExtractText(String msg, String expected, byte[] input) @@ -156,7 +131,7 @@ }; Metadata metadata = new Metadata(); parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext()); - assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); + assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("text/plain")); assertEquals(msg, expected, handler.toString()); } @@ -187,8 +162,22 @@ metadata, new ParseContext()); - assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE)); } + /** + * Test case for TIKA-431 + * @see TIKA-431 + */ + public void testNotAbusingContentEncoding() throws Exception { + String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n"; + + ContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + parser.parse( + new ByteArrayInputStream(text.getBytes("UTF-8")), + handler, metadata, new ParseContext()); + assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING)); + } + } Index: src/main/java/org/apache/tika/parser/mail/MailContentHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (revision 1171650) +++ src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (working copy) @@ -69,8 +69,7 @@ // sub part without damaging the main metadata Metadata submd = new Metadata(); - submd.set(Metadata.CONTENT_TYPE, body.getMimeType()); - submd.set(Metadata.CONTENT_ENCODING, body.getCharset()); + submd.set(Metadata.CONTENT_TYPE, String.format("%s; charset=%s", body.getMimeType(), body.getCharset())); try { BodyContentHandler bch = new BodyContentHandler(handler); Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 1171650) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -36,6 +36,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Geographic; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; @@ -119,7 +120,7 @@ String content = new Tika().parseToString( HtmlParserTest.class.getResourceAsStream(path), metadata); - assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("XHTML test document", metadata.get(Metadata.TITLE)); assertEquals("Tika Developers", metadata.get("Author")); @@ -244,7 +245,7 @@ new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE)); } /** @@ -274,16 +275,28 @@ new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("UTF-8", getCharsetFromContentType(metadata)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("ISO-8859-1", getCharsetFromContentType(metadata)); } + private String getCharsetFromContentType(Metadata metadata) { + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType != null) { + MediaType mt = MediaType.parse(contentType); + if (mt != null) { + return mt.getParameters().get("charset"); + } + } + + return null; + } + /** * Test case for HTML content like * ">div<foo>br<bar>/div>" that should result @@ -332,7 +345,7 @@ new HtmlParser().parse ( new ByteArrayInputStream(test1.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("ISO-8859-1", getCharsetFromContentType(metadata)); // Some HTML pages have errors like ';;' versus '; ' as separator String test2 = @@ -344,7 +357,7 @@ new HtmlParser().parse ( new ByteArrayInputStream(test2.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("ISO-8859-1", getCharsetFromContentType(metadata)); } /** @@ -360,14 +373,14 @@ new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("UTF-8", getCharsetFromContentType(metadata)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("ISO-8859-1", getCharsetFromContentType(metadata)); } @@ -382,7 +395,7 @@ HtmlParserTest.class.getResourceAsStream(path), new BodyContentHandler(), metadata, new ParseContext()); - assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); + assertEquals("windows-1251", getCharsetFromContentType(metadata)); } /** @@ -567,7 +580,7 @@ String result = sw.toString(); // tag for Content-Type should exist, but nothing for Language - assertTrue(Pattern.matches("(?s).*.*$", result)); + assertTrue(Pattern.matches("(?s).*.*$", result)); assertFalse(Pattern.matches("(?s).*TIKA-431 + */ + public void testNotAbusingContentEncoding() throws Exception { + String test = + "" + + "the name is \u00e1ndre" + + ""; + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING)); + } + + } Index: src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (revision 1171650) +++ src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (working copy) @@ -224,4 +224,18 @@ .getResourceAsStream(name); } + /** + * Test for TIKA-431 - not all headers may be present + */ + public void testNotAbusingContentEncoding() throws Exception { + Parser parser = new RFC822Parser(); + Metadata metadata = new Metadata(); + InputStream stream = getStream("test-documents/testRFC822"); + ContentHandler handler = mock(DefaultHandler.class); + + parser.parse(stream, handler, metadata, new ParseContext()); + assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING)); + } + + } Index: src/test/java/org/apache/tika/parser/AutoDetectParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (revision 1171650) +++ src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (working copy) @@ -78,8 +78,8 @@ ContentHandler handler = new BodyContentHandler(); new AutoDetectParser().parse(input, handler, metadata); - assertEquals("Bad content type: " + tp, - tp.realType, metadata.get(Metadata.CONTENT_TYPE)); + assertTrue("Bad content type: " + tp, + metadata.get(Metadata.CONTENT_TYPE).startsWith(tp.realType)); if (tp.expectedContentFragment != null) { assertTrue("Expected content not found: " + tp,