### Eclipse Workspace Patch 1.0
#P tika-parsers
Index: src/main/java/org/apache/tika/parser/txt/TXTParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/txt/TXTParser.java (revision 1171650)
+++ src/main/java/org/apache/tika/parser/txt/TXTParser.java (working copy)
@@ -25,6 +25,8 @@
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
import java.util.Set;
import org.apache.tika.exception.TikaException;
@@ -33,26 +35,21 @@
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.CharsetUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Plain text parser. The text encoding of the document stream is
* automatically detected based on the byte patterns found at the
- * beginning of the stream. The input metadata key
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING} is used
- * as an encoding hint if the automatic encoding detection fails.
+ * beginning of the stream, unless the input metadata key
+ * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE} contains
+ * a valid charset=xxx parameter.
*
* This parser sets the following output metadata entries:
*
* - {@link org.apache.tika.metadata.HttpHeaders#CONTENT_TYPE}
- * text/plain
- * - {@link org.apache.tika.metadata.HttpHeaders#CONTENT_ENCODING}
- * - The detected text encoding of the document.
- * -
- * {@link org.apache.tika.metadata.HttpHeaders#CONTENT_LANGUAGE} and
- * {@link org.apache.tika.metadata.DublinCore#LANGUAGE}
- *
+ * text/plain; charset=detected text encoding
*
*/
@SuppressWarnings("serial")
@@ -76,43 +73,50 @@
}
// Detect the content encoding (the stream is reset to the beginning)
- CharsetDetector detector = new CharsetDetector();
- String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ String charset = null;
+ MediaType mt = null;
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
- if (incomingCharset == null && incomingType != null) {
+ if (incomingType != null) {
// TIKA-341: Use charset in content-type
- MediaType mt = MediaType.parse(incomingType);
+ mt = MediaType.parse(incomingType);
if (mt != null) {
- incomingCharset = mt.getParameters().get("charset");
+ String incomingCharset = CharsetUtils.clean(mt.getParameters().get("charset"));
+ if (CharsetUtils.isSupported(incomingCharset)) {
+ charset = incomingCharset;
+ }
}
}
- if (incomingCharset != null) {
- detector.setDeclaredEncoding(incomingCharset);
- }
-
- detector.setText(stream);
- for (CharsetMatch match : detector.detectAll()) {
- if (Charset.isSupported(match.getName())) {
- metadata.set(Metadata.CONTENT_ENCODING, match.getName());
- break;
+ if (charset == null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(stream);
+ for (CharsetMatch match : detector.detectAll()) {
+ if (CharsetUtils.isSupported(match.getName())) {
+ charset = match.getName();
+ break;
+ }
}
}
-
- String encoding = metadata.get(Metadata.CONTENT_ENCODING);
- if (encoding == null) {
+
+ if (charset == null) {
throw new TikaException(
"Text encoding could not be detected and no encoding"
+ " hint is available in document metadata");
}
- // TIKA-341: Only stomp on content-type after we're done trying to
- // use it to guess at the charset.
- metadata.set(Metadata.CONTENT_TYPE, "text/plain");
+ // Set or update the metadata content-type with the charset.
+ if (mt == null) {
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=" + charset);
+ } else {
+ Map params = new HashMap(mt.getParameters());
+ params.put("charset", charset);
+ mt = new MediaType(mt, params);
+ metadata.set(Metadata.CONTENT_TYPE, mt.toString());
+ }
try {
Reader reader =
- new BufferedReader(new InputStreamReader(stream, encoding));
+ new BufferedReader(new InputStreamReader(stream, charset));
// TIKA-240: Drop the BOM when extracting plain text
reader.mark(1);
@@ -137,7 +141,7 @@
xhtml.endDocument();
} catch (UnsupportedEncodingException e) {
throw new TikaException(
- "Unsupported text encoding: " + encoding, e);
+ "Unsupported text encoding: " + charset, e);
}
}
Index: src/main/java/org/apache/tika/parser/html/HtmlParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/HtmlParser.java (revision 1171650)
+++ src/main/java/org/apache/tika/parser/html/HtmlParser.java (working copy)
@@ -20,7 +20,9 @@
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -79,6 +81,9 @@
* TODO: Move this into core, along with CharsetDetector
*/
private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
+
+ // First see if we have valid charset in a tag.
+ String metaTagCharset = null;
stream.mark(META_TAG_BUFFER_SIZE);
char[] buffer = new char[META_TAG_BUFFER_SIZE];
InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
@@ -98,70 +103,105 @@
// TIKA-459: improve charset handling.
String charset = CharsetUtils.clean(keyValue[1]);
if (CharsetUtils.isSupported(charset)) {
- metadata.set(Metadata.CONTENT_ENCODING, charset);
- return charset;
+ metaTagCharset = charset;
+ break;
}
}
}
}
}
- // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
- // hint, or the passed content-type hint.
- CharsetDetector detector = new CharsetDetector();
- String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
+ // Now see if we have a charset in the passed content-type hint.
+ String metaDataCharset = null;
String incomingType = metadata.get(Metadata.CONTENT_TYPE);
- if (incomingCharset == null && incomingType != null) {
+ if (incomingType != null) {
// TIKA-341: Use charset in content-type
MediaType mt = MediaType.parse(incomingType);
if (mt != null) {
- String charset = mt.getParameters().get("charset");
- if ((charset != null) && Charset.isSupported(charset)) {
- incomingCharset = charset;
+ String charset = CharsetUtils.clean(mt.getParameters().get("charset"));
+ if ((charset != null) && CharsetUtils.isSupported(charset)) {
+ metaDataCharset = charset;
}
}
}
- if (incomingCharset != null) {
- detector.setDeclaredEncoding(incomingCharset);
+ // If we have both and they agree, or we have one or the other,
+ // then we're all set.
+ String resultingCharset = null;
+ if (metaTagCharset != null) {
+ if (metaDataCharset == null) {
+ resultingCharset = metaTagCharset;
+ } else if (metaDataCharset == metaTagCharset) {
+ resultingCharset = metaTagCharset;
+ } else {
+ // disagreement, so use detector
+ }
+ } else if (metaDataCharset != null) {
+ resultingCharset = metaDataCharset;
+ } else {
+ // no charset, so use detector.
}
+
+ if (resultingCharset == null) {
+ // If we have a disagreement, and that's why we're using the detector,
+ // favor meta tag charset info over response header charset info.
+ CharsetDetector detector = new CharsetDetector();
+ if (metaTagCharset != null) {
+ detector.setDeclaredEncoding(metaTagCharset);
+ } else if (metaDataCharset != null) {
+ detector.setDeclaredEncoding(metaDataCharset);
+ }
- // TIKA-341 without enabling input filtering (stripping of tags) the
- // short HTML tests don't work well.
- detector.enableInputFilter(true);
- detector.setText(stream);
- for (CharsetMatch match : detector.detectAll()) {
- if (Charset.isSupported(match.getName())) {
- metadata.set(Metadata.CONTENT_ENCODING, match.getName());
+ // TIKA-341 without enabling input filtering (stripping of tags) the
+ // short HTML tests don't work well.
+ detector.enableInputFilter(true);
+ detector.setText(stream);
+ for (CharsetMatch match : detector.detectAll()) {
+ if (CharsetUtils.isSupported(match.getName())) {
+ resultingCharset = match.getName();
- // TIKA-339: Don't set language, as it's typically not a very good
- // guess, and it can create ambiguity if another (better) language
- // value is specified by a meta tag in the HTML (or via HTTP response
- // header).
- /*
+ // TIKA-339: Don't set language, as it's typically not a very good
+ // guess, and it can create ambiguity if another (better) language
+ // value is specified by a meta tag in the HTML (or via HTTP response
+ // header).
+ /*
String language = match.getLanguage();
if (language != null) {
metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
metadata.set(Metadata.LANGUAGE, match.getLanguage());
}
- */
-
- break;
+ */
+
+ break;
+ }
}
}
-
- String encoding = metadata.get(Metadata.CONTENT_ENCODING);
- if (encoding == null) {
- if (Charset.isSupported(DEFAULT_CHARSET)) {
- encoding = DEFAULT_CHARSET;
+
+ if (resultingCharset == null) {
+ if (CharsetUtils.isSupported(DEFAULT_CHARSET)) {
+ resultingCharset = DEFAULT_CHARSET;
} else {
- encoding = Charset.defaultCharset().name();
+ resultingCharset = Charset.defaultCharset().name();
}
-
- metadata.set(Metadata.CONTENT_ENCODING, encoding);
}
-
- return encoding;
+
+ // Now we want to update the CONTENT_TYPE field in the metadata so that
+ // the encoding that we've picked will get returned.
+ MediaType mt = null;
+ if (incomingType != null) {
+ mt = MediaType.parse(incomingType);
+ }
+
+ if (mt == null) {
+ metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=" + resultingCharset);
+ } else {
+ Map params = new HashMap(mt.getParameters());
+ params.put("charset", resultingCharset);
+ mt = new MediaType(mt, params);
+ metadata.set(Metadata.CONTENT_TYPE, mt.toString());
+ }
+
+ return resultingCharset;
}
public void parse(
@@ -177,13 +217,15 @@
// TODO: Is this still needed, given our use of TagSoup?
stream = new CloseShieldInputStream(stream);
- // Prepare the input source using the encoding hint if available
+ // Prepare the input source using the encoding hint if available.
+ // Save off CONTENT_TYPE in case XHTMLDowngradeHandler winds up
+ // stepping on what we have carefully constructed for the encoding.
InputSource source = new InputSource(stream);
source.setEncoding(getEncoding(stream, metadata));
-
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+
// Get the HTML mapper from the parse context
- HtmlMapper mapper =
- context.get(HtmlMapper.class, new HtmlParserMapper());
+ HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser =
@@ -200,6 +242,10 @@
new HtmlHandler(mapper, handler, metadata)));
parser.parse(source);
+
+ // During HTML parsing, we'll propagate any tags into metadata.
+ // But this steps on the encoding that we derived earlier, so restore.
+ metadata.set(Metadata.CONTENT_TYPE, contentType);
}
/**
Index: src/main/java/org/apache/tika/parser/mbox/MboxParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/mbox/MboxParser.java (revision 1171650)
+++ src/main/java/org/apache/tika/parser/mbox/MboxParser.java (working copy)
@@ -83,8 +83,7 @@
BufferedReader reader = new BufferedReader(isr);
- metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
- metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
+ metadata.set(Metadata.CONTENT_TYPE, String.format("%s; charset=%s", MBOX_MIME_TYPE, "us-ascii"));
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Index: src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (revision 1171650)
+++ src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (working copy)
@@ -155,5 +155,19 @@
.getResourceAsStream(name);
}
+ /**
+ * Test case for TIKA-431
+ * @see TIKA-431
+ */
+ public void testNotAbusingContentEncoding() throws Exception {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/simple.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
}
Index: src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (revision 1171650)
+++ src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.ContentHandler;
@@ -34,7 +35,7 @@
public void testEnglishText() throws Exception {
String text =
- "Hello, World! This is simple UTF-8 text content written"
+ "Hello, World! This is simple ascii text content written"
+ " in English to test autodetection of both the character"
+ " encoding and the language of the input stream.";
@@ -47,7 +48,7 @@
new ParseContext());
String content = writer.toString();
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
// TIKA-501: Remove language detection from TXTParser
assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
@@ -67,8 +68,7 @@
parser.parse(
new ByteArrayInputStream(text.getBytes("UTF-8")),
handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertTrue(handler.toString().contains(text));
}
@@ -78,7 +78,7 @@
Metadata metadata = new Metadata();
parser.parse(
new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("\n", handler.toString());
}
@@ -97,31 +97,6 @@
}
/**
- * Test case for TIKA-335: using incoming charset
- *
- * @see TIKA-335
- */
- public void testUseIncomingCharsetAsHint() throws Exception {
- // Could be UTF-8 or ISO 8859-1 or ...
- // u00e1 is latin small letter a with acute
- final String test2 = "the name is \u00e1ndre";
-
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(test2.getBytes("UTF-8")),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
-
- metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
- parser.parse(
- new ByteArrayInputStream(test2.getBytes("UTF-8")),
- new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
- }
-
- /**
* Test case for TIKA-341: using charset in content-type
*
* @see TIKA-341
@@ -136,15 +111,15 @@
new ByteArrayInputStream(test2.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
metadata = new Metadata();
- metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
+ metadata.set(Metadata.CONTENT_TYPE, "text/plain; charset=ISO-8859-1");
parser.parse(
new ByteArrayInputStream(test2.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
private void assertExtractText(String msg, String expected, byte[] input)
@@ -156,7 +131,7 @@
};
Metadata metadata = new Metadata();
parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+ assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("text/plain"));
assertEquals(msg, expected, handler.toString());
}
@@ -187,8 +162,22 @@
metadata,
new ParseContext());
- assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
}
+ /**
+ * Test case for TIKA-431
+ * @see TIKA-431
+ */
+ public void testNotAbusingContentEncoding() throws Exception {
+ String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ parser.parse(
+ new ByteArrayInputStream(text.getBytes("UTF-8")),
+ handler, metadata, new ParseContext());
+ assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
}
Index: src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
===================================================================
--- src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (revision 1171650)
+++ src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (working copy)
@@ -69,8 +69,7 @@
// sub part without damaging the main metadata
Metadata submd = new Metadata();
- submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
- submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+ submd.set(Metadata.CONTENT_TYPE, String.format("%s; charset=%s", body.getMimeType(), body.getCharset()));
try {
BodyContentHandler bch = new BodyContentHandler(handler);
Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 1171650)
+++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy)
@@ -36,6 +36,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
@@ -119,7 +120,7 @@
String content = new Tika().parseToString(
HtmlParserTest.class.getResourceAsStream(path), metadata);
- assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
@@ -244,7 +245,7 @@
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
/**
@@ -274,16 +275,28 @@
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("UTF-8", getCharsetFromContentType(metadata));
metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", getCharsetFromContentType(metadata));
}
+ private String getCharsetFromContentType(Metadata metadata) {
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ if (contentType != null) {
+ MediaType mt = MediaType.parse(contentType);
+ if (mt != null) {
+ return mt.getParameters().get("charset");
+ }
+ }
+
+ return null;
+ }
+
/**
* Test case for HTML content like
* ">div<foo>br<bar>/div>" that should result
@@ -332,7 +345,7 @@
new HtmlParser().parse (
new ByteArrayInputStream(test1.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", getCharsetFromContentType(metadata));
// Some HTML pages have errors like ';;' versus '; ' as separator
String test2 =
@@ -344,7 +357,7 @@
new HtmlParser().parse (
new ByteArrayInputStream(test2.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", getCharsetFromContentType(metadata));
}
/**
@@ -360,14 +373,14 @@
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("UTF-8", getCharsetFromContentType(metadata));
metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
new HtmlParser().parse (
new ByteArrayInputStream(test.getBytes("UTF-8")),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("ISO-8859-1", getCharsetFromContentType(metadata));
}
@@ -382,7 +395,7 @@
HtmlParserTest.class.getResourceAsStream(path),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
+ assertEquals("windows-1251", getCharsetFromContentType(metadata));
}
/**
@@ -567,7 +580,7 @@
String result = sw.toString();
// tag for Content-Type should exist, but nothing for Language
- assertTrue(Pattern.matches("(?s).*.*$", result));
+ assertTrue(Pattern.matches("(?s).*.*$", result));
assertFalse(Pattern.matches("(?s).*TIKA-431
+ */
+ public void testNotAbusingContentEncoding() throws Exception {
+ String test =
+ ""
+ + "the name is \u00e1ndre"
+ + "";
+ Metadata metadata = new Metadata();
+ new HtmlParser().parse (
+ new ByteArrayInputStream(test.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, new ParseContext());
+ assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
}
Index: src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (revision 1171650)
+++ src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (working copy)
@@ -224,4 +224,18 @@
.getResourceAsStream(name);
}
+ /**
+ * Test for TIKA-431 - not all headers may be present
+ */
+ public void testNotAbusingContentEncoding() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ parser.parse(stream, handler, metadata, new ParseContext());
+ assertEquals(null, metadata.get(Metadata.CONTENT_ENCODING));
+ }
+
+
}
Index: src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (revision 1171650)
+++ src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (working copy)
@@ -78,8 +78,8 @@
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
- assertEquals("Bad content type: " + tp,
- tp.realType, metadata.get(Metadata.CONTENT_TYPE));
+ assertTrue("Bad content type: " + tp,
+ metadata.get(Metadata.CONTENT_TYPE).startsWith(tp.realType));
if (tp.expectedContentFragment != null) {
assertTrue("Expected content not found: " + tp,