diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 22c801a..7656367 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -16,9 +16,13 @@ */ package org.apache.tika.parser.html; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.Charset; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; @@ -40,12 +44,35 @@ public class HtmlParser implements Parser { // Use the widest, most common charset as our default. private static final String DEFAULT_CHARSET = "windows-1252"; + private static final int META_TAG_BUFFER_SIZE = 4096; + private static final Pattern HTTP_EQUIV_CHARSET_PATTERN = Pattern + .compile("(?is)TIKA-332 + */ + public void testHttpEquivCharset() throws Exception { + String test = "" + + "the name is \u00e1ndre"; + Metadata metadata = new Metadata(); + new HtmlParser().parse ( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); + } + + /** * Test case for TIKA-334 * @see TIKA-334 */