diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java index 5b64001..1c10261 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java @@ -25,30 +25,78 @@ import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.txt.CharsetDetector; +import org.apache.tika.parser.txt.CharsetMatch; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** - * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events, + * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, * and post-processes the events to produce XHTML and metadata expected by * Tika clients. */ public class HtmlParser implements Parser { + // Use the widest, most common charset as our default. + private static final String DEFAULT_CHARSET = "windows-1252"; + + // TODO: Move this into core, along with CharsetDetector + private String getEncoding(InputStream stream, Metadata metadata) throws IOException { + // TODO: Check for TIKA-287 + * Test case for TIKA-334 + * @see TIKA-334 */ - public void testRelativeLinks() throws Exception { - + public void testDetectOfCharset() throws IOException, SAXException, TikaException { + final String test = "Ž"; + Metadata metadata = new Metadata(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + new BodyContentHandler(), metadata, new ParseContext()); + + assertEquals("Ž", metadata.get(Metadata.TITLE)); } + + }