Index: src/test/java/org/apache/tika/TestParsers.java =================================================================== --- src/test/java/org/apache/tika/TestParsers.java (revision 584537) +++ src/test/java/org/apache/tika/TestParsers.java (working copy) @@ -175,19 +175,6 @@ ParserConfig config = tc.getParserConfig("text/html"); Parser parser = ParserFactory.getParser(config); assertNotNull(parser); - - Metadata metadata = new Metadata(); - InputStream stream = new FileInputStream(file); - try { - parser.parse(stream, new DefaultHandler(), metadata); - } finally { - stream.close(); - } - assertEquals("Title : Test Indexation Html", metadata.get(Metadata.TITLE)); - - final String text = metadata.toString(); - final String expected = "Test Indexation Html"; - assertTrue("text contains '" + expected + "'", text.contains(expected)); } public void testZipExtraction() throws Exception { Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 0) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 0) @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringWriter; + +import junit.framework.TestCase; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.WriteOutContentHandler; +import org.xml.sax.SAXException; + +public class HtmlParserTest extends TestCase { + + private Parser parser = new HtmlParser(); + + private static InputStream getStream(String name) { + return Thread.currentThread().getContextClassLoader() + .getResourceAsStream(name); + } + + public void testParseAscii() throws IOException, SAXException, + TikaException { + + StringWriter writer = new StringWriter(); + Metadata metadata = new Metadata(); + + parser.parse(getStream("test-documents/testHTML.html"), + new WriteOutContentHandler(writer), metadata); + String content = writer.toString(); + + assertTrue("Did not contain expected text:" + + "Title : Test Indexation Html", content + .contains("Title : Test Indexation Html")); + + assertTrue("Did not contain expected text:" + "Test Indexation Html", + content.contains("Test Indexation Html")); + + assertTrue("Did not contain expected text:" + "Indexation du fichier", + content.contains("Indexation du fichier")); + + } + + public void testParseUTF8() throws IOException, SAXException, TikaException { + + StringWriter writer = new StringWriter(); + Metadata metadata = new Metadata(); + + parser.parse(getStream("test-documents/testHTML_utf8.html"), + new WriteOutContentHandler(writer), metadata); + String content = writer.toString(); + + assertTrue("Did not contain expected text:" + + "Title : Tilte with UTF-8 chars öäå", content + .contains("Title : Tilte with UTF-8 chars öäå")); + + assertTrue("Did not contain expected text:" + + "Content with UTF-8 chars", content + .contains("Content with UTF-8 chars")); + + assertTrue("Did not contain expected text:" + "åäö", content + .contains("åäö")); + + } + + public void testParseEmpty() throws Exception { + Metadata metadata = new Metadata(); + StringWriter writer = new StringWriter(); + parser.parse(new ByteArrayInputStream(new byte[0]), + new WriteOutContentHandler(writer), metadata); + String content = writer.toString(); + assertEquals("", content); + } + +} Index: src/test/resources/test-documents/testHTML_utf8.html =================================================================== --- src/test/resources/test-documents/testHTML_utf8.html (revision 0) +++ src/test/resources/test-documents/testHTML_utf8.html (revision 0) @@ -0,0 +1,9 @@ + +
+åäö
+ + \ No newline at end of file Index: src/main/java/org/apache/tika/utils/Utils.java =================================================================== --- src/main/java/org/apache/tika/utils/Utils.java (revision 584537) +++ src/main/java/org/apache/tika/utils/Utils.java (working copy) @@ -16,6 +16,7 @@ */ package org.apache.tika.utils; +import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; @@ -24,18 +25,25 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.Reader; import java.util.ArrayList; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.log4j.Logger; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.HttpHeaders; +import org.apache.tika.metadata.Metadata; import org.jdom.Document; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; + /** * Class util * @@ -121,4 +129,45 @@ } } + /** + * Try to detect encoding from inputstream and return a UTF-8 + * Reader. A metadata hint can be submitted as part of {@link Metadata} + * under key {@link HttpHeaders#CONTENT_ENCODING}. + * + * After succesfull detection, fills Metadata with detected content encoding + * and content language ({@link HttpHeaders#CONTENT_LANGUAGE}). + * + * @return Reader to utf8 encoded reader. + */ + public static Reader getUTF8Reader(InputStream stream, Metadata metadata) throws TikaException, IOException{ + CharsetDetector detector = new CharsetDetector(); + + // Use the declared character encoding, if available + String encoding = metadata.get(Metadata.CONTENT_ENCODING); + if (encoding != null) { + detector.setDeclaredEncoding(encoding); + } + + // CharsetDetector expects a stream to support marks + if (!stream.markSupported()) { + stream = new BufferedInputStream(stream); + } + + detector.setText(stream); + + CharsetMatch match = detector.detect(); + if (match == null) { + throw new TikaException("Unable to detect character encoding"); + } + + metadata.set(Metadata.CONTENT_ENCODING, match.getName()); + String language = match.getLanguage(); + if (language != null) { + metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); + metadata.set(Metadata.LANGUAGE, match.getLanguage()); + } + + return match.getReader(); + } + } Index: src/main/java/org/apache/tika/parser/txt/TXTParser.java =================================================================== --- src/main/java/org/apache/tika/parser/txt/TXTParser.java (revision 584537) +++ src/main/java/org/apache/tika/parser/txt/TXTParser.java (working copy) @@ -16,7 +16,6 @@ */ package org.apache.tika.parser.txt; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; @@ -25,12 +24,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.apache.tika.parser.XHTMLContentHandler; +import org.apache.tika.utils.Utils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; - /** * Text parser */ @@ -39,38 +36,13 @@ public void parse( InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { - CharsetDetector detector = new CharsetDetector(); - - // Use the declared character encoding, if available - String encoding = metadata.get(Metadata.CONTENT_ENCODING); - if (encoding != null) { - detector.setDeclaredEncoding(encoding); - } - - // CharsetDetector expects a stream to support marks - if (!stream.markSupported()) { - stream = new BufferedInputStream(stream); - } - - detector.setText(stream); - - CharsetMatch match = detector.detect(); - if (match == null) { - throw new TikaException("Unable to detect character encoding"); - } - + + Reader reader = Utils.getUTF8Reader(stream, metadata); metadata.set(Metadata.CONTENT_TYPE, "text/plain"); - metadata.set(Metadata.CONTENT_ENCODING, match.getName()); - String language = match.getLanguage(); - if (language != null) { - metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage()); - metadata.set(Metadata.LANGUAGE, match.getLanguage()); - } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); xhtml.startElement("p"); - Reader reader = match.getReader(); char[] buffer = new char[4096]; for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { xhtml.characters(buffer, 0, n); Index: src/main/java/org/apache/tika/parser/html/HtmlParser.java =================================================================== --- src/main/java/org/apache/tika/parser/html/HtmlParser.java (revision 584537) +++ src/main/java/org/apache/tika/parser/html/HtmlParser.java (working copy) @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.InputStream; +import java.io.Reader; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -30,14 +31,33 @@ import org.xml.sax.SAXException; /** - * Simple HTML parser implemented with NekoHTML. + * Simple HTML parser that extracts title. */ public class HtmlParser implements Parser { public void parse(InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException { + final SAXParser parser = new SAXParser(); - final InputSource source = new InputSource(stream); + + final InputSource source; + + Reader utf8Reader; + + try { + utf8Reader = org.apache.tika.utils.Utils.getUTF8Reader( + stream, metadata); + } catch (TikaException ex) { + utf8Reader = null; + } + + if (utf8Reader == null) { + source = new InputSource(stream); + } else { + source = new InputSource(utf8Reader); + } + + parser.setContentHandler(new TitleExtractingContentHandler(handler, metadata)); parser.parse(source);