Index: src/main/java/org/apache/tika/parser/html/HtmlParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/HtmlParser.java (revision 584197)
+++ src/main/java/org/apache/tika/parser/html/HtmlParser.java (working copy)
@@ -1,89 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerException;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.sax.SAXResult;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-import org.w3c.dom.Text;
-import org.w3c.tidy.Tidy;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Html parser
- */
-public class HtmlParser implements Parser {
-
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- try {
- Tidy tidy = new Tidy();
- tidy.setQuiet(true);
- tidy.setShowWarnings(false);
- tidy.setXHTML(true);
-
- Element root = tidy.parseDOM(stream, null).getDocumentElement();
-
- metadata.set(Metadata.CONTENT_TYPE, "text/html");
- extractElementTxt(root, Metadata.TITLE, "title", metadata);
-
- TransformerFactory factory = TransformerFactory.newInstance();
- Transformer transformer = factory.newTransformer();
- transformer.transform(new DOMSource(root), new SAXResult(handler));
- } catch (TransformerException e) {
- throw new TikaException("Failed to transform DOM to SAX", e);
- }
- }
-
- private void extractElementTxt(
- Element root, String name, String tag, Metadata metadata) {
- NodeList children = root.getElementsByTagName(tag);
- if (children != null) {
- if (children.getLength() > 0) {
- if (children.getLength() == 1) {
- Element node = (Element) children.item(0);
- Text txt = (Text) node.getFirstChild();
- if (txt != null) {
- metadata.set(name, txt.getData());
- }
- } else {
- for (int i = 0; i < children.getLength(); i++) {
- Element node = (Element) children.item(i);
- Text txt = (Text) node.getFirstChild();
- if (txt != null) {
- metadata.add(name, txt.getData());
- }
- }
- }
- }
- }
- }
-
-}
Index: src/main/java/org/apache/tika/parser/html/NekoHtmlParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/NekoHtmlParser.java (revision 0)
+++ src/main/java/org/apache/tika/parser/html/NekoHtmlParser.java (revision 0)
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ContentHandlerDecorator;
+import org.apache.tika.parser.Parser;
+import org.cyberneko.html.parsers.SAXParser;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * Simple HTML parser implemented with NekoHTML.
+ */
+public class NekoHtmlParser implements Parser {
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+ final SAXParser parser = new SAXParser();
+ final InputSource source = new InputSource(stream);
+ parser.setContentHandler(new TitleExtractingContentHandler(handler,
+ metadata));
+ parser.parse(source);
+ }
+
+ private static class TitleExtractingContentHandler extends
+ ContentHandlerDecorator {
+
+ private Phase phase = Phase.START;
+
+ private Metadata metadata;
+
+ private StringBuilder title = new StringBuilder();
+
+ private static enum Phase {
+ START, HTML, HEAD, TITLE, IGNORE;
+ }
+
+ public TitleExtractingContentHandler(final ContentHandler handler,
+ final Metadata metadata) {
+ super(handler);
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String name,
+ Attributes atts) throws SAXException {
+ if (phase == Phase.START && "HTML".equals(localName)) {
+ phase = Phase.HTML;
+ } else if (phase == Phase.HTML && "HEAD".equals(localName)) {
+ phase = Phase.HEAD;
+ } else if (phase == Phase.HEAD && "TITLE".equals(localName)) {
+ phase = Phase.TITLE;
+ }
+ super.startElement(uri, localName, name, atts);
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (phase == Phase.TITLE) {
+ title.append(ch, start, length);
+ }
+ super.characters(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ if (phase == Phase.TITLE && "TITLE".equals(localName)) {
+ phase = Phase.IGNORE;
+ }
+ super.endElement(uri, localName, name);
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.TITLE, title.toString());
+ super.endDocument();
+ }
+ }
+}
Index: src/main/resources/tika-config.xml
===================================================================
--- src/main/resources/tika-config.xml (revision 584197)
+++ src/main/resources/tika-config.xml (working copy)
@@ -21,7 +21,7 @@
application/vnd.ms-powerpoint
-
+
text/html
application/x-asp
Index: pom.xml
===================================================================
--- pom.xml (revision 584197)
+++ pom.xml (working copy)
@@ -184,11 +184,6 @@
2.0.8
- jtidy
- jtidy
- 4aug2000r7-dev
-
-
com.ibm.icu
icu4j
3.4.4
@@ -199,6 +194,11 @@
1.2.14
+ nekohtml
+ nekohtml
+ 0.9.5
+
+
junit
junit
3.8.1