### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (revision 985009) +++ src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (working copy) @@ -21,8 +21,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import de.l3s.boilerpipe.BoilerpipeExtractor; import de.l3s.boilerpipe.BoilerpipeProcessingException; @@ -96,11 +98,28 @@ throw new SAXException(e); } + Attributes emptyAttrs = new AttributesImpl(); + delegate.startDocument(); + delegate.startPrefixMapping("", XHTMLContentHandler.XHTML); + + delegate.startElement(XHTMLContentHandler.XHTML, "html", "html", emptyAttrs); + delegate.startElement(XHTMLContentHandler.XHTML, "head", "head", emptyAttrs); + delegate.startElement(XHTMLContentHandler.XHTML, "title", "title", emptyAttrs); + + if (td.getTitle() != null) { + char[] titleChars = td.getTitle().toCharArray(); + delegate.characters(titleChars, 0, titleChars.length); + } + + delegate.endElement(XHTMLContentHandler.XHTML, "title", "title"); + delegate.endElement(XHTMLContentHandler.XHTML, "head", "head"); + delegate.startElement(XHTMLContentHandler.XHTML, "body", "body", emptyAttrs); + for (TextBlock block : td.getTextBlocks()) { if (block.isContent()) { - delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", null); + delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs); char[] chars = block.getText().toCharArray(); delegate.characters(chars, 0, chars.length); delegate.endElement(XHTMLContentHandler.XHTML, "p", "p"); @@ -108,6 +127,11 @@ } } + delegate.endElement(XHTMLContentHandler.XHTML, "body", "body"); + delegate.endElement(XHTMLContentHandler.XHTML, "html", "html"); + + delegate.endPrefixMapping(""); + delegate.endDocument(); } } Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 986089) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -555,6 +555,31 @@ } /** + * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer + * as delegate for BoilerpipeContentHandler + * @see TIKA-480 + */ + public void testBoilerplateDelegation() throws Exception { + String path = "/test-documents/boilerplate.html"; + + Metadata metadata = new Metadata(); + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + makeHtmlTransformer(sw), metadata, new ParseContext()); + + String content = sw.toString(); + + // Should have , , , <body> elements + assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content)); + assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content)); + assertTrue(Pattern.matches("(?s).*<title>Title.*$", content)); + assertTrue(Pattern.matches("(?s).*.*.*$", content)); + } + + + + /** * Create ContentHandler that transforms SAX events into textual HTML output, * and writes it out to - typically this is a StringWriter. *