### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 985288) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; +import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; @@ -416,17 +417,10 @@ "" + "

Simple Content

"; - SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); StringWriter sw = new StringWriter(); - handler.setResult(new StreamResult(sw)); - new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), - handler, new Metadata(), new ParseContext()); + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); @@ -459,17 +453,10 @@ "" + ""; - SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); StringWriter sw = new StringWriter(); - handler.setResult(new StreamResult(sw)); - new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), - handler, new Metadata(), new ParseContext()); + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); @@ -486,17 +473,10 @@ "" + ""; - SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); StringWriter sw = new StringWriter(); - handler.setResult(new StreamResult(sw)); - new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), - handler, new Metadata(), new ParseContext()); + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); @@ -505,33 +485,41 @@ } /** + * Test case for change related to TIKA-463. Verify proper handling of tags. + * @see TIKA-463 + */ + public void testMetaTagHandling() throws Exception { + final String test = "

header

some text

"; + + Metadata metadata = new Metadata(); + metadata.add("Content-Type", "text/html; charset=utf-8"); + metadata.add("Language", null); + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + makeHtmlTransformer(sw), metadata, new ParseContext()); + + String result = sw.toString(); + + // tag for Content-Type should exist, but nothing for Language + assertTrue(Pattern.matches("(?s).*.*$", result)); + assertFalse(Pattern.matches("(?s).* inside of . * @see TIKA-457 */ public void testFBrokenrameset() throws Exception { - final String test2 = " my title " + - "" + - "" + - "" + - "" + - ""; - - SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); - TransformerHandler handler = factory.newTransformerHandler(); - handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); - handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); - handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); - final String test1 = "Title" + "" + ""; StringWriter sw1 = new StringWriter(); - handler.setResult(new StreamResult(sw1)); - new HtmlParser().parse( new ByteArrayInputStream(test1.getBytes("UTF-8")), - handler, new Metadata(), new ParseContext()); + makeHtmlTransformer(sw1), new Metadata(), new ParseContext()); String result = sw1.toString(); @@ -541,12 +529,18 @@ // tag should not exist. assertFalse(Pattern.matches("(?s).*.*$", result)); - StringWriter sw2 = new StringWriter(); - handler.setResult(new StreamResult(sw2)); + // Test the example from the Nutch project. + final String test2 = " my title " + + "" + + "" + + "" + + "" + + ""; + StringWriter sw2 = new StringWriter(); new HtmlParser().parse( new ByteArrayInputStream(test2.getBytes("UTF-8")), - handler, new Metadata(), new ParseContext()); + makeHtmlTransformer(sw2), new Metadata(), new ParseContext()); result = sw2.toString(); @@ -560,5 +554,21 @@ assertFalse(Pattern.matches("(?s).*.*$", result)); } - + /** + * Create ContentHandler that transforms SAX events into textual HTML output, + * and writes it out to - typically this is a StringWriter. + * + * @param writer Where to write resulting HTML text. + * @return ContentHandler suitable for passing to parse() methods. + * @throws Exception + */ + private ContentHandler makeHtmlTransformer(Writer writer) throws Exception { + SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); + handler.setResult(new StreamResult(writer)); + return handler; + } } #P tika-core Index: src/main/java/org/apache/tika/sax/XHTMLContentHandler.java =================================================================== --- src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (revision 985288) +++ src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (working copy) @@ -157,11 +157,15 @@ } for (String value : metadata.getValues(name)) { - AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", "name", "name", "CDATA", name); - attributes.addAttribute("", "content", "content", "CDATA", value); - super.startElement(XHTML, "meta", "meta", attributes); - super.endElement(XHTML, "meta", "meta"); + // Putting null values into attributes causes problems, but is + // allowed by Metadata, so guard against that. + if (value != null) { + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "name", "name", "CDATA", name); + attributes.addAttribute("", "content", "content", "CDATA", value); + super.startElement(XHTML, "meta", "meta", attributes); + super.endElement(XHTML, "meta", "meta"); + } } }