### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/main/java/org/apache/tika/parser/html/HtmlHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/html/HtmlHandler.java (revision 985052) +++ src/main/java/org/apache/tika/parser/html/HtmlHandler.java (working copy) @@ -79,7 +79,7 @@ if ("TITLE".equals(name) || titleLevel > 0) { titleLevel++; } - if ("BODY".equals(name) || bodyLevel > 0) { + if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) { bodyLevel++; } if (mapper.isDiscardElement(name) || discardLevel > 0) { Index: src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java =================================================================== --- src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (revision 985052) +++ src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (working copy) @@ -62,6 +62,8 @@ // TIKA-463 - add additional elements that contain URLs put("AREA", "area"); put("IMG", "img"); + put("FRAMESET", "frameset"); + put("FRAME", "frame"); }}; @@ -73,6 +75,7 @@ private static final Map> SAFE_ATTRIBUTES = new HashMap>() {{ put("a", attrSet("rel", "name")); put("img", attrSet("src")); + put("frame", attrSet("src")); // TODO KKr - fill out this set. }}; Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 985052) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -477,5 +477,88 @@ assertTrue(Pattern.matches("(?s).*.*$", result)); } + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * @see TIKA-463 + */ + public void testFrameSrcExtraction() throws Exception { + final String test = "Title" + + "" + + ""; + + SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); + StringWriter sw = new StringWriter(); + handler.setResult(new StreamResult(sw)); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + handler, new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*.*$", result)); + } + + /** + * Test case for TIKA-457. Better handling for broken HTML that has inside of . + * @see TIKA-457 + */ + public void testFBrokenrameset() throws Exception { + final String test2 = " my title " + + "" + + "" + + "" + + "" + + ""; + + SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); + + final String test1 = "Title" + + "" + + ""; + + StringWriter sw1 = new StringWriter(); + handler.setResult(new StreamResult(sw1)); + + new HtmlParser().parse( + new ByteArrayInputStream(test1.getBytes("UTF-8")), + handler, new Metadata(), new ParseContext()); + + String result = sw1.toString(); + + // tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*.*$", result)); + + // tag should not exist. + assertFalse(Pattern.matches("(?s).*.*$", result)); + + StringWriter sw2 = new StringWriter(); + handler.setResult(new StreamResult(sw2)); + + new HtmlParser().parse( + new ByteArrayInputStream(test2.getBytes("UTF-8")), + handler, new Metadata(), new ParseContext()); + + result = sw2.toString(); + + // tags should exist, with relative URL (no base element specified) + assertTrue(Pattern.matches("(?s).*.*$", result)); + assertTrue(Pattern.matches("(?s).*.*$", result)); + assertTrue(Pattern.matches("(?s).*.*$", result)); + assertTrue(Pattern.matches("(?s).*.*$", result)); + + // tag should not exist. + assertFalse(Pattern.matches("(?s).*.*$", result)); + } + } #P tika-core Index: src/main/java/org/apache/tika/sax/XHTMLContentHandler.java =================================================================== --- src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (revision 985028) +++ src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (working copy) @@ -59,13 +59,13 @@ * skip them if they get sent to startElement/endElement by mistake. */ private static final Set AUTO = - unmodifiableSet("html", "head", "body"); + unmodifiableSet("html", "head", "body", "frameset"); /** * The elements that get prepended with the {@link #TAB} character. */ private static final Set INDENT = - unmodifiableSet("li", "dd", "dt", "td", "th"); + unmodifiableSet("li", "dd", "dt", "td", "th", "frame"); /** * The elements that get appended with the {@link #NL} character. @@ -93,7 +93,8 @@ */ private boolean headStarted = false; private boolean headEnded = false; - + private boolean useFrameset = false; + public XHTMLContentHandler(ContentHandler handler, Metadata metadata) { super(handler); this.metadata = metadata; @@ -138,14 +139,15 @@ * <head> * <title>...</title> * </head> - * <body> + * <body> (or <frameset> * */ - private void lazyEndHead() throws SAXException { + private void lazyEndHead(boolean isFrameset) throws SAXException { lazyStartHead(); if (!headEnded) { headEnded = true; + useFrameset = isFrameset; // TIKA-478: Emit all metadata values (other than title). We have to call // startElement() and characters() directly to avoid recursive problems. @@ -156,7 +158,8 @@ for (String value : metadata.getValues(name)) { AttributesImpl attributes = new AttributesImpl(); - attributes.addAttribute("", name, name, "CDATA", value); + attributes.addAttribute("", "name", "name", "CDATA", name); + attributes.addAttribute("", "content", "content", "CDATA", value); super.startElement(XHTML, "meta", "meta", attributes); super.endElement(XHTML, "meta", "meta"); } @@ -172,7 +175,12 @@ super.endElement(XHTML, "title", "title"); super.endElement(XHTML, "head", "head"); - super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); + + if (useFrameset) { + super.startElement(XHTML, "frameset", "frameset", EMPTY_ATTRIBUTES); + } else { + super.startElement(XHTML, "body", "body", EMPTY_ATTRIBUTES); + } } } @@ -186,9 +194,14 @@ */ @Override public void endDocument() throws SAXException { - lazyEndHead(); + lazyEndHead(useFrameset); + + if (useFrameset) { + super.endElement(XHTML, "frameset", "frameset"); + } else { + super.endElement(XHTML, "body", "body"); + } - super.endElement(XHTML, "body", "body"); super.endElement(XHTML, "html", "html"); endPrefixMapping(""); @@ -204,11 +217,13 @@ String uri, String local, String name, Attributes attributes) throws SAXException { - if (!AUTO.contains(name)) { + if (name.equals("frameset")) { + lazyEndHead(true); + } else if (!AUTO.contains(name)) { if (HEAD.contains(name)) { lazyStartHead(); } else { - lazyEndHead(); + lazyEndHead(false); } if (XHTML.equals(uri) && INDENT.contains(name)) { @@ -238,7 +253,7 @@ */ @Override public void characters(char[] ch, int start, int length) throws SAXException { - lazyEndHead(); + lazyEndHead(useFrameset); super.characters(ch, start, length); }