### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/main/java/org/apache/tika/parser/html/HtmlHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/html/HtmlHandler.java (revision 985009) +++ src/main/java/org/apache/tika/parser/html/HtmlHandler.java (working copy) @@ -130,11 +130,19 @@ xhtml.startElement(safe); } else { AttributesImpl newAttributes = new AttributesImpl(atts); - for (int att=0;att SAFE_ELEMENTS = new HashMap() {{ + put("H1", "h1"); + put("H2", "h2"); + put("H3", "h3"); + put("H4", "h4"); + put("H5", "h5"); + put("H6", "h6"); - public String mapSafeElement(String name) { - // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd + put("P", "p"); + put("PRE", "pre"); + put("BLOCKQUOTE", "blockquote"); - if ("H1".equals(name)) return "h1"; - if ("H2".equals(name)) return "h2"; - if ("H3".equals(name)) return "h3"; - if ("H4".equals(name)) return "h4"; - if ("H5".equals(name)) return "h5"; - if ("H6".equals(name)) return "h6"; + put("UL", "ul"); + put("OL", "ol"); + put("MENU", "ul"); + put("LI", "li"); + put("DL", "dl"); + put("DT", "dt"); + put("DD", "dd"); - if ("P".equals(name)) return "p"; - if ("PRE".equals(name)) return "pre"; - if ("BLOCKQUOTE".equals(name)) return "blockquote"; + put("TABLE", "table"); + put("THEAD", "thead"); + put("TBODY", "tbody"); + put("TR", "tr"); + put("TH", "th"); + put("TD", "td"); - if ("UL".equals(name)) return "ul"; - if ("OL".equals(name)) return "ol"; - if ("MENU".equals(name)) return "ul"; - if ("LI".equals(name)) return "li"; - if ("DL".equals(name)) return "dl"; - if ("DT".equals(name)) return "dt"; - if ("DD".equals(name)) return "dd"; + put("ADDRESS", "address"); + + // TIKA-463 - add additional elements that contain URLs + put("AREA", "area"); + put("IMG", "img"); - if ("TABLE".equals(name)) return "table"; - if ("THEAD".equals(name)) return "thead"; - if ("TBODY".equals(name)) return "tbody"; - if ("TR".equals(name)) return "tr"; - if ("TH".equals(name)) return "th"; - if ("TD".equals(name)) return "td"; + }}; + + private static final Set DISCARDABLE_ELEMENTS = new HashSet() {{ + add("STYLE"); + add("SCRIPT"); + }}; - if ("ADDRESS".equals(name)) return "address"; + private static final Map> SAFE_ATTRIBUTES = new HashMap>() {{ + put("a", attrSet("rel", "name")); + put("img", attrSet("src")); + // TODO KKr - fill out this set. + }}; + + private static Set attrSet(String... attrs) { + Set result = new HashSet(); + for (String attr : attrs) { + result.add(attr); + } + return result; + } + + /** + * @since Apache Tika 0.8 + */ + public static final HtmlMapper INSTANCE = new DefaultHtmlMapper(); - return null; + public String mapSafeElement(String name) { + return SAFE_ELEMENTS.get(name); } - /** Normalises an attribute name. Assumes that the element name - * is valid and normalised **/ + /** Normalizes an attribute name. Assumes that the element name + * is valid and normalized + */ public String mapSafeAttribute(String elementName, String attributeName) { - return null; - } + Set safeAttrs = SAFE_ATTRIBUTES.get(elementName); + if ((safeAttrs != null) && safeAttrs.contains(attributeName)) { + return attributeName; + } else { + return null; + } + } public boolean isDiscardElement(String name) { - return "STYLE".equals(name) || "SCRIPT".equals(name); + return DISCARDABLE_ELEMENTS.contains(name); } } Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 985028) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -450,6 +450,32 @@ } + /** + * Test case for TIKA-463. Don't skip elements that have URLs. + * @see TIKA-463 + */ + public void testImgUrlExtraction() throws Exception { + final String test = "Title" + + "" + + ""; + SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance(); + TransformerHandler handler = factory.newTransformerHandler(); + handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); + handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8"); + StringWriter sw = new StringWriter(); + handler.setResult(new StreamResult(sw)); + + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + handler, new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // tag should exist, with fully resolved URL + assertTrue(Pattern.matches("(?s).*.*$", result)); + } + }