### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/main/java/org/apache/tika/parser/html/HtmlHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/html/HtmlHandler.java (revision 986131) +++ src/main/java/org/apache/tika/parser/html/HtmlHandler.java (working copy) @@ -118,7 +118,7 @@ resolve(atts.getValue("href").trim())); xhtml.startElement(uri, local, "base", atts); } else if ("LINK".equals(name) && atts.getValue("href") != null) { - xhtml.startElement(uri, local, "link", atts); + startElementWithSafeAttributes("link", atts); } } @@ -143,23 +143,7 @@ else if (atts.getLength() == 0) { xhtml.startElement(safe); } else { - AttributesImpl newAttributes = new AttributesImpl(atts); - for (int att = 0; att < newAttributes.getLength(); att++) { - String normAttrName = mapper.mapSafeAttribute(safe, newAttributes.getLocalName(att)); - if (normAttrName == null) { - newAttributes.removeAttribute(att); - att--; - } else { - // We have a remapped attribute name, so set it as it might have changed. - newAttributes.setLocalName(att, normAttrName); - - // And resolve relative links for the src attribute. - if (normAttrName.equals("src")) { - newAttributes.setValue(att, resolve(newAttributes.getValue(att).trim())); - } - } - } - xhtml.startElement(safe, newAttributes); + startElementWithSafeAttributes(safe, atts); } } } @@ -167,6 +151,27 @@ title.setLength(0); } + private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException { + AttributesImpl newAttributes = new AttributesImpl(atts); + for (int att = 0; att < newAttributes.getLength(); att++) { + String normAttrName = mapper.mapSafeAttribute(name, newAttributes.getLocalName(att)); + if (normAttrName == null) { + newAttributes.removeAttribute(att); + att--; + } else { + // We have a remapped attribute name, so set it as it might have changed. + newAttributes.setLocalName(att, normAttrName); + + // And resolve relative links for the href & src attributes. + if (normAttrName.equals("src") || normAttrName.equals("href")) { + newAttributes.setValue(att, resolve(newAttributes.getValue(att).trim())); + } + } + } + + xhtml.startElement(name, newAttributes); + } + @Override public void endElement( String uri, String local, String name) throws SAXException { Index: src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java =================================================================== --- src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (revision 986131) +++ src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (working copy) @@ -78,6 +78,7 @@ put("a", attrSet("rel", "name")); put("img", attrSet("src")); put("frame", attrSet("src")); + put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media")); // TODO KKr - fill out this set. }}; Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 986131) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -577,6 +577,26 @@ assertTrue(Pattern.matches("(?s).*.*.*$", content)); } + /** + * Test case for TIKA-481. Verify href in is resolved. + * @see TIKA-481 + */ + public void testLinkHrefResolution() throws Exception { + final String test = "Title" + + "" + + "" + + ""; + + StringWriter sw = new StringWriter(); + new HtmlParser().parse( + new ByteArrayInputStream(test.getBytes("UTF-8")), + makeHtmlTransformer(sw), new Metadata(), new ParseContext()); + + String result = sw.toString(); + + // tag should exist in , with fully resolved URL + assertTrue(Pattern.matches("(?s).*.*.*.*$", result)); + } /**