/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.html; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import javax.xml.transform.OutputKeys; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.sax.TransformerHandler; import javax.xml.transform.stream.StreamResult; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Geographic; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.LinkContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.TextContentHandler; import org.ccil.cowan.tagsoup.HTMLSchema; import org.ccil.cowan.tagsoup.Schema; import org.junit.Ignore; import org.junit.Test; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class HtmlParserTest { @Test public void testParseAscii() throws Exception { String path = "/test-documents/testHTML.html"; final StringWriter href = new StringWriter(); final StringWriter name = new StringWriter(); ContentHandler body = new BodyContentHandler(); Metadata metadata = new Metadata(); InputStream stream = HtmlParserTest.class.getResourceAsStream(path); try { ContentHandler link = new DefaultHandler() { @Override public void startElement( String u, String l, String n, Attributes a) throws SAXException { if ("a".equals(l)) { if (a.getValue("href") != null) { href.append(a.getValue("href")); } else if (a.getValue("name") != null) { name.append(a.getValue("name")); } } } }; new HtmlParser().parse( stream, new TeeContentHandler(body, link), metadata, new ParseContext()); } finally { stream.close(); } assertEquals( "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Tika Developers", metadata.get("Author")); assertEquals("5", metadata.get("refresh")); assertEquals("51.2312", metadata.get(Geographic.LATITUDE)); assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE)); assertEquals("http://www.apache.org/", href.toString()); assertEquals("test-anchor", name.toString()); String content = body.toString(); assertTrue( "Did not contain expected text:" + "Test Indexation Html", content.contains("Test Indexation Html")); assertTrue( "Did not contain expected text:" + "Indexation du fichier", content.contains("Indexation du fichier")); } @Test @Ignore("The file 'testXHTML_utf8.html' is not available fo testing") public void XtestParseUTF8() throws IOException, SAXException, TikaException { String path = "/test-documents/testXHTML_utf8.html"; Metadata metadata = new Metadata(); String content = new Tika().parseToString( HtmlParserTest.class.getResourceAsStream(path), metadata); assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå", content .contains("Title : Tilte with UTF-8 chars öäå")); assertTrue("Did not contain expected text:" + "Content with UTF-8 chars", content .contains("Content with UTF-8 chars")); assertTrue("Did not contain expected text:" + "åäö", content .contains("åäö")); } @Test public void testXhtmlParsing() throws Exception { String path = "/test-documents/testXHTML.html"; Metadata metadata = new Metadata(); String content = new Tika().parseToString( HtmlParserTest.class.getResourceAsStream(path), metadata); assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE)); assertEquals("Tika Developers", metadata.get("Author")); assertEquals("5", metadata.get("refresh")); assertTrue(content.contains("ability of Apache Tika")); assertTrue(content.contains("extract content")); assertTrue(content.contains("an XHTML document")); } @Test public void testParseEmpty() throws Exception { ContentHandler handler = new BodyContentHandler(); new HtmlParser().parse( new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext()); assertEquals("", handler.toString()); } /** * Test case for TIKA-210 * @see TIKA-210 */ @Test public void testCharactersDirectlyUnderBodyElement() throws Exception { String test = "test"; String content = new Tika().parseToString( new ByteArrayInputStream(test.getBytes("UTF-8"))); assertEquals("test", content); } /** * Test case for TIKA-287 * @see TIKA-287 */ @Test public void testBaseHref() throws Exception { assertRelativeLink( "http://lucene.apache.org/tika/", "http://lucene.apache.org/", "tika/"); assertRelativeLink( "http://domain.com/?pid=1", "http://domain.com", "?pid=1"); assertRelativeLink( "http://domain.com/?pid=2", "http://domain.com?pid=1", "?pid=2"); assertRelativeLink( "http://domain.com/file.html", "http://domain.com/path/", "/file.html"); assertRelativeLink( "http://domain.com/path/file.html", "http://domain.com/path/", "./file.html"); assertRelativeLink( "http://domain.com/path/file.html", "http://domain.com/path/", "file.html"); assertRelativeLink( "http://domain2.com/newpath", "http://domain.com/path/to/file", "http://domain2.com/newpath"); // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx // Also http://www.ietf.org/rfc/rfc3986.txt // Also http://issues.apache.org/jira/browse/NUTCH-566 // Also http://issues.apache.org/jira/browse/NUTCH-436 assertRelativeLink( "http://domain.com/path/?pid=1", "http://domain.com/path/", "?pid=1"); assertRelativeLink( "http://domain.com/file?pid=1", "http://domain.com/file", "?pid=1"); assertRelativeLink( "http://domain.com/path/d;p?pid=1", "http://domain.com/path/d;p?q#f", "?pid=1"); } private void assertRelativeLink(String url, String base, String relative) throws Exception { String test = "" + "test"; final List links = new ArrayList(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), new DefaultHandler() { @Override public void startElement( String u, String l, String name, Attributes atts) { if (name.equals("a") && atts.getValue("", "href") != null) { links.add(atts.getValue("", "href")); } } }, new Metadata(), new ParseContext()); assertEquals(1, links.size()); assertEquals(url, links.get(0)); } /** * Test case for TIKA-268 * @see TIKA-268 */ @Test public void testWhitespaceBetweenTableCells() throws Exception { String test = "
ab
"; String content = new Tika().parseToString( new ByteArrayInputStream(test.getBytes("UTF-8"))); assertTrue(content.contains("a")); assertTrue(content.contains("b")); assertFalse(content.contains("ab")); } /** * Test case for TIKA-332 * @see TIKA-332 */ @Test public void testHttpEquivCharset() throws Exception { String test = "" + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-892 * @see TIKA-892 */ @Test public void testHtml5Charset() throws Exception { String test = "" + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-334 * @see TIKA-334 */ @Test public void testDetectOfCharset() throws Exception { String test = "\u017d"; Metadata metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE)); } /** * Test case for TIKA-341 * @see TIKA-341 */ @Test public void testUsingCharsetInContentTypeHeader() throws Exception { final String test = "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1"); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for HTML content like * ">div<foo>br<bar>/div>" that should result * in three whitespace-separated tokens "foo", "bar" and "baz" instead * of a single token "foobarbaz". * * @see TIKA-343 */ @Test public void testLineBreak() throws Exception { String test = "
foo
bar
baz"; String text = new Tika().parseToString( new ByteArrayInputStream(test.getBytes("US-ASCII"))); String[] parts = text.trim().split("\\s+"); assertEquals(3, parts.length); assertEquals("foo", parts[0]); assertEquals("bar", parts[1]); assertEquals("baz", parts[2]); } /** * Test case for TIKA-339: Don't use language returned by CharsetDetector * @see TIKA-339 */ @Test public void testIgnoreCharsetDetectorLanguage() throws Exception { String test = "Simple Content"; Metadata metadata = new Metadata(); metadata.add(Metadata.CONTENT_LANGUAGE, "en"); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE)); } /** * Test case for TIKA-349 * @see TIKA-349 */ @Test public void testHttpEquivCharsetFunkyAttributes() throws Exception { String test1 = "" + "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test1.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); // Some HTML pages have errors like ';;' versus '; ' as separator String test2 = "" + "the name is \u00e1ndre" + ""; metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test2.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-350 * @see TIKA-350 */ @Test public void testUsingFunkyCharsetInContentTypeHeader() throws Exception { final String test = "the name is \u00e1ndre" + ""; Metadata metadata = new Metadata(); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("UTF-8")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING)); metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html"); new HtmlParser().parse ( new ByteArrayInputStream(test.getBytes("ISO-8859-1")), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-357 * @see TIKA-357 */ @Test public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception { String path = "/test-documents/big-preamble.html"; Metadata metadata = new Metadata(); new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), new BodyContentHandler(), metadata, new ParseContext()); assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); } /** * Test case for TIKA-420 * @see TIKA-420 */ @Test public void testBoilerplateRemoval() throws Exception { String path = "/test-documents/boilerplate.html"; Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(); new HtmlParser().parse( HtmlParserTest.class.getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata, new ParseContext()); String content = handler.toString(); assertTrue(content.startsWith("This is the real meat")); assertTrue(content.endsWith("This is the end of the text.\n")); assertFalse(content.contains("boilerplate")); assertFalse(content.contains("footer")); } /** * Test case for TIKA-478. Don't emit sub-elements inside of . * @see TIKA-478 */ @Test public void testElementOrdering() throws Exception { final String test = "Title" + "" + "" + "

Simple Content

"; StringWriter sw = new StringWriter(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); // Title element in section assertTrue(Pattern.matches("(?s).*Title.*.*$", result)); // No meta elements in body assertFalse(Pattern.matches("(?s).*.*.*$", result)); // meta elements should show up in section assertTrue(Pattern.matches("(?s).*.*$", result)); // No link elements in body assertFalse(Pattern.matches("(?s).*.*.*$", result)); // link element should be in section assertTrue(Pattern.matches("(?s).*.*$", result)); // There should be ending elements. assertTrue(Pattern.matches("(?s).*.*$", result)); } /** * Test case for TIKA-463. Don't skip elements that have URLs. * @see TIKA-463 */ @Test public void testImgUrlExtraction() throws Exception { final String test = "Title" + "" + ""; StringWriter sw = new StringWriter(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); // tag should exist, with fully resolved URL assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result)); } /** * Test case for TIKA-463. Don't skip elements that have URLs. * @see TIKA-463 */ @Test public void testFrameSrcExtraction() throws Exception { final String test = "Title" + "" + ""; StringWriter sw = new StringWriter(); new HtmlParser().parse( new ByteArrayInputStream(test.getBytes("UTF-8")), makeHtmlTransformer(sw), new Metadata(), new ParseContext()); String result = sw.toString(); // tag should exist, with fully resolved URL assertTrue(Pattern.matches("(?s).*.*$", result)); } /** * Test case for TIKA-463. Don't skip elements that have URLs. * @see TIKA-463 */ @Test public void testIFrameSrcExtraction() throws Exception { final String test = "Title" + "" + "