### Eclipse Workspace Patch 1.0 #P tika-parsers Index: src/test/resources/test-documents/boilerplate.html =================================================================== --- src/test/resources/test-documents/boilerplate.html (revision 0) +++ src/test/resources/test-documents/boilerplate.html (revision 0) @@ -0,0 +1,41 @@ + + + + + + Title + + + + + + + +
+ + + + + +
boilerplatetext
+
+ +

This is the real meat of the page, +and represents the text we want. +It has lots of juicy content. + +We assume that it won't get filtered out. +And that all of the lines will be in the +output. +

+ +

+Here's another paragraph of text. +This is the end of the text. +

+ +

footer

+ + + Property changes on: src/test/resources/test-documents/boilerplate.html ___________________________________________________________________ Added: svn:mime-type + text/html Added: svn:eol-style + native Index: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java =================================================================== --- src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (revision 0) +++ src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (revision 0) @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.html; + +import java.io.Writer; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import de.l3s.boilerpipe.BoilerpipeExtractor; +import de.l3s.boilerpipe.BoilerpipeProcessingException; +import de.l3s.boilerpipe.document.TextBlock; +import de.l3s.boilerpipe.document.TextDocument; +import de.l3s.boilerpipe.extractors.ArticleExtractor; +import de.l3s.boilerpipe.extractors.DefaultExtractor; +import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; + +/** + * Uses the boilerpipe + * library to automatically extract the main content from a web page. + * + * Use this as a {@link ContentHandler} object passed to + * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)} + */ +public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler { + + private ContentHandler delegate; + private BoilerpipeExtractor extractor; + + /** + * Creates a new boilerpipe-based content extractor, using the + * {@link DefaultExtractor} extraction rules and "delegate" as the content handler. + * + * @param delegate + * The {@link ContentHandler} object + */ + public BoilerpipeContentHandler(ContentHandler delegate) { + this(delegate, DefaultExtractor.INSTANCE); + } + + /** + * Creates a content handler that writes XHTML body character events to + * the given writer. + * + * @param writer writer + */ + public BoilerpipeContentHandler(Writer writer) { + this(new WriteOutContentHandler(writer)); + } + + /** + * Creates a new boilerpipe-based content extractor, using the given + * extraction rules. The extracted main content will be passed to the + * content handler. + * + * @param delegate + * The {@link ContentHandler} object + * @param extractor + * Extraction rules to use, e.g. {@link ArticleExtractor} + */ + public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) { + this.delegate = delegate; + this.extractor = extractor; + } + + @Override + public void endDocument() throws SAXException { + super.endDocument(); + + TextDocument td = toTextDocument(); + try { + extractor.process(td); + } catch (BoilerpipeProcessingException e) { + throw new SAXException(e); + } + + delegate.startDocument(); + + for (TextBlock block : td.getTextBlocks()) { + if (block.isContent()) { + delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", null); + char[] chars = block.getText().toCharArray(); + delegate.characters(chars, 0, chars.length); + delegate.endElement(XHTMLContentHandler.XHTML, "p", "p"); + } + } + + delegate.endDocument(); + } +} Property changes on: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:eol-style + native Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java =================================================================== --- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 961850) +++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy) @@ -31,6 +31,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; +import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -377,4 +378,24 @@ assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING)); } + /** + * Test case for TIKA-420 + * @see TIKA-420 + */ + public void testBoilerplateRemoval() throws Exception { + String path = "/test-documents/boilerplate.html"; + + Metadata metadata = new Metadata(); + WriteOutContentHandler handler = new WriteOutContentHandler(); + new HtmlParser().parse( + HtmlParserTest.class.getResourceAsStream(path), + new BoilerpipeContentHandler(handler), metadata, new ParseContext()); + + String content = handler.toString(); + assertTrue(content.startsWith("This is the real meat")); + assertTrue(content.endsWith("This is the end of the text.")); + assertFalse(content.contains("boilerplate")); + assertFalse(content.contains("footer")); + } + } Index: pom.xml =================================================================== --- pom.xml (revision 961850) +++ pom.xml (working copy) @@ -38,6 +38,16 @@ 3.7-beta1 + + + + maven2-repository.dev.java.net + Java.net Repository for Maven + http://download.java.net/maven/2/ + default + + + ${project.groupId} @@ -123,6 +133,11 @@ 2.4.0-beta-1 + de.l3s.boilerpipe + boilerpipe + 1.0.4 + + junit junit test #P tika-app Index: src/main/java/org/apache/tika/cli/TikaCLI.java =================================================================== --- src/main/java/org/apache/tika/cli/TikaCLI.java (revision 961850) +++ src/main/java/org/apache/tika/cli/TikaCLI.java (working copy) @@ -43,6 +43,7 @@ import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.BoilerpipeContentHandler; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; @@ -88,6 +89,12 @@ } }; + private final OutputType TEXT_MAIN = new OutputType() { + public ContentHandler getContentHandler() throws Exception { + return new BoilerpipeContentHandler(getSystemOutWriter(encoding)); + } + }; + private final OutputType METADATA = new OutputType() { public ContentHandler getContentHandler() throws Exception { final PrintWriter writer = @@ -146,6 +153,8 @@ type = HTML; } else if (arg.equals("-t") || arg.equals("--text")) { type = TEXT; + } else if (arg.equals("-T") || arg.equals("--text-main")) { + type = TEXT_MAIN; } else if (arg.equals("-m") || arg.equals("--metadata")) { type = METADATA; } else { @@ -171,6 +180,7 @@ metadata, context); } finally { input.close(); + System.out.flush(); } } } @@ -188,6 +198,7 @@ out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); out.println(" -t or --text Output plain text content"); + out.println(" -T or --text-main Output plain text content (main content only)"); out.println(" -m or --metadata Output only metadata"); out.println(); out.println("Description:");