### Eclipse Workspace Patch 1.0
#P tika-parsers
Index: src/test/resources/test-documents/boilerplate.html
===================================================================
--- src/test/resources/test-documents/boilerplate.html (revision 0)
+++ src/test/resources/test-documents/boilerplate.html (revision 0)
@@ -0,0 +1,41 @@
+
+
+
+
+
+ Title
+
+
+
+
+
+This is the real meat of the page,
+and represents the text we want.
+It has lots of juicy content.
+
+We assume that it won't get filtered out.
+And that all of the lines will be in the
+output.
+
+
+
+Here's another paragraph of text.
+This is the end of the text.
+
+
+footer
+
+
+
Property changes on: src/test/resources/test-documents/boilerplate.html
___________________________________________________________________
Added: svn:mime-type
+ text/html
Added: svn:eol-style
+ native
Index: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (revision 0)
+++ src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (revision 0)
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+
+/**
+ * Uses the boilerpipe
+ * library to automatically extract the main content from a web page.
+ *
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+ private ContentHandler delegate;
+ private BoilerpipeExtractor extractor;
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the
+ * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+ *
+ * @param delegate
+ * The {@link ContentHandler} object
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate) {
+ this(delegate, DefaultExtractor.INSTANCE);
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BoilerpipeContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the given
+ * extraction rules. The extracted main content will be passed to the
+ * content handler.
+ *
+ * @param delegate
+ * The {@link ContentHandler} object
+ * @param extractor
+ * Extraction rules to use, e.g. {@link ArticleExtractor}
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+ this.delegate = delegate;
+ this.extractor = extractor;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+
+ TextDocument td = toTextDocument();
+ try {
+ extractor.process(td);
+ } catch (BoilerpipeProcessingException e) {
+ throw new SAXException(e);
+ }
+
+ delegate.startDocument();
+
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", null);
+ char[] chars = block.getText().toCharArray();
+ delegate.characters(chars, 0, chars.length);
+ delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ }
+ }
+
+ delegate.endDocument();
+ }
+}
Property changes on: src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:eol-style
+ native
Index: src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (revision 961850)
+++ src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (working copy)
@@ -31,6 +31,7 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -377,4 +378,24 @@
assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
}
+ /**
+ * Test case for TIKA-420
+ * @see TIKA-420
+ */
+ public void testBoilerplateRemoval() throws Exception {
+ String path = "/test-documents/boilerplate.html";
+
+ Metadata metadata = new Metadata();
+ WriteOutContentHandler handler = new WriteOutContentHandler();
+ new HtmlParser().parse(
+ HtmlParserTest.class.getResourceAsStream(path),
+ new BoilerpipeContentHandler(handler), metadata, new ParseContext());
+
+ String content = handler.toString();
+ assertTrue(content.startsWith("This is the real meat"));
+ assertTrue(content.endsWith("This is the end of the text."));
+ assertFalse(content.contains("boilerplate"));
+ assertFalse(content.contains("footer"));
+ }
+
}
Index: pom.xml
===================================================================
--- pom.xml (revision 961850)
+++ pom.xml (working copy)
@@ -38,6 +38,16 @@
3.7-beta1
+
+
+
+ maven2-repository.dev.java.net
+ Java.net Repository for Maven
+ http://download.java.net/maven/2/
+ default
+
+
+
${project.groupId}
@@ -123,6 +133,11 @@
2.4.0-beta-1
+ de.l3s.boilerpipe
+ boilerpipe
+ 1.0.4
+
+
junit
junit
test
#P tika-app
Index: src/main/java/org/apache/tika/cli/TikaCLI.java
===================================================================
--- src/main/java/org/apache/tika/cli/TikaCLI.java (revision 961850)
+++ src/main/java/org/apache/tika/cli/TikaCLI.java (working copy)
@@ -43,6 +43,7 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
@@ -88,6 +89,12 @@
}
};
+ private final OutputType TEXT_MAIN = new OutputType() {
+ public ContentHandler getContentHandler() throws Exception {
+ return new BoilerpipeContentHandler(getSystemOutWriter(encoding));
+ }
+ };
+
private final OutputType METADATA = new OutputType() {
public ContentHandler getContentHandler() throws Exception {
final PrintWriter writer =
@@ -146,6 +153,8 @@
type = HTML;
} else if (arg.equals("-t") || arg.equals("--text")) {
type = TEXT;
+ } else if (arg.equals("-T") || arg.equals("--text-main")) {
+ type = TEXT_MAIN;
} else if (arg.equals("-m") || arg.equals("--metadata")) {
type = METADATA;
} else {
@@ -171,6 +180,7 @@
metadata, context);
} finally {
input.close();
+ System.out.flush();
}
}
}
@@ -188,6 +198,7 @@
out.println(" -x or --xml Output XHTML content (default)");
out.println(" -h or --html Output HTML content");
out.println(" -t or --text Output plain text content");
+ out.println(" -T or --text-main Output plain text content (main content only)");
out.println(" -m or --metadata Output only metadata");
out.println();
out.println("Description:");