Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml (revision 1567870)
+++ src/plugin/build.xml (working copy)
@@ -91,6 +91,7 @@
+
Index: src/plugin/headings/build.xml
===================================================================
--- src/plugin/headings/build.xml (revision 1567870)
+++ src/plugin/headings/build.xml (working copy)
@@ -19,4 +19,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
Index: src/plugin/headings/sample/testHeaders.html
===================================================================
--- src/plugin/headings/sample/testHeaders.html (revision 0)
+++ src/plugin/headings/sample/testHeaders.html (revision 0)
@@ -0,0 +1,8 @@
+
+
+
+
+This is a test head h1
+This is a test head h2
+
+
Index: src/plugin/headings/sample/testMultivalueHeaders.html
===================================================================
--- src/plugin/headings/sample/testMultivalueHeaders.html (revision 0)
+++ src/plugin/headings/sample/testMultivalueHeaders.html (revision 0)
@@ -0,0 +1,10 @@
+
+
+
+
+Test header h1
+Test header h1 too
+Test header h2
+Test header h2 too
+Test header h2 with span
+
\ No newline at end of file
Index: src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
===================================================================
--- src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java (revision 0)
+++ src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java (revision 0)
@@ -0,0 +1,100 @@
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.headings;
+
+import java.util.Set;
+import java.util.TreeSet;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestHeadingsParseFilter extends TestCase {
+ private String fileSeparator = System.getProperty("file.separator");
+ private String sampleDir = System.getProperty("test.data", ".");
+ private String sampleFile = "testHeaders.html";
+ private String sampleFileMultival = "testMultivalueHeaders.html";
+
+ public TestHeadingsParseFilter(String name) {
+ super(name);
+ }
+
+ public Metadata parseMeta(String fileName, Configuration conf) {
+ System.out.println("TT:"+conf.get("plugin.folders"));
+ Metadata metadata = null;
+ try {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+ metadata = parse.getData().getParseMeta();
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.toString());
+ }
+ return metadata;
+ }
+
+ public void testIt() {
+ Configuration conf = NutchConfiguration.create();
+
+ // check that we get the same values
+ Metadata parseMeta= parseMeta(sampleFile, conf);
+
+ assertEquals("This is a test head h1", parseMeta.get("h1"));
+ assertEquals("This is a test head h2", parseMeta.get("h2"));
+ }
+
+ public void testMultiValueMetatags() {
+ Configuration conf = NutchConfiguration.create();
+ conf.setBoolean("headings.multivalued", true);
+
+ Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+ String failMessage = "One value of metatag with multiple values is missing: ";
+
+ Set valueSet = new TreeSet();
+ for (String val : parseMeta.getValues("h1")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues1 = {"Test header h1", "Test header h1 too"};
+ for (String val : expectedValues1) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
+
+ valueSet.clear();
+ for (String val : parseMeta.getValues("h2")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues2 = {"Test header h2", "Test header h2 too", "Test header h2 with span"};
+ for (String val : expectedValues2) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
+ }
+}
Index: src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
===================================================================
--- src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (revision 1567870)
+++ src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (working copy)
@@ -19,7 +19,8 @@
import java.util.ArrayList;
import java.util.List;
-import java.util.regex.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
@@ -27,21 +28,55 @@
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
-import org.w3c.dom.*;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
/**
* HtmlParseFilter to retrieve h1 and h2 values from the DOM.
*/
public class HeadingsParseFilter implements HtmlParseFilter {
-
+ private Configuration conf;
+
/**
* Pattern used to strip surpluss whitespace
*/
protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-
- private Configuration conf;
+
+ /**
+ * List of headings to collect
+ */
private String[] headings;
+
+ /**
+ * Whether we are multi valued, e.g. collect multiple instances of the same heading element
+ */
private boolean multiValued = false;
+
+ /**
+ * Limit on the number of instances collected per heading element
+ */
+ private int limit = 16;
+
+ /**
+ * Maximum length of a heading
+ */
+ private int maxLength = 256;
+
+ /**
+ * Minimum length of a heading
+ */
+ private int minLength = 2;
+
+ /**
+ * Whether to ignore headings that are within hyperlinks
+ */
+ private boolean ignoreHyperlinks = true;
+
+ /**
+ * Whether to truncate long headings to skip them.
+ */
+ private boolean truncate = true;
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
@@ -70,6 +105,15 @@
headings = conf.getStrings("headings");
multiValued = conf.getBoolean("headings.multivalued", false);
+ limit = conf.getInt("headings.limit", 32);
+ maxLength = conf.getInt("headings.maxlength", 256);
+ minLength = conf.getInt("headings.minlength", 2);
+ truncate = conf.getBoolean("headings.truncate", true);
+ ignoreHyperlinks = conf.getBoolean("headings.ignore.hyperlinks", true);
+
+ if (!multiValued) {
+ limit = 1;
+ }
}
public Configuration getConf() {
@@ -85,16 +129,24 @@
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
+
+ if (ignoreHyperlinks && currentNode.getNodeName().equalsIgnoreCase("a")) {
+ walker.skipChildren();
+ continue;
+ }
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
+
if (element.equalsIgnoreCase(currentNode.getNodeName())) {
- headings.add(getNodeValue(currentNode));
-
- // Check for multiValued here, if disabled we don't need
- // to discover more headings.
- if (!multiValued) {
+ if (headings.size() >= limit) {
break;
}
+
+ String heading = getNodeValue(currentNode);
+
+ if (heading != null) {
+ headings.add(heading);
+ }
}
}
}
@@ -105,19 +157,39 @@
/**
* Returns the text value of the specified Node and child nodes
*/
- protected static String getNodeValue(Node node) {
+ protected String getNodeValue(Node node) {
+ NodeWalker walker = new NodeWalker(node);
StringBuilder buffer = new StringBuilder();
- NodeList children = node.getChildNodes();
-
- for (int i = 0; i < children.getLength(); i++) {
- if (children.item(i).getNodeType() == Node.TEXT_NODE) {
- buffer.append(children.item(i).getNodeValue());
+ while (walker.hasNext()) {
+ Node currentNode = walker.nextNode();
+
+ NodeList children = currentNode.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ if (children.item(i).getNodeType() == Node.TEXT_NODE) {
+ buffer.append(children.item(i).getNodeValue());
+ }
}
+
+ buffer.append(" ");
}
// Return with stripped surplus whitespace
Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
- return matcher.replaceAll(" ").trim();
+ String result = matcher.replaceAll(" ").trim();
+
+ if (result.length() < minLength) {
+ return null;
+ }
+
+ if (result.length() > maxLength) {
+ if (truncate) {
+ return result.substring(0, maxLength);
+ } else {
+ return null;
+ }
+ }
+
+ return result;
}
}