Index: src/java/org/apache/nutch/parse/HtmlParseFilter.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilter.java (revision 537657)
+++ src/java/org/apache/nutch/parse/HtmlParseFilter.java (working copy)
@@ -38,5 +38,5 @@
/** Adds metadata or otherwise modifies a parse of HTML content, given
* the DOM tree of a page. */
- Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
+ ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc);
}
Index: src/java/org/apache/nutch/parse/ParseResult.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseResult.java (revision 537657)
+++ src/java/org/apache/nutch/parse/ParseResult.java (working copy)
@@ -123,7 +123,7 @@
public Iterator> iterator() {
return parseMap.entrySet().iterator();
}
-
+
/**
* Remove all results where status is not successful (as determined
* by {@link ParseStatus#isSuccess()}). Note that effects of this operation
@@ -137,6 +137,19 @@
i.remove();
}
}
-
}
+
+ /**
+ * A convenience method which returns true only if all parses are successful.
+ * Parse success is determined by {@link ParseStatus#isSuccess()}
+ */
+ public boolean isSuccess() {
+ for(Iterator> i = iterator(); i.hasNext();) {
+ Entry entry = i.next();
+ if (!entry.getValue().getData().getStatus().isSuccess()) {
+ return false;
+ }
+ }
+ return true;
+ }
}
Index: src/java/org/apache/nutch/parse/HtmlParseFilters.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilters.java (revision 537657)
+++ src/java/org/apache/nutch/parse/HtmlParseFilters.java (working copy)
@@ -57,20 +57,26 @@
}
/** Run all defined filters. */
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
- ParseResult filteredParseResult = new ParseResult(content.getUrl());
-
- for (java.util.Map.Entry entry : parseResult) {
- Parse parse = entry.getValue();
- for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
- parse = this.htmlParseFilters[i].filter(content, parse, metaTags, doc);
- if (!parse.getData().getStatus().isSuccess()) break;
- }
- filteredParseResult.put(entry.getKey(),
- new ParseText(parse.getText()), parse.getData());
+ // loop on each filter
+ for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
+ // call filter interface
+ parseResult =
+ htmlParseFilters[i].filter(content, parseResult, metaTags, doc);
+
+ // any failure on parse obj, return
+ if (!parseResult.isSuccess()) {
+ // TODO: What happens when parseResult.isEmpty() ?
+ // Maybe clone parseResult and use parseResult as backup...
+
+ // remove failed parse before return
+ parseResult.filter();
+ return parseResult;
+ }
}
- return filteredParseResult;
+ return parseResult;
}
}
Index: src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
===================================================================
--- src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (revision 537657)
+++ src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (working copy)
@@ -34,13 +34,14 @@
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.*;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.StringUtil;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
/**
@@ -63,8 +64,11 @@
/**
* Scan the HTML document looking at possible rel-tags
*/
- public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+ // get parse obj
+ Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set tags = parser.getRelTags();
@@ -73,7 +77,7 @@
while (iter.hasNext()) {
metadata.add(REL_TAG, (String) iter.next());
}
- return parse;
+ return parseResult;
}
private static class Parser {
Index: src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
===================================================================
--- src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (revision 537657)
+++ src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (working copy)
@@ -36,11 +36,13 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.oro.text.regex.MatchResult;
import org.apache.oro.text.regex.Pattern;
import org.apache.oro.text.regex.PatternCompiler;
@@ -69,7 +71,11 @@
private Configuration conf;
- public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
+
+ Parse parse = parseResult.get(content.getUrl());
+
String url = content.getBaseUrl();
ArrayList outlinks = new ArrayList();
walk(doc, parse, metaTags, url, outlinks);
@@ -85,9 +91,11 @@
parse.getData().getContentMeta(),
parse.getData().getParseMeta());
parseData.setConf(this.conf);
- parse = new ParseImpl(text, parseData);
+
+ // replace original parse obj with new one
+ parseResult.put(content.getUrl(), new ParseText(text), parseData);
}
- return parse;
+ return parseResult;
}
private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List outlinks) {
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (revision 537657)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (working copy)
@@ -29,12 +29,13 @@
// Nutch imports
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.*;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
// DOM imports
import org.w3c.dom.DocumentFragment;
@@ -84,8 +85,10 @@
* 3. meta http-equiv (content-language) (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2)
*
Only the first occurence of language is stored.
*/
- public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
-
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+
+ Parse parse = parseResult.get(content.getUrl());
+
// Trying to find the document's language
LanguageParser parser = new LanguageParser(doc);
String lang = parser.getLanguage();
@@ -93,7 +96,7 @@
if (lang != null) {
parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
}
- return parse;
+ return parseResult;
}
static class LanguageParser {
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (revision 537657)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.metadata.Metadata;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -263,24 +264,28 @@
/** Adds metadata or otherwise modifies a parse of an HTML document, given
* the DOM tree of a page. */
- public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ // get parse obj
+ Parse parse = parseResult.get(content.getUrl());
+
// construct base url
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return parseResult;
}
try {
// extract license metadata
Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
} catch (ParseException e) {
- return new ParseStatus(e).getEmptyParse(getConf());
+ return parseResult;
}
- return parse;
+
+ return parseResult;
}
public void setConf(Configuration conf) {