Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java (revision 168215)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java (working copy)
@@ -68,6 +68,10 @@
private int threadCount = // max number of threads
NutchConf.get().getInt("fetcher.threads.fetch", 10);
+ private static final float NEW_INJECTED_PAGE_SCORE =
+ NutchConf.get().getFloat("db.score.injected", 2.0f);
+ private static final int MAX_REDIRECT =
+ NutchConf.get().getInt("http.redirect.max", 3);
// All threads (FetcherThread or thread started by it) belong to
// group "fetcher". Each FetcherThread is named as "fetcherXX",
@@ -115,36 +119,64 @@
}
LOG.info("fetching " + url); // fetch the page
-
- Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getContent(url);
-
- handleFetch(url, fle, content);
-
- synchronized (Fetcher.this) { // update status
- pages++;
- bytes += content.getContent().length;
- if ((pages % 100) == 0) { // show status every 100pp
- status();
+
+ // support multiple redirects, if requested by protocol
+ // or content meta-tags (the latter requires running Fetcher
+ // in parsing mode). Protocol-level redirects take precedence over
+ // content-level redirects. Some plugins can handle redirects
+ // automatically, so that only the final success or failure will be
+ // shown here.
+ boolean refetch = false;
+ int redirCnt = 0;
+ do {
+ Protocol protocol = ProtocolFactory.getProtocol(url);
+ ProtocolOutput output = protocol.getProtocolOutput(fle);
+ ProtocolStatus pstat = output.getStatus();
+ System.out.println("pstat=" + pstat);
+ Content content = output.getContent();
+ switch(pstat.getCode()) {
+ case ProtocolStatus.SUCCESS:
+ if (content != null) {
+ synchronized (Fetcher.this) { // update status
+ pages++;
+ bytes += content.getContent().length;
+ if ((pages % 100) == 0) { // show status every 100pp
+ status();
+ }
+ }
+ ParseStatus ps = handleFetch(url, fle, content);
+ if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+ url = ps.getMessage();
+ if (url != null) {
+ refetch = true;
+ redirCnt++;
+ fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+ }
+ }
+ }
+ break;
+ case ProtocolStatus.TEMP_MOVED: // try to redirect immediately
+ url = pstat.getMessage();
+ if (url != null) {
+ refetch = true;
+ redirCnt++;
+ // XXX create new entry.
+ fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+ }
+ break;
+ case ProtocolStatus.GONE:
+ //logError(url, fle, e);
+ handleNoFetch(fle, FetcherOutput.NOT_FOUND);
+ break;
+ case ProtocolStatus.RETRY:
+ //logError(url, fle, e);
+ handleNoFetch(fle, FetcherOutput.RETRY);
+ break;
+ default:
+ throw new Exception("Unknown ProtocolStatus: " + pstat.getCode());
}
- }
- } catch (ResourceGone e) { // don't retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.NOT_FOUND);
+ } while (refetch && redirCnt < MAX_REDIRECT);
- // dealt with in handleFetch() below
- //} catch (ParseException e) { // don't retry
- // logError(url, fle, e);
- // handleNoFetch(fle, FetcherOutput.CANT_PARSE);
-
- } catch (RetryLater e) { // explicit retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.RETRY);
-
- } catch (ProtocolException e) { // implicit retry
- logError(url, fle, e);
- handleNoFetch(fle, FetcherOutput.RETRY);
-
} catch (Throwable t) { // an unchecked exception
if (fle != null) {
logError(url, fle, t); // retry?
@@ -176,33 +208,39 @@
}
}
- private void handleFetch(String url, FetchListEntry fle, Content content) {
+ private ParseStatus handleFetch(String url, FetchListEntry fle, Content content) {
if (!Fetcher.this.parsing) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
FetcherOutput.SUCCESS),
content, null, null);
- return;
+ return null;
}
- try {
String contentType = content.getContentType();
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.SUCCESS),
- content, new ParseText(parse.getText()), parse.getData());
- } catch (ParseException e) {
- // 20041026, xing
- // If fetching succeeds, but parsing fails, content should be saved
- // so that we can try to parse again in separate pass, possibly
- // using better/alternative parser.
- LOG.info("fetch okay, but can't parse " + url + ", reason: "
- + e.getMessage());
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.CANT_PARSE),
- content, new ParseText(""),
- new ParseData("", new Outlink[0], new Properties()));
- }
+ Parser parser = null;
+ Parse parse = null;
+ ParseStatus status = null;
+ try {
+ parser = ParserFactory.getParser(contentType, url);
+ parse = parser.getParse(content);
+ status = parse.getStatus();
+ } catch (Exception e) {
+ e.printStackTrace();
+ status = new ParseStatus(e);
+ }
+ if (status.isSuccess()) {
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ FetcherOutput.SUCCESS),
+ content, new ParseText(parse.getText()), parse.getData());
+ } else {
+ LOG.info("fetch okay, but can't parse " + url + ", reason: "
+ + status.toString());
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ FetcherOutput.CANT_PARSE),
+ content, new ParseText(""),
+ new ParseData("", new Outlink[0], new Properties()));
+ }
+ return status;
}
private void handleNoFetch(FetchListEntry fle, int status) {
@@ -429,7 +467,7 @@
}
// set log level
- fetcher.setLogLevel(Level.parse(logLevel.toUpperCase()));
+ setLogLevel(Level.parse(logLevel.toUpperCase()));
if (showThreadID) {
LogFormatter.setShowThreadIDs(showThreadID);
Index: src/java/org/apache/nutch/protocol/Protocol.java
===================================================================
--- src/java/org/apache/nutch/protocol/Protocol.java (revision 168215)
+++ src/java/org/apache/nutch/protocol/Protocol.java (working copy)
@@ -18,13 +18,21 @@
import java.io.IOException;
+import org.apache.nutch.pagedb.FetchListEntry;
+
/** A retriever of url content. Implemented by protocol extensions. */
public interface Protocol {
/** The name of the extension point. */
public final static String X_POINT_ID = Protocol.class.getName();
- /** Returns the {@link Content} for a url.
+ /** Returns the {@link Content} for a url. This method may be
+ * more limited than {@link #getProtocolOutput(FetchListEntry)}.
* @throws IOException for any errors.
*/
- Content getContent(String url) throws ProtocolException;
+ ProtocolOutput getProtocolOutput(String url);
+
+ /** Returns the {@link Content} for a fetchlist entry.
+ * @throws IOException for any errors.
+ */
+ ProtocolOutput getProtocolOutput(FetchListEntry fle);
}
Index: src/java/org/apache/nutch/protocol/ProtocolStatus.java
===================================================================
--- src/java/org/apache/nutch/protocol/ProtocolStatus.java (revision 0)
+++ src/java/org/apache/nutch/protocol/ProtocolStatus.java (revision 0)
@@ -0,0 +1,100 @@
+/*
+ * Created on May 4, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.protocol;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.nutch.io.Writable;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class ProtocolStatus implements Writable {
+
+ /** Content was retrieved without errors. */
+ public static final int SUCCESS = 1;
+ /** Content was not retrieved. Any further errors may be indicated in args. */
+ public static final int FAILED = 2;
+
+ /** This protocol was not found. Application may attempt to retry later. */
+ public static final int PROTO_NOT_FOUND = 10;
+ /** Resource is gone. */
+ public static final int GONE = 11;
+ /** Resource has moved permanently. */
+ public static final int MOVED = 12;
+ /** Resource has moved temporarily. */
+ public static final int TEMP_MOVED = 13;
+ /** Resource was not found. */
+ public static final int NOT_FOUND = 14;
+ /** Temporary failure. Application may retry immediately. */
+ public static final int RETRY = 15;
+ /** Unspecified exception occured. Further information may be provided in args. */
+ public static final int EXCEPTION = 16;
+
+ public static final ProtocolStatus STATUS_SUCCESS = new ProtocolStatus(SUCCESS);
+ public static final ProtocolStatus STATUS_FAILED = new ProtocolStatus(FAILED);
+
+ private int code;
+ private String[] args;
+
+ public ProtocolStatus(int code, String[] args) {
+ this.code = code;
+ this.args = args;
+ }
+
+ public ProtocolStatus(int code) {
+ this(code, null);
+ }
+
+ public ProtocolStatus(int code, Object message) {
+ this.code = code;
+ this.args = new String[]{String.valueOf(message)};
+ }
+
+ public ProtocolStatus(Throwable t) {
+ this(EXCEPTION, t);
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ // TODO Auto-generated method stub
+
+ }
+ public void write(DataOutput out) throws IOException {
+ // TODO Auto-generated method stub
+
+ }
+
+ public String[] getArgs() {
+ return args;
+ }
+
+ public int getCode() {
+ return code;
+ }
+
+ public String getMessage() {
+ if (args != null && args.length > 0) return args[0];
+ return null;
+ }
+
+ public String toString() {
+ StringBuffer res = new StringBuffer();
+ res.append("(" + code + ")");
+ if (args != null) {
+ if (args.length == 1) {
+ res.append(": " + String.valueOf(args[0]));
+ } else {
+ for (int i = 0; i < args.length; i++) {
+ if (args[i] != null)
+ res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+ }
+ }
+ }
+ return res.toString();
+ }
+}
Property changes on: src/java/org/apache/nutch/protocol/ProtocolStatus.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/java/org/apache/nutch/parse/ParserChecker.java
===================================================================
--- src/java/org/apache/nutch/parse/ParserChecker.java (revision 168215)
+++ src/java/org/apache/nutch/parse/ParserChecker.java (working copy)
@@ -67,7 +67,7 @@
LOG.info("fetching: "+url);
Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getContent(url);
+ Content content = protocol.getProtocolOutput(url).getContent();
if (force) {
content.setContentType(contentType);
Index: src/java/org/apache/nutch/parse/ParseData.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseData.java (revision 168215)
+++ src/java/org/apache/nutch/parse/ParseData.java (working copy)
@@ -21,7 +21,6 @@
import org.apache.nutch.io.*;
import org.apache.nutch.fs.*;
-import org.apache.nutch.util.*;
import org.apache.nutch.tools.UpdateDatabaseTool;
Index: src/java/org/apache/nutch/parse/HtmlParseFilter.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilter.java (revision 168215)
+++ src/java/org/apache/nutch/parse/HtmlParseFilter.java (working copy)
@@ -30,6 +30,5 @@
/** Adds metadata or otherwise modifies a parse of HTML content, given
* the DOM tree of a page. */
- Parse filter(Content content, Parse parse, DocumentFragment doc)
- throws ParseException;
+ Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
}
Index: src/java/org/apache/nutch/parse/HtmlParseFilters.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilters.java (revision 168215)
+++ src/java/org/apache/nutch/parse/HtmlParseFilters.java (working copy)
@@ -45,11 +45,11 @@
private HtmlParseFilters() {} // no public ctor
/** Run all defined filters. */
- public static Parse filter(Content content,Parse parse,DocumentFragment doc)
- throws ParseException {
+ public static Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
for (int i = 0 ; i < CACHE.length; i++) {
- parse = CACHE[i].filter(content, parse, doc);
+ parse = CACHE[i].filter(content, parse, metaTags, doc);
+ if (!parse.getStatus().isSuccess()) break;
}
return parse;
Index: src/java/org/apache/nutch/parse/Parse.java
===================================================================
--- src/java/org/apache/nutch/parse/Parse.java (revision 168215)
+++ src/java/org/apache/nutch/parse/Parse.java (working copy)
@@ -20,10 +20,14 @@
* @see Parser#getParse(FetcherOutput,Content)
*/
public interface Parse {
+
/** The textual content of the page. This is indexed, searched, and used when
* generating snippets.*/
String getText();
/** Other data extracted from the page. */
ParseData getData();
+
+ /** Status of parsing. */
+ ParseStatus getStatus();
}
Index: src/java/org/apache/nutch/parse/HTMLMetaTags.java
===================================================================
--- src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 0)
+++ src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 0)
@@ -0,0 +1,166 @@
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.net.URL;
+import java.util.Properties;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class HTMLMetaTags {
+ private boolean noIndex = false;
+
+ private boolean noFollow = false;
+
+ private boolean noCache = false;
+
+ private URL baseHref = null;
+
+ private boolean refresh = false;
+
+ private int refreshTime = 0;
+
+ private URL refreshHref = null;
+
+ private Properties generalTags = new Properties();
+
+ private Properties httpEquivTags = new Properties();
+
+ /**
+ * Sets all boolean values to false
. Clears all other tags.
+ */
+ public void reset() {
+ noIndex = false;
+ noFollow = false;
+ noCache = false;
+ refresh = false;
+ refreshTime = 0;
+ baseHref = null;
+ refreshHref = null;
+ generalTags.clear();
+ httpEquivTags.clear();
+ }
+
+ /**
+ * Sets noFollow
to true
.
+ */
+ public void setNoFollow() {
+ noFollow = true;
+ }
+
+ /**
+ * Sets noIndex
to true
.
+ */
+ public void setNoIndex() {
+ noIndex = true;
+ }
+
+ /**
+ * Sets noCache
to true
.
+ */
+ public void setNoCache() {
+ noCache = true;
+ }
+
+ /**
+ * Sets refresh
to true
.
+ */
+ public void setRefresh() {
+ refresh = true;
+ }
+
+ /**
+ * Sets the baseHref
.
+ */
+ public void setBaseHref(URL baseHref) {
+ this.baseHref = baseHref;
+ }
+
+ /**
+ * Sets the refreshHref
.
+ */
+ public void setRefreshHref(URL refreshHref) {
+ this.refreshHref = refreshHref;
+ }
+
+ /**
+ * Sets the refreshTime
.
+ */
+ public void setRefreshTime(int refreshTime) {
+ this.refreshTime = refreshTime;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noIndex
.
+ */
+ public boolean getNoIndex() {
+ return noIndex;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noFollow
.
+ */
+ public boolean getNoFollow() {
+ return noFollow;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noCache
.
+ */
+ public boolean getNoCache() {
+ return noCache;
+ }
+
+ /**
+ * A convenience method. Returns the current value of refresh
.
+ */
+ public boolean getRefresh() {
+ return refresh;
+ }
+
+ /**
+ * A convenience method. Returns the baseHref
, if set, or
+ * null
otherwise.
+ */
+ public URL getBaseHref() {
+ return baseHref;
+ }
+
+ /**
+ * A convenience method. Returns the refreshHref
, if set, or
+ * null
otherwise. The value may be invalid if
+ * {@link #getRefresh()}returns false
.
+ */
+ public URL getRefreshHref() {
+ return refreshHref;
+ }
+
+ /**
+ * A convenience method. Returns the current value of refreshTime
.
+ * The value may be invalid if {@link #getRefresh()}returns
+ * false
.
+ */
+ public int getRefreshTime() {
+ return refreshTime;
+ }
+
+ /**
+ * Returns all collected values of the general meta tags. Property names are
+ * tag names, property values are "content" values.
+ */
+ public Properties getGeneralTags() {
+ return generalTags;
+ }
+
+ /**
+ * Returns all collected values of the "http-equiv" meta tags. Property names
+ * are tag names, property values are "content" values.
+ */
+ public Properties getHttpEquivTags() {
+ return httpEquivTags;
+ }
+}
Index: src/java/org/apache/nutch/parse/ParseImpl.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseImpl.java (revision 168215)
+++ src/java/org/apache/nutch/parse/ParseImpl.java (working copy)
@@ -22,13 +22,23 @@
public class ParseImpl implements Parse {
private String text;
private ParseData data;
+ private ParseStatus status;
+ public ParseImpl(String text, ParseData data, ParseStatus status) {
+ this.text = text;
+ this.data = data;
+ this.status = status;
+ }
+
public ParseImpl(String text, ParseData data) {
this.text = text;
this.data = data;
+ this.status = new ParseStatus(ParseStatus.SUCCESS);
}
public String getText() { return text; }
public ParseData getData() { return data; }
+
+ public ParseStatus getStatus() { return status; }
}
Index: src/java/org/apache/nutch/parse/Parser.java
===================================================================
--- src/java/org/apache/nutch/parse/Parser.java (revision 168215)
+++ src/java/org/apache/nutch/parse/Parser.java (working copy)
@@ -27,5 +27,5 @@
public final static String X_POINT_ID = Parser.class.getName();
/** Creates the parse for some content. */
- Parse getParse(Content c) throws ParseException;
+ Parse getParse(Content c);
}
Index: src/java/org/apache/nutch/parse/ParseStatus.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseStatus.java (revision 0)
+++ src/java/org/apache/nutch/parse/ParseStatus.java (revision 0)
@@ -0,0 +1,179 @@
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableUtils;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class ParseStatus implements Writable {
+
+ // Primary status codes:
+
+ /** Parsing succeeded. */
+ public static final byte SUCCESS = 1;
+ /** General failure. There may be a more specific error message in arguments. */
+ public static final byte FAILED = 2;
+
+ public static final String[] majorCodes = {
+ "undefined",
+ "success",
+ "failed"
+ };
+
+ // Secondary success codes go here:
+
+ /** Parsed content contains a directive to redirect to another URL.
+ * The target URL can be retrieved from the arguments.
+ */
+ public static final short SUCCESS_REDIRECT = 100;
+
+ // Secondary failure codes go here:
+
+ /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */
+ public static final short FAILED_EXCEPTION = 200;
+ /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+ public static final short FAILED_TRUNCATED = 202;
+ /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+ public static final short FAILED_INVALID_FORMAT = 203;
+ /** Parsing failed. Other related parts of the content are needed to complete
+ * parsing. The list of URLs to missing parts may be provided in arguments.
+ * The Fetcher may decide to fetch these parts at once, then put them into
+ * Content.metadata, and supply them for re-parsing.
+ */
+ public static final short FAILED_MISSING_PARTS = 204;
+
+
+ public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+ public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+
+ private byte majorCode = 0;
+ private short minorCode = 0;
+ private String[] args = null;
+
+ public ParseStatus(int majorCode, int minorCode, String[] args) {
+ this.args = args;
+ this.majorCode = (byte)majorCode;
+ this.minorCode = (short)minorCode;
+ }
+
+ public ParseStatus(int majorCode) {
+ this(majorCode, 0, (String[])null);
+ }
+
+ public ParseStatus(int majorCode, String[] args) {
+ this(majorCode, 0, args);
+ }
+
+ public ParseStatus(int majorCode, int minorCode) {
+ this(majorCode, minorCode, (String[])null);
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, int minorCode, String message) {
+ this(majorCode, minorCode, new String[]{message});
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, String message) {
+ this(majorCode, 0, new String[]{message});
+ }
+
+ public ParseStatus(Throwable t) {
+ this(FAILED, FAILED_EXCEPTION, new String[]{t.toString()});
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ }
+
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(majorCode);
+ out.writeShort(minorCode);
+ if (args != null && args.length > 0)
+ WritableUtils.writeCompressedStringArray(out, args);
+ else out.writeInt(0);
+ }
+
+ /** A convenience method. Returns true if majorCode is SUCCESS, false
+ * otherwise.
+ */
+
+ public boolean isSuccess() {
+ return majorCode == SUCCESS;
+ }
+
+ /** A convenience method. Return a String representation of the first
+ * argument, or null.
+ */
+ public String getMessage() {
+ if (args != null && args.length > 0 && args[0] != null)
+ return args[0].toString();
+ return null;
+ }
+
+ public String[] getArgs() {
+ return args;
+ }
+
+ public int getMajorCode() {
+ return majorCode;
+ }
+
+ public int getMinorCode() {
+ return minorCode;
+ }
+
+ /** A convenience method. Creates an empty Parse instance,
+ * which returns this status.
+ */
+ public Parse getEmptyParse() {
+ return new EmptyParseImpl(this);
+ }
+
+ public String toString() {
+ StringBuffer res = new StringBuffer();
+ res.append(majorCodes[majorCode] + "(" + majorCode + "," + minorCode + ")");
+ if (args != null) {
+ if (args.length == 1) {
+ res.append(": " + String.valueOf(args[0]));
+ } else {
+ for (int i = 0; i < args.length; i++) {
+ if (args[i] != null)
+ res.append(", args[" + i + "]=" + String.valueOf(args[i]));
+ }
+ }
+ }
+ return res.toString();
+ }
+}
+
+class EmptyParseImpl implements Parse {
+
+ private ParseStatus status = null;
+
+ public EmptyParseImpl(ParseStatus status) {
+ this.status = status;
+ }
+
+ public ParseData getData() {
+ return null;
+ }
+
+ public ParseStatus getStatus() {
+ return status;
+ }
+
+ public String getText() {
+ return null;
+ }
+}
+
Index: src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (revision 168215)
+++ src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (working copy)
@@ -64,7 +64,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
parser = ParserFactory.getParser(content.getContentType(), urlString);
parse = parser.getParse(content);
Index: src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (revision 168215)
+++ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (working copy)
@@ -27,6 +27,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -79,13 +80,13 @@
rootLogger.addAppender(appender);
}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/pdf"))
- throw new ParseException(
- "Content-Type not application/pdf: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/pdf: " + contentType).getEmptyParse();
// in memory representation of pdf file
PDDocument pdf = null;
@@ -100,8 +101,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete pdf file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
}
PDFParser parser = new PDFParser(
@@ -134,14 +136,15 @@
// formatDate(info.getCreationDate())
// formatDate(info.getModificationDate())
- } catch (ParseException e) {
- throw e;
} catch (CryptographyException e) {
- throw new ParseException("Error decrypting document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Error decrypting document. " + e).getEmptyParse();
} catch (InvalidPasswordException e) {
- throw new ParseException("Can't decrypt document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't decrypt document - invalid password. " + e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as pdf document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as pdf document. " + e).getEmptyParse();
} finally {
try {
if (pdf != null)
@@ -165,7 +168,7 @@
metadata.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(title, outlinks, metadata);
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text, parseData, ParseStatus.STATUS_SUCCESS);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Index: src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (revision 168215)
+++ src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (working copy)
@@ -64,7 +64,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
parser = ParserFactory.getParser(content.getContentType(), urlString);
parse = parser.getParse(content);
Index: src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (revision 168215)
+++ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (working copy)
@@ -18,6 +18,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -52,13 +53,13 @@
public MSWordParser () {}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/msword"))
- throw new ParseException(
- "Content-Type not application/msword: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/msword: " + contentType).getEmptyParse();
String text = null;
String title = null;
@@ -71,8 +72,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete msword file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + raw.length
+ +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
}
WordExtractor extractor = new WordExtractor();
@@ -86,13 +88,14 @@
extractor = null;
} catch (ParseException e) {
- throw e;
+ return new ParseStatus(e).getEmptyParse();
} catch (FastSavedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (PasswordProtectedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as msword document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as msword document. " + e).getEmptyParse();
} finally {
// nothing so far
}
@@ -117,7 +120,7 @@
Outlink[] outlinks = new Outlink[0];
ParseData parseData = new ParseData(title, outlinks, metadata);
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text, parseData, ParseStatus.STATUS_SUCCESS);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (revision 168215)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (working copy)
@@ -14,9 +14,9 @@
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
+import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseException;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.*;
@@ -38,8 +38,7 @@
*
robotsMeta
to appropriate
+ * values, based on any META tags found under the given
+ * node
.
+ */
+ public static final void getMetaTags (
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ metaTags.reset();
+ getMetaTagsHelper(metaTags, node, currURL);
+ }
+
+ private static final void getMetaTagsHelper(
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("BODY".equals(node.getNodeName())) {
+ // META tags should not be under body
+ return;
+ }
+
+ if ("META".equals(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node nameNode = attrs.getNamedItem("name");
+ Node equivNode = attrs.getNamedItem("http-equiv");
+ Node contentNode = attrs.getNamedItem("content");
+
+ if (nameNode != null) {
+ if (contentNode != null) {
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ if ("robots".equals(name)) {
+
+ if (contentNode != null) {
+ String directives =
+ contentNode.getNodeValue().toLowerCase();
+ int index = directives.indexOf("none");
+
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ metaTags.setNoFollow();
+ }
+
+ index = directives.indexOf("all");
+ if (index >= 0) {
+ // do nothing...
+ }
+
+ index = directives.indexOf("noindex");
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ }
+
+ index = directives.indexOf("nofollow");
+ if (index >= 0) {
+ metaTags.setNoFollow();
+ }
+ }
+
+ } // end if (name == robots)
+ }
+ }
+
+ if (equivNode != null) {
+ if (contentNode != null) {
+ String name = equivNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue();
+ metaTags.getHttpEquivTags().setProperty(name, content);
+ if ("pragma".equals(name)) {
+ content = content.toLowerCase();
+ int index = content.indexOf("no-cache");
+ if (index >= 0)
+ metaTags.setNoCache();
+ } else if ("refresh".equals(name)) {
+ int idx = content.indexOf(';');
+ String time = null;
+ if (idx == -1) { // just the refresh time
+ time = content;
+ } else time = content.substring(0, idx);
+ try {
+ metaTags.setRefreshTime(Integer.parseInt(time));
+ // skip this if we couldn't parse the time
+ metaTags.setRefresh();
+ } catch (Exception e) {
+ ;
+ }
+ if (metaTags.getRefresh() && idx != -1) { // set the URL
+ idx = content.indexOf("URL=");
+ if (idx != -1) {
+ String url = content.substring(idx + 4);
+ URL refreshUrl = null;
+ try {
+ refreshUrl = new URL(url);
+ } catch (Exception e) {
+ try {
+ refreshUrl = new URL(currURL, url);
+ } catch (Exception e1) {
+ ;
+ }
+ }
+ metaTags.setRefreshHref(refreshUrl);
+ }
+ }
+ }
+ }
+ }
+
+ } else if ("BASE".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+
+ if (hrefNode != null) {
+ String urlString = hrefNode.getNodeValue();
+
+ URL url = null;
+ try {
+ if (currURL == null)
+ url = new URL(urlString);
+ else
+ url = new URL(currURL, urlString);
+ } catch (Exception e) {
+ ;
+ }
+
+ if (url != null)
+ metaTags.setBaseHref(url);
+ }
+
+ }
+
+ }
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ getMetaTagsHelper(metaTags, children.item(i), currURL);
+ }
+ }
+ }
+
+}
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (revision 168215)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java (working copy)
@@ -16,7 +16,7 @@
package org.apache.nutch.protocol.http;
-import java.io.*;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.InetAddress;
import java.net.UnknownHostException;
@@ -28,13 +28,15 @@
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.db.Page;
+import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.*;
-/** An implementation of the Http protocol. */
+/** An implementation of the Https protocol. */
public class Http implements Protocol {
public static final Logger LOG =
- LogFormatter.getLogger("org.apache.nutch.net.Http");
+ LogFormatter.getLogger("org.apache.nutch.net.Https");
static {
if (NutchConf.get().getBoolean("http.verbose", false))
@@ -170,8 +172,19 @@
}
}
- public Content getContent(String urlString) throws ProtocolException {
+ public ProtocolOutput getProtocolOutput(String urlString) {
+ ProtocolOutput output = null;
try {
+ return getProtocolOutput(new FetchListEntry(true,
+ new Page(urlString, 1.0f), new String[0]));
+ } catch (MalformedURLException mue) {
+ return new ProtocolOutput(null, new ProtocolStatus(mue));
+ }
+ }
+
+ public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+ String urlString = fle.getUrl().toString();
+ try {
URL url = new URL(urlString);
int redirects = 0;
@@ -191,10 +204,10 @@
int code = response.getCode();
if (code == 200) { // got a good response
- return response.toContent(); // return it
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code == 410) { // page is gone
- throw new ResourceGone(url, "Http: " + code);
+ throw new ResourceGone(url, "Https: " + code);
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
@@ -207,8 +220,8 @@
throw new HttpError(code);
}
}
- } catch (IOException e) {
- throw new HttpException(e);
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
@@ -259,7 +272,7 @@
boolean verbose = false;
String url = null;
- String usage = "Usage: Http [-verbose] [-timeout N] url";
+ String usage = "Usage: Https [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
@@ -285,7 +298,7 @@
LOG.setLevel(Level.FINE);
}
- Content content = http.getContent(url);
+ Content content = http.getProtocolOutput(url).getContent();
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: " + content.get("Content-Length"));
Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
===================================================================
--- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (revision 168215)
+++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (working copy)
@@ -247,7 +247,7 @@
ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
while (!doneChunks) {
- Http.LOG.fine("Http: starting chunk");
+ Http.LOG.fine("Https: starting chunk");
readLine(in, line, false);
Index: src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
===================================================================
--- src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (revision 168215)
+++ src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (working copy)
@@ -17,24 +17,24 @@
package org.apache.nutch.protocol.file;
+import org.apache.nutch.db.Page;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.util.LogFormatter;
import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.pagedb.FetchListEntry;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.net.MalformedURLException;
import java.net.URL;
-import java.io.InputStream;
-// 20040528, xing, disabled for now
-//import java.io.Reader;
-import java.io.IOException;
-
/************************************
* File.java deals with file: scheme.
*
@@ -65,10 +65,21 @@
}
/** Set the point at which content is truncated. */
- public void setMaxContentLength(int length) {this.maxContentLength = length;}
+ public void setMaxContentLength(int length) {maxContentLength = length;}
- public Content getContent(String urlString) throws FileException {
+ public ProtocolOutput getProtocolOutput(String urlString) {
+ ProtocolOutput output = null;
try {
+ return getProtocolOutput(new FetchListEntry(true,
+ new Page(urlString, 1.0f), new String[0]));
+ } catch (MalformedURLException mue) {
+ return new ProtocolOutput(null, new ProtocolStatus(mue));
+ }
+ }
+
+ public ProtocolOutput getProtocolOutput(FetchListEntry fle) {
+ String urlString = fle.getUrl().toString();
+ try {
URL url = new URL(urlString);
int redirects = 0;
@@ -80,7 +91,7 @@
int code = response.getCode();
if (code == 200) { // got a good response
- return response.toContent(); // return it
+ return new ProtocolOutput(response.toContent()); // return it
} else if (code >= 300 && code < 400) { // handle redirect
if (redirects == MAX_REDIRECTS)
@@ -94,8 +105,8 @@
throw new FileError(code);
}
}
- } catch (IOException e) {
- throw new FileException(e);
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
@@ -139,7 +150,7 @@
// set log level
LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = file.getContent(urlString);
+ Content content = file.getProtocolOutput(urlString).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.get("Content-Length"));