Index: src/java/org/apache/nutch/fetcher/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/fetcher/Fetcher.java (revision 164986)
+++ src/java/org/apache/nutch/fetcher/Fetcher.java (working copy)
@@ -68,6 +68,8 @@
private int threadCount = // max number of threads
NutchConf.get().getInt("fetcher.threads.fetch", 10);
+ private static final float NEW_INJECTED_PAGE_SCORE =
+ NutchConf.get().getFloat("db.score.injected", 2.0f);
// All threads (FetcherThread or thread started by it) belong to
// group "fetcher". Each FetcherThread is named as "fetcherXX",
@@ -115,19 +117,31 @@
}
LOG.info("fetching " + url); // fetch the page
+
+ // support multiple redirects, if requested by content meta directives
+ // NOTE: this requires running Fetcher in parsing mode!
+ boolean refetch = false;
+ do {
+ Protocol protocol = ProtocolFactory.getProtocol(url);
+ // XXX (ab) refactor to check protocol status
+ Content content = protocol.getContent(url);
+ if (content != null) {
+ synchronized (Fetcher.this) { // update status
+ pages++;
+ bytes += content.getContent().length;
+ if ((pages % 100) == 0) { // show status every 100pp
+ status();
+ }
+ }
+ }
+ ParseStatus ps = handleFetch(url, fle, content);
+ if (ps != null && ps.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+ refetch = true;
+ url = ps.getMessage();
+ fle = new FetchListEntry(true, new Page(url, NEW_INJECTED_PAGE_SCORE), new String[0]);
+ }
+ } while (refetch);
- Protocol protocol = ProtocolFactory.getProtocol(url);
- Content content = protocol.getContent(url);
-
- handleFetch(url, fle, content);
-
- synchronized (Fetcher.this) { // update status
- pages++;
- bytes += content.getContent().length;
- if ((pages % 100) == 0) { // show status every 100pp
- status();
- }
- }
} catch (ResourceGone e) { // don't retry
logError(url, fle, e);
handleNoFetch(fle, FetcherOutput.NOT_FOUND);
@@ -176,33 +190,38 @@
}
}
- private void handleFetch(String url, FetchListEntry fle, Content content) {
+ private ParseStatus handleFetch(String url, FetchListEntry fle, Content content) {
if (!Fetcher.this.parsing) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
FetcherOutput.SUCCESS),
content, null, null);
- return;
+ return null;
}
- try {
String contentType = content.getContentType();
- Parser parser = ParserFactory.getParser(contentType, url);
- Parse parse = parser.getParse(content);
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.SUCCESS),
- content, new ParseText(parse.getText()), parse.getData());
- } catch (ParseException e) {
- // 20041026, xing
- // If fetching succeeds, but parsing fails, content should be saved
- // so that we can try to parse again in separate pass, possibly
- // using better/alternative parser.
- LOG.info("fetch okay, but can't parse " + url + ", reason: "
- + e.getMessage());
- outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
- FetcherOutput.CANT_PARSE),
- content, new ParseText(""),
- new ParseData("", new Outlink[0], new Properties()));
- }
+ Parser parser = null;
+ Parse parse = null;
+ ParseStatus status = null;
+ try {
+ parser = ParserFactory.getParser(contentType, url);
+ parser.getParse(content);
+ status = parse.getParseStatus();
+ } catch (Exception e) {
+ status = new ParseStatus(e);
+ }
+ if (status.isSuccess()) {
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ FetcherOutput.SUCCESS),
+ content, new ParseText(parse.getText()), parse.getData());
+ } else {
+ LOG.info("fetch okay, but can't parse " + url + ", reason: "
+ + status.toString());
+ outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
+ FetcherOutput.CANT_PARSE),
+ content, new ParseText(""),
+ new ParseData("", new Outlink[0], new Properties()));
+ }
+ return status;
}
private void handleNoFetch(FetchListEntry fle, int status) {
Index: src/java/org/apache/nutch/parse/HtmlParseFilter.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilter.java (revision 164986)
+++ src/java/org/apache/nutch/parse/HtmlParseFilter.java (working copy)
@@ -30,6 +30,5 @@
/** Adds metadata or otherwise modifies a parse of HTML content, given
* the DOM tree of a page. */
- Parse filter(Content content, Parse parse, DocumentFragment doc)
- throws ParseException;
+ Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc);
}
Index: src/java/org/apache/nutch/parse/HtmlParseFilters.java
===================================================================
--- src/java/org/apache/nutch/parse/HtmlParseFilters.java (revision 164986)
+++ src/java/org/apache/nutch/parse/HtmlParseFilters.java (working copy)
@@ -45,11 +45,11 @@
private HtmlParseFilters() {} // no public ctor
/** Run all defined filters. */
- public static Parse filter(Content content,Parse parse,DocumentFragment doc)
- throws ParseException {
+ public static Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) {
for (int i = 0 ; i < CACHE.length; i++) {
- parse = CACHE[i].filter(content, parse, doc);
+ parse = CACHE[i].filter(content, parse, metaTags, doc);
+ if (!parse.getParseStatus().isSuccess()) break;
}
return parse;
Index: src/java/org/apache/nutch/parse/Parse.java
===================================================================
--- src/java/org/apache/nutch/parse/Parse.java (revision 164986)
+++ src/java/org/apache/nutch/parse/Parse.java (working copy)
@@ -20,10 +20,14 @@
* @see Parser#getParse(FetcherOutput,Content)
*/
public interface Parse {
+
/** The textual content of the page. This is indexed, searched, and used when
* generating snippets.*/
String getText();
/** Other data extracted from the page. */
ParseData getData();
+
+ /** Status of parsing. */
+ ParseStatus getParseStatus();
}
Index: src/java/org/apache/nutch/parse/HTMLMetaTags.java
===================================================================
--- src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 0)
+++ src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 0)
@@ -0,0 +1,166 @@
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.parse;
+
+import java.net.URL;
+import java.util.Properties;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class HTMLMetaTags {
+ private boolean noIndex = false;
+
+ private boolean noFollow = false;
+
+ private boolean noCache = false;
+
+ private URL baseHref = null;
+
+ private boolean refresh = false;
+
+ private int refreshTime = 0;
+
+ private URL refreshHref = null;
+
+ private Properties generalTags = new Properties();
+
+ private Properties httpEquivTags = new Properties();
+
+ /**
+ * Sets all boolean values to false
. Clears all other tags.
+ */
+ public void reset() {
+ noIndex = false;
+ noFollow = false;
+ noCache = false;
+ refresh = false;
+ refreshTime = 0;
+ baseHref = null;
+ refreshHref = null;
+ generalTags.clear();
+ httpEquivTags.clear();
+ }
+
+ /**
+ * Sets noFollow
to true
.
+ */
+ public void setNoFollow() {
+ noFollow = true;
+ }
+
+ /**
+ * Sets noIndex
to true
.
+ */
+ public void setNoIndex() {
+ noIndex = true;
+ }
+
+ /**
+ * Sets noCache
to true
.
+ */
+ public void setNoCache() {
+ noCache = true;
+ }
+
+ /**
+ * Sets refresh
to true
.
+ */
+ public void setRefresh() {
+ refresh = true;
+ }
+
+ /**
+ * Sets the baseHref
.
+ */
+ public void setBaseHref(URL baseHref) {
+ this.baseHref = baseHref;
+ }
+
+ /**
+ * Sets the refreshHref
.
+ */
+ public void setRefreshHref(URL refreshHref) {
+ this.refreshHref = refreshHref;
+ }
+
+ /**
+ * Sets the refreshTime
.
+ */
+ public void setRefreshTime(int refreshTime) {
+ this.refreshTime = refreshTime;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noIndex
.
+ */
+ public boolean getNoIndex() {
+ return noIndex;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noFollow
.
+ */
+ public boolean getNoFollow() {
+ return noFollow;
+ }
+
+ /**
+ * A convenience method. Returns the current value of noCache
.
+ */
+ public boolean getNoCache() {
+ return noCache;
+ }
+
+ /**
+ * A convenience method. Returns the current value of refresh
.
+ */
+ public boolean getRefresh() {
+ return refresh;
+ }
+
+ /**
+ * A convenience method. Returns the baseHref
, if set, or
+ * null
otherwise.
+ */
+ public URL getBaseHref() {
+ return baseHref;
+ }
+
+ /**
+ * A convenience method. Returns the refreshHref
, if set, or
+ * null
otherwise. The value may be invalid if
+ * {@link #getRefresh()}returns false
.
+ */
+ public URL getRefreshHref() {
+ return refreshHref;
+ }
+
+ /**
+ * A convenience method. Returns the current value of refreshTime
.
+ * The value may be invalid if {@link #getRefresh()}returns
+ * false
.
+ */
+ public int getRefreshTime() {
+ return refreshTime;
+ }
+
+ /**
+ * Returns all collected values of the general meta tags. Property names are
+ * tag names, property values are "content" values.
+ */
+ public Properties getGeneralTags() {
+ return generalTags;
+ }
+
+ /**
+ * Returns all collected values of the "http-equiv" meta tags. Property names
+ * are tag names, property values are "content" values.
+ */
+ public Properties getHttpEquivTags() {
+ return httpEquivTags;
+ }
+}
Index: src/java/org/apache/nutch/parse/ParseImpl.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseImpl.java (revision 164986)
+++ src/java/org/apache/nutch/parse/ParseImpl.java (working copy)
@@ -22,13 +22,23 @@
public class ParseImpl implements Parse {
private String text;
private ParseData data;
+ private ParseStatus status;
+ public ParseImpl(String text, ParseData data, ParseStatus status) {
+ this.text = text;
+ this.data = data;
+ this.status = status;
+ }
+
public ParseImpl(String text, ParseData data) {
this.text = text;
this.data = data;
+ this.status = new ParseStatus(ParseStatus.SUCCESS);
}
public String getText() { return text; }
public ParseData getData() { return data; }
+
+ public ParseStatus getParseStatus() { return status; }
}
Index: src/java/org/apache/nutch/parse/Parser.java
===================================================================
--- src/java/org/apache/nutch/parse/Parser.java (revision 164986)
+++ src/java/org/apache/nutch/parse/Parser.java (working copy)
@@ -27,5 +27,5 @@
public final static String X_POINT_ID = Parser.class.getName();
/** Creates the parse for some content. */
- Parse getParse(Content c) throws ParseException;
+ Parse getParse(Content c);
}
Index: src/java/org/apache/nutch/parse/ParseStatus.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseStatus.java (revision 0)
+++ src/java/org/apache/nutch/parse/ParseStatus.java (revision 0)
@@ -0,0 +1,161 @@
+/*
+ * Created on Apr 28, 2005
+ * Author: Andrzej Bialecki <ab@getopt.org>
+ *
+ */
+package org.apache.nutch.parse;
+
+/**
+ * @author Andrzej Bialecki <ab@getopt.org>
+ */
+public class ParseStatus {
+
+ // Primary status codes:
+
+ /** Parsing succeeded. */
+ public static final int SUCCESS = 1;
+ /** General failure. There may be a more specific error message in arguments. */
+ public static final int FAILED = 2;
+
+ public static final String[] majorCodes = {
+ "undefined",
+ "success",
+ "failed"
+ };
+
+ // Secondary success codes go here:
+
+ /** Parsed content contains a directive to redirect to another URL.
+ * The target URL can be retrieved from the arguments.
+ */
+ public static final int SUCCESS_REDIRECT = 100;
+
+ // Secondary failure codes go here:
+
+ /** Parsing failed. An Exception occured (which may be retrieved from the arguments). */
+ public static final int FAILED_EXCEPTION = 200;
+ /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+ public static final int FAILED_TRUNCATED = 202;
+ /** Parsing failed. Invalid format - the content may be corrupted or of wrong type. */
+ public static final int FAILED_INVALID_FORMAT = 203;
+ /** Parsing failed. Other related parts of the content are needed to complete
+ * parsing. The list of URLs to missing parts may be provided in arguments.
+ * The Fetcher may decide to fetch these parts at once, then put them into
+ * Content.metadata, and supply them for re-parsing.
+ */
+ public static final int FAILED_MISSING_PARTS = 204;
+
+
+ public static final ParseStatus STATUS_SUCCESS = new ParseStatus(SUCCESS);
+ public static final ParseStatus STATUS_FAILURE = new ParseStatus(FAILED);
+
+ private int majorCode = 0;
+ private int minorCode = 0;
+ private Object[] args = null;
+
+ public ParseStatus(int majorCode, int minorCode, Object[] args) {
+ this.args = args;
+ this.majorCode = majorCode;
+ this.minorCode = minorCode;
+ }
+
+ public ParseStatus(int majorCode) {
+ this(majorCode, 0, (Object[])null);
+ }
+
+ public ParseStatus(int majorCode, Object[] args) {
+ this(majorCode, 0, args);
+ }
+
+ public ParseStatus(int majorCode, int minorCode) {
+ this(majorCode, minorCode, (Object[])null);
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, int minorCode, String message) {
+ this(majorCode, minorCode, new Object[]{message});
+ }
+
+ /** Simplified constructor for passing just a text message. */
+ public ParseStatus(int majorCode, String message) {
+ this(majorCode, 0, new Object[]{message});
+ }
+
+ public ParseStatus(Throwable t) {
+ this(FAILED, FAILED_EXCEPTION, new Object[]{t});
+ }
+
+ /** A convenience method. Returns true if majorCode is SUCCESS, false
+ * otherwise.
+ */
+
+ public boolean isSuccess() {
+ return majorCode == SUCCESS;
+ }
+
+ /** A convenience method. Return a String representation of the first
+ * argument, or null.
+ */
+ public String getMessage() {
+ if (args != null && args.length > 0 && args[0] != null)
+ return args[0].toString();
+ return null;
+ }
+
+ public Object[] getArgs() {
+ return args;
+ }
+
+ public int getMajorCode() {
+ return majorCode;
+ }
+
+ public int getMinorCode() {
+ return minorCode;
+ }
+
+ /** A convenience method. Creates an empty Parse instance,
+ * which returns this status.
+ */
+ public Parse getEmptyParse() {
+ return new EmptyParseImpl(this);
+ }
+
+ public String toString() {
+ StringBuffer res = new StringBuffer();
+ res.append(majorCodes[majorCode] + "(" + majorCode + "," + minorCode + ")");
+ if (args != null) {
+ if (args.length == 1) {
+ res.append(": " + args[0].toString());
+ } else {
+ for (int i = 0; i < args.length; i++) {
+ if (args[i] != null)
+ res.append(", args[" + i + "]=" + args[i].toString());
+ }
+ }
+ }
+ return res.toString();
+ }
+}
+
+class EmptyParseImpl implements Parse {
+
+ private ParseStatus status = null;
+
+ public EmptyParseImpl(ParseStatus status) {
+ this.status = status;
+ }
+
+ public ParseData getData() {
+ return null;
+ }
+
+ public ParseStatus getParseStatus() {
+ return status;
+ }
+
+ public String getText() {
+ return null;
+ }
+}
+
Index: src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
===================================================================
--- src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (revision 164986)
+++ src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (working copy)
@@ -27,6 +27,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -79,13 +80,13 @@
rootLogger.addAppender(appender);
}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/pdf"))
- throw new ParseException(
- "Content-Type not application/pdf: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/pdf: " + contentType).getEmptyParse();
// in memory representation of pdf file
PDDocument pdf = null;
@@ -100,8 +101,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete pdf file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at "+raw.length
+ +" bytes. Parser can't handle incomplete pdf file.").getEmptyParse();
}
PDFParser parser = new PDFParser(
@@ -134,14 +136,15 @@
// formatDate(info.getCreationDate())
// formatDate(info.getModificationDate())
- } catch (ParseException e) {
- throw e;
} catch (CryptographyException e) {
- throw new ParseException("Error decrypting document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Error decrypting document. " + e).getEmptyParse();
} catch (InvalidPasswordException e) {
- throw new ParseException("Can't decrypt document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't decrypt document - invalid password. " + e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as pdf document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as pdf document. " + e).getEmptyParse();
} finally {
try {
if (pdf != null)
@@ -165,7 +168,7 @@
metadata.putAll(content.getMetadata()); // copy through
ParseData parseData = new ParseData(title, outlinks, metadata);
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text, parseData, ParseStatus.STATUS_SUCCESS);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Index: src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java
===================================================================
--- src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (revision 164986)
+++ src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/MSWordParser.java (working copy)
@@ -18,6 +18,7 @@
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -52,13 +53,13 @@
public MSWordParser () {}
- public Parse getParse(Content content) throws ParseException {
+ public Parse getParse(Content content) {
// check that contentType is one we can handle
String contentType = content.getContentType();
if (contentType != null && !contentType.startsWith("application/msword"))
- throw new ParseException(
- "Content-Type not application/msword: "+contentType);
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not application/msword: " + contentType).getEmptyParse();
String text = null;
String title = null;
@@ -71,8 +72,9 @@
String contentLength = content.get("Content-Length");
if (contentLength != null
&& raw.length != Integer.parseInt(contentLength)) {
- throw new ParseException("Content truncated at "+raw.length
- +" bytes. Parser can't handle incomplete msword file.");
+ return new ParseStatus(ParseStatus.FAILED, ParseStatus.FAILED_TRUNCATED,
+ "Content truncated at " + raw.length
+ +" bytes. Parser can't handle incomplete msword file.").getEmptyParse();
}
WordExtractor extractor = new WordExtractor();
@@ -86,13 +88,14 @@
extractor = null;
} catch (ParseException e) {
- throw e;
+ return new ParseStatus(e).getEmptyParse();
} catch (FastSavedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (PasswordProtectedException e) {
- throw new ParseException(e);
+ return new ParseStatus(e).getEmptyParse();
} catch (Exception e) { // run time exception
- throw new ParseException("Can't be handled as msword document. "+e);
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as msword document. " + e).getEmptyParse();
} finally {
// nothing so far
}
@@ -117,7 +120,7 @@
Outlink[] outlinks = new Outlink[0];
ParseData parseData = new ParseData(title, outlinks, metadata);
- return new ParseImpl(text, parseData);
+ return new ParseImpl(text, parseData, ParseStatus.STATUS_SUCCESS);
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
}
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (revision 164986)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (working copy)
@@ -14,9 +14,9 @@
* limitations under the License.
*/
package org.apache.nutch.analysis.lang;
+import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseException;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.*;
@@ -38,8 +38,7 @@
*
robotsMeta
to appropriate
+ * values, based on any META tags found under the given
+ * node
.
+ */
+ public static final void getMetaTags (
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ metaTags.reset();
+ getMetaTagsHelper(metaTags, node, currURL);
+ }
+
+ private static final void getMetaTagsHelper(
+ HTMLMetaTags metaTags, Node node, URL currURL) {
+
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+
+ if ("BODY".equals(node.getNodeName())) {
+ // META tags should not be under body
+ return;
+ }
+
+ if ("META".equals(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node nameNode = attrs.getNamedItem("name");
+ Node equivNode = attrs.getNamedItem("http-equiv");
+ Node contentNode = attrs.getNamedItem("content");
+
+ if (nameNode != null) {
+ if (contentNode != null) {
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ if ("robots".equals(name)) {
+
+ if (contentNode != null) {
+ String directives =
+ contentNode.getNodeValue().toLowerCase();
+ int index = directives.indexOf("none");
+
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ metaTags.setNoFollow();
+ }
+
+ index = directives.indexOf("all");
+ if (index >= 0) {
+ // do nothing...
+ }
+
+ index = directives.indexOf("noindex");
+ if (index >= 0) {
+ metaTags.setNoIndex();
+ }
+
+ index = directives.indexOf("nofollow");
+ if (index >= 0) {
+ metaTags.setNoFollow();
+ }
+ }
+
+ } // end if (name == robots)
+ }
+ }
+
+ if (equivNode != null) {
+ if (contentNode != null) {
+ String name = equivNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue();
+ metaTags.getHttpEquivTags().setProperty(name, content);
+ if ("pragma".equals(name)) {
+ content = content.toLowerCase();
+ int index = content.indexOf("no-cache");
+ if (index >= 0)
+ metaTags.setNoCache();
+ } else if ("refresh".equals(name)) {
+ int idx = content.indexOf(';');
+ String time = null;
+ if (idx == -1) { // just the refresh time
+ time = content;
+ } else time = content.substring(0, idx);
+ try {
+ metaTags.setRefreshTime(Integer.parseInt(time));
+ // skip this if we couldn't parse the time
+ metaTags.setRefresh();
+ } catch (Exception e) {
+ ;
+ }
+ if (metaTags.getRefresh() && idx != -1) { // set the URL
+ idx = content.indexOf("URL=");
+ if (idx != -1) {
+ String url = content.substring(idx + 4);
+ URL refreshUrl = null;
+ try {
+ refreshUrl = new URL(url);
+ } catch (Exception e) {
+ try {
+ refreshUrl = new URL(currURL, url);
+ } catch (Exception e1) {
+ ;
+ }
+ }
+ metaTags.setRefreshHref(refreshUrl);
+ }
+ }
+ }
+ }
+ }
+
+ } else if ("BASE".equalsIgnoreCase(node.getNodeName())) {
+ NamedNodeMap attrs = node.getAttributes();
+ Node hrefNode = attrs.getNamedItem("href");
+
+ if (hrefNode != null) {
+ String urlString = hrefNode.getNodeValue();
+
+ URL url = null;
+ try {
+ if (currURL == null)
+ url = new URL(urlString);
+ else
+ url = new URL(currURL, urlString);
+ } catch (Exception e) {
+ ;
+ }
+
+ if (url != null)
+ metaTags.setBaseHref(url);
+ }
+
+ }
+
+ }
+
+ NodeList children = node.getChildNodes();
+ if (children != null) {
+ int len = children.getLength();
+ for (int i = 0; i < len; i++) {
+ getMetaTagsHelper(metaTags, children.item(i), currURL);
+ }
+ }
+ }
+
+}