Index: src/java/org/apache/nutch/metadata/Nutch.java
===================================================================
--- src/java/org/apache/nutch/metadata/Nutch.java (revision 424960)
+++ src/java/org/apache/nutch/metadata/Nutch.java (working copy)
@@ -29,5 +29,20 @@
public static final String CHAR_ENCODING_FOR_CONVERSION =
"CharEncodingForConversion";
-
+
+ /** Sites may request that search engines don't provide access to cached documents. */
+ public static final String CACHING_FORBIDDEN =
+ "CachingForbidden";
+
+ /** Show both original content and summaries. */
+ public static final String CACHING_FORBIDDEN_NONE =
+ "none";
+
+ /** Don't show either original content or summaries. */
+ public static final String CACHING_FORBIDDEN_ALL =
+ "all";
+
+ /** Don't show original content, but show summaries. */
+ public static final String CACHING_FORBIDDEN_CONTENT =
+ "content";
}
Index: src/web/jsp/search.jsp
===================================================================
--- src/web/jsp/search.jsp (revision 424960)
+++ src/web/jsp/search.jsp (working copy)
@@ -8,6 +8,7 @@
import="java.net.*"
import="org.apache.nutch.html.Entities"
+ import="org.apache.nutch.metadata.Nutch"
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.plugin.*"
import="org.apache.nutch.clustering.*"
@@ -212,6 +213,9 @@
String url = detail.getValue("url");
String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
String summary = summaries[i].toHtml(true);
+ String caching = detail.getValue("cache");
+ boolean showSummary = caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_ALL);
+ boolean showCached = caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE);
if (title == null || title.equals("")) { // use url for docs w/o title
title = url;
@@ -219,12 +223,16 @@
%>
<%=Entities.encode(title)%>
<%@ include file="more.jsp" %>
- <% if (!"".equals(summary)) { %>
+ <% if (!"".equals(summary) && showSummary) { %>
<%=summary%>
<% } %>
<%=Entities.encode(url)%>
- ()
+ <%
+ if (showCached) {
+ %>() <%
+ }
+ %>
(&lang=<%=queryLang%>">)
()
<% if (hit.moreFromDupExcluded()) {
Index: src/web/jsp/cached.jsp
===================================================================
--- src/web/jsp/cached.jsp (revision 424960)
+++ src/web/jsp/cached.jsp (working copy)
@@ -7,6 +7,7 @@
import="org.apache.nutch.searcher.*"
import="org.apache.nutch.parse.ParseData"
import="org.apache.nutch.metadata.Metadata"
+ import="org.apache.nutch.metadata.Nutch"
import="org.apache.hadoop.conf.Configuration"
import="org.apache.nutch.util.NutchConfiguration"
%><%
@@ -66,6 +67,17 @@
FIXME: have to sanitize 'content' : e.g. removing unncessary part
of head elememt
-->
+<%
+ String caching = details.getValue("cache");
+ String url = details.getValue("url");
+ if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+%>
+Display of this content was administratively prohibited by the webmaster.
+You may visit the original page instead: <%=url%>.
+<%
+ return;
+ }
+%>
<% if (contentType.startsWith("text/html")) {%>
<% if (content != null && !content.equals("")) {%>
Index: src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
===================================================================
--- src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (revision 424960)
+++ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (working copy)
@@ -22,6 +22,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
@@ -87,6 +88,11 @@
}
// add title indexed and stored so that it can be displayed
doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+
+ String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN);
+ if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+ doc.add(new Field("cache", caching, Field.Store.YES, Field.Index.NO));
+ }
return doc;
}
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (revision 424960)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (working copy)
@@ -32,6 +32,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.*;
@@ -99,6 +100,8 @@
private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
+
+ private String cachingPolicy;
public Parse getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
@@ -201,10 +204,6 @@
}
}
- if (!metaTags.getNoCache()) { // okay to cache
- // ??? FIXME ???
- }
-
ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
if (metaTags.getRefresh()) {
status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
@@ -216,7 +215,11 @@
Parse parse = new ParseImpl(text, parseData);
// run filters on parse
- return this.htmlParseFilters.filter(content, parse, metaTags, root);
+ parse = this.htmlParseFilters.filter(content, parse, metaTags, root);
+ if (metaTags.getNoCache()) { // not okay to cache
+ parse.getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN, cachingPolicy);
+ }
+ return parse;
}
private DocumentFragment parse(InputSource input) throws Exception {
@@ -300,6 +303,7 @@
this.parserImpl = getConf().get("parser.html.impl", "neko");
this.defaultCharEncoding = getConf().get(
"parser.character.encoding.default", "windows-1252");
+ this.cachingPolicy = getConf().get("parser.html.caching.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
this.utils = new DOMContentUtils(conf);
}
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (revision 424960)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (working copy)
@@ -104,6 +104,11 @@
if (index >= 0) {
metaTags.setNoFollow();
}
+
+ index = directives.indexOf("noarchive");
+ if (index >= 0) {
+ metaTags.setNoCache();
+ }
}
} // end if (name == robots)