Index: conf/schema.xml
===================================================================
--- conf/schema.xml (revision 1597753)
+++ conf/schema.xml (working copy)
@@ -38,6 +38,7 @@
omitNorms="true" positionIncrementGap="0"/>
+
@@ -117,6 +118,9 @@
+
+
+
id
content
Index: conf/schema-solr4.xml
===================================================================
--- conf/schema-solr4.xml (revision 1597753)
+++ conf/schema-solr4.xml (working copy)
@@ -31,8 +31,8 @@
+
-
@@ -348,6 +348,9 @@
+
+
+
id
text
Index: src/java/org/apache/nutch/indexer/IndexingJob.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingJob.java (revision 1597753)
+++ src/java/org/apache/nutch/indexer/IndexingJob.java (working copy)
@@ -72,11 +72,26 @@
index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
false);
}
-
+
public void index(Path crawlDb, Path linkDb, List segments,
boolean noCommit, boolean deleteGone, String params,
boolean filter, boolean normalize) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false);
+ }
+
+ public void index(Path crawlDb, Path linkDb, List segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent) throws IOException {
+ index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+ false, false, false);
+ }
+ public void index(Path crawlDb, Path linkDb, List segments,
+ boolean noCommit, boolean deleteGone, String params,
+ boolean filter, boolean normalize, boolean addBinaryContent,
+ boolean base64) throws IOException {
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("Indexer: starting at " + sdf.format(start));
@@ -86,12 +101,19 @@
LOG.info("Indexer: deleting gone documents: " + deleteGone);
LOG.info("Indexer: URL filtering: " + filter);
- LOG.info("Indexer: URL normalizing: " + normalize);
+ LOG.info("Indexer: URL normalizing: " + normalize);
+ if (addBinaryContent) {
+ if (base64) {
+ LOG.info("Indexer: adding binary content as Base64");
+ } else {
+ LOG.info("Indexer: adding binary content");
+ }
+ }
IndexWriters writers = new IndexWriters(getConf());
LOG.info(writers.describe());
- IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
+ IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
// NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
// job.set(SolrConstants.SERVER_URL, solrUrl);
@@ -99,7 +121,8 @@
job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
-
+ job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
+
if (params != null) {
job.set(IndexerMapReduce.INDEXER_PARAMS, params);
}
@@ -128,7 +151,7 @@
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err
- .println("Usage: Indexer [-linkdb ] [-params k1=v1&k2=v2...] ( ... | -dir ) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+ .println("Usage: Indexer [-linkdb ] [-params k1=v1&k2=v2...] ( ... | -dir ) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
IndexWriters writers = new IndexWriters(getConf());
System.err.println(writers.describe());
return -1;
@@ -144,6 +167,8 @@
boolean deleteGone = false;
boolean filter = false;
boolean normalize = false;
+ boolean addBinaryContent = false;
+ boolean base64 = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-linkdb")) {
@@ -165,6 +190,10 @@
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
+ } else if (args[i].equals("-addBinaryContent")) {
+ addBinaryContent = true;
+ } else if (args[i].equals("-base64")) {
+ base64 = true;
} else if (args[i].equals("-params")) {
params = args[++i];
} else {
@@ -174,7 +203,7 @@
try {
index(crawlDb, linkDb, segments, noCommit, deleteGone, params,
- filter, normalize);
+ filter, normalize, addBinaryContent, base64);
return 0;
} catch (final Exception e) {
LOG.error("Indexer: " + StringUtils.stringifyException(e));
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java (revision 1597753)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java (working copy)
@@ -47,8 +47,11 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
public class IndexerMapReduce extends Configured
implements Mapper,
@@ -62,10 +65,12 @@
public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
public static final String URL_FILTERING = "indexer.url.filters";
public static final String URL_NORMALIZING = "indexer.url.normalizers";
-
+ public static final String INDEXER_BINARY_AS_BASE64 = "indexer.binary.base64";
+
private boolean skip = false;
private boolean delete = false;
private boolean deleteRobotsNoIndex = false;
+ private boolean base64 = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -84,6 +89,7 @@
this.delete = job.getBoolean(INDEXER_DELETE, false);
this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+ this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
normalize = job.getBoolean(URL_NORMALIZING, false);
filter = job.getBoolean(URL_FILTERING, false);
@@ -165,6 +171,7 @@
OutputCollector output, Reporter reporter)
throws IOException {
Inlinks inlinks = null;
+ Content content = null;
CrawlDatum dbDatum = null;
CrawlDatum fetchDatum = null;
ParseData parseData = null;
@@ -209,6 +216,8 @@
}
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
+ } else if (value instanceof Content) {
+ content = (Content)value;
} else if (LOG.isWarnEnabled()) {
LOG.warn("Unrecognized type: "+value.getClass());
}
@@ -314,6 +323,18 @@
doc.setWeight(boost);
// store boost for use by explain and dedup
doc.add("boost", Float.toString(boost));
+
+ if (content != null) {
+ // Get the original unencoded content
+ String binary = new String(content.getContent());
+
+ // optionally encode as base64
+ if (base64) {
+ binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+ }
+
+ doc.add("binaryContent", binary);
+ }
reporter.incrCounter("IndexerStatus", "Documents added", 1);
@@ -325,7 +346,7 @@
public static void initMRJob(Path crawlDb, Path linkDb,
Collection segments,
- JobConf job) {
+ JobConf job, boolean addBinaryContent) {
LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
@@ -338,6 +359,10 @@
FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+ if (addBinaryContent) {
+ FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+ }
}
FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));