();
+ }
+ fieldValues.add(value);
+ fields.put(name, fieldValues);
+ }
+
+ /** Adds a lucene field.
+ *
+ * This method is provided for backward-compatibility with
+ * older indexing filters. This should not be used by newer
+ * implementations since this is slower than
+ * {@link NutchDocument#add(String, String)} and will be removed
+ * in a future release.
+ *
+ * @param f Lucene field to be added.
+ * @deprecated Use {@link NutchDocument#add(String, String)} instead and
+ * set index-level metadata for field information.
+ * */
+ public void add(Field f) {
+ String fieldName = f.name();
+ String key = LuceneConstants.FIELD_PREFIX + fieldName;
+ if (f.isStored()) {
+ documentMeta.add(key, LuceneConstants.STORE_YES);
+ } else {
+ documentMeta.add(key, LuceneConstants.STORE_NO);
+ }
+
+ if (f.isIndexed()) {
+ if (f.isTokenized()) {
+ documentMeta.add(key, LuceneConstants.INDEX_TOKENIZED);
+ } else {
+ documentMeta.add(key, LuceneConstants.INDEX_UNTOKENIZED);
+ }
+ } else {
+ documentMeta.add(key, LuceneConstants.INDEX_NO);
+ }
+ }
+
+ private void addFieldUnprotected(String name, String value) {
+ fields.get(name).add(value);
+ }
+
+ public String getFieldValue(String name) {
+ List fieldValues = fields.get(name);
+ if (fieldValues == null) {
+ return null;
+ }
+ if (fieldValues.size() == 0) {
+ return null;
+ }
+ return fieldValues.get(0);
+ }
+
+ public List getFieldValues(String name) {
+ return fields.get(name);
+ }
+
+ public List removeField(String name) {
+ return fields.remove(name);
+ }
+
+ public Collection getFieldNames() {
+ return fields.keySet();
+ }
+
+ /** Iterate over all fields. */
+ public Iterator>> fieldIterator() {
+ return fields.entrySet().iterator();
+ }
+
+ public float getScore() {
+ return score;
+ }
+
+ public void setScore(float score) {
+ this.score = score;
+ }
+
+ public Metadata getDocumentMeta() {
+ return documentMeta;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ byte version = in.readByte();
+ if (version != VERSION) {
+ throw new VersionMismatchException(VERSION, version);
+ }
+ int size = WritableUtils.readVInt(in);
+ for (int i = 0; i < size; i++) {
+ String name = Text.readString(in);
+ int numValues = WritableUtils.readVInt(in);
+ fields.put(name, new ArrayList());
+ for (int j = 0; j < numValues; j++) {
+ String value = Text.readString(in);
+ addFieldUnprotected(name, value);
+ }
+ }
+ score = in.readFloat();
+ documentMeta.readFields(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ out.writeByte(VERSION);
+ WritableUtils.writeVInt(out, fields.size());
+ for (Map.Entry> entry : fields.entrySet()) {
+ Text.writeString(out, entry.getKey());
+ List values = entry.getValue();
+ WritableUtils.writeVInt(out, values.size());
+ for (String value : values) {
+ Text.writeString(out, value);
+ }
+ }
+ out.writeFloat(score);
+ documentMeta.write(out);
+ }
+
+}
Index: src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java
===================================================================
--- src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (revision 0)
+++ src/java/org/apache/nutch/indexer/lucene/LuceneWriter.java (revision 0)
@@ -0,0 +1,170 @@
+package org.apache.nutch.indexer.lucene;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.nutch.analysis.AnalyzerFactory;
+import org.apache.nutch.analysis.NutchAnalyzer;
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.indexer.Indexer;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchIndexWriter;
+import org.apache.nutch.indexer.NutchSimilarity;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.LogUtil;
+
+public class LuceneWriter implements NutchIndexWriter {
+
+ private IndexWriter writer;
+
+ private AnalyzerFactory analyzerFactory;
+
+ private Path perm;
+
+ private Path temp;
+
+ private FileSystem fs;
+
+ private Map fieldStore;
+
+ private Map fieldIndex;
+
+ public LuceneWriter() {
+ fieldStore = new HashMap();
+ fieldIndex = new HashMap();
+ }
+
+ private Document createLuceneDoc(NutchDocument doc) {
+ Document out = new Document();
+
+ out.setBoost(doc.getScore());
+
+ Iterator>> iterator = doc.fieldIterator();
+ Metadata documentMeta = doc.getDocumentMeta();
+ while (iterator.hasNext()) {
+ Map.Entry> entry = iterator.next();
+ String fieldName = entry.getKey();
+
+ Field.Store store = fieldStore.get(fieldName);
+ Field.Index index = fieldIndex.get(fieldName);
+
+ // default values
+ if (store == null) {
+ store = Field.Store.NO;
+ }
+
+ if (index == null) {
+ index = Field.Index.TOKENIZED;
+ }
+
+ // read document-level field information
+ String[] fieldMetas =
+ documentMeta.getValues(LuceneConstants.FIELD_PREFIX + fieldName);
+ if (fieldMetas.length != 0) {
+ for (String val : fieldMetas) {
+ if (LuceneConstants.STORE_YES.equals(val)) {
+ store = Field.Store.YES;
+ } else if (LuceneConstants.STORE_NO.equals(val)) {
+ store = Field.Store.NO;
+ } else if (LuceneConstants.INDEX_TOKENIZED.equals(val)) {
+ index = Field.Index.TOKENIZED;
+ } else if (LuceneConstants.INDEX_NO.equals(val)) {
+ index = Field.Index.NO;
+ } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(val)) {
+ index = Field.Index.UN_TOKENIZED;
+ } else if (LuceneConstants.INDEX_NO_NORMS.equals(val)) {
+ index = Field.Index.NO_NORMS;
+ }
+ }
+ }
+
+ for (String fieldValue : entry.getValue()) {
+ out.add(new Field(fieldName, fieldValue, store, index));
+ }
+ }
+
+ return out;
+ }
+
+ private void processIndexMeta(Metadata indexMeta) {
+ String[] names = indexMeta.names();
+ for (String key : names) {
+ if (!key.startsWith(LuceneConstants.LUCENE_PREFIX)) {
+ continue;
+ }
+ String[] hints = indexMeta.getValues(key);
+ String fieldName =
+ key.substring(LuceneConstants.FIELD_PREFIX.length());
+ for (String hint : hints) {
+ if (LuceneConstants.STORE_YES.equals(hint)) {
+ fieldStore.put(fieldName, Field.Store.YES);
+ } else if (LuceneConstants.STORE_NO.equals(hint)) {
+ fieldStore.put(fieldName, Field.Store.NO);
+ } else if (LuceneConstants.INDEX_TOKENIZED.equals(hint)) {
+ fieldIndex.put(fieldName, Field.Index.TOKENIZED);
+ } else if (LuceneConstants.INDEX_NO.equals(hint)) {
+ fieldIndex.put(fieldName, Field.Index.NO);
+ } else if (LuceneConstants.INDEX_UNTOKENIZED.equals(hint)) {
+ fieldIndex.put(fieldName, Field.Index.UN_TOKENIZED);
+ } else if (LuceneConstants.INDEX_NO_NORMS.equals(hint)) {
+ fieldIndex.put(fieldName, Field.Index.NO_NORMS);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ writer.optimize();
+ writer.close();
+ fs.completeLocalOutput(perm, temp); // copy to dfs
+ fs.createNewFile(new Path(perm, Indexer.DONE_NAME));
+ }
+
+ @Override
+ public void open(Configuration conf, Metadata indexMeta)
+ throws IOException {
+ this.fs = FileSystem.get(conf);
+ perm = new Path(indexMeta.get(LuceneConstants.OUTPUT_DIR));
+ temp = new Path(indexMeta.get(LuceneConstants.TEMP_OUTPUT_DIR));
+
+ fs.delete(perm); // delete old, if any
+ analyzerFactory = new AnalyzerFactory(conf);
+ writer = new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
+ new NutchDocumentAnalyzer(conf), true);
+
+ writer.setMergeFactor(conf.getInt("indexer.mergeFactor", 10));
+ writer.setMaxBufferedDocs(conf.getInt("indexer.minMergeDocs", 100));
+ writer.setMaxMergeDocs(conf
+ .getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
+ writer.setTermIndexInterval(conf.getInt("indexer.termIndexInterval", 128));
+ writer.setMaxFieldLength(conf.getInt("indexer.max.tokens", 10000));
+ writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG));
+ writer.setUseCompoundFile(false);
+ writer.setSimilarity(new NutchSimilarity());
+
+ processIndexMeta(indexMeta);
+ }
+
+ @Override
+ public void write(NutchDocument doc) throws IOException {
+ Document luceneDoc = createLuceneDoc(doc);
+ NutchAnalyzer analyzer = analyzerFactory.get(luceneDoc.get("lang"));
+ if (Indexer.LOG.isDebugEnabled()) {
+ Indexer.LOG.debug("Indexing [" + luceneDoc.get("url")
+ + "] with analyzer " + analyzer + " (" + luceneDoc.get("lang")
+ + ")");
+ }
+ writer.addDocument(luceneDoc, analyzer);
+
+ }
+}
Index: src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java
===================================================================
--- src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java (revision 0)
+++ src/java/org/apache/nutch/indexer/lucene/LuceneConstants.java (revision 0)
@@ -0,0 +1,23 @@
+package org.apache.nutch.indexer.lucene;
+
+public interface LuceneConstants {
+ public static final String LUCENE_PREFIX = "lucene.";
+
+ public static final String FIELD_PREFIX = LUCENE_PREFIX + "field.";
+
+ public static final String OUTPUT_DIR = LUCENE_PREFIX + "output.dir";
+
+ public static final String TEMP_OUTPUT_DIR = LUCENE_PREFIX + "tmp.dir";
+
+ public static final String STORE_YES = "store.yes";
+
+ public static final String STORE_NO = "store.no";
+
+ public static final String INDEX_NO = "index.no";
+
+ public static final String INDEX_NO_NORMS = "index.no_norms";
+
+ public static final String INDEX_TOKENIZED = "index.tokenized";
+
+ public static final String INDEX_UNTOKENIZED = "index.untokenized";
+}
Index: src/java/org/apache/nutch/indexer/IndexingFilters.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingFilters.java (revision 557522)
+++ src/java/org/apache/nutch/indexer/IndexingFilters.java (working copy)
@@ -24,8 +24,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.lucene.document.Document;
-
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.plugin.*;
import org.apache.nutch.parse.Parse;
import org.apache.hadoop.conf.Configuration;
@@ -41,7 +40,9 @@
public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
private IndexingFilter[] indexingFilters;
-
+
+ private Metadata indexMeta = null;
+
public IndexingFilters(Configuration conf) {
/* Get indexingfilter.order property */
String order = conf.get(INDEXINGFILTER_ORDER);
@@ -104,7 +105,7 @@
}
/** Run all defined filters. */
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum,
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
Inlinks inlinks) throws IndexingException {
for (int i = 0; i < this.indexingFilters.length; i++) {
doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
@@ -114,4 +115,15 @@
return doc;
}
+
+ /** Get index-level metadata. */
+ public Metadata getIndexMeta() {
+ if (indexMeta == null) {
+ indexMeta = new Metadata();
+ for (IndexingFilter filter : indexingFilters) {
+ filter.addIndexMeta(indexMeta);
+ }
+ }
+ return indexMeta;
+ }
}
Index: src/java/org/apache/nutch/indexer/Indexer.java
===================================================================
--- src/java/org/apache/nutch/indexer/Indexer.java (revision 557522)
+++ src/java/org/apache/nutch/indexer/Indexer.java (working copy)
@@ -25,17 +25,16 @@
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.parse.*;
-import org.apache.nutch.analysis.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
-import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -45,116 +44,107 @@
import org.apache.nutch.crawl.LinkDb;
import org.apache.nutch.crawl.NutchWritable;
-import org.apache.lucene.index.*;
-import org.apache.lucene.document.*;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
+import org.apache.nutch.indexer.lucene.LuceneWriter;
+import org.apache.nutch.indexer.solr.SolrConstants;
+import org.apache.nutch.indexer.solr.SolrWriter;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
/** Create indexes for segments. */
public class Indexer extends ToolBase implements Reducer, Mapper {
-
+
public static final String DONE_NAME = "index.done";
public static final Log LOG = LogFactory.getLog(Indexer.class);
- /** A utility class used to pass a lucene document from Indexer.reduce
- * to Indexer.OutputFormat.
- * Note: Despite its name, it can't properly wrap a lucene document - it
- * doesn't know how to serialize/deserialize a lucene document.
- */
- private static class LuceneDocumentWrapper implements Writable {
- private Document doc;
-
- public LuceneDocumentWrapper(Document doc) {
- this.doc = doc;
- }
-
- public Document get() {
- return doc;
- }
+ private static final String INDEX_META_KEY = "indexer.index.meta";
+ private static final String LUCENE_ENABLED_KEY = "indexer.lucene.backend.enabled";
+ private static final String SOLR_ENABLED_KEY = "indexer.solr.backend.enabled";
+
+ public static class OutputFormat extends org.apache.hadoop.mapred.OutputFormatBase {
- public void readFields(DataInput in) throws IOException {
- // intentionally left blank
- }
-
- public void write(DataOutput out) throws IOException {
- // intentionally left blank
- }
-
- }
+ public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
+ String name, final Progressable progress) throws IOException {
- /** Unwrap Lucene Documents created by reduce and add them to an index. */
- public static class OutputFormat
- extends org.apache.hadoop.mapred.OutputFormatBase {
- public RecordWriter getRecordWriter(final FileSystem fs, JobConf job,
- String name, final Progressable progress) throws IOException {
- final Path perm = new Path(job.getOutputPath(), name);
- final Path temp =
- job.getLocalPath("index/_"+Integer.toString(new Random().nextInt()));
+ final List writers = new ArrayList();
+ Metadata indexMeta = (Metadata) job.getObject(INDEX_META_KEY);
- fs.delete(perm); // delete old, if any
+ if (job.getBoolean(LUCENE_ENABLED_KEY, false)) {
+ // first set then add to make sure that previous values are removed.
+ indexMeta.set(LuceneConstants.FIELD_PREFIX + "segment",
+ LuceneConstants.STORE_YES);
+ indexMeta.add(LuceneConstants.FIELD_PREFIX + "segment",
+ LuceneConstants.INDEX_NO);
+
+ indexMeta.set(LuceneConstants.FIELD_PREFIX + "digest",
+ LuceneConstants.STORE_YES);
+ indexMeta.add(LuceneConstants.FIELD_PREFIX + "digest",
+ LuceneConstants.INDEX_NO);
+
+ indexMeta.set(LuceneConstants.FIELD_PREFIX + "boost",
+ LuceneConstants.STORE_YES);
+ indexMeta.add(LuceneConstants.FIELD_PREFIX + "boost",
+ LuceneConstants.INDEX_NO);
+
+ indexMeta.set(LuceneConstants.OUTPUT_DIR,
+ new Path(job.getOutputPath(), name).toString());
+
+ Path temp = job.getLocalPath("index/_" +
+ Integer.toString(new Random().nextInt()));
+ indexMeta.set(LuceneConstants.TEMP_OUTPUT_DIR, temp.toString());
- final AnalyzerFactory factory = new AnalyzerFactory(job);
- final IndexWriter writer = // build locally first
- new IndexWriter(fs.startLocalOutput(perm, temp).toString(),
- new NutchDocumentAnalyzer(job), true);
+ writers.add(new LuceneWriter());
+ }
- writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
- writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
- writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
- writer.setTermIndexInterval
- (job.getInt("indexer.termIndexInterval", 128));
- writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
- writer.setInfoStream(LogUtil.getInfoStream(LOG));
- writer.setUseCompoundFile(false);
- writer.setSimilarity(new NutchSimilarity());
+ if (job.getBoolean(SOLR_ENABLED_KEY, false)) {
+ indexMeta.set(SolrConstants.SERVER_URL, job.get("indexer.solr.url"));
+ writers.add(new SolrWriter());
+ }
+
+ for (NutchIndexWriter writer : writers) {
+ writer.open(job, indexMeta);
+ }
return new RecordWriter() {
- boolean closed;
+ boolean closed;
- public void write(WritableComparable key, Writable value)
- throws IOException { // unwrap & index doc
- Document doc = ((LuceneDocumentWrapper) value).get();
- NutchAnalyzer analyzer = factory.get(doc.get("lang"));
- if (LOG.isInfoEnabled()) {
- LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" +
- " with analyzer " + analyzer +
- " (" + doc.get("lang") + ")");
- }
- writer.addDocument(doc, analyzer);
- progress.progress();
+ public void write(WritableComparable key, Writable value)
+ throws IOException { // unwrap & index doc
+ NutchDocument doc = (NutchDocument)value;
+ for (NutchIndexWriter writer : writers) {
+ writer.write(doc);
}
-
- public void close(final Reporter reporter) throws IOException {
- // spawn a thread to give progress heartbeats
- Thread prog = new Thread() {
- public void run() {
- while (!closed) {
- try {
- reporter.setStatus("closing");
- Thread.sleep(1000);
- } catch (InterruptedException e) { continue; }
- catch (Throwable e) { return; }
- }
- }
- };
+ progress.progress();
+ }
- try {
- prog.start();
- if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); }
- // optimize & close index
- writer.optimize();
+ public void close(final Reporter reporter) throws IOException {
+ // spawn a thread to give progress heartbeats
+ Thread prog = new Thread() {
+ public void run() {
+ while (!closed) {
+ try {
+ reporter.setStatus("closing");
+ Thread.sleep(1000);
+ } catch (InterruptedException e) { continue; }
+ catch (Throwable e) { return; }
+ }
+ }
+ };
+
+ try {
+ prog.start();
+ for (NutchIndexWriter writer : writers) {
writer.close();
- fs.completeLocalOutput(perm, temp); // copy to dfs
- fs.createNewFile(new Path(perm, DONE_NAME));
- } finally {
- closed = true;
}
+ } finally {
+ closed = true;
}
- };
+ }
+ };
}
}
-
+
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -170,9 +160,17 @@
setConf(job);
this.filters = new IndexingFilters(getConf());
this.scfilters = new ScoringFilters(getConf());
+
+ // TODO: HACK... Find a proper way to pass indexMeta to OutputFormat.
+ job.setObject(INDEX_META_KEY, this.filters.getIndexMeta());
}
public void close() {}
+
+ public void map(WritableComparable key, Writable value,
+ OutputCollector output, Reporter reporter) throws IOException {
+ output.collect(key, new NutchWritable(value));
+ }
public void reduce(WritableComparable key, Iterator values,
OutputCollector output, Reporter reporter)
@@ -217,27 +215,16 @@
if (!parseData.getStatus().isSuccess()) {
return;
}
-
- Document doc = new Document();
+
+ NutchDocument doc = new NutchDocument();
Metadata metadata = parseData.getContentMeta();
// add segment, used to map from merged index back to segment files
- doc.add(new Field("segment", metadata.get(Nutch.SEGMENT_NAME_KEY),
- Field.Store.YES, Field.Index.NO));
+ doc.add("segment", metadata.get(Nutch.SEGMENT_NAME_KEY));
// add digest, used by dedup
- doc.add(new Field("digest", metadata.get(Nutch.SIGNATURE_KEY),
- Field.Store.YES, Field.Index.NO));
+ doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
-// if (LOG.isInfoEnabled()) {
-// LOG.info("Url: "+key.toString());
-// LOG.info("Title: "+parseData.getTitle());
-// LOG.info(crawlDatum.toString());
-// if (inlinks != null) {
-// LOG.info(inlinks.toString());
-// }
-// }
-
Parse parse = new ParseImpl(parseText, parseData);
try {
// run indexing filters
@@ -262,33 +249,46 @@
return;
}
// apply boost to all indexed fields.
- doc.setBoost(boost);
+ doc.setScore(boost);
// store boost for use by explain and dedup
- doc.add(new Field("boost", Float.toString(boost),
- Field.Store.YES, Field.Index.NO));
+ doc.add("boost", Float.toString(boost));
- output.collect(key, new LuceneDocumentWrapper(doc));
+ output.collect(key, doc);
}
+
+ public void index(Path luceneDir, String solrUrl, Path crawlDb,
+ Path linkDb, Collection segments)
+ throws IOException {
- public void index(Path indexDir, Path crawlDb, Path linkDb, Path[] segments)
- throws IOException {
+ LOG.info("Indexer: starting");
+ LOG.info("Indexer: crawldbb: " + crawlDb);
+ LOG.info("Indexer: linkdb: " + linkDb);
- if (LOG.isInfoEnabled()) {
- LOG.info("Indexer: starting");
- LOG.info("Indexer: linkdb: " + linkDb);
+ JobConf job = new NutchJob(getConf());
+ String jobName = "index";
+ if (luceneDir != null) {
+ jobName += " lucene=" + luceneDir;
+ job.setBoolean(LUCENE_ENABLED_KEY, true);
+ LOG.info("Indexer: luceneDir: " + luceneDir);
+ } else {
+ job.setBoolean(LUCENE_ENABLED_KEY, false);
}
+ if (solrUrl != null) {
+ jobName += " solr=" + solrUrl;
+ job.setBoolean(SOLR_ENABLED_KEY, true);
+ job.set("indexer.solr.url", solrUrl);
+ LOG.info("Indexer: solrUrl: " + solrUrl);
+ } else {
+ job.setBoolean(SOLR_ENABLED_KEY, false);
+ }
+ job.setJobName(jobName);
- JobConf job = new NutchJob(getConf());
- job.setJobName("index " + indexDir);
-
- for (int i = 0; i < segments.length; i++) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Indexer: adding segment: " + segments[i]);
- }
- job.addInputPath(new Path(segments[i], CrawlDatum.FETCH_DIR_NAME));
- job.addInputPath(new Path(segments[i], CrawlDatum.PARSE_DIR_NAME));
- job.addInputPath(new Path(segments[i], ParseData.DIR_NAME));
- job.addInputPath(new Path(segments[i], ParseText.DIR_NAME));
+ for (Path segment : segments) {
+ LOG.info("Indexer: adding segment: " + segment);
+ job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME));
+ job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME));
+ job.addInputPath(new Path(segment, ParseData.DIR_NAME));
+ job.addInputPath(new Path(segment, ParseText.DIR_NAME));
}
job.addInputPath(new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -298,10 +298,15 @@
job.setMapperClass(Indexer.class);
job.setReducerClass(Indexer.class);
- job.setOutputPath(indexDir);
+ if (luceneDir == null) {
+ job.setOutputPath(new Path("notused"));
+ } else {
+ job.setOutputPath(luceneDir);
+ }
job.setOutputFormat(OutputFormat.class);
job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(NutchWritable.class);
+ job.setMapOutputValueClass(NutchWritable.class);
+ job.setOutputValueClass(NutchDocument.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Indexer: done"); }
@@ -314,19 +319,45 @@
public int run(String[] args) throws Exception {
- if (args.length < 4) {
- System.err.println("Usage: ...");
+ if (args.length < 5) {
+ System.err.println("Usage: (-lucene ) (-solr )" +
+ " ...");
+
return -1;
}
- Path[] segments = new Path[args.length-3];
- for (int i = 3; i < args.length; i++) {
- segments[i-3] = new Path(args[i]);
+ Path luceneDir = null;
+ String solrUrl = null;
+ Path crawlDb = null;
+ Path linkDb = null;
+ ArrayList segments = new ArrayList();
+ int i;
+ for (i = 0; i < args.length; i++) {
+ if (args[i].equals("-lucene")) {
+ luceneDir = new Path(args[++i]);
+ } else if (args[i].equals("-solr")) {
+ solrUrl = args[++i];
+ } else {
+ break;
+ }
}
+
+ if (luceneDir == null && solrUrl == null) {
+ System.err.println("Usage: (-lucene ) (-solr )" +
+ " ...");
+
+ return -1;
+ }
+
+ crawlDb = new Path(args[i++]);
+ linkDb = new Path(args[i++]);
+
+ for (; i < args.length; i++) {
+ segments.add(new Path(args[i]));
+ }
try {
- index(new Path(args[0]), new Path(args[1]), new Path(args[2]),
- segments);
+ index(luceneDir, solrUrl, crawlDb, linkDb, segments);
return 0;
} catch (Exception e) {
LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
@@ -334,9 +365,4 @@
}
}
- public void map(WritableComparable key, Writable value,
- OutputCollector output, Reporter reporter) throws IOException {
- output.collect(key, new NutchWritable(value));
- }
-
}
Index: src/java/org/apache/nutch/scoring/ScoringFilter.java
===================================================================
--- src/java/org/apache/nutch/scoring/ScoringFilter.java (revision 557522)
+++ src/java/org/apache/nutch/scoring/ScoringFilter.java (working copy)
@@ -25,6 +25,7 @@
import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.plugin.Pluggable;
@@ -156,6 +157,6 @@
* other scoring strategies by modifying Lucene document directly.
* @throws ScoringFilterException
*/
- public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException;
}
Index: src/java/org/apache/nutch/scoring/ScoringFilters.java
===================================================================
--- src/java/org/apache/nutch/scoring/ScoringFilters.java (revision 557522)
+++ src/java/org/apache/nutch/scoring/ScoringFilters.java (working copy)
@@ -25,6 +25,7 @@
import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.plugin.Extension;
@@ -135,7 +136,7 @@
return adjust;
}
- public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, parse, inlinks, initScore);
}
Index: src/java/org/apache/nutch/crawl/Inlinks.java
===================================================================
--- src/java/org/apache/nutch/crawl/Inlinks.java (revision 557522)
+++ src/java/org/apache/nutch/crawl/Inlinks.java (working copy)
@@ -69,7 +69,7 @@
/** Return the set of anchor texts. Only a single anchor with a given text
* is permitted from a given domain. */
- public String[] getAnchors() throws IOException {
+ public String[] getAnchors() {
HashMap domainToAnchors = new HashMap();
ArrayList results = new ArrayList();
Iterator it = inlinks.iterator();
Index: src/java/org/apache/nutch/crawl/Crawl.java
===================================================================
--- src/java/org/apache/nutch/crawl/Crawl.java (revision 557522)
+++ src/java/org/apache/nutch/crawl/Crawl.java (working copy)
@@ -131,7 +131,7 @@
linkDbTool.invert(linkDb, segments, true, true, false); // invert links
// index, dedup & merge
- indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments));
+ indexer.index(indexes, null, crawlDb, linkDb, Arrays.asList(fs.listPaths(segments)));
dedup.dedup(new Path[] { indexes });
merger.merge(fs.listPaths(indexes), index, tmpDir);
} else {
Index: src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
===================================================================
--- src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (revision 557522)
+++ src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (working copy)
@@ -22,7 +22,10 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.hadoop.io.Text;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
// Hadoop imports
@@ -48,22 +51,29 @@
// Inherited JavaDoc
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG);
if (tags != null) {
for (int i=0; i *
* ----------------------------- */
Index: src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
===================================================================
--- src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (revision 557522)
+++ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (working copy)
@@ -21,20 +21,20 @@
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.DateTools;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
@@ -46,7 +46,7 @@
private int MAX_TITLE_LENGTH;
private Configuration conf;
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String host = null;
@@ -58,29 +58,16 @@
}
if (host != null) {
- // add host as un-stored, indexed and tokenized
- doc.add(new Field("host", host, Field.Store.NO, Field.Index.TOKENIZED));
- // add site as un-stored, indexed and un-tokenized
- doc.add(new Field("site", host, Field.Store.NO, Field.Index.UN_TOKENIZED));
+ doc.add("host", host);
+ doc.add("site", host);
}
-
- // url is both stored and indexed, so it's both searchable and returned
- doc.add(new Field("url", url.toString(), Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add("url", url.toString());
+ doc.add("content", parse.getText());
- // content is indexed, so that it's searchable, but not stored in index
- doc.add(new Field("content", parse.getText(), Field.Store.NO, Field.Index.TOKENIZED));
-
- // anchors are indexed, so they're searchable, but not stored in index
- try {
- String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
- for (int i = 0; i < anchors.length; i++) {
- doc.add(new Field("anchor", anchors[i], Field.Store.NO, Field.Index.TOKENIZED));
- }
- } catch (IOException ioe) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("BasicIndexingFilter: can't get anchors for " + url.toString());
- }
+ String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
+ for (String anchor : anchors) {
+ doc.add("anchor", anchor);
}
// title
@@ -88,22 +75,75 @@
if (title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
- // add title indexed and stored so that it can be displayed
- doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add("title", title);
+
// add cached content/summary display policy, if available
String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
- doc.add(new Field("cache", caching, Field.Store.YES, Field.Index.NO));
+ doc.add("cache", caching);
}
// add timestamp when fetched, for deduplication
- doc.add(new Field("tstamp",
- DateTools.timeToString(datum.getFetchTime(), DateTools.Resolution.MILLISECOND),
- Field.Store.YES, Field.Index.NO));
+ doc.add("tstamp",
+ DateTools.timeToString(datum.getFetchTime(),
+ DateTools.Resolution.MILLISECOND));
return doc;
}
+
+ public void addIndexMeta(Metadata meta) {
+
+ ///////////////////////////
+ // add lucene hints //
+ ///////////////////////////
+
+ // host is un-stored, indexed and tokenized
+ meta.add(LuceneConstants.FIELD_PREFIX + "host",
+ LuceneConstants.STORE_NO);
+ meta.add(LuceneConstants.FIELD_PREFIX + "host",
+ LuceneConstants.INDEX_TOKENIZED);
+ // site is un-stored, indexed and un-tokenized
+ meta.add(LuceneConstants.FIELD_PREFIX + "site",
+ LuceneConstants.STORE_NO);
+ meta.add(LuceneConstants.FIELD_PREFIX + "site",
+ LuceneConstants.INDEX_UNTOKENIZED);
+
+ // url is both stored and indexed, so it's both searchable and returned
+ meta.add(LuceneConstants.FIELD_PREFIX + "url",
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + "url",
+ LuceneConstants.INDEX_TOKENIZED);
+
+ // content is indexed, so that it's searchable, but not stored in index
+ meta.add(LuceneConstants.FIELD_PREFIX + "content",
+ LuceneConstants.STORE_NO);
+ meta.add(LuceneConstants.FIELD_PREFIX + "content",
+ LuceneConstants.INDEX_TOKENIZED);
+
+ // anchors are indexed, so they're searchable, but not stored in index
+ meta.add(LuceneConstants.FIELD_PREFIX + "anchor",
+ LuceneConstants.STORE_NO);
+ meta.add(LuceneConstants.FIELD_PREFIX + "anchor",
+ LuceneConstants.INDEX_TOKENIZED);
+
+ // title is indexed and stored so that it can be displayed
+ meta.add(LuceneConstants.FIELD_PREFIX + "title",
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + "title",
+ LuceneConstants.INDEX_TOKENIZED);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + "cache",
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + "cache",
+ LuceneConstants.INDEX_NO);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + "tstamp",
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + "tstamp",
+ LuceneConstants.INDEX_NO);
+ }
+
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
Index: src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
===================================================================
--- src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (revision 557522)
+++ src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (working copy)
@@ -22,6 +22,8 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.metadata.Metadata;
@@ -30,11 +32,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-// Lucene imports
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Document;
-
/**
* An {@link org.apache.nutch.indexer.IndexingFilter} that
* add a lang
(language) field to the document.
@@ -65,7 +63,7 @@
}
// Inherited JavaDoc
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
@@ -92,10 +90,18 @@
lang = "unknown";
}
- doc.add(new Field("lang", lang, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ doc.add("lang", lang);
return doc;
}
+
+ @Override
+ public void addIndexMeta(Metadata meta) {
+ meta.add(LuceneConstants.FIELD_PREFIX + "lang",
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + "lang",
+ LuceneConstants.INDEX_UNTOKENIZED);
+ }
public void setConf(Configuration conf) {
this.conf = conf;
@@ -105,4 +111,5 @@
public Configuration getConf() {
return this.conf;
}
+
}
Index: src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
===================================================================
--- src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (revision 557522)
+++ src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (working copy)
@@ -29,11 +29,9 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
-import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.fetcher.Fetcher;
-import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
@@ -158,7 +156,7 @@
}
/** Dampen the boost value by scorePower.*/
- public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore;
}
}
Index: src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
===================================================================
--- src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (revision 557522)
+++ src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (working copy)
@@ -17,20 +17,19 @@
package org.creativecommons.nutch;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.nutch.metadata.CreativeCommons;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.CreativeCommons;
import org.apache.hadoop.conf.Configuration;
@@ -50,7 +49,7 @@
private Configuration conf;
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
@@ -86,7 +85,7 @@
/** Add the features represented by a license URL. Urls are of the form
* "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
* license feature. */
- public void addUrlFeatures(Document doc, String urlString) {
+ public void addUrlFeatures(NutchDocument doc, String urlString) {
try {
URL url = new URL(urlString);
@@ -108,9 +107,17 @@
}
}
- private void addFeature(Document doc, String feature) {
- doc.add(new Field(FIELD, feature, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ private void addFeature(NutchDocument doc, String feature) {
+ doc.add(FIELD, feature);
}
+
+ @Override
+ public void addIndexMeta(Metadata meta) {
+ meta.add(LuceneConstants.FIELD_PREFIX + FIELD,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + FIELD,
+ LuceneConstants.INDEX_UNTOKENIZED);
+ }
public void setConf(Configuration conf) {
this.conf = conf;
Index: src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
===================================================================
--- src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (revision 557522)
+++ src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (working copy)
@@ -25,12 +25,12 @@
//APACHE imports
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.nutch.metadata.Feed;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
@@ -71,7 +71,7 @@
* index.
*
*/
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum,
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
Inlinks inlinks) throws IndexingException {
ParseData parseData = parse.getData();
Metadata parseMeta = parseData.getParseMeta();
@@ -84,35 +84,31 @@
if (authors != null) {
for (String author : authors) {
- doc.add(new Field(Feed.FEED_AUTHOR, author,
- Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(Feed.FEED_AUTHOR, author);
}
}
if (tags != null) {
for (String tag : tags) {
- doc.add(new Field(Feed.FEED_TAGS, tag,
- Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(Feed.FEED_TAGS, tag);
}
}
if (feed != null)
- doc.add(new Field(Feed.FEED, feed, Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(Feed.FEED, feed);
SimpleDateFormat sdf = new SimpleDateFormat(dateFormatStr);
sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
if (published != null) {
Date date = new Date(Long.parseLong(published));
String dateString = sdf.format(date);
- doc.add(new Field(PUBLISHED_DATE, dateString,
- Field.Store.YES, Field.Index.NO_NORMS));
+ doc.add(PUBLISHED_DATE, dateString);
}
if (updated != null) {
Date date = new Date(Long.parseLong(updated));
String dateString = sdf.format(date);
- doc.add(new Field(UPDATED_DATE, dateString,
- Field.Store.YES, Field.Index.NO_NORMS));
+ doc.add(UPDATED_DATE, dateString);
}
return doc;
@@ -126,6 +122,35 @@
return conf;
}
+ @Override
+ public void addIndexMeta(Metadata meta) {
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED_AUTHOR,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED_AUTHOR,
+ LuceneConstants.INDEX_TOKENIZED);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED_TAGS,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED_TAGS,
+ LuceneConstants.INDEX_TOKENIZED);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + Feed.FEED,
+ LuceneConstants.INDEX_TOKENIZED);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + PUBLISHED_DATE,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + PUBLISHED_DATE,
+ LuceneConstants.INDEX_NO_NORMS);
+
+ meta.add(LuceneConstants.FIELD_PREFIX + UPDATED_DATE,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + UPDATED_DATE,
+ LuceneConstants.INDEX_NO_NORMS);
+
+ }
+
/**
* Sets the {@link Configuration} object used to configure this
* {@link IndexingFilter}.
Index: src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
===================================================================
--- src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (revision 557522)
+++ src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (working copy)
@@ -19,17 +19,18 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.nutch.collection.CollectionManager;
import org.apache.nutch.crawl.CrawlDatum;
@@ -62,14 +63,22 @@
* @param doc
* @param url
*/
- private void addSubCollectionField(Document doc, String url) {
+ private void addSubCollectionField(NutchDocument doc, String url) {
String collname = CollectionManager.getCollectionManager(getConf()).getSubCollections(url);
- doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(FIELD_NAME, collname);
}
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;
}
+
+ @Override
+ public void addIndexMeta(Metadata meta) {
+ meta.add(LuceneConstants.FIELD_PREFIX + FIELD_NAME,
+ LuceneConstants.STORE_YES);
+ meta.add(LuceneConstants.FIELD_PREFIX + FIELD_NAME,
+ LuceneConstants.INDEX_TOKENIZED);
+ }
}
Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (revision 557522)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (working copy)
@@ -27,9 +27,6 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
@@ -39,6 +36,8 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.lucene.LuceneConstants;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -82,7 +81,7 @@
/** Get the MimeTypes resolver instance. */
private MimeTypes MIME;
- public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String url_s = url.toString();
@@ -97,7 +96,7 @@
// Add time related meta info. Add last-modified if present. Index date as
// last-modified, or, if that's not present, use fetch time.
- private Document addTime(Document doc, ParseData data,
+ private NutchDocument addTime(NutchDocument doc, ParseData data,
String url, CrawlDatum datum) {
long time = -1;
@@ -105,7 +104,7 @@
if (lastModified != null) { // try parse last-modified
time = getTime(lastModified,url); // use as time
// store as string
- doc.add(new Field("lastModified", new Long(time).toString(), Field.Store.YES, Field.Index.NO));
+ doc.add("lastModified", Long.toString(time));
}
if (time == -1) { // if no last-modified
@@ -119,7 +118,7 @@
String dateString = sdf.format(new Date(time));
// un-stored, indexed and un-tokenized
- doc.add(new Field("date", dateString, Field.Store.NO, Field.Index.UN_TOKENIZED));
+ doc.add("date", dateString);
return doc;
}
@@ -169,17 +168,17 @@
}
// Add Content-Length
- private Document addLength(Document doc, ParseData data, String url) {
+ private NutchDocument addLength(NutchDocument doc, ParseData data, String url) {
String contentLength = data.getMeta(Response.CONTENT_LENGTH);
if (contentLength != null)
- doc.add(new Field("contentLength", contentLength, Field.Store.YES, Field.Index.NO));
+ doc.add("contentLength", contentLength);
return doc;
}
// Add Content-Type and its primaryType and subType
- private Document addType(Document doc, ParseData data, String url) {
+ private NutchDocument addType(NutchDocument doc, ParseData data, String url) {
MimeType mimeType = null;
String contentType = data.getMeta(Response.CONTENT_TYPE);
if (contentType == null) {
@@ -199,7 +198,7 @@
try {
mimeType = new MimeType(contentType);
} catch (MimeTypeException e) {
- if (LOG.isWarnEnabled()) { LOG.warn(url + e.toString()); }
+ LOG.warn(url + e.toString());
mimeType = null;
}
}
@@ -225,14 +224,13 @@
// type:vnd.ms-powerpoint
// all case insensitive.
// The query filter is implemented in TypeQueryFilter.java
- doc.add(new Field("type", contentType, Field.Store.NO, Field.Index.UN_TOKENIZED));
- doc.add(new Field("type", primaryType, Field.Store.NO, Field.Index.UN_TOKENIZED));
- doc.add(new Field("type", subType, Field.Store.NO, Field.Index.UN_TOKENIZED));
+ doc.add("type", contentType);
+ doc.add("type", primaryType);
+ doc.add("type", subType);
// add its primaryType and subType to respective fields
- // as stored, indexed and un-tokenized
- doc.add(new Field("primaryType", primaryType, Field.Store.YES, Field.Index.UN_TOKENIZED));
- doc.add(new Field("subType", subType, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ doc.add("primaryType", primaryType);
+ doc.add("subType", subType);
return doc;
}
@@ -261,7 +259,7 @@
}
}
- private Document resetTitle(Document doc, ParseData data, String url) {
+ private NutchDocument resetTitle(NutchDocument doc, ParseData data, String url) {
String contentDisposition = data.getMeta(Metadata.CONTENT_DISPOSITION);
if (contentDisposition == null)
return doc;
@@ -270,13 +268,51 @@
for (int i=0; i