Index: conf/schema-solr4.xml
===================================================================
--- conf/schema-solr4.xml	(revision 1693468)
+++ conf/schema-solr4.xml	(working copy)
@@ -32,7 +32,9 @@
     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
     <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
 
+    <fieldtype name="binary" class="solr.BinaryField"/>
 
+
     <!--
       Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
     -->
@@ -405,6 +407,10 @@
 
     <!-- fields for tld plugin -->    
     <field name="tld" type="string" stored="false" indexed="false"/>
+
+    <!-- field containing segment's raw binary content if indexed with -addBinaryContent -->
+    <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>
Index: conf/schema.xml
===================================================================
--- conf/schema.xml	(revision 1693468)
+++ conf/schema.xml	(working copy)
@@ -39,6 +39,7 @@
         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
             omitNorms="true" positionIncrementGap="0"/>
         <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
+        <fieldtype name="binary" class="solr.BinaryField"/>
 
         <fieldType name="text" class="solr.TextField"
             positionIncrementGap="100">
@@ -176,6 +177,9 @@
         <!-- fields for tld plugin -->    
         <field name="tld" type="string" stored="false" indexed="false"/>
 
+        <!-- field containing segment's raw binary content if indexed with -addBinaryContent -->
++       <field name="binaryContent" type="binary" stored="true" indexed="false"/>
+
         <!-- to work with Solr 4.9 and beyond that use RealTimeGetHandler -->
         <field name="_version_" type="long" indexed="true" stored="true"/>
 
Index: ivy/ivy.xml
===================================================================
--- ivy/ivy.xml	(revision 1693468)
+++ ivy/ivy.xml	(working copy)
@@ -71,21 +71,21 @@
 		<dependency org="com.google.guava" name="guava" rev="11.0.2" />
 		<dependency org="com.google.code.crawler-commons" name="crawler-commons"
 			rev="0.5" />
-                <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.0.4"/>
-                <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.0.4"/>
-                <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" /> 
-                <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" />
-                <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" />	
+        <dependency org="org.apache.cxf" name="cxf" rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.0.4"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.0.4"/>
+        <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1" /> 
+        <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.5.1" />
+        <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.5.1" />	
 
-                <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
+        <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 
-                 <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
-                 <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
-                 <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
-                 <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.3.0" />
+        <dependency org="org.apache.mahout" name="mahout-math" rev="0.8" />
+        <dependency org="org.apache.mahout" name="mahout-core" rev="0.8" />
+        <dependency org="org.apache.lucene" name="lucene-core" rev="4.3.0" />
+        <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="4.3.0" />
               
 		<!--Configuration: test -->
 
Index: src/java/org/apache/nutch/indexer/IndexerMapReduce.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(revision 1693468)
+++ src/java/org/apache/nutch/indexer/IndexerMapReduce.java	(working copy)
@@ -22,6 +22,8 @@
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -34,7 +36,6 @@
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.SequenceFileInputFormat;
-import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.crawl.Inlinks;
@@ -48,6 +49,7 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 
@@ -64,10 +66,12 @@
   public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
   public static final String URL_FILTERING = "indexer.url.filters";
   public static final String URL_NORMALIZING = "indexer.url.normalizers";
+  public static final String INDEXER_BINARY_AS_BASE64 = "indexer.binary.base64";
 
   private boolean skip = false;
   private boolean delete = false;
   private boolean deleteRobotsNoIndex = false;
+  private boolean base64 = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
 
@@ -91,6 +95,7 @@
     this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
         false);
     this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
+    this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
 
     normalize = job.getBoolean(URL_NORMALIZING, false);
     filter = job.getBoolean(URL_FILTERING, false);
@@ -159,7 +164,7 @@
 
   public void map(Text key, Writable value,
       OutputCollector<Text, NutchWritable> output, Reporter reporter)
-      throws IOException {
+          throws IOException {
 
     String urlString = filterUrl(normalizeUrl(key.toString()));
     if (urlString == null) {
@@ -173,10 +178,11 @@
 
   public void reduce(Text key, Iterator<NutchWritable> values,
       OutputCollector<Text, NutchIndexAction> output, Reporter reporter)
-      throws IOException {
+          throws IOException {
     Inlinks inlinks = null;
     CrawlDatum dbDatum = null;
     CrawlDatum fetchDatum = null;
+    Content content = null;
     ParseData parseData = null;
     ParseText parseText = null;
 
@@ -219,6 +225,8 @@
         }
       } else if (value instanceof ParseText) {
         parseText = (ParseText) value;
+      } else if (value instanceof Content) {
+        content = (Content)value;
       } else if (LOG.isWarnEnabled()) {
         LOG.warn("Unrecognized type: " + value.getClass());
       }
@@ -327,6 +335,18 @@
     // store boost for use by explain and dedup
     doc.add("boost", Float.toString(boost));
 
+    if (content != null) {
+      // Get the original unencoded content
+      String binary = new String(content.getContent());
+
+      // optionally encode as base64
+      if (base64) {
+        binary = Base64.encodeBase64String(StringUtils.getBytesUtf8(binary));
+      }
+
+      doc.add("binaryContent", binary);
+    }
+
     reporter.incrCounter("IndexerStatus", "indexed (add/update)", 1);
 
     NutchIndexAction action = new NutchIndexAction(doc, NutchIndexAction.ADD);
@@ -337,7 +357,7 @@
   }
 
   public static void initMRJob(Path crawlDb, Path linkDb,
-      Collection<Path> segments, JobConf job) {
+      Collection<Path> segments, JobConf job, boolean addBinaryContent) {
 
     LOG.info("IndexerMapReduce: crawldb: " + crawlDb);
 
@@ -352,6 +372,10 @@
           CrawlDatum.PARSE_DIR_NAME));
       FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
       FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
+
+      if (addBinaryContent) {
+        FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
+      }
     }
 
     FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
@@ -367,7 +391,7 @@
         }
       } catch (IOException e) {
         LOG.warn("Failed to use linkDb ({}) for indexing: {}", linkDb,
-            StringUtils.stringifyException(e));
+            org.apache.hadoop.util.StringUtils.stringifyException(e));
       }
     }
 
Index: src/java/org/apache/nutch/indexer/IndexingJob.java
===================================================================
--- src/java/org/apache/nutch/indexer/IndexingJob.java	(revision 1693468)
+++ src/java/org/apache/nutch/indexer/IndexingJob.java	(working copy)
@@ -83,7 +83,23 @@
   public void index(Path crawlDb, Path linkDb, List<Path> segments,
       boolean noCommit, boolean deleteGone, String params, boolean filter,
       boolean normalize) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false);
+  }
 
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent) throws IOException {
+    index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false,
+        false, false, false);
+  }
+
+  public void index(Path crawlDb, Path linkDb, List<Path> segments,
+      boolean noCommit, boolean deleteGone, String params,
+      boolean filter, boolean normalize, boolean addBinaryContent,
+      boolean base64) throws IOException {
+
+
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("Indexer: starting at " + sdf.format(start));
@@ -94,11 +110,17 @@
     LOG.info("Indexer: deleting gone documents: " + deleteGone);
     LOG.info("Indexer: URL filtering: " + filter);
     LOG.info("Indexer: URL normalizing: " + normalize);
-
+    if (addBinaryContent) {
+      if (base64) {
+        LOG.info("Indexer: adding binary content as Base64");
+      } else {
+        LOG.info("Indexer: adding binary content");
+      }
+    }        
     IndexWriters writers = new IndexWriters(getConf());
     LOG.info(writers.describe());
 
-    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);
+    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent);
 
     // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM
     // job.set(SolrConstants.SERVER_URL, solrUrl);
@@ -106,6 +128,7 @@
     job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
     job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
     job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
+    job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64);
 
     if (params != null) {
       job.set(IndexerMapReduce.INDEXER_PARAMS, params);
@@ -141,7 +164,8 @@
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.err
-      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+      //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
+      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
       IndexWriters writers = new IndexWriters(getConf());
       System.err.println(writers.describe());
       return -1;
@@ -157,6 +181,8 @@
     boolean deleteGone = false;
     boolean filter = false;
     boolean normalize = false;
+    boolean addBinaryContent = false;
+    boolean base64 = false;
 
     for (int i = 1; i < args.length; i++) {
       if (args[i].equals("-linkdb")) {
@@ -180,6 +206,10 @@
         filter = true;
       } else if (args[i].equals("-normalize")) {
         normalize = true;
+      } else if (args[i].equals("-addBinaryContent")) {
+        addBinaryContent = true;
+      } else if (args[i].equals("-base64")) {
+        base64 = true;
       } else if (args[i].equals("-params")) {
         params = args[++i];
       } else {
@@ -188,8 +218,7 @@
     }
 
     try {
-      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter,
-          normalize);
+      index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
       return 0;
     } catch (final Exception e) {
       LOG.error("Indexer: " + StringUtils.stringifyException(e));
