Index: conf/log4j.properties
===================================================================
--- conf/log4j.properties	(revision 4916)
+++ conf/log4j.properties	(working copy)
@@ -59,7 +59,7 @@
 log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
 log4j.logger.org.apache.nutch.hostdb.UpdateHostDb=INFO,cmdstdout
 log4j.logger.org.apache.nutch.hostdb.ReadHostDb=INFO,cmdstdout
-
+log4j.logger.org.apache.nutch.util.resolve.ResolveProtocols=INFO,cmdstdout
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN
 
Index: src/bin/nutch
===================================================================
--- src/bin/nutch	(revision 4916)
+++ src/bin/nutch	(working copy)
@@ -95,6 +95,7 @@
   echo "  warc              exports crawled data from segments at the WARC format"
   echo "  updatehostdb      update the host db with records from the crawl db"
   echo "  readhostdb        read / dump host db"
+  echo "  resolveprotocols  run the protocol resolver on the crawl db"
   echo " or"
   echo "  CLASSNAME         run the class named CLASSNAME"
   echo "Most commands print help when invoked w/o parameters."
@@ -302,6 +303,8 @@
   CLASS=org.apache.nutch.hostdb.UpdateHostDb
 elif [ "$COMMAND" = "readhostdb" ] ; then
   CLASS=org.apache.nutch.hostdb.ReadHostDb
+elif [ "$COMMAND" = "resolveprotocols" ] ; then
+  CLASS=org.apache.nutch.util.resolve.ResolveProtocols
 else
   CLASS=$COMMAND
 fi
Index: src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocols.java
===================================================================
--- src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocols.java	(revision 0)
+++ src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocols.java	(working copy)
@@ -0,0 +1,138 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util.resolve;
+
+import java.io.IOException;
+import java.net.URL;
+import java.net.MalformedURLException;
+import java.text.SimpleDateFormat;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.FSUtils;
+import org.apache.nutch.util.LockUtil;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+import org.apache.nutch.util.TimingUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * 
+ *
+ *
+ * @author markus@openindex.io
+ */
+public class ResolveProtocols extends Configured implements Tool {
+  
+  public static final String HTTP = "http";
+  public static final String HTTPS = "https";
+  
+  public static final Logger LOG = LoggerFactory.getLogger(ResolveProtocols.class);
+  
+  public ResolveProtocols() {}
+
+  public ResolveProtocols(Configuration conf) {
+    setConf(conf);
+  }
+  
+  public void resolveProtocols(Path crawlDb, Path output) throws Exception {
+    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+    long start = System.currentTimeMillis();
+    LOG.info("ResolveProtocols: starting at " + sdf.format(start));
+
+    Configuration conf = getConf();
+    conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
+    conf.set("mapred.textoutputformat.separator", "\t");
+    
+    Job job = new Job(conf, "ResolveProtocols");
+    job.setJarByClass(ResolveProtocols.class);
+
+    FileInputFormat.addInputPath(job, new Path(crawlDb, "current"));
+    FileOutputFormat.setOutputPath(job, output);
+
+    job.setJarByClass(ResolveProtocols.class);
+    job.setMapperClass(ResolveProtocolsMapper.class);
+    job.setReducerClass(ResolveProtocolsReducer.class);
+    job.setInputFormatClass(SequenceFileInputFormat.class);
+    job.setOutputFormatClass(TextOutputFormat.class);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Text.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(Text.class);
+
+    try {
+      job.waitForCompletion(true);
+    } catch (Exception e) {
+      throw e;
+    }
+
+    long end = System.currentTimeMillis();
+    LOG.info("ResolveProtocols: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+  }
+
+  public static void main(String args[]) throws Exception {
+    int res = ToolRunner.run(NutchConfiguration.create(), new ResolveProtocols(), args);
+    System.exit(res);
+  }
+
+  public int run(String[] args) throws Exception {
+    if (args.length < 4) {
+      System.err.println("Usage: ResolveProtocols -crawldb <crawldb> -output <output>");
+      return -1;
+    }
+
+    Path output = null;
+    Path crawlDb = null;
+    
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-output")) {
+        output = new Path(args[i + 1]);
+        LOG.info("ResolveProtocols: output: " + output);
+        i++;
+      }
+      if (args[i].equals("-crawldb")) {
+        crawlDb = new Path(args[i + 1]);
+        LOG.info("ResolveProtocols: crawldb: " + crawlDb);
+        i++;
+      }
+    }
+
+    try {
+      resolveProtocols(crawlDb, output);
+
+      return 0;
+    } catch (Exception e) {
+      LOG.error("ResolveProtocols: " + StringUtils.stringifyException(e));
+      return -1;
+    }
+  }
+}
\ No newline at end of file
Index: src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsMapper.java
===================================================================
--- src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsMapper.java	(revision 0)
+++ src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsMapper.java	(working copy)
@@ -0,0 +1,62 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util.resolve;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.URLUtil;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Writes out url's grouped by host. In this case ee don't care about URL
+ * status.
+ */
+public class ResolveProtocolsMapper extends Mapper<Text, CrawlDatum, Text, Text> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ResolveProtocolsMapper.class);
+
+  /**
+   *
+   * @param context
+   */
+  public void setup(Context context) {}
+
+  /**
+
+   *
+   * @param key the url
+   * @param value the crawl datum
+   * @param context
+   * @return void
+   */
+  public void map(Text key, CrawlDatum crawlDatum, Context context) throws IOException, InterruptedException {
+    // Get the host!
+    Text host = new Text(URLUtil.getHost(key.toString()));
+    
+    // Write out
+    context.write(host, key);
+  }
+  
+  /**
+   *
+   * @param context
+   */
+  public void cleanup(Context context) { }
+}
\ No newline at end of file
Index: src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsReducer.java
===================================================================
--- src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsReducer.java	(revision 0)
+++ src/java/org/apache/nutch/util/resolve/protocols/ResolveProtocolsReducer.java	(working copy)
@@ -0,0 +1,101 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util.resolve;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ *
+ */
+public class ResolveProtocolsReducer extends Reducer<Text, Text, Text, Text> {
+
+  public static final Logger LOG = LoggerFactory.getLogger(ResolveProtocolsReducer.class);
+  
+  /**
+   * Maximum number of URL's we'll sample for duplicates.
+   */
+  protected int sampleSize = 1000;
+  
+  /**
+   * Fixed number of minimum duplicates before rules are emitted.
+   */
+  protected int minDuplicates = 10;
+  
+  /**
+   *
+   * @param context
+   */
+  public void setup(Context context) { }
+
+  /**
+   * Here we only deal with URL's without extension.
+   */
+  public void reduce(Text host, Iterable<Text> values, Context context) throws IOException, InterruptedException {      
+    int numHttp = 0;
+    int numHttps = 0;
+    int numDuplicates = 0;
+    
+    // We only need to keep track of http URL's as https versions come after
+    List<String> httpUrlPaths = new ArrayList<String>(sampleSize);
+
+    // Loop through the crap!
+    for (Text value : values) {
+      // Get the URL's path and protocol
+      URL url = new URL(value.toString());
+      String path = url.getPath();
+      String protocol = url.getProtocol();
+
+      // Count http and https occurences
+      if (protocol.equals(ResolveProtocols.HTTP)) {
+        httpUrlPaths.add(path);
+        numHttp++;
+      }
+      
+      if (protocol.equals(ResolveProtocols.HTTPS)) {
+        // Is a duplicate?
+        if (httpUrlPaths.contains(path)) {
+          numDuplicates++;
+        }
+        
+        numHttps++;
+      }
+    }
+    
+    LOG.info("ResolveProtocols: " + host + " duplicates: " + numDuplicates + "/" + sampleSize +" http: " + numHttp + " https: " + numHttps);
+    
+    // Got a decent number of https?
+    if (numDuplicates > minDuplicates) {
+      context.write(host, new Text(ResolveProtocols.HTTPS));
+    }
+    
+    // Do not emit a rule for http-only, this allows us to discover sites migrating to https
+  }
+  
+  /**
+   *
+   * @param context
+   */
+  public void cleanup(Context context) { }
+}
\ No newline at end of file