From f0196c800c5176eb8642fd68eba64d475bc0d66c Mon Sep 17 00:00:00 2001
From: Asitang Mishra <asitang@gmail.com>
Date: Tue, 24 Mar 2015 00:36:45 -0700
Subject: [PATCH] patch 3

---
 .../apache/nutch/protocol/http/api/HttpBase.java   | 59 ++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 846be0e..f180539 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -17,10 +17,14 @@
 package org.apache.nutch.protocol.http.api;
 
 // JDK imports
+import java.io.BufferedReader;
 import java.io.IOException;
+import java.io.Reader;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
+import java.util.Random;
 import java.util.Set;
 
 // Logging imports
@@ -56,6 +60,13 @@ public abstract class HttpBase implements Protocol {
 
   private HttpRobotRulesParser robots = null;
 
+  private int agentIndex = 0; // NUTCH-1941
+  private int urlCount = 0;// NUTCH-1941
+  private Random random = null; // NUTCH-1941
+  private int rotationInterval = 0; //NUTCH-1941
+  private ArrayList<String> useragentnames; // NUTCH-1941
+  private int agentfileset = 0;// NUTCH-1941
+
   /** The proxy hostname. */
   protected String proxyHost = null;
 
@@ -124,6 +135,26 @@ public abstract class HttpBase implements Protocol {
     robots = new HttpRobotRulesParser();
   }
 
+// NUTCH-1941
+
+  private void rotateAgentName() { 
+    if (urlCount == 0) {
+      userAgent = useragentnames.get(agentIndex);
+      agentIndex += 1;
+
+      if (agentIndex == useragentnames.size())
+        agentIndex = 0;
+
+      urlCount = random.nextInt(rotationInterval) + 1;
+
+    } else {
+      urlCount--;
+
+    }
+  }
+
   // Inherited Javadoc
   public void setConf(Configuration conf) {
     this.conf = conf;
@@ -143,6 +174,30 @@ public abstract class HttpBase implements Protocol {
     this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
     this.robots.setConf(conf);
 
+
+    // NUTCH-1941
+    try {
+
+      if (conf.getBoolean("agent.rotate", false)) {
+        Reader reader = conf.getConfResourceAsReader(conf.get(
+            "agent.rotate.file", "agents.txt"));
+        rotationInterval = conf.getInt("agent.rotate.interval", 50);
+
+        BufferedReader br = new BufferedReader(reader);
+        useragentnames = new ArrayList<String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          useragentnames.add(word);
+        }
+        br.close();
+        agentfileset = 1;
+        random = new Random();
+      }
+
+    } catch (Exception e) {
+      logger.info("there was some problem fetching agents rotation file, so using the default agent name set in the nutch-site.xml ");
+    }
+
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
         "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
     String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
@@ -312,6 +367,10 @@ public abstract class HttpBase implements Protocol {
   }
 
   public String getUserAgent() {
+    if (agentfileset == 1) { // NUTCH-1941
+      rotateAgentName();
+    }
+
     return userAgent;
   }
 
-- 
1.9.5 (Apple Git-50.3)

