Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 1721535)
+++ src/plugin/build.xml	(working copy)
@@ -51,6 +51,7 @@
      <ant dir="nutch-extensionpoints" target="deploy"/>
      <ant dir="protocol-file" target="deploy"/>
      <ant dir="protocol-ftp" target="deploy"/>
+     <ant dir="protocol-htmlunit" target="deploy"/>
      <ant dir="protocol-http" target="deploy"/>
      <ant dir="protocol-httpclient" target="deploy"/>
      <ant dir="lib-selenium" target="deploy"/>
@@ -162,6 +163,7 @@
     <ant dir="nutch-extensionpoints" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
     <ant dir="protocol-ftp" target="clean"/>
+    <ant dir="protocol-htmlunit" target="clean"/>
     <ant dir="protocol-http" target="clean"/>
     <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="lib-selenium" target="clean"/>
Index: src/plugin/protocol-htmlunit/build.xml
===================================================================
--- src/plugin/protocol-htmlunit/build.xml	(revision 0)
+++ src/plugin/protocol-htmlunit/build.xml	(working copy)
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-http"/>
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <copy toDir="${build.test}">
+      <fileset dir="${src.test}" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <!-- for junit test -->
+  <!--
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="jsp"/>
+  </copy>-->
+
+</project>

Property changes on: src/plugin/protocol-htmlunit/build.xml
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: src/plugin/protocol-htmlunit/ivy.xml
===================================================================
--- src/plugin/protocol-htmlunit/ivy.xml	(revision 0)
+++ src/plugin/protocol-htmlunit/ivy.xml	(working copy)
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="net.sourceforge.htmlunit" name="htmlunit" rev="2.19" />
+  </dependencies>
+  
+</ivy-module>
Index: src/plugin/protocol-htmlunit/plugin.xml
===================================================================
--- src/plugin/protocol-htmlunit/plugin.xml	(revision 0)
+++ src/plugin/protocol-htmlunit/plugin.xml	(working copy)
@@ -0,0 +1,80 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-htmlunit"
+   name="HtmlUnit Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="protocol-htmlunit.jar">
+      <export name="*"/>
+    </library>
+    <library name="commons-codec-1.10.jar"/>
+    <library name="commons-collections-3.2.1.jar"/>
+    <library name="commons-io-2.4.jar"/>
+    <library name="commons-lang3-3.4.jar"/>
+    <library name="commons-logging-1.2.jar"/>
+    <library name="cssparser-0.9.18.jar"/>
+    <library name="htmlunit-2.19.jar"/>
+    <library name="htmlunit-core-js-2.17.jar"/>
+    <library name="httpclient-4.5.1.jar"/>
+    <library name="httpcore-4.4.3.jar"/>
+    <library name="httpmime-4.5.1.jar"/>
+    <library name="jetty-io-9.2.13.v20150730.jar"/>
+    <library name="jetty-util-9.2.13.v20150730.jar"/>
+    <library name="nekohtml-1.9.22.jar"/>
+    <library name="sac-1.3.jar"/>
+    <library name="serializer-2.7.2.jar"/>
+    <library name="websocket-api-9.2.13.v20150730.jar"/>
+    <library name="websocket-client-9.2.13.v20150730.jar"/>
+    <library name="websocket-common-9.2.13.v20150730.jar"/>
+    <library name="xalan-2.7.2.jar"/>
+    <library name="xercesImpl-2.11.0.jar"/>
+    <library name="xml-apis-1.4.01.jar"/>
+
+<!--
+    <library name=""/>
+
+commons-codec-1.10.jar         commons-lang3-3.4.jar    htmlunit-2.19.jar          httpcore-4.4.3.jar             jetty-util-9.2.13.v20150730.jar  protocol-htmlunit.jar  websocket-api-9.2.13.v20150730.jar     xalan-2.7.2.jar             
+commons-collections-3.2.1.jar  commons-logging-1.2.jar  htmlunit-core-js-2.17.jar  httpmime-4.5.1.jar             nekohtml-1.9.22.jar              sac-1.3.jar            websocket-client-9.2.13.v20150730.jar  xercesImpl-2.11.0.jar       
+commons-io-2.4.jar             cssparser-0.9.18.jar     httpclient-4.5.1.jar       jetty-io-9.2.13.v20150730.jar  plugin.xml                       serializer-2.7.2.jar   websocket-common-9.2.13.v20150730.jar  xml-apis-1.4.01.jar   
+-->
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints"/>
+    <import plugin="lib-http"/>
+  </requires>
+
+  <extension id="org.apache.nutch.protocol.http"
+             name="HttpProtocol"
+             point="org.apache.nutch.protocol.Protocol">
+
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="http"/>
+    </implementation>
+      
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="https"/>
+    </implementation>
+
+   </extension>
+</plugin>

Property changes on: src/plugin/protocol-htmlunit/plugin.xml
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
===================================================================
--- src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java	(revision 0)
+++ src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java	(working copy)
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ *
+ *
+ */
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+  
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+}

Property changes on: src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
===================================================================
--- src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java	(revision 0)
+++ src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java	(working copy)
@@ -0,0 +1,138 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.List;
+
+import com.gargoylesoftware.htmlunit.BrowserVersion;
+import com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine;
+import com.gargoylesoftware.htmlunit.html.HtmlPage;
+import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
+import com.gargoylesoftware.htmlunit.OnbeforeunloadHandler;
+import com.gargoylesoftware.htmlunit.ScriptResult;
+import com.gargoylesoftware.htmlunit.StringWebResponse;
+import com.gargoylesoftware.htmlunit.TopLevelWindow;
+import com.gargoylesoftware.htmlunit.util.NameValuePair;
+import com.gargoylesoftware.htmlunit.WebClient;
+import com.gargoylesoftware.htmlunit.WebRequest;
+import com.gargoylesoftware.htmlunit.WebResponse;
+import com.gargoylesoftware.htmlunit.WebResponseData;
+import com.gargoylesoftware.htmlunit.WebWindow;
+import com.gargoylesoftware.htmlunit.WebClient;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.http.api.HttpException;
+
+/**
+ * An HTTP response.
+ *
+ *
+ *
+ * TODO needs to remove http* and jetty* or jersey* from nutch/lib directory otherwise everything
+ * goes crazy! IndexChecker works sofar.
+ *
+ */
+public class HttpResponse implements Response {
+
+  private Configuration conf;
+  private HttpBase http;
+  private URL url;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  
+  private WebClient client = new WebClient(BrowserVersion.FIREFOX_38);
+  
+  private HtmlPage page;
+  private WebResponse webResponse;
+
+  /**
+   * Default public constructor.
+   *
+   * @param http
+   * @param url
+   * @param datum
+   * @throws ProtocolException
+   * @throws IOException
+   */
+  public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws ProtocolException, IOException {
+    conf = http.getConf();
+    this.url = url;
+    
+    // Don't throw Java exception for script errors
+    client.getOptions().setThrowExceptionOnScriptError(false);
+      
+    // Enable CSS parsing, but it is enabled by default
+    client.getOptions().setCssEnabled(true);
+    client.getOptions().setJavaScriptEnabled(true);
+
+    // Do not follow redirects so we can check response from outlinks
+    // If we don't follow redirects, we get an exception FailingHttpStatusCodeException: 302 Moved Temporarily
+    client.getOptions().setRedirectEnabled(true);  // If we disable this, the referenced hyperlinks are not followed, causing trouble loading JS, assets and stuff, but this also allows the input URL to be redirected without Nutch knowing it
+      
+    client.setJavaScriptTimeout(3500);
+    client.setAjaxController(new NicelyResynchronizingAjaxController());
+    client.waitForBackgroundJavaScriptStartingBefore(500);
+
+    System.out.println("isCssEnabled: " + client.getOptions().isCssEnabled());
+    System.out.println("isJavaScriptEnabled: " + client.getOptions().isJavaScriptEnabled());
+    
+    page = client.getPage(url);
+    
+    //client.waitForBackgroundJavaScript(10000);
+    webResponse = page.getWebResponse();
+    
+    // Parse the headers
+    List<NameValuePair> headerList = webResponse.getResponseHeaders();
+    for (int i = 0; i < headerList.size(); i++) {
+      headers.add(headerList.get(i).getName(), headerList.get(i).getValue());
+    }
+  }
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return webResponse.getStatusCode();
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {   
+    return headers;
+  }
+
+  public byte[] getContent() {
+    try {
+      return page.asXml().getBytes("UTF8");
+    } catch (Exception e) { }
+    
+    return null;
+  }
+}
\ No newline at end of file
Index: src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
===================================================================
--- src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html	(revision 0)
+++ src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html	(working copy)
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
+</body>
+</html>
