http.proxy.username
and http.auth.username
)
+ * is set in the conf/nutch-site.xml
file
+ *
+ * The realm properties (http.proxy.realm
and
+ * http.auth.realm
) are used as domain names for NTLM
+ * authentication.
+ *
+ * @author Susam Pal
+ */
+public class Http extends HttpBase {
+
+ static final Log LOG = LogFactory.getLog(Http.class);
+
+ private HttpClient client = new HttpClient();
+
+ private String proxyUser;
+ private String proxyPassword;
+ private String proxyRealm;
+
+ private String authUser;
+ private String authPassword;
+ private String authHost;
+ private String authRealm;
+
+ /**
+ * Constructs this plugin.
+ */
+ public Http() {
+ super(LOG);
+ }
+
+ /**
+ * Reads the configuration from the Nutch configuration files and sets
+ * the configuration.
+ *
+ * @param conf Configuration
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ this.proxyUser = conf.get("http.proxy.username", "");
+ this.proxyPassword = conf.get("http.proxy.password", "");
+ this.proxyRealm = conf.get("http.proxy.realm", "");
+ this.authUser = conf.get("http.auth.username", "");
+ this.authPassword = conf.get("http.auth.password", "");
+ this.authHost = conf.get("http.auth.host", "");
+ this.authRealm = conf.get("http.auth.realm", "");
+ }
+
+ /**
+ * Returns an authentication scope
+ *
+ * @param host Host
+ * @param host Port number
+ * @param realm Authentication realm
+ * @return Authentication scope
+ */
+ private AuthScope getAuthScope(String host, int port, String realm) {
+ if (realm.length() == 0) {
+ return new AuthScope(host, port);
+ } else {
+ return new AuthScope(host, port, realm);
+ }
+ }
+
+ /**
+ * Configures the HTTP client
+ *
+ * @param url URL to be fetched
+ */
+ private void configureClient(URL url) {
+
+ HostConfiguration hostConf = this.client.getHostConfiguration();
+
+ // HTTP proxy server details
+ if (useProxy) {
+ hostConf.setProxy(this.proxyHost, this.proxyPort);
+
+ if (proxyUser.length() > 0) {
+
+ AuthScope proxyAuthScope = getAuthScope(
+ this.proxyHost, this.proxyPort, this.proxyRealm);
+
+ NTCredentials proxyCredentials = new NTCredentials(
+ this.proxyUser, this.proxyPassword,
+ this.authHost, this.proxyRealm);
+
+ this.client.getState().setProxyCredentials(
+ proxyAuthScope, proxyCredentials);
+ }
+
+ }
+
+ // Web server authentication details
+ if (authUser.length() > 0) {
+
+ int port = url.getPort();
+ if (port == -1) {
+ port = 80;
+ }
+
+ AuthScope serverAuthScope = getAuthScope(
+ url.getHost(), port, this.authRealm);
+
+ NTCredentials serverCredentials = new NTCredentials(
+ this.authUser, this.authPassword,
+ this.authHost, this.authRealm);
+
+ this.client.getState().setCredentials(
+ serverAuthScope, serverCredentials);
+ }
+
+ // Connection parameters
+ HttpConnectionManagerParams params =
+ this.client.getHttpConnectionManager().getParams();
+ params.setConnectionTimeout(timeout);
+ params.setSoTimeout(timeout);
+
+ }
+
+ /**
+ * Returns the configured HTTP client.
+ *
+ * @return HTTP client
+ */
+ HttpClient getClient() {
+ return this.client;
+ }
+
+ /**
+ * Fetches the url
with a configured HTTP client and
+ * gets the response.
+ *
+ * @param url URL to be fetched
+ * @param datum Crawl data
+ * @param redirect Follow redirects if and only if true
+ * @return HTTP response
+ */
+ protected Response getResponse(URL url, CrawlDatum datum,
+ boolean redirect) throws ProtocolException, IOException {
+ configureClient(url);
+ return new HttpResponse(this, url, datum, redirect);
+ }
+
+ /**
+ * Method for unit testing this plugin.
+ *
+ * @param args Command line arguments
+ */
+ public static void main(String[] args) throws Exception {
+ Http http = new Http();
+ http.setConf(NutchConfiguration.create());
+ main(http, args);
+ }
+}
Index: src/plugin/protocol-http11/src/java/org/apache/nutch/protocol/http11/HttpResponse.java
===================================================================
--- src/plugin/protocol-http11/src/java/org/apache/nutch/protocol/http11/HttpResponse.java (revision 0)
+++ src/plugin/protocol-http11/src/java/org/apache/nutch/protocol/http11/HttpResponse.java (revision 0)
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.http11;
+
+// JDK imports
+import java.io.InputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.net.URL;
+
+// HttpClient imports
+import org.apache.commons.httpclient.methods.GetMethod;
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.params.HttpMethodParams;
+import org.apache.commons.httpclient.HttpVersion;
+import org.apache.commons.httpclient.HttpException;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+
+/**
+ * An HTTP response.
+ *
+ * @author Susam Pal
+ */
+public class HttpResponse implements Response {
+
+ private byte[] content;
+ private URL url;
+ private int code;
+ private Metadata headers = new SpellCheckedMetadata();
+
+ /**
+ * Fetches the given url
and prepares HTTP response.
+ *
+ * @param http An instance of the implementation class
+ * of this plugin
+ * @param url URL to be fetched
+ * @param datum Crawl data
+ * @param followRedirects Whether to follow redirects; follows
+ * redirect if and only if this is true
+ * @return HTTP response
+ * @throws ProtocolException When an HTTP error occurs
+ * @throws IOException When a non recoverable error occurs
+ */
+ public HttpResponse(Http http, URL url, CrawlDatum datum,
+ boolean followRedirects) throws ProtocolException, IOException {
+
+ // Prepare GET method for HTTP request
+ this.url = url;
+ GetMethod get = new GetMethod(url.toString());
+ get.setFollowRedirects(followRedirects);
+ get.setDoAuthentication(true);
+ get.setRequestHeader("Accept-Encoding", "x-gzip, gzip");
+ get.setRequestHeader("User-Agent", http.getUserAgent());
+ if (datum.getModifiedTime() > 0) {
+ get.setRequestHeader("If-Modified-Since",
+ HttpDateFormat.toString(datum.getModifiedTime()));
+ }
+
+ // Set HTTP parameters
+ HttpMethodParams params = get.getParams();
+ if (http.getUseHttp11()) {
+ params.setVersion(HttpVersion.HTTP_1_1);
+ } else {
+ params.setVersion(HttpVersion.HTTP_1_0);
+ }
+
+ try {
+
+ this.code = http.getClient().executeMethod(get);
+
+ // Read headers
+ Header[] headerArray = get.getResponseHeaders();
+ for (int i=0; i < headerArray.length; i++) {
+ headers.set(headerArray[i].getName(), headerArray[i].getValue());
+ }
+
+ // Allocate buffer of appropriate length to store content
+ ByteArrayOutputStream out = null;
+ if (getHeader(Response.CONTENT_LENGTH) != null) {
+ out = new ByteArrayOutputStream(Integer.parseInt(
+ getHeader(Response.CONTENT_LENGTH).trim()));
+ } else {
+ out = new ByteArrayOutputStream();
+ }
+
+ // Read content
+ InputStream is = get.getResponseBodyAsStream();
+ int ch;
+ int bytesRead = 0;
+ while ((ch = is.read()) != -1 && bytesRead < http.getMaxContent()) {
+ out.write(ch);
+ bytesRead++;
+ }
+ content = out.toByteArray();
+
+ String contentEncoding = getHeader(Response.CONTENT_ENCODING);
+
+ // Trace message
+ StringBuffer fetchTrace = new StringBuffer("url: " + url +
+ "; status code: " + code +
+ "; bytes received: " + content.length);
+ if (getHeader(Response.CONTENT_LENGTH) != null)
+ fetchTrace.append("; Content-Length: " +
+ getHeader(Response.CONTENT_LENGTH));
+ if (contentEncoding != null)
+ fetchTrace.append("; Content-Encoding: " + contentEncoding);
+ if (getHeader(Response.LOCATION) != null)
+ fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
+
+ // Handle gzip and x-gzip files
+ if ("gzip".equals(contentEncoding) ||
+ "x-gzip".equals(contentEncoding)) {
+ content = http.processGzipEncoded(content, url);
+ fetchTrace.append("; gzip extracted to " + content.length +
+ " bytes");
+ }
+
+ // Log trace message
+ if (Http.LOG.isInfoEnabled()) {
+ Http.LOG.info(fetchTrace);
+ }
+
+ } catch (HttpException ex) {
+ throw new ProtocolException("ProtocolException while fetching: " +
+ url + " - " + ex);
+ } catch (IOException ex) {
+ throw new IOException("IOException while fetching: " + url +
+ " - " + ex);
+ } finally {
+ get.releaseConnection();
+ }
+ }
+
+ /* ------------------------- *
+ * Protocol plugin which supports retrieving documents via the HTTP 1.0, +HTTP 1.1 and HTTPS protocols, optionally with Basic, Digest and NTLM +authentication schemes for web server as well as proxy server.
+ + Index: src/plugin/protocol-http11/plugin.xml =================================================================== --- src/plugin/protocol-http11/plugin.xml (revision 0) +++ src/plugin/protocol-http11/plugin.xml (revision 0) @@ -0,0 +1,61 @@ + + + +