diff --git conf/nutch-default.xml conf/nutch-default.xml
index fe2b0db..8a7034b 100644
--- conf/nutch-default.xml
+++ conf/nutch-default.xml
@@ -1453,6 +1453,14 @@
Maximum size of the bulk in bytes.
+
+
+
+ cloudsearch.endpoint
+
+ The Document Endpoint for the domain. It should include 'http://'... and ...'/API_VERSION/documents/batch'.
+
+
diff --git src/plugin/build.xml src/plugin/build.xml
index 5b6d4a3..7c03868 100755
--- src/plugin/build.xml
+++ src/plugin/build.xml
@@ -34,6 +34,7 @@
+
@@ -120,6 +121,7 @@
+
diff --git src/plugin/indexer-cloudsearch/README.cloudsearch.txt src/plugin/indexer-cloudsearch/README.cloudsearch.txt
new file mode 100644
index 0000000..db5855a
--- /dev/null
+++ src/plugin/indexer-cloudsearch/README.cloudsearch.txt
@@ -0,0 +1,35 @@
+Steps to use:
+
+1. Create a CloudSearch domain
+ note the document endpoint
+2. Checkout nutch (I'm using git)
+ git clone https://github.com/apache/nutch
+3. Switch to 1.7 branch
+ git checkout -t origin/branch-1.7
+4. Apply attached patch
+ I created it with : git diff remotes/origin/branch-1.7 --no-prefix > indexer-cloudsearch.patch
+ applied with: patch -p0 -i ~/code/nutch/indexer-cloudsearch.patch
+5. Edit conf/nutch-default.xml
+ add the document endpoint under the cloudsearch parameters (add http:// on the front and /2011-02-01/documents/batch on the end)
+ change the line with "indexer-solr" to "indexer-cloudsearch"
+6. Build nutch
+ Just "ant" in top directory.
+ builds "runtime" directory, and "local" under that.
+7. cd to nutch/runtime/local
+8. Do step three of the tutorial at http://wiki.apache.org/nutch/NutchTutorial
+ 1) You've done step #1 already
+ 2) Step 2, I didn't have to do, it was all correct already
+ 3) Do step 3, stop before 3.1
+ a) Then do this: bin/nutch crawl urls -dir crawl -depth 3 -topN 5
+ b) 3.2 through 5.x SKIP
+ 4) skip tutorial step 4
+ 5) skip tutorial step 5
+ 6) Parts of step 6.
+ Check that the domain is ready
+ Then just do the one line
+ bin/nutch solrindex http://127.0.0.1:8983/solr/ crawl/crawldb -linkdb crawl/linkdb crawl/segments/*
+ Don't worry about the URL, it's ignored. The real URL comes from nutch-default.xml (set above)
+ (This is a hack, since I'm not sure how to integrate properly. Hopefully someone can help here)
+9.Check logs/hadoop.log
+ Should show The adds sent to CloudSearch. Errors show there, too.
+ Might have to set logging level to info in nutch/runtime/local/conf/log4j.properties
diff --git src/plugin/indexer-cloudsearch/build.xml src/plugin/indexer-cloudsearch/build.xml
new file mode 100644
index 0000000..852b265
--- /dev/null
+++ src/plugin/indexer-cloudsearch/build.xml
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
diff --git src/plugin/indexer-cloudsearch/ivy.xml src/plugin/indexer-cloudsearch/ivy.xml
new file mode 100644
index 0000000..8fa649f
--- /dev/null
+++ src/plugin/indexer-cloudsearch/ivy.xml
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+
+
+ Apache Nutch
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/indexer-cloudsearch/plugin.xml src/plugin/indexer-cloudsearch/plugin.xml
new file mode 100644
index 0000000..bb0c9fd
--- /dev/null
+++ src/plugin/indexer-cloudsearch/plugin.xml
@@ -0,0 +1,44 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchBatcher.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchBatcher.java
new file mode 100644
index 0000000..eb7fb0c
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchBatcher.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.cloudsearch;
+
+import java.io.IOException;
+
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Handles batching of documents into up to "batchSize" chunks and then posts
+ * them to CloudSearch
+ *
+ * Remember to call flush() at the end!!
+ *
+ * Note: This assumes a single thread per Batcher (and therefore per
+ * IndexWriter). If this is not the case, then synchronization is needed.
+ *
+ */
+public class CloudSearchBatcher {
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CloudSearchBatcher.class);
+
+ /**
+ * Builds a JSONObject that has the SDF headers needed for posting to
+ * CloudSearch
+ *
+ * @param fields
+ * Contains the field names and field values.
+ * @param version
+ * @param id
+ * @return
+ * @throws JSONException
+ */
+ static public JSONObject makeSDF(String id, int version, JSONObject fields)
+ throws JSONException {
+ boolean isAdd = fields != null;
+ JSONObject doc = new JSONObject();
+ if (isAdd)
+ doc.put("fields", fields);
+ doc.put("type", isAdd ? "add" : "delete");
+ doc.put("id", id);
+ doc.put("version", version);
+ doc.put("lang", "en");
+ return doc;
+ }
+
+ /** Maximum batch size allowed by CloudSearch. Non-final for easier testing. */
+ private int batchSize;
+
+ /**
+ * Extra characters in a batch, in addition to the documents. Mostly "[" + "]"
+ * + optional whitespace.
+ */
+ final int overhead = 10;
+
+ JSONArray output;
+
+ CloudSearchPoster poster;
+
+ /**
+ * The number of bytes used by the already-accumulated documents.
+ */
+ int bytesUsed;
+
+ public CloudSearchBatcher(CloudSearchPoster poster) {
+ this.poster = poster;
+ setBatchSizeLimit(5000000);
+ reset();
+ }
+
+ void reset() {
+ output = new JSONArray();
+ bytesUsed = 0;
+ }
+
+ void setBatchSizeLimit(int limit) {
+ batchSize = limit;
+ }
+
+ /**
+ * Adds the document to the current batch. The batch will be posted if it is
+ * full.
+ *
+ * Remember to call flush() after the last document!!
+ *
+ * TODO: check for documents that are too long.
+ *
+ * @param doc
+ * @throws Exception
+ */
+ public void processDocument(String id, int version, JSONObject fields)
+ throws IOException {
+ JSONObject sdf;
+ try {
+ // Wrap the documents fields
+ sdf = makeSDF(id, version, fields);
+ } catch (JSONException je) {
+ throw new IOException(je);
+ }
+
+ String json = sdf.toString();
+
+ final byte[] utf8Bytes = json.getBytes("UTF-8");
+
+ if (bytesUsed + utf8Bytes.length < (batchSize - overhead)) {
+ output.put(sdf);
+ //System.out.printf("%4d %s%n", output.length(), sdf);
+
+ // There is still space left in this batch, accumulate it.
+ bytesUsed += utf8Bytes.length;
+ return;
+ }
+ // Post will clear the current batch.
+ post();
+ output.put(sdf);
+ }
+
+ /** Posts any remaining documents */
+ public void flush() throws Exception {
+ post();
+ }
+
+ int batchNo = 0;
+ int docCount = 0;
+ long bytesTotal = 0;
+ /**
+ * Send the document to CloudSearch using poster.
+ *
+ * @throws Exception
+ */
+ void post() throws IOException {
+ if (output.length() == 0)
+ return;
+ bytesTotal+= bytesUsed;
+ docCount += output.length();
+ String s = String.format("Uploading batch%,6d with %,6d docs, %,8d bytes. Totals: Docs: %,8d Bytes %,10d", batchNo++, output.length(), bytesUsed, docCount, bytesTotal);
+ LOG.info(s);
+ poster.postBatch(output);
+ reset();
+ }
+}
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java
new file mode 100644
index 0000000..32d5e91
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchConstants.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.cloudsearch;
+
+public interface CloudSearchConstants {
+ public static final String CLOUDSEARCH_PREFIX = "cloudsearch.";
+ // Document Endpoint should include the "2011-02-01/documents/batch" suffix
+ public static final String ENDPOINT = CLOUDSEARCH_PREFIX + "endpoint";
+
+}
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchException.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchException.java
new file mode 100644
index 0000000..0420c93
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchException.java
@@ -0,0 +1,23 @@
+package org.apache.nutch.indexwriter.cloudsearch;
+
+import java.io.IOException;
+
+public class CloudSearchException extends IOException {
+
+ public CloudSearchException() {
+
+ }
+
+ public CloudSearchException(String message) {
+ super(message);
+ }
+
+ public CloudSearchException(Throwable cause) {
+ super(cause);
+ }
+
+ public CloudSearchException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+}
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
new file mode 100644
index 0000000..7a2742e
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.cloudsearch;
+
+// TODO: Could map floats to UINT fields.
+// TODO: Add mapper, or at least convert publishedDate to published_date in clean()
+
+import java.io.IOException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Date;
+import java.util.Map.Entry;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexerMapReduce;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Writes documents to CloudSearch.
+ */
+public class CloudSearchIndexWriter implements IndexWriter {
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CloudSearchIndexWriter.class);
+
+ private CloudSearchBatcher batcher;
+
+ private Configuration config;
+
+ private boolean delete = false;
+
+ MessageDigest digester;
+
+ @Override
+ public void open(JobConf job, String name) throws IOException {
+ LOG.debug("CloudSearchIndexWriter.open() name= " + name);
+ String endpoint = job.get(CloudSearchConstants.ENDPOINT);
+ CloudSearchPoster poster = new CloudSearchPoster(endpoint);
+ batcher = new CloudSearchBatcher(poster);
+ delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+ try {
+ digester = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException e) {
+ throw new IOException("CloudSearchIndexWriter.open() " + e.toString(), e);
+ }
+ }
+
+ @Override
+ public void delete(String key) throws IOException {
+ if (!delete)
+ return;
+
+ LOG.debug("Deleting doc with id : " + key);
+ batcher.processDocument(key, getVersion(), null);
+ }
+
+ @Override
+ public void update(NutchDocument doc) throws IOException {
+ write(doc);
+ }
+
+ /**
+ * Generate a unique ID. CloudSearch has restrictions on the ID field, it must
+ * be of the form "[a-z0-9][a-z0-9_]*", so we can't use the URL. Generate a
+ * hash from the URL, and use it.
+ */
+ String getID(String url) {
+ byte[] data = url.getBytes();
+ data = digester.digest(data);
+ String id = Hex.encodeHexString(data);
+ return id;
+ }
+
+ @Override
+ public void write(NutchDocument doc) throws IOException {
+ JSONObject csdoc = new JSONObject();
+
+ // Generate a unique ID.
+ String id = getID((String) doc.getFieldValue("url"));
+
+ for (final Entry e : doc) {
+ for (final Object val : e.getValue().getValues()) {
+ // normalise the string representation for a Date
+ Object val2 = val;
+
+ // Convert dates to an integer
+ if (val instanceof Date) {
+ Date d = (Date) val;
+ val2 = dateToInt(d);
+ }
+
+ if (e.getKey().equals("content") || e.getKey().equals("title")) {
+ val2 = CloudSearchUtils.stripNonCharCodepoints((String) val);
+ }
+ try {
+ csdoc.put(clean(e.getKey()), val2);
+ } catch (JSONException exception) {
+ throw new IOException("CloudSearchIndexWriter: csdoc.put " + exception.toString(), exception);
+ }
+ }
+ }
+ try {
+ int version = getVersion();
+ LOG.debug("Adding doc with id: " + id + " version: " + version);
+ batcher.processDocument(id, version, csdoc);
+ } catch (Exception e) {
+ throw new IOException("CloudSearchIndexWriter.write() " + e.toString(), e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ // This will flush any unsent documents.
+ commit();
+ }
+
+ @Override
+ public void commit() throws IOException {
+ if (batcher != null) {
+ try {
+ batcher.flush();
+ } catch (Exception e) {
+ throw new IOException("CloudSearchIndexWriter.commit() " + e.toString(), e);
+ }
+ }
+ }
+
+ @Override
+ public Configuration getConf() {
+ return config;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ config = conf;
+ String endpoint = conf.get(CloudSearchConstants.ENDPOINT);
+ if (endpoint == null) {
+ String message = "Missing CloudSearch URL. Should be set via -D "
+ + CloudSearchConstants.ENDPOINT;
+ message += "\n" + describe();
+ LOG.error(message);
+ throw new RuntimeException(message);
+ }
+ }
+
+ public String describe() {
+ StringBuffer sb = new StringBuffer("CloudSearchIndexWriter\n");
+ sb.append("\t")
+ .append(CloudSearchConstants.ENDPOINT)
+ .append(
+ " : URL of the CloudSearch domain's document endpoint. (mandatory)\n");
+ return sb.toString();
+ }
+
+ int getVersion() {
+ // TODO: If nutch provides a "lastModifiedDate" we should use that for the
+ // version.
+ Date now = new Date();
+ int version = dateToInt(now);
+ // System.out.println("Version:" + version);
+ return version;
+ }
+
+ int dateToInt(Date d) {
+ return (int) (d.getTime() / 1000L);
+ }
+
+ /**
+ * Remove the non-cloudSearch-legal characters. Note that this might convert
+ * two fields to the same name. TODO: Could just use a mapper, like the Solr
+ * Plugin does.
+ *
+ * @param name
+ * @return
+ */
+ String clean(String name) {
+ String lowercase = name.toLowerCase();
+ String noPunctuation = lowercase.replaceAll("[^a-z_0-9]", "_");
+ return noPunctuation;
+ }
+
+ protected void setBatchSizeLimitTest(int batchSize) {
+ this.batcher.setBatchSizeLimit(batchSize);
+ }
+}
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchPoster.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchPoster.java
new file mode 100644
index 0000000..52f564f
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchPoster.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.cloudsearch;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.HttpStatus;
+import org.apache.http.StatusLine;
+import org.apache.http.client.ClientProtocolException;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.HttpResponseException;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.StringEntity;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.util.EntityUtils;
+import org.json.JSONArray;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+//import org.apache.http.impl.client.BasicResponseHandler;
+
+/**
+ * Post documents to CloudSearch via HTTP
+ */
+public class CloudSearchPoster {
+
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CloudSearchPoster.class);
+
+ // The URL of the Document Endpoint of the CloudSearch domain we are posting
+ // to.
+ String docEndPointURL;
+ static public final String UTF8 = "UTF-8";
+ static public final String JSON_MIME = "application/json";
+ int maxTries = 3;
+ int delayBaseMS = 200;
+ double mult = 2.0;
+
+ /**
+ *
+ * @param docEndPoint
+ * The doc endPoint. Looks like:
+ * http://doc-imdb-xyz123abc.us-east-1.cloudsearch
+ * .amazonaws.com/2011-02-01/documents/batch
+ */
+ public CloudSearchPoster(String docEndPoint) {
+ this.docEndPointURL = docEndPoint;
+ }
+
+ // TODO Get this from nutch config.
+ void setMaxTries(int tries) {
+ maxTries = tries;
+
+ }
+
+ /**
+ * Post documents to CloudSearch.
+ *
+ * @param batch
+ * A JSONArray containing documents.
+ * @throws UnsupportedEncodingException
+ *
+ * @throws Exception
+ * If Anything other than a 200 response.
+ */
+
+ public void postBatch(JSONArray batch) throws UnsupportedEncodingException, CloudSearchException {
+ String sdf = batch.toString();
+ StringEntity entity = new StringEntity(sdf, JSON_MIME, UTF8);
+
+ HttpClient httpclient = new DefaultHttpClient();
+ // HttpParams params = httpclient.getParams();
+ // System.out.println(params);
+ // System.out.println(HttpConnectionParams.getConnectionTimeout(params));
+ HttpPost post = new HttpPost(docEndPointURL);
+ post.setEntity(entity);
+
+ int code = 0;
+ Exception lastException = null;
+ int sleepDelayMS = delayBaseMS;
+ for (int retry = 0; retry < maxTries; retry++) {
+ try {
+ HttpResponse response = httpclient.execute(post);
+ HttpEntity responseEntity = response.getEntity();
+ StatusLine sl = response.getStatusLine();
+ code = sl.getStatusCode();
+ if (code != HttpStatus.SC_OK) {
+ LOG.warn("Uploading of documents failed. Http Status = " + code
+ + ". Retries remaining: " + ((maxTries - retry) - 1));
+ String entityString = EntityUtils.toString(responseEntity);
+ LOG.debug(entityString);
+ try {
+ Thread.sleep(sleepDelayMS);
+ sleepDelayMS *= mult;
+ } catch (InterruptedException ie) {
+ }
+ continue;
+ }
+ EntityUtils.consume(responseEntity);
+ return;
+ } catch (HttpResponseException e) {
+ LOG.warn("Exception when posting to CloudSearch " + e.toString()
+ + ". Retries remaining: " + ((maxTries - retry) - 1));
+ lastException = e;
+ } catch (ClientProtocolException e) {
+ LOG.warn("Exception when posting to CloudSearch " + e.toString()
+ + ". Retries remaining: " + ((maxTries - retry) - 1));
+ lastException = e;
+ } catch (IOException e) {
+ LOG.warn("Exception when posting to CloudSearch " + e.toString()
+ + ". Retries remaining: " + ((maxTries - retry) - 1));
+ lastException = e;
+ }
+ }
+ LOG.error("Uploading of documents failed after " + maxTries
+ + " tries." + ((code != 0)?("Http Status = " + code + "."):"") + " " + ((lastException!= null)?lastException:""));
+ throw new CloudSearchException("Retries failed. Could not post.");
+ }
+}
diff --git src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
new file mode 100644
index 0000000..93236c4
--- /dev/null
+++ src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.cloudsearch;
+
+public class CloudSearchUtils {
+
+ public static String stripNonCharCodepoints(String input) {
+ StringBuilder retval = new StringBuilder();
+ char ch;
+
+ for (int i = 0; i < input.length(); i++) {
+ ch = input.charAt(i);
+
+ // Keep only characters that are legal for CloudSearch
+ if ((ch == 0x9 || ch == 0xa || ch == 0xd) || (ch >= 0x20 && ch <= 0xFFFD)) {
+ retval.append(ch);
+ }
+ }
+
+ return retval.toString();
+ }
+}