Index: tika-app/pom.xml
===================================================================
--- tika-app/pom.xml (revision 1680458)
+++ tika-app/pom.xml (working copy)
@@ -94,8 +94,7 @@
commons-io
commons-io
- 2.1
- test
+ 2.4
Index: tika-bundle/pom.xml
===================================================================
--- tika-bundle/pom.xml (revision 1680458)
+++ tika-bundle/pom.xml (working copy)
@@ -125,7 +125,8 @@
tika-parsers;inline=true,
- commons-compress, xz, commons-codec, commons-csv, junrar,
+ commons-compress, xz, commons-codec, commons-csv,
+ commons-io, commons-exec, junrar,
pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
xmlbeans,
@@ -134,11 +135,11 @@
juniversalchardet,
vorbis-java-core, vorbis-java-tika,
isoparser, aspectjrt,
- metadata-extractor, xmpcore,
- boilerpipe, rome,
+ metadata-extractor, xmpcore, json-simple,
+ boilerpipe, rome, opennlp-tools, opennlp-maxent,
geoapi, sis-metadata, sis-netcdf, sis-utility,
sis-storage, apache-mime4j-core, apache-mime4j-dom,
- jsr-275, jhighlight, java-libpst,
+ jsr-275, jhighlight, java-libpst, jwnl,
netcdf4, grib, cdm, httpservices, jcip-annotations,
jmatio, guava
@@ -184,6 +185,11 @@
opendap.dap.http;resolution:=optional,
opendap.dap;resolution:=optional,
opendap.dap.parser;resolution:=optional,
+ opennlp.maxent;resolution:=optional,
+ opennlp.tools.namefind;resolution:=optional,
+ net.didion.jwnl;resolution:=optional,
+ org.apache.commons.exec;resolution:=optional,
+ org.apache.commons.io;resolution:=optional,
org.apache.commons.httpclient;resolution:=optional,
org.apache.commons.httpclient.auth;resolution:=optional,
org.apache.commons.httpclient.methods;resolution:=optional,
@@ -233,6 +239,7 @@
org.jdom2;resolution:=optional,
org.jdom2.input;resolution:=optional,
org.jdom2.output;resolution:=optional,
+ org.json.simple;resolution:=optional,
org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
org.osgi.framework;resolution:=optional,
Index: tika-parsers/pom.xml
===================================================================
--- tika-parsers/pom.xml (revision 1680458)
+++ tika-parsers/pom.xml (working copy)
@@ -224,6 +224,32 @@
provided
+
+ org.apache.opennlp
+ opennlp-tools
+ 1.5.3
+
+
+
+ commons-io
+ commons-io
+ 2.4
+
+
+
+ org.apache.commons
+ commons-exec
+ 1.3
+
+
+
+ com.googlecode.json-simple
+ json-simple
+ 1.1.1
+
+
+
+
junit
Index: tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java (working copy)
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+
+ private static final long serialVersionUID = -2241391757440215491L;
+ private static final MediaType MEDIA_TYPE = MediaType
+ .application("geotopic");
+ private static final Set SUPPORTED_TYPES = Collections
+ .singleton(MEDIA_TYPE);
+ private GeoParserConfig config = new GeoParserConfig();
+ private static final Logger LOG = Logger.getLogger(GeoParser.class.getName());
+
+ @Override
+ public Set getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ /*----------------configure this parser by ParseContext Object---------------------*/
+ config = context.get(GeoParserConfig.class,
+ config);
+ String nerModelPath = config.getNERPath();
+
+ if(!isAvailable()){
+ return;
+ }
+
+ /*----------------get locationNameEntities and best nameEntity for the input stream---------------------*/
+ NameEntityExtractor extractor = new NameEntityExtractor(nerModelPath);
+ extractor.getAllNameEntitiesfromInput(stream);
+ extractor.getBestNameEntity();
+ ArrayList locationNameEntities = extractor.locationNameEntities;
+ String bestner = extractor.bestNameEntity;
+
+ /*------------------------resolve geonames for each ner, store results in a hashmap---------------------*/
+ HashMap> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+ /*----------------store locationNameEntities and their geonames in a geotag, each input has one geotag---------------------*/
+ GeoTag geotag = new GeoTag();
+ geotag.toGeoTag(resolvedGeonames, bestner);
+
+ /* add resolved entities in metadata */
+
+ metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+ metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+ metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+ for (int i = 0; i < geotag.alternatives.size(); ++i) {
+ GeoTag alter = (GeoTag) geotag.alternatives.get(i);
+ metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+ metadata.add("Optional_LONGITUDE" + (i + 1),
+ alter.Geographic_LONGTITUDE);
+ metadata.add("Optional_LATITUDE" + (i + 1),
+ alter.Geographic_LATITUDE);
+ }
+ }
+
+ public HashMap> searchGeoNames(
+ ArrayList locationNameEntities) throws ExecuteException,
+ IOException {
+ CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ cmdLine.addArgument("-s");
+ for (String name : locationNameEntities) {
+ cmdLine.addArgument(name);
+ }
+
+ LOG.fine("Executing: " + cmdLine);
+ DefaultExecutor exec = new DefaultExecutor();
+ exec.setExitValue(0);
+ ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+ exec.setWatchdog(watchdog);
+ PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+ exec.setStreamHandler(streamHandler);
+ int exitValue = exec.execute(cmdLine,
+ EnvironmentUtils.getProcEnvironment());
+ String outputJson = outputStream.toString("UTF-8");
+ JSONArray json = (JSONArray) JSONValue.parse(outputJson);
+
+ HashMap> returnHash = new HashMap>();
+ for (int i = 0; i < json.size(); i++) {
+ JSONObject obj = (JSONObject) json.get(i);
+ for (Object key : obj.keySet()) {
+ String theKey = (String) key;
+ JSONArray vals = (JSONArray) obj.get(theKey);
+ ArrayList stringVals = new ArrayList(
+ vals.size());
+ for (int j = 0; j < vals.size(); j++) {
+ String val = (String) vals.get(j);
+ stringVals.add(val);
+ }
+
+ returnHash.put(theKey, stringVals);
+ }
+ }
+
+ return returnHash;
+
+ }
+
+ public boolean isAvailable(){
+ return ExternalParser.check(new String[]{"lucene-geo-gazetteer", "--help"}, -1)
+ && config.getNERPath() != null && !config.getNERPath().equals("");
+ }
+
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java (working copy)
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+
+public class GeoParserConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+ private String nerModelPath = null;
+
+ public GeoParserConfig() {
+ try {
+ if (GeoParserConfig.class.getResource(
+ "en-ner-location.bin") != null){
+ this.nerModelPath = new File(GeoParserConfig.class.getResource(
+ "en-ner-location.bin").toURI()).getAbsolutePath();
+ }
+ } catch (URISyntaxException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void setNERModelPath(String path) {
+ if (path == null)
+ return;
+ File file = new File(path);
+ if (file.isDirectory() || !file.exists()) {
+ return;
+ }
+ nerModelPath = path;
+ }
+
+ public String getNERPath() {
+ return nerModelPath;
+ }
+
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java (working copy)
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+ String Geographic_NAME;
+ String Geographic_LONGTITUDE;
+ String Geographic_LATITUDE;
+ ArrayList alternatives = new ArrayList();
+
+ public void setMain(String name, String longitude, String latitude) {
+ Geographic_NAME = name;
+ Geographic_LONGTITUDE = longitude;
+ Geographic_LATITUDE = latitude;
+ }
+
+ public void addAlternative(GeoTag geotag) {
+ alternatives.add(geotag);
+ }
+
+ /*
+ * Store resolved geoName entities in a GeoTag
+ *
+ * @param resolvedGeonames resolved entities
+ *
+ * @param bestNER best name entity among all the extracted entities for the
+ * input stream
+ */
+ public void toGeoTag(HashMap> resolvedGeonames,
+ String bestNER) {
+
+ for (String key : resolvedGeonames.keySet()) {
+ ArrayList cur = resolvedGeonames.get(key);
+ if (key.equals(bestNER)) {
+ this.Geographic_NAME = cur.get(0);
+ this.Geographic_LONGTITUDE = cur.get(1);
+ this.Geographic_LATITUDE = cur.get(2);
+ } else {
+ GeoTag alter = new GeoTag();
+ alter.Geographic_NAME = cur.get(0);
+ alter.Geographic_LONGTITUDE = cur.get(1);
+ alter.Geographic_LATITUDE = cur.get(2);
+ this.addAlternative(alter);
+ }
+ }
+ }
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java (working copy)
@@ -0,0 +1,108 @@
+package org.apache.tika.parser.geo.topic;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.Span;
+
+import org.apache.commons.io.IOUtils;
+
+public class NameEntityExtractor {
+ private String nerModelPath = null;
+ ArrayList locationNameEntities;
+ String bestNameEntity;
+ private HashMap tf;
+
+ public NameEntityExtractor(String nerModelpath) {
+ this.locationNameEntities = new ArrayList();
+ this.bestNameEntity = null;
+ this.nerModelPath = nerModelpath;
+ tf = new HashMap();
+
+ }
+
+ /*
+ * Use OpenNLP to extract location names that's appearing in the steam.
+ * OpenNLP's default Name Finder accuracy is not very good, please refer to
+ * its documentation.
+ *
+ * @param stream stream that passed from this.parse()
+ */
+
+ public void getAllNameEntitiesfromInput(InputStream stream)
+ throws InvalidFormatException, IOException {
+
+ InputStream modelIn = new FileInputStream(nerModelPath);
+ TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
+ NameFinderME nameFinder = new NameFinderME(model);
+ String[] in = IOUtils.toString(stream, "UTF-8").split(" ");
+
+ Span nameE[] = nameFinder.find(in);
+
+ String spanNames = Arrays.toString(Span.spansToStrings(nameE, in));
+ spanNames = spanNames.substring(1, spanNames.length() - 1);
+ modelIn.close();
+ String[] tmp = spanNames.split(",");
+
+ for (String name : tmp) {
+ name = name.trim();
+ this.locationNameEntities.add(name);
+ }
+
+ }
+
+ /*
+ * Get the best location entity extracted from the input stream. Simply
+ * return the most frequent entity, If there several highest frequent
+ * entity, pick one randomly. May not be the optimal solution, but works.
+ *
+ * @param locationNameEntities OpenNLP name finder's results, stored in
+ * ArrayList
+ */
+ public void getBestNameEntity() {
+ if (this.locationNameEntities.size() == 0)
+ return;
+
+ for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+ if (tf.containsKey(this.locationNameEntities.get(i)))
+ tf.put(this.locationNameEntities.get(i),
+ tf.get(this.locationNameEntities.get(i)) + 1);
+ else
+ tf.put(this.locationNameEntities.get(i), 1);
+ }
+ int max = 0;
+ List> list = new ArrayList>(
+ tf.entrySet());
+ Collections.shuffle(list);
+ Collections.sort(list, new Comparator>() {
+ public int compare(Map.Entry o1,
+ Map.Entry o2) {
+ return o2.getValue().compareTo(o1.getValue()); // descending
+ // order
+
+ }
+ });
+
+ this.locationNameEntities.clear();// update so that they are in
+ // descending order
+ for (Map.Entry entry : list) {
+ this.locationNameEntities.add(entry.getKey());
+ if (entry.getValue() > max) {
+ max = entry.getValue();
+ this.bestNameEntity = entry.getKey();
+ }
+ }
+ }
+
+}
Index: tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
===================================================================
--- tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (revision 1680458)
+++ tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (working copy)
@@ -63,3 +63,5 @@
org.apache.tika.parser.jdbc.SQLite3Parser
org.apache.tika.parser.isatab.ISArchiveParser
org.apache.tika.parser.geoinfo.GeographicInformationParser
+org.apache.tika.parser.geo.topic.GeoParser
+
Index: tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java (revision 0)
+++ tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java (working copy)
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParserTest {
+ private Parser geoparser = new GeoParser();
+
+ @Test
+ public void testFunctions() throws UnsupportedEncodingException,
+ IOException, SAXException, TikaException {
+ String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
+ + "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
+ + "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
+ + "a geographically distributed network of United States proxy climate records was examined to study the spatial and temporal patterns of change, and to "
+ + "quantify the magnitude of change during these transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was likely the smallest of "
+ + "the present interglacial period; China certainly it was less extensive than at any time in the past 100 years, "
+ + "and therefore affords an opportunity to investigate a period of warmth similar to what is projected during the coming century.";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new GeoParserConfig();
+ context.set(GeoParserConfig.class, config);
+
+ InputStream s = new ByteArrayInputStream(text.getBytes("UTF-8"));
+ /* if it's not available no tests to run */
+ if (!((GeoParser)geoparser).isAvailable()) return;
+
+ geoparser.parse(s, new BodyContentHandler(), metadata, context);
+
+ assertNotNull(metadata.get("Geographic_NAME"));
+ assertNotNull(metadata.get("Geographic_LONGITUDE"));
+ assertNotNull(metadata.get("Geographic_LATITUDE"));
+ assertEquals("China", metadata.get("Geographic_NAME"));
+ assertEquals("United States", metadata.get("Optional_NAME1"));
+ assertEquals("27.33931", metadata.get("Geographic_LATITUDE"));
+ assertEquals("-108.60288", metadata.get("Geographic_LONGITUDE"));
+ assertEquals("39.76", metadata.get("Optional_LATITUDE1"));
+ assertEquals("-98.5", metadata.get("Optional_LONGITUDE1"));
+
+ }
+
+ @Test
+ public void testNulls() throws UnsupportedEncodingException, IOException,
+ SAXException, TikaException {
+ String text = "";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new GeoParserConfig();
+ context.set(GeoParserConfig.class, config);
+ geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
+ new BodyContentHandler(), metadata, context);
+ assertNull(metadata.get("Geographic_NAME"));
+ assertNull(metadata.get("Geographic_LONGITUDE"));
+ assertNull(metadata.get("Geographic_LATITUDE"));
+
+ }
+}