Index: tika-bundle/pom.xml
===================================================================
--- tika-bundle/pom.xml	(revision 1683145)
+++ tika-bundle/pom.xml	(working copy)
@@ -140,7 +140,8 @@
               geoapi, sis-metadata, sis-netcdf, sis-utility, 
               sis-storage, apache-mime4j-core, apache-mime4j-dom,
               jsr-275, jhighlight, java-libpst, jwnl, 
-              netcdf4, grib, cdm, httpservices, jcip-annotations, 
+              netcdf4, grib, cdm, httpservices, jcip-annotations,
+              ctakes-core, findstructapi, openaifsm, ctakes-type-system, lucene-core, uimaj-core, uimafit-core, commons-lang, spring-beans,
               jmatio, guava
             </Embed-Dependency>
             <Embed-Transitive>true</Embed-Transitive>
@@ -151,7 +152,7 @@
               org.apache.tika.parser.*,
             </Export-Package>
             <Import-Package>
-              !org.junit,
+              !org.junit, 
               *,
               org.apache.tika.fork,
               android.util;resolution:=optional,
Index: tika-example/pom.xml
===================================================================
--- tika-example/pom.xml	(revision 1683145)
+++ tika-example/pom.xml	(working copy)
@@ -104,11 +104,13 @@
         <artifactId>jackrabbit-core</artifactId>
         <version>2.3.6</version>
     </dependency>   	
+<!--
 	<dependency>
 	    <groupId>org.apache.lucene</groupId>
 	    <artifactId>lucene-core</artifactId>
 	    <version>3.5.0</version>
 	</dependency>	
+-->
 	<dependency>
 	    <groupId>commons-io</groupId>
 	    <artifactId>commons-io</artifactId>
Index: tika-parsers/pom.xml
===================================================================
--- tika-parsers/pom.xml	(revision 1683145)
+++ tika-parsers/pom.xml	(working copy)
@@ -326,6 +326,27 @@
 		<artifactId>geoapi</artifactId>
 		<version>3.0.0</version>
 	</dependency>
+    <!-- Apache cTAKES -->
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+      <version>3.2.2</version>
+<!--
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.lucene</groupId>
+          <artifactId>lucene-core</artifactId>
+        </exclusion>
+      </exclusions>
+-->
+    </dependency>
+<!--
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-type-system</artifactId>
+      <version>3.2.2</version>
+    </dependency>
+-->
   </dependencies>
 
   <build>
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESAnnotationProperty.java	(working copy)
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+public enum CTAKESAnnotationProperty {
+	BEGIN("start"),
+	END("end"),
+	CONDITIONAL("conditional"),
+	CONFIDENCE("confidence"),
+	DISCOVERY_TECNIQUE("discoveryTechnique"),
+	GENERIC("generic"),
+	HISTORY_OF("historyOf"),
+	ID("id"),
+	ONTOLOGY_CONCEPT_ARR("ontologyConceptArr"),
+	POLARITY("polarity");
+	
+	private String name;
+	
+	CTAKESAnnotationProperty(String name) {
+		this.name = name;
+	}
+	
+	public String getName() {
+		return name;
+	}
+}
\ No newline at end of file
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESConfig.java	(working copy)
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.Serializable;
+import java.util.Properties;
+
+import org.apache.tika.io.NullOutputStream;
+
+/*
+ * Configuration for {@see CTAKESContentHandler}.
+ * 
+ * This class allows to enable cTAKES and set its parameters.
+ * 
+ */
+public class CTAKESConfig implements Serializable {
+	/**
+	 * Serial version UID
+	 */
+	private static final long serialVersionUID = -1599741171775528923L;
+	
+	// Path to XML descriptor for AnalysisEngine
+	private String aeDescriptorPath = "/ctakes-core/desc/analysis_engine/SentencesAndTokensAggregate.xml";
+	
+	// UMLS username
+	private String UMLSUser = "";
+	
+	// UMLS password
+	private String UMLSPass = "";
+	
+	// Enables formatted output
+	private boolean prettyPrint = true; 
+	
+	// Type of cTAKES (UIMA) serializer
+	private CTAKESSerializer serializerType = CTAKESSerializer.XMI;
+	
+	// OutputStream object used for CAS serialization
+	private OutputStream stream = NullOutputStream.NULL_OUTPUT_STREAM;
+	
+	// Enables CAS serialization
+	private boolean serialize = false;
+	
+	// Enables text analysis using cTAKES
+	//private boolean text = false;
+	private boolean text = true;
+	
+	// List of metadata to analyze using cTAKES
+	private String[] metadata = null;
+	
+	// List of annotation properties to add to metadata in addition to text covered by an annotation
+	private CTAKESAnnotationProperty[] annotationProps = null;
+	
+	// Character used to separate the annotation properties into metadata
+	private char separatorChar = ':';
+
+	/**
+	 * Default constructor.
+	 */
+	public CTAKESConfig() {
+		init(this.getClass().getResourceAsStream("CTAKESConfig.properties"));
+	}
+	
+	/**
+	 * Loads properties from InputStream and then tries to close InputStream.
+	 * @param stream {@see InputStream} object used to read properties.
+	 */
+	public CTAKESConfig(InputStream stream) {
+		init(stream);
+	}
+	
+	private void init(InputStream stream) {
+		if (stream == null) {
+			return;
+		}
+		Properties props = new Properties();
+		
+		try {
+			props.load(stream);
+		} catch (IOException e) {
+			// TODO warning
+		} finally {
+			if (stream != null) {
+				try {
+					stream.close();
+				} catch (IOException ioe) {
+					// TODO warning
+				}
+			}
+		}
+		
+		setAeDescriptorPath(props.getProperty("aeDescriptorPath", getAeDescriptorPath()));
+		setUMLSUser(props.getProperty("UMLSUser", getUMLSUser()));
+		setUMLSPass(props.getProperty("UMLSPass", getUMLSPass()));
+		setText(Boolean.valueOf(props.getProperty("text", Boolean.toString(isText()))));
+		setMetadata(props.getProperty("metadata", getMetadataAsString()).split(","));
+		setAnnotationProps(props.getProperty("annotationProps", getAnnotationPropsAsString()).split(","));
+		setSeparatorChar(props.getProperty("separatorChar", Character.toString(getSeparatorChar())).charAt(0));
+	}
+	
+	/**
+	 * Returns the path to XML descriptor for AnalysisEngine.
+	 * @return the path to XML descriptor for AnalysisEngine.
+	 */
+	public String getAeDescriptorPath() {
+		return aeDescriptorPath;
+	}
+	
+	/**
+	 * Returns the UMLS username.
+	 * @return the UMLS username.
+	 */
+	public String getUMLSUser() {
+		return UMLSUser;
+	}
+	
+	/**
+	 * Returns the UMLS password.
+	 * @return the UMLS password.
+	 */
+	public String getUMLSPass() {
+		return UMLSPass;
+	}
+	
+	/**
+	 * Returns {@code true} if formatted output is enabled, {@code false} otherwise.
+	 * @return {@code true} if formatted output is enabled, {@code false} otherwise.
+	 */
+	public boolean isPrettyPrint() {
+		return prettyPrint;
+	}
+	
+	/**
+	 * Returns the type of cTAKES (UIMA) serializer used to write the CAS.
+	 * @return the type of cTAKES serializer.
+	 */
+	public CTAKESSerializer getSerializerType() {
+		return serializerType;
+	}
+	
+	/**
+	 * Returns an {@see OutputStream} object used write the CAS.
+	 * @return {@see OutputStream} object used write the CAS.
+	 */
+	public OutputStream getStream() {
+		return stream;
+	}
+	
+	/**
+	 * Returns {@code true} if CAS serialization is enabled, {@code false} otherwise.
+	 * @return {@code true} if CAS serialization output is enabled, {@code false} otherwise.
+	 */
+	public boolean isSerialize() {
+		return serialize;
+	}
+	
+	public boolean isText() {
+		return text;
+	}
+	
+	public String[] getMetadata() {
+		return metadata;
+	}
+	
+	public String getMetadataAsString() {
+		if (metadata == null) {
+			return "";
+		}
+		StringBuilder sb = new StringBuilder();
+		for (int i = 0; i < metadata.length; i++) {
+			sb.append(metadata[i]);
+			if (i < metadata.length-1) {
+				sb.append(",");
+			}
+		}
+		return sb.toString();
+	}
+	
+	public CTAKESAnnotationProperty[] getAnnotationProps() {
+		return annotationProps;
+	}
+	
+	public String getAnnotationPropsAsString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("coveredText");
+		if (annotationProps != null) {
+			for (CTAKESAnnotationProperty property : annotationProps) {
+				sb.append(separatorChar);
+				sb.append(property.getName());
+			}
+		}
+		return sb.toString();
+	}
+	
+	public char getSeparatorChar() {
+		return separatorChar;
+	}
+
+	/**
+	 * Sets the path to XML descriptor for AnalysisEngine.
+	 * @param aeDescriptorPath the path to XML descriptor for AnalysisEngine.
+	 */
+	public void setAeDescriptorPath(String aeDescriptorPath) {
+		this.aeDescriptorPath = aeDescriptorPath;
+	}
+
+	/**
+	 * Sets the UMLS username.
+	 * @param uMLSUser the UMLS username.
+	 */
+	public void setUMLSUser(String uMLSUser) {
+		this.UMLSUser = uMLSUser;
+	}
+
+	/**
+	 * Sets the UMLS password.
+	 * @param uMLSPass the UMLS password.
+	 */
+	public void setUMLSPass(String uMLSPass) {
+		this.UMLSPass = uMLSPass;
+	}
+
+	/**
+	 * Enables the formatted output for serializer.
+	 * @param prettyPrint {@true} to enable formatted output, {@code false} otherwise.
+	 */
+	public void setPrettyPrint(boolean prettyPrint) {
+		this.prettyPrint = prettyPrint;
+	}
+
+	/**
+	 * Sets the type of cTAKES (UIMA) serializer used to write CAS. 
+	 * @param serializerType the type of cTAKES serializer.
+	 */
+	public void setSerializerType(CTAKESSerializer serializerType) {
+		this.serializerType = serializerType;
+	}
+	
+	public void setOutputStream(OutputStream stream) {
+		this.stream = stream;
+	}
+	
+	/**
+	 * Enables CAS serialization.
+	 * @param serialize {@true} to enable CAS serialization, {@code false} otherwise.
+	 */
+	public void setSerialize(boolean serialize) {
+		this.serialize = serialize;
+	}
+	
+	public void setText(boolean text) {
+		this.text = text;
+	}
+	
+	public void setMetadata(String[] metadata) {
+		this.metadata = metadata;
+	}
+	
+	public void setAnnotationProps(CTAKESAnnotationProperty[] annotationProps) {
+		this.annotationProps = annotationProps;
+	}
+	
+	public void setAnnotationProps(String[] annotationProps) {
+		CTAKESAnnotationProperty[] properties = new CTAKESAnnotationProperty[annotationProps.length];
+		for (int i = 0; i < annotationProps.length; i++) {
+			properties[i] = CTAKESAnnotationProperty.valueOf(annotationProps[i]);
+		}
+		setAnnotationProps(properties);
+	}
+	
+	public void setSeparatorChar(char separatorChar) {
+		this.separatorChar = separatorChar;
+	}
+}
\ No newline at end of file
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESContentHandler.java	(working copy)
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.net.MalformedURLException;
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Class used to extract biomedical information while parsing. 
+ *
+ * <p>
+ * This class relies on <a href="http://ctakes.apache.org/">Apache cTAKES</a> 
+ * that is a natural language processing system for extraction of information 
+ * from electronic medical record clinical free-text.
+ * </p>
+ *
+ */
+public class CTAKESContentHandler extends ContentHandlerDecorator {
+	// Prefix used for metadata including cTAKES annotations
+	public static String CTAKES_META_PREFIX = "ctakes:";
+	
+	// Configuration object for CTAKESContentHandler
+	private CTAKESConfig config = null;
+	
+	// StringBuilder object used to build the clinical free-text for cTAKES
+	private StringBuilder sb = null;
+	
+	// Metadata object used for cTAKES annotations
+	private Metadata metadata = null;
+	
+	public CTAKESContentHandler(ContentHandler handler, Metadata metadata, CTAKESConfig config) throws MalformedURLException {
+		super(handler);
+		this.metadata = metadata;
+		this.config = config;
+		this.sb = new StringBuilder();
+	}
+	
+	public CTAKESContentHandler(ContentHandler handler, Metadata metadata) throws MalformedURLException {
+		this(handler, metadata, new CTAKESConfig());
+	}
+	
+	public CTAKESContentHandler() throws MalformedURLException {
+		this(new DefaultHandler(), new Metadata());
+	}
+	
+	@Override
+	public void characters(char[] ch, int start, int length) throws SAXException {
+		if (config.isText()) {
+			sb.append(ch, start, length);
+		}
+		super.characters(ch, start, length);
+	}
+
+	@Override
+	public void endDocument() throws SAXException {
+		try {
+			// create an Analysis Engine
+			AnalysisEngine ae = CTAKESUtils.getAnalysisEngine(config.getAeDescriptorPath(), config.getUMLSUser(), config.getUMLSPass());
+			
+			// create a JCas, given an AE
+			JCas jcas = CTAKESUtils.getJCas(ae);
+			
+			StringBuilder metaText = new StringBuilder();
+			for (String name : config.getMetadata()) {
+				for (String value : metadata.getValues(name)) {
+					metaText.append(value);
+					metaText.append(System.lineSeparator());
+				}
+			}
+			
+			// analyze text
+			jcas.setDocumentText(metaText.toString() + sb.toString());
+			ae.process(jcas);
+			
+			// add annotations to metadata
+			metadata.add(CTAKES_META_PREFIX + "schema", config.getAnnotationPropsAsString());
+			CTAKESAnnotationProperty[] annotationPros = config.getAnnotationProps();
+			Collection<IdentifiedAnnotation> collection = JCasUtil.select(jcas, IdentifiedAnnotation.class);
+			Iterator<IdentifiedAnnotation> iterator = collection.iterator();
+			while (iterator.hasNext()) {
+				IdentifiedAnnotation annotation = iterator.next();
+				StringBuilder annotationBuilder = new StringBuilder();
+				annotationBuilder.append(annotation.getCoveredText());
+				if (annotationPros != null) {
+					for (CTAKESAnnotationProperty property : annotationPros) {
+						annotationBuilder.append(config.getSeparatorChar());
+						annotationBuilder.append(CTAKESUtils.getAnnotationProperty(annotation, property));
+					}
+				}
+				metadata.add(CTAKES_META_PREFIX + annotation.getType().getShortName(), annotationBuilder.toString());
+			}
+			
+			if (config.isSerialize()) {
+				// serialize data
+				CTAKESUtils.serialize(config.getSerializerType(), config.isPrettyPrint(), config.getStream());
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+			throw new SAXException(e.getMessage());
+		}
+	}
+	
+	/**
+	 * Returns metadata that includes cTAKES annotations.
+	 * @return {@Metadata} object that includes cTAKES annotations.
+	 */
+	public Metadata getMetadata() {
+		return metadata;
+	}
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESParser.java	(working copy)
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParserDecorator;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * CTAKESParser decorates {@see AutoDetectParser} and leverages on {@see
+ * CTAKESContentHandler} to extract biomedical information from clinical text.
+ * <p>
+ * ...
+ * </p>
+ * ...
+ * <p>
+ * CTAKESConfig config = new CTAKESConfig();<br />
+ * parseContext.set(CTAKESConfig.class, config);
+ * </p>
+ */
+public class CTAKESParser extends ParserDecorator {
+	/**
+	 * Serial version UID
+	 */
+	private static final long serialVersionUID = -2313482748027097961L;
+
+	/**
+	 * Default constructor.
+	 */
+	public CTAKESParser() {
+		super(new AutoDetectParser());
+	}
+
+	@Override
+	public void parse(InputStream stream, ContentHandler handler,
+			Metadata metadata, ParseContext context) throws IOException,
+			SAXException, TikaException {
+		CTAKESConfig config = context.get(CTAKESConfig.class,
+				new CTAKESConfig());
+		CTAKESContentHandler ctakesHandler = new CTAKESContentHandler(handler,
+				metadata, config);
+		super.parse(stream, ctakesHandler, metadata, context);
+	}
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESSerializer.java	(working copy)
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.util.XmlCasSerializer;
+
+/**
+ * Enumeration for types of cTAKES (UIMA) CAS serializer supported by cTAKES.
+ * 
+ * A CAS serializer writes a CAS in the given format.
+ *
+ */
+public enum CTAKESSerializer {
+	XCAS(XCASSerializer.class.getName()),
+	XMI(XmiCasSerializer.class.getName()),
+	XML(XmlCasSerializer.class.getName());
+	
+	private final String className;
+	
+	private CTAKESSerializer(String className) {
+		this.className = className;
+	}
+	
+	public String getClassName() {
+		return className;
+	}
+}
Index: tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java	(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/ctakes/CTAKESUtils.java	(working copy)
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ctakes;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.net.URISyntaxException;
+
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.impl.XCASSerializer;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.cas.impl.XmiSerializationSharedData;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceSpecifier;
+import org.apache.uima.util.InvalidXMLException;
+import org.apache.uima.util.XMLInputSource;
+import org.apache.uima.util.XmlCasSerializer;
+import org.xml.sax.SAXException;
+
+/**
+ * This class provides methods to extract biomedical information from plain text
+ * using {@see CTAKESContentHandler} that relies on Apache cTAKES.
+ * 
+ * <p>
+ * Apache cTAKES is built on top of <a href="https://uima.apache.org/">Apache
+ * UIMA</a> framework and <a href="https://opennlp.apache.org/">OpenNLP</a>
+ * toolkit.
+ * </p>
+ *
+ */
+public class CTAKESUtils {
+	// UIMA Analysis Engine
+	private static AnalysisEngine ae = null;
+
+	// JCas object for working with the CAS (Common Analysis System)
+	private static JCas jcas = null;
+
+	// UMLS username property
+	private final static String CTAKES_UMLS_USER = "ctakes.umlsuser";
+
+	// UMLS password property
+	private final static String CTAKES_UMLS_PASS = "ctakes.umlspw";
+
+	/**
+	 * Returns a new UIMA Analysis Engine (AE). This method ensures that only
+	 * one instance of an AE is created.
+	 * 
+	 * <p>
+	 * An Analysis Engine is a component responsible for analyzing unstructured
+	 * information, discovering and representing semantic content. Unstructured
+	 * information includes, but is not restricted to, text documents.
+	 * </p>
+	 * 
+	 * @param aeDescriptor
+	 *            pathname for XML file including an AnalysisEngineDescription
+	 *            that contains all of the information needed to instantiate and
+	 *            use an AnalysisEngine.
+	 * @param umlsUser
+	 *            UMLS username for NLM database
+	 * @param umlsPass
+	 *            UMLS password for NLM database
+	 * @return an Analysis Engine for analyzing unstructured information.
+	 * @throws IOException
+	 *             if any I/O error occurs.
+	 * @throws InvalidXMLException
+	 *             if the input XML is not valid or does not specify a valid
+	 *             ResourceSpecifier.
+	 * @throws ResourceInitializationException
+	 *             if a failure occurred during production of the resource.
+	 * @throws URISyntaxException
+	 *             if URL of the resource is not formatted strictly according to
+	 *             to RFC2396 and cannot be converted to a URI.
+	 */
+	public static AnalysisEngine getAnalysisEngine(String aeDescriptor,
+			String umlsUser, String umlsPass) throws IOException,
+			InvalidXMLException, ResourceInitializationException,
+			URISyntaxException {
+		if (ae == null) {
+			// UMLS user ID and password.
+			String aeDescriptorPath = CTAKESUtils.class
+					.getResource(aeDescriptor).toURI().getPath();
+
+			// get Resource Specifier from XML
+			XMLInputSource aeIputSource = new XMLInputSource(aeDescriptorPath);
+			ResourceSpecifier aeSpecifier = UIMAFramework.getXMLParser()
+					.parseResourceSpecifier(aeIputSource);
+
+			// UMLS user ID and password
+			if ((umlsUser != null) && (!umlsUser.isEmpty())
+					&& (umlsPass != null) && (!umlsPass.isEmpty())) {
+				/*
+				 * It is highly recommended that you change UMLS credentials in
+				 * the XML configuration file instead of giving user and
+				 * password using CTAKESConfig.
+				 */
+				System.setProperty(CTAKES_UMLS_USER, umlsUser);
+				System.setProperty(CTAKES_UMLS_PASS, umlsPass);
+			}
+
+			// create AE
+			ae = UIMAFramework.produceAnalysisEngine(aeSpecifier);
+		}
+		return ae;
+	}
+
+	/**
+	 * Returns a new JCas () appropriate for the given Analysis Engine. This
+	 * method ensures that only one instance of a JCas is created. A Jcas is a
+	 * Java Cover Classes based Object-oriented CAS (Common Analysis System)
+	 * API.
+	 * 
+	 * <p>
+	 * Important: It is highly recommended that you reuse CAS objects rather
+	 * than creating new CAS objects prior to each analysis. This is because CAS
+	 * objects may be expensive to create and may consume a significant amount
+	 * of memory.
+	 * </p>
+	 * 
+	 * @param ae
+	 *            AnalysisEngine used to create an appropriate JCas object.
+	 * @return a JCas object appropriate for the given AnalysisEngine.
+	 * @throws ResourceInitializationException
+	 *             if a CAS could not be created because this AnalysisEngine's
+	 *             CAS metadata (type system, type priorities, or FS indexes)
+	 *             are invalid.
+	 */
+	public static JCas getJCas(AnalysisEngine ae)
+			throws ResourceInitializationException {
+		if (jcas == null) {
+			jcas = ae.newJCas();
+		}
+		return jcas;
+	}
+
+	/**
+	 * Serializes a CAS in the given format.
+	 * 
+	 * @param type
+	 *            type of cTAKES (UIMA) serializer used to write CAS.
+	 * @param prettyPrint
+	 *            {@code true} to do pretty printing of output.
+	 * @param stream
+	 *            {@see OutputStream} object used to print out information
+	 *            extracted by using cTAKES.
+	 * @throws SAXException
+	 *             if there was a SAX exception.
+	 * @throws IOException
+	 *             if any I/O error occurs.
+	 */
+	public static void serialize(CTAKESSerializer type, boolean prettyPrint,
+			OutputStream stream) throws SAXException, IOException {
+		if (type == CTAKESSerializer.XCAS) {
+			XCASSerializer.serialize(jcas.getCas(), stream, prettyPrint);
+		} else if (type == CTAKESSerializer.XMI) {
+			XmiCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+					stream, prettyPrint, new XmiSerializationSharedData());
+		} else {
+			XmlCasSerializer.serialize(jcas.getCas(), jcas.getTypeSystem(),
+					stream);
+		}
+	}
+
+	/**
+	 * Returns the annotation value based on the given annotation type. 
+	 * @param annotation {@see IdentifiedAnnotation} object. 
+	 * @param property {@see CTAKESAnnotationProperty} enum used to identify the annotation type.
+	 * @return the annotation value.
+	 */
+	public static String getAnnotationProperty(IdentifiedAnnotation annotation,
+			CTAKESAnnotationProperty property) {
+		String value = null;
+		if (property == CTAKESAnnotationProperty.BEGIN) {
+			value = Integer.toString(annotation.getBegin());
+		} else if (property == CTAKESAnnotationProperty.END) {
+			value = Integer.toString(annotation.getEnd());
+		} else if (property == CTAKESAnnotationProperty.CONDITIONAL) {
+			value = Boolean.toString(annotation.getConditional());
+		} else if (property == CTAKESAnnotationProperty.CONFIDENCE) {
+			value = Float.toString(annotation.getConfidence());
+		} else if (property == CTAKESAnnotationProperty.DISCOVERY_TECNIQUE) {
+			value = Integer.toString(annotation.getDiscoveryTechnique());
+		} else if (property == CTAKESAnnotationProperty.GENERIC) {
+			value = Boolean.toString(annotation.getGeneric());
+		} else if (property == CTAKESAnnotationProperty.HISTORY_OF) {
+			value = Integer.toString(annotation.getHistoryOf());
+		} else if (property == CTAKESAnnotationProperty.ID) {
+			value = Integer.toString(annotation.getId());
+		} else if (property == CTAKESAnnotationProperty.ONTOLOGY_CONCEPT_ARR) {
+			FSArray mentions = annotation.getOntologyConceptArr();
+			StringBuilder sb = new StringBuilder();
+			if (mentions != null) {
+				for (int i = 0; i < mentions.size(); i++) {
+					if (mentions.get(i) instanceof UmlsConcept) {
+						UmlsConcept concept = (UmlsConcept) mentions.get(i);
+						sb.append(concept.getCui());
+						if (i < mentions.size()-1) {
+							sb.append(",");
+						}
+					}
+				}
+			}
+			value = sb.toString();
+		} else if (property == CTAKESAnnotationProperty.POLARITY) {
+			value = Integer.toString(annotation.getPolarity());
+		}
+		return value;
+	}
+
+	/**
+	 * Resets cTAKES objects, if created. This method ensures that new cTAKES
+	 * objects (a.k.a., Analysis Engine and JCas) will be created if getters of
+	 * this class are called.
+	 */
+	public static void reset() {
+		// Analysis Engine
+		ae.destroy();
+		ae = null;
+
+		// JCas
+		jcas.reset();
+		jcas = null;
+	}
+}
