Index: pom.xml
===================================================================
--- pom.xml (revision 726192)
+++ pom.xml (working copy)
@@ -211,6 +211,11 @@
3.5-beta4
+ org.apache.poi
+ poi-ooxml
+ 3.5-beta4
+
+
net.sourceforge.nekohtml
nekohtml
1.9.9
Index: src/main/java/org/apache/tika/metadata/MSOffice.java
===================================================================
--- src/main/java/org/apache/tika/metadata/MSOffice.java (revision 726192)
+++ src/main/java/org/apache/tika/metadata/MSOffice.java (working copy)
@@ -46,5 +46,29 @@
public static final String TEMPLATE = "Template";
public static final String AUTHOR = "Author";
+
+ public static final String TOTAL_TIME = "Total-Time";
+
+ public static final String SLIDE_COUNT = "Slide-Count";
+
+ public static final String PRESENTATION_FORMAT = "Presentation-Format";
+
+ public static final String PARAGRAPH_COUNT = "Paragraph-Count";
+
+ public static final String NOTES = "Notes";
+
+ public static final String MANAGER = "Manager";
+
+ public static final String LINE_COUNT = "Line-Count";
+
+ public static final String CHARACTER_COUNT_WITH_SPACES = "Character-Count-With-Spaces";
+
+ public static final String APPLICATION_VERSION = "Application-Version";
+
+ public static final String VERSION = "version";
+
+ public static final String CONTENT_STATUS = "Content-Status";
+
+ public static final String CATEGORY = "category";
}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (revision 0)
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Base class for all Tika OOXML extractors.
+ *
+ * Tika extractors decorate POI extractors so that the parsed content of
+ * documents is returned as a sequence of XHTML SAX events. Subclasses must
+ * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
+ * populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+ protected POIXMLTextExtractor extractor;
+
+ public AbstractOOXMLExtractor(POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
+ */
+ public POIXMLDocument getDocument() {
+ return extractor.getDocument();
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
+ */
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor);
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
+ * org.apache.tika.metadata.Metadata)
+ */
+ public XHTMLContentHandler getXHTML(ContentHandler handler,
+ Metadata metadata) throws SAXException, XmlException, IOException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ buildXHTML(xhtml);
+ xhtml.endDocument();
+ return xhtml;
+ }
+
+ /**
+ * Populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+ protected abstract void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException;
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (revision 0)
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.internal.PackagePropertiesPart;
+import org.openxml4j.util.Nullable;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+
+/**
+ * OOXML metadata extractor.
+ *
+ * Currently POI doesn't support metadata extraction for OOXML.
+ *
+ * @see OOXMLExtractor#getMetadataExtractor()
+ */
+public class MetadataExtractor {
+
+ private POIXMLTextExtractor extractor;
+
+ public MetadataExtractor(POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ public void extract(Metadata metadata) throws TikaException {
+ try {
+ extractMetadata(extractor.getCoreProperties(), metadata);
+ extractMetadata(extractor.getExtendedProperties(), metadata);
+ } catch (IOException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ }
+ }
+
+ private void extractMetadata(CoreProperties properties, Metadata metadata) {
+ PackagePropertiesPart propsHolder = properties
+ .getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+ addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ addProperty(metadata, Metadata.CONTENT_TYPE, propsHolder
+ .getContentType());
+ addProperty(metadata, Metadata.DATE, propsHolder
+ .getCreatedPropertyString());
+ addProperty(metadata, Metadata.CREATOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.AUTHOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.DESCRIPTION, propsHolder
+ .getDescriptionProperty());
+ addProperty(metadata, Metadata.IDENTIFIER, propsHolder
+ .getIdentifierProperty());
+ addProperty(metadata, Metadata.KEYWORDS, propsHolder
+ .getKeywordsProperty());
+ addProperty(metadata, Metadata.LANGUAGE, propsHolder
+ .getLanguageProperty());
+ addProperty(metadata, Metadata.LAST_AUTHOR, propsHolder
+ .getLastModifiedByProperty());
+ addProperty(metadata, Metadata.LAST_PRINTED, propsHolder
+ .getLastPrintedPropertyString());
+ addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
+ .getModifiedPropertyString());
+ addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+ .getRevisionProperty());
+ addProperty(metadata, Metadata.SUBJECT, propsHolder
+ .getSubjectProperty());
+ addProperty(metadata, Metadata.TITLE, propsHolder.getTitleProperty());
+ addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
+ }
+
+ private void extractMetadata(ExtendedProperties properties,
+ Metadata metadata) {
+ CTProperties propsHolder = properties.getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder
+ .getApplication());
+ addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder
+ .getAppVersion());
+ addProperty(metadata, Metadata.CHARACTER_COUNT, propsHolder
+ .getCharacters());
+ addProperty(metadata, Metadata.CHARACTER_COUNT_WITH_SPACES, propsHolder
+ .getCharactersWithSpaces());
+ addProperty(metadata, Metadata.PUBLISHER, propsHolder.getCompany());
+ addProperty(metadata, Metadata.LINE_COUNT, propsHolder.getLines());
+ addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+ addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+ addProperty(metadata, Metadata.PAGE_COUNT, propsHolder.getPages());
+ addProperty(metadata, Metadata.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+ addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder
+ .getPresentationFormat());
+ addProperty(metadata, Metadata.SLIDE_COUNT, propsHolder.getSlides());
+ addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+ addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, Metadata.WORD_COUNT, propsHolder.getWords());
+ }
+
+ private void addProperty(Metadata metadata, String name, Nullable> value) {
+ if (value.getValue() != null) {
+ addProperty(metadata, name, value.getValue().toString());
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (revision 0)
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Interface implemented by all Tika OOXML extractors.
+ *
+ * @see POIXMLTextExtractor
+ */
+public interface OOXMLExtractor {
+
+ /**
+ * Returns the opened document.
+ *
+ * @see POIXMLTextExtractor#getDocument()
+ */
+ POIXMLDocument getDocument();
+
+ /**
+ * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
+ * for OOXML by POI.
+ */
+ MetadataExtractor getMetadataExtractor();
+
+ /**
+ * Returns to clients a {@link XHTMLContentHandler} object representing the
+ * parsed content of a document as XHTML SAX events.
+ */
+ XHTMLContentHandler getXHTML(ContentHandler handler, Metadata metadata)
+ throws SAXException, XmlException, IOException;
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (revision 0)
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+
+/**
+ * Figures out the correct {@link OOXMLExtractor} for the supplied document and
+ * returns it.
+ */
+public class OOXMLExtractorFactory {
+
+ public static OOXMLExtractor createExtractor(POIXMLTextExtractor extractor) {
+ POIXMLDocument document = extractor.getDocument();
+
+ if (document instanceof XSLFSlideShow) {
+ return new XSLFPowerPointExtractorDecorator(
+ (XSLFPowerPointExtractor) extractor);
+ } else if (document instanceof XSSFWorkbook) {
+ return new XSSFExcelExtractorDecorator(
+ (XSSFExcelExtractor) extractor);
+ } else if (document instanceof XWPFDocument) {
+ return new XWPFWordExtractorDecorator((XWPFWordExtractor) extractor);
+ } else {
+ return new POIXMLTextExtractorDecorator(extractor);
+ }
+ }
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (revision 0)
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Office Open XML (OOXML) parser.
+ *
+ */
+public class OOXMLParser implements Parser {
+
+ /**
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+
+ try {
+ OOXMLExtractor extractor = OOXMLExtractorFactory
+ .createExtractor((POIXMLTextExtractor) ExtractorFactory
+ .createExtractor(stream));
+ extractor.getXHTML(handler, metadata);
+ extractor.getMetadataExtractor().extract(metadata);
+
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ }
+ }
+
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (revision 0)
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
+ super(extractor);
+ }
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
+ // extract document content as a single string (not structured)
+ xhtml.element("p", extractor.getText());
+ }
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (revision 0)
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.xml.sax.SAXException;
+
+public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
+ XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
+
+ XSLFSlide[] slides = xmlSlideShow.getSlides();
+ for (XSLFSlide slide : slides) {
+ CTSlide rawSlide = slide._getCTSlide();
+ CTSlideIdListEntry slideId = slide._getCTSlideId();
+
+ CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
+ slideId);
+ CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
+ .getSlideComments(slideId);
+
+ xhtml.startElement("div");
+ extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
+
+ if (comments != null) {
+ for (CTComment comment : comments.getCmArray()) {
+ xhtml.element("p", comment.getText());
+ }
+ }
+
+ if (notes != null) {
+ extractShapeContent(notes.getCSld().getSpTree(), xhtml);
+ }
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
+ throws SAXException {
+ CTShape[] shapes = gs.getSpArray();
+ for (CTShape shape : shapes) {
+ CTTextBody textBody = shape.getTxBody();
+ if (textBody != null) {
+ CTTextParagraph[] paras = textBody.getPArray();
+ for (CTTextParagraph textParagraph : paras) {
+ CTRegularTextRun[] textRuns = textParagraph.getRArray();
+ for (CTRegularTextRun textRun : textRuns) {
+ xhtml.element("p", textRun.getT());
+ }
+ }
+ }
+ }
+ }
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (revision 0)
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Comment;
+import org.apache.poi.ss.usermodel.HeaderFooter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.SAXException;
+
+public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) {
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+ for (int i = 0; i < document.getNumberOfSheets(); i++) {
+ xhtml.startElement("div");
+ XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
+ xhtml.element("h1", document.getSheetName(i));
+
+ // Header(s), if present
+ extractHeaderFooter(sheet.getFirstHeader(), xhtml);
+ extractHeaderFooter(sheet.getOddHeader(), xhtml);
+ extractHeaderFooter(sheet.getEvenHeader(), xhtml);
+
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ // Rows and cells
+ for (Object rawR : sheet) {
+ xhtml.startElement("tr");
+ Row row = (Row) rawR;
+ for (Iterator ri = row.cellIterator(); ri.hasNext();) {
+ xhtml.startElement("td");
+ Cell cell = ri.next();
+
+ if (cell.getCellType() == Cell.CELL_TYPE_FORMULA
+ || cell.getCellType() == Cell.CELL_TYPE_STRING) {
+ xhtml.characters(cell.getRichStringCellValue()
+ .getString());
+ } else {
+ XSSFCell xc = (XSSFCell) cell;
+ String rawValue = xc.getRawValue();
+ if (rawValue != null) {
+ xhtml.characters(rawValue);
+ }
+
+ }
+
+ // Output the comment in the same cell as the content
+ Comment comment = cell.getCellComment();
+ if (comment != null) {
+ xhtml.characters(comment.getString().getString());
+ }
+
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Finally footer(s), if present
+ extractHeaderFooter(sheet.getFirstFooter(), xhtml);
+ extractHeaderFooter(sheet.getOddFooter(), xhtml);
+ extractHeaderFooter(sheet.getEvenFooter(), xhtml);
+
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
+ throws SAXException {
+ String content = ExcelExtractor._extractHeaderFooter(hf);
+ if (content.length() > 0) {
+ xhtml.element("p", content);
+ }
+ }
+}
Index: src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
===================================================================
--- src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (revision 0)
+++ src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (revision 0)
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
+import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
+import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+import org.xml.sax.SAXException;
+
+public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XWPFDocument document = (XWPFDocument) extractor.getDocument();
+ XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
+
+ // headers
+ if (hfPolicy.getFirstPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
+ }
+ if (hfPolicy.getEvenPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
+ }
+ if (hfPolicy.getDefaultHeader() != null) {
+ xhtml.element("p", hfPolicy.getDefaultHeader().getText());
+ }
+
+ // first all paragraphs
+ Iterator i = document.getParagraphsIterator();
+ while (i.hasNext()) {
+ XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
+ new XWPFHyperlinkDecorator(i.next(), null, true));
+ xhtml.element("p", decorator.getText());
+ }
+
+ // then all document tables
+ extractTableContent(document.getDocument().getBody().getTblArray(),
+ xhtml);
+
+ // footers
+ if (hfPolicy.getFirstPageFooter() != null) {
+ xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
+ }
+ if (hfPolicy.getEvenPageFooter() != null) {
+ xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
+ }
+ if (hfPolicy.getDefaultFooter() != null) {
+ xhtml.element("p", hfPolicy.getDefaultFooter().getText());
+ }
+ }
+
+ /**
+ * Low level structured parsing of document tables.
+ */
+ private void extractTableContent(CTTbl[] tables, XHTMLContentHandler xhtml)
+ throws SAXException {
+ for (CTTbl table : tables) {
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+ CTRow[] rows = table.getTrArray();
+ for (CTRow row : rows) {
+ xhtml.startElement("tr");
+ CTTc[] cells = row.getTcArray();
+ for (CTTc tc : cells) {
+ xhtml.startElement("td");
+ CTP[] content = tc.getPArray();
+ for (CTP ctp : content) {
+ CTR[] inner = ctp.getRArray();
+ for (CTR ctr : inner) {
+ CTText[] text = ctr.getTArray();
+ for (CTText textContent : text) {
+ xhtml.characters(textContent.getStringValue());
+ }
+ }
+ }
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ }
+ }
+}
Index: src/main/resources/mime/tika-mimetypes.xml
===================================================================
--- src/main/resources/mime/tika-mimetypes.xml (revision 726192)
+++ src/main/resources/mime/tika-mimetypes.xml (working copy)
@@ -188,6 +188,15 @@
+
+
+
+
+
+
+
+
+
Index: src/main/resources/tika-config.xml
===================================================================
--- src/main/resources/tika-config.xml (revision 726192)
+++ src/main/resources/tika-config.xml (working copy)
@@ -35,6 +35,13 @@
application/vnd.visio
application/vnd.ms-outlook
+
+
+ application/vnd.openxmlformats-package.core-properties+xml
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+ application/vnd.openxmlformats-officedocument.presentationml.presentation
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document
+
text/html
Index: src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (revision 0)
+++ src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (revision 0)
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.opendocument.OpenOfficeParserTest;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.parser.AutoDetectParser;
+
+public class OOXMLParserTest extends TestCase {
+
+ public void testExcel() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testEXCEL.xlsx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("Numbers and their Squares"));
+ assertTrue(content.contains("9"));
+ assertFalse(content.contains("9.0"));
+ assertTrue(content.contains("196"));
+ assertFalse(content.contains("196.0"));
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testPowerPoint() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testPPT.pptx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("Powerpoint X for Mac"));
+ } finally {
+ input.close();
+ }
+
+ }
+
+ public void testWord() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testWORD.docx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(handler.toString().contains("Sample Word Document"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
|