Index: src/test/java/org/apache/tika/TestParsers.java
===================================================================
--- src/test/java/org/apache/tika/TestParsers.java (revision 682234)
+++ src/test/java/org/apache/tika/TestParsers.java (working copy)
@@ -44,15 +44,15 @@
* FIXME the old mechanism does not work anymore when running the tests
* with Maven - need a resource-based one, but this means more changes
* to classes which rely on filenames.
- *
+ *
* String sep = File.separator; StringTokenizer st = new
* StringTokenizer(System.getProperty( "java.class.path"),
* File.pathSeparator);
- *
+ *
* classDir = new File(st.nextToken());
- *
+ *
* config = classDir.getParent() + sep + "config" + sep + "config.xml";
- *
+ *
* String log4j = classDir.getParent() + sep + "Config" + sep + "log4j" +
* sep + "log4j.properties";
*/
@@ -171,6 +171,16 @@
assertNotNull(parser);
}
+ public void testZipFileExtraction() throws Exception {
+ File file = getTestFile("test-documents.zip");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
+ assertEquals(s1, s2);
+
+ Parser parser = tc.getParser("application/zip");
+ assertNotNull(parser);
+ }
+
public void testZipExtraction() throws Exception {
File zip = getTestFile("test-documents.zip");
List parsers = ParseUtils.getParsersFromZip(zip, tc);
Index: src/main/java/org/apache/tika/parser/zip/ZipParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/zip/ZipParser.java (revision 0)
+++ src/main/java/org/apache/tika/parser/zip/ZipParser.java (revision 0)
@@ -0,0 +1,63 @@
+package org.apache.tika.parser.zip;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ParseUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.FileOutputStream;
+import java.io.FileInputStream;
+
+/**
+ * Zip File Parser.
+ */
+public class ZipParser extends AbstractParser {
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, TikaException, SAXException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ ZipInputStream zis = new ZipInputStream(stream);
+ ZipEntry ze;
+ while ((ze = zis.getNextEntry()) != null) {
+ xhtml.startElement("div", "class", "file");
+ xhtml.element("h1", ze.getName());
+
+ File temp = File.createTempFile("TIKA_unzip_", ze.getName());
+ OutputStream out = new FileOutputStream(temp);
+ byte[] buf = new byte[1024];
+ int len;
+ while ((len = zis.read(buf)) > 0) {
+ out.write(buf, 0, len);
+ }
+ out.close();
+
+ Parser parser = ParseUtils.getParser(temp, TikaConfig.getDefaultConfig());
+ ContentHandler content = new BodyContentHandler();
+ parser.parse(new FileInputStream(temp), content, new Metadata());
+
+ xhtml.element("content", content.toString());
+ xhtml.endElement("div");
+
+ temp.delete();
+ zis.closeEntry();
+ }
+ zis.close();
+ xhtml.endDocument();
+ }
+}
Index: src/main/resources/tika-config.xml
===================================================================
--- src/main/resources/tika-config.xml (revision 682234)
+++ src/main/resources/tika-config.xml (working copy)
@@ -57,7 +57,7 @@
text/plain
-
+
application/vnd.sun.xml.writer
application/vnd.oasis.opendocument.text
application/vnd.oasis.opendocument.graphics
@@ -105,6 +105,10 @@
image/x-xcf
+
+ application/zip
+
+
\ No newline at end of file