From 7e4bb724b9a7ac8f3e79c456dc298b2da1f6fce5 Mon Sep 17 00:00:00 2001
From: Jukka Zitting <jukka@adobe.com>
Date: Mon, 5 Aug 2013 15:40:05 +0300
Subject: [PATCH] TIKA-1149: Improve parser lookup performance

Add SimpleParser and CompositeParser.canParse() to help speed up parser lookups.
---
 .../org/apache/tika/parser/CompositeParser.java    | 90 +++++++++++++++++++---
 .../java/org/apache/tika/parser/DefaultParser.java | 15 ++++
 .../java/org/apache/tika/parser/SimpleParser.java  | 66 ++++++++++++++++
 3 files changed, 159 insertions(+), 12 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/parser/SimpleParser.java

diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 05d1b72..bb6eeca 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
+import java.util.ListIterator;
 import java.util.Map;
 import java.util.Set;
 
@@ -75,6 +76,56 @@ public class CompositeParser extends AbstractParser {
         this(new MediaTypeRegistry());
     }
 
+    /**
+     * Checks whether the given parser can parse a document of the given
+     * media type in the given context.
+     *
+     * @param parser parser instance
+     * @param type media type of a document, normalized
+     * @param context parse context
+     * @return whether documents of the given type can be parsed
+     */
+    protected boolean canParse(
+            Parser parser, MediaType type, ParseContext context) {
+        if (parser instanceof CompositeParser) {
+            return ((CompositeParser) parser).canParse(type, context);
+        }
+
+        Set<MediaType> supportedTypes = parser.getSupportedTypes(context);
+        if (supportedTypes.contains(type)) {
+            return true;
+        } else if (parser instanceof SimpleParser) {
+            // supported types of SimpleParser classes are always normalized
+            return false;
+        }
+
+        // for backwards compatibility: supported types may need normalization
+        for (MediaType supportedType : supportedTypes) {
+            if (type.equals(registry.normalize(supportedType))) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Checks whether <em>this</em> parser can parse a document of the given
+     * media type in the given context.
+     *
+     * @param type media type of a document, normalized
+     * @param context parse context
+     * @return whether documents of the given type can be parsed
+     */
+    protected boolean canParse(MediaType type, ParseContext context) {
+        for (Parser parser : parsers) {
+            if (canParse(parser, type, context)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     public Map<MediaType, Parser> getParsers(ParseContext context) {
         Map<MediaType, Parser> map = new HashMap<MediaType, Parser>();
         for (Parser parser : parsers) {
@@ -196,25 +247,40 @@ public class CompositeParser extends AbstractParser {
         return getParser(metadata, new ParseContext());
     }
 
-    protected Parser getParser(Metadata metadata, ParseContext context) {
-        Map<MediaType, Parser> map = getParsers(context);
+    protected Parser getParser(List<MediaType> types, ParseContext context) {
+        Parser best = getFallback();
+        int index = types.size();
+
+        ListIterator<Parser> iterator = parsers.listIterator(parsers.size());
+        while (index > 0 && iterator.hasPrevious()) {
+            Parser parser = iterator.previous();
+            for (int i = 0; i < index; i++) {
+                if (canParse(parser, types.get(i), context)) {
+                    best = parser;
+                    index = i;
+                    break;
+                }
+            }
+        }
+
+        return best;
+    }
+
+    public Parser getParser(Metadata metadata, ParseContext context) {
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
         if (type != null) {
-           // We always work on the normalised, canonical form
+           // We always work on the normalized, canonical form
            type = registry.normalize(type);
         }
-        
+
+        // Collect the normalized type and all its ancestors
+        List<MediaType> types = new ArrayList<MediaType>();
         while (type != null) {
-            // Try finding a parser for the type
-            Parser parser = map.get(type);
-            if (parser != null) {
-                return parser;
-            }
-            
-            // Failing that, try for the parent of the type
+            types.add(type);
             type = registry.getSupertype(type);
         }
-        return fallback;
+
+        return getParser(types, context);
     }
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
index 09d844c..9aec225 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
@@ -110,4 +110,19 @@ public class DefaultParser extends CompositeParser {
         return map;
     }
 
+    @Override
+    protected boolean canParse(MediaType type, ParseContext context) {
+        if (loader != null) {
+            List<Parser> parsers =
+                    loader.loadDynamicServiceProviders(Parser.class);
+            for (Parser parser : parsers) {
+                if (canParse(parser, type, context)) {
+                    return true;
+                }
+            }
+        }
+
+        return super.canParse(type, context);
+    }
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/SimpleParser.java b/tika-core/src/main/java/org/apache/tika/parser/SimpleParser.java
new file mode 100644
index 0000000..a1252d7
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/SimpleParser.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.singleton;
+import static java.util.Collections.unmodifiableSet;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.mime.MediaType;
+
+/**
+ * Base class for simple parser implementations that support a fixed set
+ * of <em>normalized</em> media types regardless of the parsing contexts.
+ *
+ * @since Apache Tika 1.5
+ */
+public abstract class SimpleParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = 1823643199982533681L;
+
+    /** Immutable set of supported media types */
+    private final Set<MediaType> supportedTypes;
+
+    protected SimpleParser(Set<MediaType> supportedTypes) {
+        this.supportedTypes = supportedTypes;
+    }
+
+    protected SimpleParser(MediaType supportedType) {
+        this(singleton(supportedType));
+    }
+
+    protected SimpleParser(MediaType... supportedTypes) {
+        this(unmodifiableSet(new HashSet<MediaType>(asList(supportedTypes))));
+    }
+
+    //------------------------------------------------------------< Parser >--
+
+    /**
+     * Returns the set of media types supported by this parser.
+     *
+     * @param ignored
+     * @return immutable set of media types
+     */
+    public final Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+}
-- 
1.8.1.msysgit.1

