### Eclipse Workspace Patch 1.0
#P tika-parsers
Index: src/main/java/org/apache/tika/parser/html/HtmlParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/html/HtmlParser.java	(revision 961535)
+++ src/main/java/org/apache/tika/parser/html/HtmlParser.java	(working copy)
@@ -37,6 +37,7 @@
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.utils.CharsetUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -91,21 +92,18 @@
                 for (String attr : attrs) {
                     String[] keyValue = attr.trim().split("=");
                     if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
-                    	String charset = keyValue[1];
-                    	try {
-                    		if (Charset.isSupported(charset)) {
-                    			metadata.set(Metadata.CONTENT_ENCODING, charset);
-                    			return charset;
-                    		}
-                    	} catch (IllegalCharsetNameException e){
-                    		// Ignore malformed charset names
+                        // TIKA-459: improve charset handling.
+                    	String charset = CharsetUtils.clean(keyValue[1]);
+                    	if (CharsetUtils.isSupported(charset)) {
+                    	    metadata.set(Metadata.CONTENT_ENCODING, charset);
+                    	    return charset;
                     	}
                     }
                 }
             }
         }
 
-        // No charset in a meta http-equiv tag, see if it's in the passed content-encoding
+        // No (valid) charset in a meta http-equiv tag, see if it's in the passed content-encoding
         // hint, or the passed content-type hint.
         CharsetDetector detector = new CharsetDetector();
         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
#P tika-core
Index: src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
===================================================================
--- src/test/java/org/apache/tika/utils/CharsetUtilsTest.java	(revision 0)
+++ src/test/java/org/apache/tika/utils/CharsetUtilsTest.java	(revision 0)
@@ -0,0 +1,47 @@
+package org.apache.tika.utils;
+
+import junit.framework.TestCase;
+
+public class CharsetUtilsTest extends TestCase {
+
+    public void testInvalidCharset() {
+        assertFalse(CharsetUtils.isSupported(" utf-8"));
+        assertFalse(CharsetUtils.isSupported("my charset name"));
+        assertFalse(CharsetUtils.isSupported("charset1; charset2"));
+        assertFalse(CharsetUtils.isSupported(null));
+        assertFalse(CharsetUtils.isSupported(""));
+    }
+    
+    public void testValidCharset() {
+        assertTrue(CharsetUtils.isSupported("UTF-8"));
+        assertFalse(CharsetUtils.isSupported("bogus"));
+    }
+    
+    public void testCleaningCharsetName() {
+        assertEquals("UTF-8", CharsetUtils.clean("utf-8"));
+        assertEquals(null, CharsetUtils.clean(""));
+        assertEquals(null, CharsetUtils.clean(null));
+        assertEquals("US-ASCII", CharsetUtils.clean(" us-ascii  "));
+        assertEquals("UTF-8", CharsetUtils.clean("\"utf-8\""));
+        assertEquals("ISO-8859-1", CharsetUtils.clean("ISO-8859-1, latin1"));
+    }
+    
+    public void testFunkyNames() {
+        assertEquals(null, CharsetUtils.clean("none"));
+        assertEquals(null, CharsetUtils.clean("no"));
+        
+        assertEquals("UTF-8", CharsetUtils.clean("utf-8>"));
+        
+        assertEquals("ISO-8859-1", CharsetUtils.clean("iso-8851-1"));
+        assertEquals("ISO-8859-15", CharsetUtils.clean("8859-15"));
+        
+        assertEquals("windows-1251", CharsetUtils.clean("cp-1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("win1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("WIN-1251"));
+        assertEquals("windows-1251", CharsetUtils.clean("win-1251"));
+        assertEquals("windows-1252", CharsetUtils.clean("Windows"));
+        
+        assertEquals("KOI8-R", CharsetUtils.clean("koi8r"));
+    }
+
+}

Property changes on: src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
___________________________________________________________________
Added: svn:mime-type
   + text/plain
Added: svn:eol-style
   + native

Index: src/main/java/org/apache/tika/utils/CharsetUtils.java
===================================================================
--- src/main/java/org/apache/tika/utils/CharsetUtils.java	(revision 0)
+++ src/main/java/org/apache/tika/utils/CharsetUtils.java	(revision 0)
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+@SuppressWarnings("serial")
+public class CharsetUtils {
+    private static final Pattern CHARSET_NAME_PATTERN = Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*");
+    private static final Pattern ISO_NAME_PATTERN = Pattern.compile("(?i).*8859-([\\d]+)");
+    private static final Pattern CP_NAME_PATTERN = Pattern.compile("(?i)cp-([\\d]+)");
+    private static final Pattern WIN_NAME_PATTERN = Pattern.compile("(?i)win(|-)([\\d]+)");
+    
+    // List of common invalid charset names that we can't fix using
+    // pattern matching + heuristic
+    private static final Map<String, String> CHARSET_ALIASES = new HashMap<String, String>() {{
+        put("none", null);
+        put("no", null);
+        
+        put("iso-8851-1", "iso-8859-1");
+        
+        put("windows", "windows-1252");
+        
+        put("koi8r", "KOI8-R");
+    }};
+    
+    /**
+     * Safely return whether <charsetName> is supported, without throwing exceptions
+     * 
+     * @param charsetName Name of charset (can be null)
+     * @return true if the character set is supported
+     */
+    public static boolean isSupported(String charsetName) {
+        try {
+            return Charset.isSupported(charsetName);
+        } catch (IllegalCharsetNameException e) {
+            return false;
+        } catch (IllegalArgumentException e) {
+            // null, for example
+            return false;
+        } catch (Exception e) {
+            // Unexpected exception, what to do?
+            return false;
+        }
+    }
+    
+    /**
+     * Handle various common charset name errors, and return something
+     * that will be considered valid (and is normalized)
+     * 
+     * @param charsetName name of charset to process
+     * @return potentially remapped/cleaned up version of charset name
+     */
+    public static String clean(String charsetName) {
+        if (charsetName == null) {
+            return null;
+        }
+        
+        // Get rid of cruft around names, like <>, trailing commas, etc.
+        Matcher m = CHARSET_NAME_PATTERN.matcher(charsetName);
+        if (!m.matches()) {
+            return null;
+        }
+
+        String result = m.group(1);
+        if (CHARSET_ALIASES.containsKey(result.toLowerCase())) {
+            // Handle common erroneous charset names.
+            result = CHARSET_ALIASES.get(result.toLowerCase());
+        } else if (ISO_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "iso 8859-x" error
+            m = ISO_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "iso-8859-" + m.group(1);
+        } else if (CP_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "cp-xxx" error
+            m = CP_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "cp" + m.group(1);
+        } else if (WIN_NAME_PATTERN.matcher(result).matches()) {
+            // Handle "winxxx" and "win-xxx" errors
+            m = WIN_NAME_PATTERN.matcher(result);
+            m.matches();
+            result = "windows-" + m.group(2);
+        }
+        
+        try {
+            Charset cs = Charset.forName(result);
+            return cs.name();
+        } catch (Exception e) {
+            return null;
+        }
+    }
+
+}

Property changes on: src/main/java/org/apache/tika/utils/CharsetUtils.java
___________________________________________________________________
Added: svn:mime-type
   + text/plain
Added: svn:eol-style
   + native

