diff --git src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index 57014b0..3a077e6 100644
--- src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -18,7 +18,8 @@
package org.apache.nutch.indexer.metadata;
import java.util.HashMap;
-import java.util.Map.Entry;
+import java.util.Locale;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -30,17 +31,16 @@ import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
/**
- * Indexer which can be configured to extract metadata from the crawldb, parse metadata or content metadata.
- * You can specify the properties "index.db", "index.parse" or "index.content" who's values are
- * comma-delimited key1, key2, key3.
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db",
+ * "index.parse" or "index.content" who's values are comma-delimited
+ * key1, key2, key3.
*/
-
public class MetadataIndexer implements IndexingFilter {
private Configuration conf;
- private HashMap staticfields;
- private static String[] dbFieldnames;
- private static String[] parseFieldnames;
- private static String[] contentFieldnames;
+ private String[] dbFieldnames;
+ private Map parseFieldnames;
+ private String[] contentFieldnames;
private static final String db_CONF_PROPERTY = "index.db.md";
private static final String parse_CONF_PROPERTY = "index.parse.md";
private static final String content_CONF_PROPERTY = "index.content.md";
@@ -63,14 +63,14 @@ public class MetadataIndexer implements IndexingFilter {
}
// add the fields from parsemd
- if (parseFieldnames != null) {
- for (String metatag : parseFieldnames) {
- for (String value : parse.getData().getParseMeta().getValues(metatag)) {
- if (value != null)
- doc.add(metatag, value);
- }
- }
- }
+ if (parseFieldnames != null) {
+ for (String metatag : parseFieldnames.keySet()) {
+ for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+ if (value != null)
+ doc.add(parseFieldnames.get(metatag), value);
+ }
+ }
+ }
// add the fields from contentmd
if (contentFieldnames != null) {
@@ -88,7 +88,10 @@ public class MetadataIndexer implements IndexingFilter {
public void setConf(Configuration conf) {
this.conf = conf;
dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
- parseFieldnames = conf.getStrings(parse_CONF_PROPERTY);
+ parseFieldnames = new HashMap();
+ for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
+ parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
+ }
contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
// TODO check conflict between field names e.g. could have same label
diff --git src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 7dc055d..0c60975 100644
--- src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -18,6 +18,7 @@ package org.apache.nutch.parse.metatags;
import java.util.Enumeration;
import java.util.HashSet;
+import java.util.Locale;
import java.util.Properties;
import java.util.Set;
@@ -35,7 +36,7 @@ import org.w3c.dom.DocumentFragment;
/**
* Parse HTML meta tags (keywords, description) and store them in the parse
* metadata so that they can be indexed with the index-metadata plugin with the
- * prefix 'metatag.'
+ * prefix 'metatag.'. Metatags are matched ignoring case.
*/
public class MetaTagsParser implements HtmlParseFilter {
@@ -50,15 +51,29 @@ public class MetaTagsParser implements HtmlParseFilter {
this.conf = conf;
// specify whether we want a specific subset of metadata
// by default take everything we can find
- String metatags = conf.get("metatags.names", "*");
- String[] values = metatags.split(";");
- for (String val : values)
- metatagset.add(val.toLowerCase());
+ String[] values = conf.getStrings("metatags.names", "*");
+ for (String val : values) {
+ metatagset.add(val.toLowerCase(Locale.ROOT));
+ }
}
public Configuration getConf() {
return this.conf;
}
+
+ /**
+ * Check whether the metatag is in the list of metatags to be
+ * indexed (or if '*' is specified). If yes, add it to parse metadata.
+ */
+ private void addIndexedMetatags(Metadata metadata, String metatag, String value) {
+ String lcMetatag = metatag.toLowerCase(Locale.ROOT);
+ if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ }
+ metadata.add("metatag." + lcMetatag, value);
+ }
+ }
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -68,42 +83,25 @@ public class MetaTagsParser implements HtmlParseFilter {
// check in the metadata first : the tika-parser
// might have stored the values there already
-
for (String mdName : metadata.names()) {
- String value = metadata.get(mdName);
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(mdName.toLowerCase())) {
- LOG.debug("Found meta tag : " + mdName + "\t" + value);
- metadata.add("metatag." + mdName.toLowerCase(), value);
+ for (String value : metadata.getValues(mdName)) {
+ addIndexedMetatags(metadata, mdName, value);
}
}
Metadata generalMetaTags = metaTags.getGeneralTags();
for (String tagName : generalMetaTags.names() ) {
- String[] tagValues = generalMetaTags.getValues(tagName);
-
- for ( String tagValue : tagValues ) {
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
- LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
- metadata.add("metatag." + tagName.toLowerCase(), tagValue);
- }
+ for ( String tagValue : generalMetaTags.getValues(tagName)) {
+ addIndexedMetatags(metadata, tagName, tagValue);
}
}
Properties httpequiv = metaTags.getHttpEquivTags();
- for (Enumeration tagNames = httpequiv.propertyNames(); tagNames
+ for (Enumeration> tagNames = httpequiv.propertyNames(); tagNames
.hasMoreElements();) {
String name = (String) tagNames.nextElement();
String value = httpequiv.getProperty(name);
- // check whether the name is in the list of what we want or if
- // specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
- LOG.debug("Found meta tag : " + name + "\t" + value);
- metadata.add("metatag." + name.toLowerCase(), value);
- }
+ addIndexedMetatags(metadata, name, value);
}
return parseResult;