Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 930265)
+++ conf/nutch-default.xml	(working copy)
@@ -1290,6 +1290,15 @@
   </description>
 </property>
 
+<!-- parse-metatags plugin properties -->
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+  <description> Names of the metatags to extract, separated by;. 
+  Use '*' to extract all metatags.
+  </description>
+</property>
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 930265)
+++ src/plugin/build.xml	(working copy)
@@ -52,6 +52,7 @@
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
+     <ant dir="parse-metatags" target="deploy"/>
      <ant dir="parse-msexcel" target="deploy"/>
      <ant dir="parse-mspowerpoint" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
@@ -99,6 +100,7 @@
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-html" target="test"/>
+     <ant dir="parse-metatags" target="test"/>
      <!-- <ant dir="parse-mp3" target="test"/> -->
      <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>
@@ -156,6 +158,7 @@
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-html" target="clean"/>
     <ant dir="parse-js" target="clean"/>
+    <ant dir="parse-metatags" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
     <ant dir="parse-msexcel" target="clean"/>
     <ant dir="parse-mspowerpoint" target="clean"/>
Index: src/plugin/parse-metatags/sample/testMetatags.html
===================================================================
--- src/plugin/parse-metatags/sample/testMetatags.html	(revision 0)
+++ src/plugin/parse-metatags/sample/testMetatags.html	(revision 0)
@@ -0,0 +1,9 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+</head>
+<body>
+text of the document
+</body>
+
Index: src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
===================================================================
--- src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java	(revision 0)
+++ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java	(revision 0)
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestMetatagParser extends TestCase {
+  
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+  
+  public TestMetatagParser(String name) {
+    super(name);
+  }
+  
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+    
+    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
+    
+    try {
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      
+      // check that we get the same values
+      Metadata parseMeta = parse.getData().getParseMeta();
+      
+      assertEquals(description, parseMeta.get("metatag.description"));
+      assertEquals(keywords, parseMeta.get("metatag.keywords"));
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.toString());
+    }
+  }
+  
+}
Index: src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
===================================================================
--- src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java	(revision 0)
+++ src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java	(revision 0)
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+public class MetaTagsParser implements HtmlParseFilter {
+
+    private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+	    .getName());
+
+    private Configuration conf;
+
+    private Set<String> metatagset = new HashSet<String>();
+
+    public void setConf(Configuration conf) {
+	this.conf = conf;
+	// specify whether we want a specific subset of metadata
+	// by default take everything we can find
+	String metatags = conf.get("metatags.names", "*");
+	String[] values = metatags.split(";");
+	for (String val : values)
+	    metatagset.add(val.toLowerCase());
+    }
+
+    public Configuration getConf() {
+	return this.conf;
+    }
+
+    public ParseResult filter(Content content, ParseResult parseResult,
+	    HTMLMetaTags metaTags, DocumentFragment doc) {
+
+	Parse parse = parseResult.get(content.getUrl());
+	Metadata metadata = parse.getData().getParseMeta();
+
+	// check in the metadata first : the tika-parser
+	// might have stored the values there already
+
+	for (String mdName : metadata.names()) {
+	    String value = metadata.get(mdName);
+	    // check whether the name is in the list of what we want or if
+	    // specified *
+	    if (metatagset.contains("*")
+		    || metatagset.contains(mdName.toLowerCase())) {
+		LOG.debug("Found meta tag : " + mdName + "\t" + value);
+		metadata.add("metatag." + mdName.toLowerCase(), value);
+	    }
+	}
+
+	Properties generalMetaTags = metaTags.getGeneralTags();
+	for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
+		.hasMoreElements();) {
+	    String name = (String) tagNames.nextElement();
+	    String value = generalMetaTags.getProperty(name);
+	    // check whether the name is in the list of what we want or if
+	    // specified *
+	    if (metatagset.contains("*")
+		    || metatagset.contains(name.toLowerCase())) {
+		LOG.debug("Found meta tag : " + name + "\t" + value);
+		metadata.add("metatag." + name.toLowerCase(), value);
+	    }
+	}
+
+	Properties httpequiv = metaTags.getHttpEquivTags();
+	for (Enumeration tagNames = httpequiv.propertyNames(); tagNames
+		.hasMoreElements();) {
+	    String name = (String) tagNames.nextElement();
+	    String value = httpequiv.getProperty(name);
+	    // check whether the name is in the list of what we want or if
+	    // specified *
+	    if (metatagset.contains("*")
+		    || metatagset.contains(name.toLowerCase())) {
+		LOG.debug("Found meta tag : " + name + "\t" + value);
+		metadata.add("metatag." + name.toLowerCase(), value);
+	    }
+	}
+
+	return parseResult;
+    }
+
+}
Index: src/plugin/parse-metatags/src/java/org/apache/nutch/searcher/MetaTagsQueryFilter.java
===================================================================
--- src/plugin/parse-metatags/src/java/org/apache/nutch/searcher/MetaTagsQueryFilter.java	(revision 0)
+++ src/plugin/parse-metatags/src/java/org/apache/nutch/searcher/MetaTagsQueryFilter.java	(revision 0)
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.searcher;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginDescriptor;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.searcher.Query.Clause;
+import org.apache.nutch.searcher.Query.Phrase;
+import org.apache.nutch.searcher.Query.Term;
+
+public class MetaTagsQueryFilter extends Configured implements QueryFilter {
+  private static final Log LOG = LogFactory.getLog(MetaTagsQueryFilter.class);
+  
+  private Set<String> fields = new HashSet<String>();
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null) {
+      return;
+    }
+    // retrieve the plugin info and extract the list of fields
+    PluginRepository pr = PluginRepository.get(conf);
+    PluginDescriptor pd = pr.getPluginDescriptor("parse-metatags");
+    Extension[] exts = pd.getExtensions();
+    for (Extension e : exts) {
+      String flds = e.getAttribute("fields");
+      if (flds != null) {
+        fields.addAll(Arrays.asList(flds.split("[\\s,]")));
+      }
+      flds = e.getAttribute("raw-fields");
+      if (flds != null) {
+        fields.addAll(Arrays.asList(flds.split("[\\s,]")));
+      }
+    }
+    LOG.info("Query meta fields: " + fields.toString());
+  }
+
+  @Override
+  public BooleanQuery filter(Query input, BooleanQuery translation)
+          throws QueryException {
+    
+    for (Clause c : input.getClauses()) {
+      if (!fields.contains(c.getField())) {
+        continue;
+      }
+      org.apache.lucene.search.Query q;
+      if (c.isPhrase()) {
+        PhraseQuery pq = new PhraseQuery();
+        Phrase p = c.getPhrase();
+        for (Term t : p.getTerms()) {
+          pq.add(new org.apache.lucene.index.Term(c.getField(), t.toString()));
+        }
+        q = pq;
+      } else {
+        Term t = c.getTerm();
+        q = new TermQuery(new org.apache.lucene.index.Term(c.getField(), t.toString()));
+      }
+      translation.add(q, c.isRequired() ? Occur.MUST : c.isProhibited() ? Occur.MUST_NOT : Occur.SHOULD);
+    }
+    return translation;
+  }
+
+}
Index: src/plugin/parse-metatags/src/java/org/apache/nutch/indexer/MetaTagsIndexer.java
===================================================================
--- src/plugin/parse-metatags/src/java/org/apache/nutch/indexer/MetaTagsIndexer.java	(revision 0)
+++ src/plugin/parse-metatags/src/java/org/apache/nutch/indexer/MetaTagsIndexer.java	(revision 0)
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.lucene.LuceneWriter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Indexes field description and keywords provided by the metataparser Note that
+ * we limit ourselves to these 2 types of metadata as we must specify the exact
+ * values in addIndexBackendOptions
+ **/
+public class MetaTagsIndexer implements IndexingFilter {
+  
+  public static final Log LOG = LogFactory.getLog(MetaTagsIndexer.class
+      .getName());
+  
+  private Configuration conf;
+  
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    
+    Metadata metadata = parse.getData().getParseMeta();
+    
+    String description = metadata.get("metatag.description");
+    String keywords = metadata.get("metatag.keywords");
+    
+    if (description != null) {
+      doc.add("description", description);
+      LOG.debug(url.toString() + " : added " + description
+          + " to the description Field");
+    }
+    
+    if (keywords != null) {
+      // split the keywords and send them as separate fields
+      // in SOLR this will allow us to specify a gap in order to prevent
+      // cross keywords matching
+      String[] kws = keywords.split(" *, *");
+      for (String kw : kws) {
+        doc.add("keywords", kw);
+      }
+      LOG.debug(url.toString() + " : added " + kws + " to the keywords Field");
+    }
+    
+    return doc;
+    
+  }
+  
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+  
+  public Configuration getConf() {
+    return this.conf;
+  }
+  
+  public void addIndexBackendOptions(Configuration conf) {
+    LuceneWriter.addFieldOptions("description", LuceneWriter.STORE.YES,
+        LuceneWriter.INDEX.TOKENIZED, conf);
+    LuceneWriter.addFieldOptions("keywords", LuceneWriter.STORE.NO,
+        LuceneWriter.INDEX.TOKENIZED, conf);
+  }
+  
+}
Index: src/plugin/parse-metatags/README.txt
===================================================================
--- src/plugin/parse-metatags/README.txt	(revision 0)
+++ src/plugin/parse-metatags/README.txt	(revision 0)
@@ -0,0 +1,18 @@
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'.
+In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml
+
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+</property>
+
+The MetatagIndexer uses the output of the parsing above to create two fields 'keywords' and 'description'. Note that keywords is multivalued.
+The MetaTagsQueryFilter allows to query on the fields above using the Nutch Query API.
+
+This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com
+
+
+
+
Index: src/plugin/parse-metatags/plugin.xml
===================================================================
--- src/plugin/parse-metatags/plugin.xml	(revision 0)
+++ src/plugin/parse-metatags/plugin.xml	(revision 0)
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-metatags"
+   name="MetaTags"
+   version="0.0.1"
+   provider-name="digitalpebble.com">
+
+   <runtime>
+      <library name="parse-metatags.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.metatags.parser"
+              name="MetaTags Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="MetaTagsParser"
+                      class="org.apache.nutch.parse.MetaTagsParser"/>
+   </extension>
+   
+      <extension id="org.apache.nutch.parse.metatags.indexer"
+              name="MetaTags Indexer"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MetaTagsIndexer"
+                      class="org.apache.nutch.indexer.MetaTagsIndexer"/>
+   </extension>
+   
+   <extension id="org.apache.nutch.parse.metatags.queryfilter"
+              name="MetaTags QueryFilter"
+              point="org.apache.nutch.searcher.QueryFilter">
+      <implementation id="MetaTagsQueryFilter"
+                      class="org.apache.nutch.searcher.MetaTagsQueryFilter">
+        <parameter name="fields" value="keywords,description"/>
+      </implementation>
+   </extension>  
+
+</plugin>
+
Index: src/plugin/parse-metatags/build.xml
===================================================================
--- src/plugin/parse-metatags/build.xml	(revision 0)
+++ src/plugin/parse-metatags/build.xml	(revision 0)
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy file="sample/testMetatags.html" todir="${build.test}/data" />
+
+</project>

Property changes on: src/plugin/parse-metatags/build.xml
___________________________________________________________________
Added: svn:executable
   + *

