### Eclipse Workspace Patch 1.0
#P TrunkTest
Index: src/plugin/parse-metatags/build.xml
===================================================================
--- src/plugin/parse-metatags/build.xml (revision 1451943)
+++ src/plugin/parse-metatags/build.xml (working copy)
@@ -28,6 +28,10 @@
-
+
+
+
+
+
Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
===================================================================
--- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (revision 1451943)
+++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (working copy)
@@ -78,7 +78,7 @@
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
Index: src/java/org/apache/nutch/parse/HTMLMetaTags.java
===================================================================
--- src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 1451943)
+++ src/java/org/apache/nutch/parse/HTMLMetaTags.java (working copy)
@@ -21,6 +21,8 @@
import java.util.Iterator;
import java.util.Properties;
+import org.apache.nutch.metadata.Metadata;
+
/**
* This class holds the information about HTML "meta" tags extracted from
* a page. Some special tags have convenience methods for easy checking.
@@ -40,10 +42,10 @@
private URL refreshHref = null;
- private Properties generalTags = new Properties();
+ private Metadata generalTags = new Metadata();
private Properties httpEquivTags = new Properties();
-
+
/**
* Sets all boolean values to false
. Clears all other tags.
*/
@@ -166,7 +168,7 @@
* Returns all collected values of the general meta tags. Property names are
* tag names, property values are "content" values.
*/
- public Properties getGeneralTags() {
+ public Metadata getGeneralTags() {
return generalTags;
}
@@ -179,26 +181,27 @@
}
public String toString() {
- StringBuffer sb = new StringBuffer();
- sb.append("base=" + baseHref
- + ", noCache=" + noCache
- + ", noFollow=" + noFollow
- + ", noIndex=" + noIndex
- + ", refresh=" + refresh
- + ", refreshHref=" + refreshHref + "\n"
- );
- sb.append(" * general tags:\n");
- Iterator it = generalTags.keySet().iterator();
- while (it.hasNext()) {
- String key = (String)it.next();
- sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n");
- }
- sb.append(" * http-equiv tags:\n");
- it = httpEquivTags.keySet().iterator();
- while (it.hasNext()) {
- String key = (String)it.next();
- sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
- }
- return sb.toString();
+ StringBuffer sb = new StringBuffer();
+ sb.append("base=" + baseHref
+ + ", noCache=" + noCache
+ + ", noFollow=" + noFollow
+ + ", noIndex=" + noIndex
+ + ", refresh=" + refresh
+ + ", refreshHref=" + refreshHref + "\n"
+ );
+ sb.append(" * general tags:\n");
+ String[] names = generalTags.names();
+ for (String name : names) {
+ String key = name;
+ sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n");
+ }
+ sb.append(" * http-equiv tags:\n");
+ Iterator it = httpEquivTags.keySet().iterator();
+ while (it.hasNext()) {
+ String key = (String)it.next();
+ sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
+ }
+ return sb.toString();
+
}
}
Index: src/plugin/parse-metatags/sample/testMultivalueMetatags.html
===================================================================
--- src/plugin/parse-metatags/sample/testMultivalueMetatags.html (revision 0)
+++ src/plugin/parse-metatags/sample/testMultivalueMetatags.html (working copy)
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+A test for multi-valued metatags.
+
\ No newline at end of file
Index: src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
===================================================================
--- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (revision 1451943)
+++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (working copy)
@@ -49,7 +49,7 @@
private static final void getMetaTagsHelper(
HTMLMetaTags metaTags, Node node, URL currURL) {
-
+
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) {
@@ -77,8 +77,8 @@
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
- metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
+ String name = nameNode.getNodeValue().toLowerCase();
+ metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
if (contentNode != null) {
Index: src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
===================================================================
--- src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (revision 1451943)
+++ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (working copy)
@@ -17,6 +17,9 @@
package org.apache.nutch.parse.html;
+import java.util.Set;
+import java.util.TreeSet;
+
import junit.framework.TestCase;
import org.apache.hadoop.conf.Configuration;
@@ -35,6 +38,7 @@
private String fileSeparator = System.getProperty("file.separator");
private String sampleDir = System.getProperty("test.data", ".");
private String sampleFile = "testMetatags.html";
+ private String sampleFileMultival = "testMultivalueMetatags.html";
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
@@ -42,27 +46,58 @@
super(name);
}
- public void testIt() {
- Configuration conf = NutchConfiguration.create();
-
- String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
-
+ public Metadata parseMeta(String fileName, Configuration conf) {
+ Metadata metadata = null;
try {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
-
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
- // check that we get the same values
- Metadata parseMeta = parse.getData().getParseMeta();
-
- assertEquals(description, parseMeta.get("metatag.description"));
- assertEquals(keywords, parseMeta.get("metatag.keywords"));
+ metadata = parse.getData().getParseMeta();
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
+ return metadata;
+ }
+
+ public void testIt() {
+ Configuration conf = NutchConfiguration.create();
+
+ // check that we get the same values
+ Metadata parseMeta= parseMeta(sampleFile, conf);
+
+ assertEquals(description, parseMeta.get("metatag.description"));
+ assertEquals(keywords, parseMeta.get("metatag.keywords"));
+ }
+
+ public void testMultiValueMetatags() {
+ Configuration conf = NutchConfiguration.create();
+ conf.set("metatags.names", "keywords;DC.creator");
+ conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
+
+ Metadata parseMeta = parseMeta(sampleFileMultival, conf);
+
+ String failMessage = "One value of metatag with multiple values is missing: ";
+
+ Set valueSet = new TreeSet();
+ for (String val : parseMeta.getValues("metatag.dc.creator")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"};
+ for (String val : expectedValues1) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
+
+ valueSet.clear();
+ for (String val : parseMeta.getValues("metatag.keywords")) {
+ valueSet.add(val);
+ }
+ String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"};
+ for (String val : expectedValues2) {
+ assertTrue(failMessage + val, valueSet.contains(val));
+ }
}
}
Index: src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
===================================================================
--- src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (revision 1451943)
+++ src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (working copy)
@@ -75,16 +75,17 @@
}
}
- Properties generalMetaTags = metaTags.getGeneralTags();
- for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
- .hasMoreElements();) {
- String name = (String) tagNames.nextElement();
- String value = generalMetaTags.getProperty(name);
+ Metadata generalMetaTags = metaTags.getGeneralTags();
+ for (String tagName : generalMetaTags.names() ) {
+ String[] tagValues = generalMetaTags.getValues(tagName);
+
+ for ( String tagValue : tagValues ) {
// check whether the name is in the list of what we want or if
// specified *
- if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) {
- LOG.debug("Found meta tag : " + name + "\t" + value);
- metadata.add("metatag." + name.toLowerCase(), value);
+ if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) {
+ LOG.debug("Found meta tag : " + tagName + "\t" + tagValue);
+ metadata.add("metatag." + tagName.toLowerCase(), tagValue);
+ }
}
}