### Eclipse Workspace Patch 1.0 #P TrunkTest Index: src/plugin/parse-metatags/build.xml =================================================================== --- src/plugin/parse-metatags/build.xml (revision 1451943) +++ src/plugin/parse-metatags/build.xml (working copy) @@ -28,6 +28,10 @@ - + + + + + Index: src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java =================================================================== --- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (revision 1451943) +++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java (working copy) @@ -78,7 +78,7 @@ if (nameNode != null) { if (contentNode != null) { String name = nameNode.getNodeValue().toLowerCase(); - metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue()); + metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { if (contentNode != null) { Index: src/java/org/apache/nutch/parse/HTMLMetaTags.java =================================================================== --- src/java/org/apache/nutch/parse/HTMLMetaTags.java (revision 1451943) +++ src/java/org/apache/nutch/parse/HTMLMetaTags.java (working copy) @@ -21,6 +21,8 @@ import java.util.Iterator; import java.util.Properties; +import org.apache.nutch.metadata.Metadata; + /** * This class holds the information about HTML "meta" tags extracted from * a page. Some special tags have convenience methods for easy checking. @@ -40,10 +42,10 @@ private URL refreshHref = null; - private Properties generalTags = new Properties(); + private Metadata generalTags = new Metadata(); private Properties httpEquivTags = new Properties(); - + /** * Sets all boolean values to false. Clears all other tags. */ @@ -166,7 +168,7 @@ * Returns all collected values of the general meta tags. Property names are * tag names, property values are "content" values. */ - public Properties getGeneralTags() { + public Metadata getGeneralTags() { return generalTags; } @@ -179,26 +181,27 @@ } public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append("base=" + baseHref - + ", noCache=" + noCache - + ", noFollow=" + noFollow - + ", noIndex=" + noIndex - + ", refresh=" + refresh - + ", refreshHref=" + refreshHref + "\n" - ); - sb.append(" * general tags:\n"); - Iterator it = generalTags.keySet().iterator(); - while (it.hasNext()) { - String key = (String)it.next(); - sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n"); - } - sb.append(" * http-equiv tags:\n"); - it = httpEquivTags.keySet().iterator(); - while (it.hasNext()) { - String key = (String)it.next(); - sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n"); - } - return sb.toString(); + StringBuffer sb = new StringBuffer(); + sb.append("base=" + baseHref + + ", noCache=" + noCache + + ", noFollow=" + noFollow + + ", noIndex=" + noIndex + + ", refresh=" + refresh + + ", refreshHref=" + refreshHref + "\n" + ); + sb.append(" * general tags:\n"); + String[] names = generalTags.names(); + for (String name : names) { + String key = name; + sb.append(" - " + key + "\t=\t" + generalTags.get(key) + "\n"); + } + sb.append(" * http-equiv tags:\n"); + Iterator it = httpEquivTags.keySet().iterator(); + while (it.hasNext()) { + String key = (String)it.next(); + sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n"); + } + return sb.toString(); + } } Index: src/plugin/parse-metatags/sample/testMultivalueMetatags.html =================================================================== --- src/plugin/parse-metatags/sample/testMultivalueMetatags.html (revision 0) +++ src/plugin/parse-metatags/sample/testMultivalueMetatags.html (working copy) @@ -0,0 +1,12 @@ + + + + + + + + + + +A test for multi-valued metatags. + \ No newline at end of file Index: src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java =================================================================== --- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (revision 1451943) +++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java (working copy) @@ -49,7 +49,7 @@ private static final void getMetaTagsHelper( HTMLMetaTags metaTags, Node node, URL currURL) { - + if (node.getNodeType() == Node.ELEMENT_NODE) { if ("body".equalsIgnoreCase(node.getNodeName())) { @@ -77,8 +77,8 @@ if (nameNode != null) { if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); - metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue()); + String name = nameNode.getNodeValue().toLowerCase(); + metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { if (contentNode != null) { Index: src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java =================================================================== --- src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (revision 1451943) +++ src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (working copy) @@ -17,6 +17,9 @@ package org.apache.nutch.parse.html; +import java.util.Set; +import java.util.TreeSet; + import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; @@ -35,6 +38,7 @@ private String fileSeparator = System.getProperty("file.separator"); private String sampleDir = System.getProperty("test.data", "."); private String sampleFile = "testMetatags.html"; + private String sampleFileMultival = "testMultivalueMetatags.html"; private String description = "This is a test of description"; private String keywords = "This is a test of keywords"; @@ -42,27 +46,58 @@ super(name); } - public void testIt() { - Configuration conf = NutchConfiguration.create(); - - String urlString = "file:" + sampleDir + fileSeparator + sampleFile; - + public Metadata parseMeta(String fileName, Configuration conf) { + Metadata metadata = null; try { + String urlString = "file:" + sampleDir + fileSeparator + fileName; Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); Content content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent(); - Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); - - // check that we get the same values - Metadata parseMeta = parse.getData().getParseMeta(); - - assertEquals(description, parseMeta.get("metatag.description")); - assertEquals(keywords, parseMeta.get("metatag.keywords")); + metadata = parse.getData().getParseMeta(); } catch (Exception e) { e.printStackTrace(); fail(e.toString()); } + return metadata; + } + + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + // check that we get the same values + Metadata parseMeta= parseMeta(sampleFile, conf); + + assertEquals(description, parseMeta.get("metatag.description")); + assertEquals(keywords, parseMeta.get("metatag.keywords")); + } + + public void testMultiValueMetatags() { + Configuration conf = NutchConfiguration.create(); + conf.set("metatags.names", "keywords;DC.creator"); + conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator"); + + Metadata parseMeta = parseMeta(sampleFileMultival, conf); + + String failMessage = "One value of metatag with multiple values is missing: "; + + Set valueSet = new TreeSet(); + for (String val : parseMeta.getValues("metatag.dc.creator")) { + valueSet.add(val); + } + String[] expectedValues1 = {"Doug Cutting", "Michael Cafarella"}; + for (String val : expectedValues1) { + assertTrue(failMessage + val, valueSet.contains(val)); + } + + valueSet.clear(); + for (String val : parseMeta.getValues("metatag.keywords")) { + valueSet.add(val); + } + String[] expectedValues2 = {"robot d'indexation", "web crawler", "Webcrawler"}; + for (String val : expectedValues2) { + assertTrue(failMessage + val, valueSet.contains(val)); + } } } Index: src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java =================================================================== --- src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (revision 1451943) +++ src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (working copy) @@ -75,16 +75,17 @@ } } - Properties generalMetaTags = metaTags.getGeneralTags(); - for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames - .hasMoreElements();) { - String name = (String) tagNames.nextElement(); - String value = generalMetaTags.getProperty(name); + Metadata generalMetaTags = metaTags.getGeneralTags(); + for (String tagName : generalMetaTags.names() ) { + String[] tagValues = generalMetaTags.getValues(tagName); + + for ( String tagValue : tagValues ) { // check whether the name is in the list of what we want or if // specified * - if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) { - LOG.debug("Found meta tag : " + name + "\t" + value); - metadata.add("metatag." + name.toLowerCase(), value); + if (metatagset.contains("*") || metatagset.contains(tagName.toLowerCase())) { + LOG.debug("Found meta tag : " + tagName + "\t" + tagValue); + metadata.add("metatag." + tagName.toLowerCase(), tagValue); + } } }