diff --git src/java/org/apache/nutch/crawl/DbUpdateMapper.java src/java/org/apache/nutch/crawl/DbUpdateMapper.java index 5bef21d..8b25eb5 100644 --- src/java/org/apache/nutch/crawl/DbUpdateMapper.java +++ src/java/org/apache/nutch/crawl/DbUpdateMapper.java @@ -17,6 +17,7 @@ package org.apache.nutch.crawl; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -27,7 +28,6 @@ import org.apache.nutch.metadata.Nutch; import org.apache.nutch.storage.Mark; import org.apache.nutch.util.NutchJob; import org.slf4j.Logger; -import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.scoring.ScoreDatum; import org.apache.nutch.scoring.ScoringFilterException; @@ -51,6 +51,9 @@ extends GoraMapper { private UrlWithScore urlWithScore = new UrlWithScore(); private NutchWritable nutchWritable = new NutchWritable(); private WebPageWritable pageWritable; + + private WebPage webpageCanonical; + private static final Utf8 YES_STRING = new Utf8("y"); @Override public void map(String key, WebPage page, Context context) @@ -100,6 +103,17 @@ extends GoraMapper { nutchWritable.set(scoreDatum); context.write(urlWithScore, nutchWritable); } + webpageCanonical = new WebPage(); + ByteBuffer getCanonical = page.getFromMetadata(new Utf8("canonical")); + if (getCanonical != null) { + urlWithScore + .setUrl(TableUtil.reverseUrl(new String(getCanonical.array()))); + urlWithScore.setScore(Float.MAX_VALUE); + pageWritable.setWebPage(webpageCanonical); + nutchWritable.set(pageWritable); + Mark.INJECT_MARK.putMark(webpageCanonical, YES_STRING); + context.write(urlWithScore, nutchWritable); + } } @Override diff --git src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java index 56ae989..377245c 100644 --- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java +++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -26,7 +26,8 @@ import java.util.HashMap; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; import org.apache.hadoop.conf.Configuration; - +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.*; /** @@ -37,6 +38,8 @@ import org.w3c.dom.*; * */ public class DOMContentUtils { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.html"); public static class LinkParams { public String elName; @@ -411,6 +414,45 @@ public class DOMContentUtils { } } } + // Better to pass urls throught regexp url normalizer before comparing? + private String normalizeUrl(String url) { + return url.replaceFirst("/$", ""); + } + + /** + * Check if page supplied in DOM node has canonical link element, + * if so compare it to page url if those are different then page + * is not canonical. + */ + public URL getDifferentCanonical(Node node, URL url) { + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + if ("link".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + Node rel = attrs.getNamedItem("rel"); + if (rel != null && "canonical".equalsIgnoreCase(rel.getNodeValue())) { + String href = attrs.getNamedItem("href").getNodeValue(); + try { + URL canonical = new URL(href); + if (!checkUrlEquality(canonical, url)) { + return canonical; + } + } catch (MalformedURLException e) { + LOG.warn("There is an error.When check canonical and base url equaltiy."); + } + } + } + } + return null; + } + public boolean checkUrlEquality(URL canonical, URL url) { + return normalizeUrl(canonical.getFile()).equalsIgnoreCase( + normalizeUrl(url.getFile())) + && canonical.getHost().equalsIgnoreCase(url.getHost()); + } } diff --git src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 0a7226f..e3b5ee5 100644 --- src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -67,6 +67,7 @@ public class HtmlParser implements Parser { // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) private static final int CHUNK_SIZE = 2000; + private static final String CANONICAL_VALUE = "canonical"; // NUTCH-1006 Meta equiv with single quotes not accepted private static Pattern metaPattern = @@ -184,7 +185,12 @@ public class HtmlParser implements Parser { LOG.error("Failed with the following Exception: ", e); return ParseStatusUtils.getEmptyParse(e, getConf()); } - + // get and control canonical(rel="canonical") value. + URL canonical = utils.getDifferentCanonical(root, base); + if (canonical != null) { + LOG.warn("Non canonical page. Ignore: " + base); + return signCanonical(canonical, page); + } // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { @@ -231,7 +237,13 @@ public class HtmlParser implements Parser { return parse; } - + // write canonical url to metadata and return Empty Parse. + public Parse signCanonical(URL base, WebPage wp) { + wp.putToMetadata(new Utf8(CANONICAL_VALUE), + ByteBuffer.wrap(base.toString().getBytes())); + return ParseStatusUtils.getEmptyParse(999, "Non canonical page. Ignore", + getConf()); + } private DocumentFragment parse(InputSource input) throws Exception { if (parserImpl.equalsIgnoreCase("tagsoup")) return parseTagSoup(input); diff --git src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java index 4edfa37..1f53ab8 100644 --- src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java +++ src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -18,28 +18,38 @@ package org.apache.nutch.parse.html; import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseStatusCodes; +import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.Bytes; import org.apache.nutch.util.NutchConfiguration; import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.StringTokenizer; import org.cyberneko.html.parsers.*; import org.xml.sax.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.*; import org.apache.html.dom.*; - import org.junit.Before; import org.junit.Test; + import static org.junit.Assert.*; /** * Unit tests for DOMContentUtils. */ public class TestDOMContentUtils { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.html"); private static final String[] testPages= { new String(" title " @@ -136,11 +146,13 @@ public class TestDOMContentUtils { + "

test1

" + "
" + "

test2

"), + // Also test canonical new String(" title " + "" + "anchor1" + "anchor2" + "anchor3" + + "canonical" + ""), new String(" title " + "" @@ -165,6 +177,7 @@ public class TestDOMContentUtils { "http://www.nutch.org//", "http://www.nutch.org/", "http://www.nutch.org/", + // Also baseurl to test canonical "http://www.nutch.org/", "http://www.nutch.org/;something" }; @@ -192,7 +205,7 @@ public class TestDOMContentUtils { "ignore ignore", "test1 test2", "test1 test2", - "title anchor1 anchor2 anchor3", + "title anchor1 anchor2 anchor3 canonical", "title anchor1 anchor2 anchor3 anchor4 anchor5" }; @@ -215,6 +228,8 @@ public class TestDOMContentUtils { private static Outlink[][] answerOutlinks; private static Configuration conf; + + private String canonicalUrl = "http://www.canonical.com/"; private static DOMContentUtils utils = null; @Before @@ -286,7 +301,9 @@ public class TestDOMContentUtils { { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), - new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") + new Outlink("http://www.nutch.org/g;x?y#s", "anchor3"), + // canonical + new Outlink("http://www.canonical.com/", ""), }, { new Outlink("http://www.nutch.org/g;something", "anchor1"), @@ -368,6 +385,81 @@ public class TestDOMContentUtils { compareOutlinks(answerOutlinks[i], outlinkArr); } } + + /** + * + * TestCase to test Canonical situations. + */ + @Test + public void testCanonical() { + if (testDOMs[0] == null) + setup(); + for (int i = 0; i < testPages.length; i++) { + if (i == 10) { + try { + compareCanonicalLink(testDOMs[i], new URL(testBaseHrefs[i])); + compareCanonicalParse(testBaseHrefs[i], testPages[i]); + } catch (MalformedURLException e) { + LOG.warn("There is an error.When test canonical link or Parsed canonical page."); + } + } else + continue; + } + } + + /** + * + * If url of page which has canonical tag not equals href attribute of + * value.And canonical enables via confugartion. Should return empty Parse. + * Method compares conclusions of Parse. + * + * @param baseUrl + * @param content + */ + public void compareCanonicalParse(String baseUrl, String content) { + + WebPage wp = prepareWebpage(baseUrl, content); + HtmlParser hp = new HtmlParser(); + hp.setConf(conf); + Parse parse = hp.getParse(baseUrl, wp); + ByteBuffer urlFromMetadata = wp.getFromMetadata(new Utf8("canonical")); + assertEquals(parse.getText(), ""); + assertEquals(parse.getTitle(), ""); + assertEquals(parse.getParseStatus().getMajorCode(), ParseStatusCodes.FAILED); + assertEquals(parse.getParseStatus().getMinorCode(), 999); + assertNotNull(wp.getFromMetadata(new Utf8("canonical"))); + assertEquals(Bytes.toString(urlFromMetadata), canonicalUrl); + } + + /** + * Prepare Webpage to pass HtmlParser + * + * @param baseUrl + * @param content + * @return + */ + public WebPage prepareWebpage(String baseUrl, String content) { + WebPage wp = new WebPage(); + wp.setContent(ByteBuffer.wrap(Bytes.toBytes(content))); + wp.setBaseUrl(new Utf8(baseUrl)); + wp.setText(new Utf8("")); + wp.setTitle(new Utf8("")); + wp.setStatus(ParseStatusCodes.FAILED); + return wp; + } + + /** + * Test Baseurl with canonical url equality. + * + * @param node + * @param url + * @throws MalformedURLException + */ + private void compareCanonicalLink(DocumentFragment node, URL url) + throws MalformedURLException { + assertEquals(utils.getDifferentCanonical(node, url), new URL(canonicalUrl)); + } + private static final void appendOutlinks(StringBuffer sb, Outlink[] o) { for (int i= 0; i < o.length; i++) { diff --git src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java index 06dadff..b3fa0d7 100644 --- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java +++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java @@ -26,6 +26,8 @@ import java.util.HashMap; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; @@ -38,6 +40,8 @@ import org.w3c.dom.NodeList; * */ public class DOMContentUtils { + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.tika"); private static class LinkParams { private String elName; @@ -412,6 +416,46 @@ public class DOMContentUtils { } } } + // Better to pass urls throught regexp url normalizer before comparing? + private String normalizeUrl(String url) { + return url.replaceFirst("/$", ""); + } + + /** + * Check if page supplied in DOM node has canonical link element, + * if so compare it to page url if those are different then page + * is not canonical. + */ + + public URL getDifferentCanonical(Node node, URL url) { + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { + + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + if ("link".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + Node rel = attrs.getNamedItem("rel"); + if (rel != null && "canonical".equalsIgnoreCase(rel.getNodeValue())) { + String href = attrs.getNamedItem("href").getNodeValue(); + try { + URL canonical = new URL(href); + if (!checkUrlEquality(canonical, url)) { + return canonical; + } + } catch (MalformedURLException e) { + LOG.warn("There is an error.When check canonical and base url equaltiy."); + } + } + } + } + return null; + } + public boolean checkUrlEquality(URL canonical, URL url) { + return normalizeUrl(canonical.getFile()).equalsIgnoreCase( + normalizeUrl(url.getFile())) + && canonical.getHost().equalsIgnoreCase(url.getHost()); + } } diff --git src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index 52e0841..303b376 100644 --- src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -70,7 +70,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { FIELDS.add(WebPage.Field.BASE_URL); FIELDS.add(WebPage.Field.CONTENT_TYPE); } - + private static final String CANONICAL_VALUE = "canonical"; private Configuration conf; private TikaConfig tikaConfig = null; private DOMContentUtils utils; @@ -162,6 +162,12 @@ public class TikaParser implements org.apache.nutch.parse.Parser { LOG.trace("found " + outlinks.length + " outlinks in " + base); } } + // get and control canonical(rel="canonical") value. + URL canonical = utils.getDifferentCanonical(root, base); + if (canonical != null) { + LOG.warn("Non canonical page. Ignore: " + base); + return signCanonical(canonical, page); + } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); @@ -198,6 +204,13 @@ public class TikaParser implements org.apache.nutch.parse.Parser { return parse; } + // write canonical url to metadata and return Empty Parse. + public Parse signCanonical(URL base, WebPage wp) { + wp.putToMetadata(new Utf8(CANONICAL_VALUE), + ByteBuffer.wrap(base.toString().getBytes())); + return ParseStatusUtils.getEmptyParse(999, "Non canonical page. Ignore", + getConf()); + } public void setConf(Configuration conf) { this.conf = conf; this.tikaConfig = null; diff --git src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java deleted file mode 100644 index 5d80125..0000000 --- src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java +++ /dev/null @@ -1,421 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.tika; - -import org.junit.Test; -import static org.junit.Assert.*; - -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.tika.DOMBuilder; -import org.apache.nutch.parse.tika.DOMContentUtils; -import org.apache.nutch.parse.tika.TikaParser; -import org.apache.hadoop.conf.Configuration; -import org.apache.html.dom.HTMLDocumentImpl; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.xml.serializer.dom3.LSSerializerImpl; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.DocumentFragment; - -import java.io.ByteArrayInputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.StringTokenizer; - -/** - * Unit tests for DOMContentUtils. - */ -public class DOMContentUtilsTest { - - private static final String[] testPages = { - // 0. - new String( - " title " - + " body " - + " anchor " + ""), - // 1. - new String( - " title " - + " body " - + " home " - + "" - + " " + " bots " - + ""), - // 2. - new String(" " + " " - + " separate this " - + " from this" + "" - + ""), - // 3. - // this one relies on certain neko fixup behavior, possibly - // distributing the anchors into the LI's-but not the other - // anchors (outside of them, instead)! So you get a tree that - // looks like: - // ...
  • home
  • - //
  • 1
  • - //
  • 2
  • - new String(" my title " - + " body " + "" + ""), - // 4. - // test frameset link extraction. The invalid frame in the middle - // will be - // fixed to a third standalone frame. - new String(" my title " - + " " - + "" + "" - + "" + "" - + "" + "" + "" - + "" + "" + "" - + "" + ""), - // 5. - // test and