From 950c8007ad2c3da73be915458fa5b2b01233d971 Mon Sep 17 00:00:00 2001
From: Sammy Yu <syu@brightedge.com>
Date: Thu, 18 Feb 2010 11:36:55 -0800
Subject: [PATCH] Updated with nofollow support for Outlinks, Inlinks, and dependent classes.

---
 src/java/org/apache/nutch/crawl/Inlink.java        |   21 +++++++++++++++---
 src/java/org/apache/nutch/crawl/LinkDb.java        |    3 +-
 src/java/org/apache/nutch/crawl/LinkDbFilter.java  |    3 +-
 src/java/org/apache/nutch/parse/Outlink.java       |   22 ++++++++++++++++---
 .../apache/nutch/parse/html/DOMContentUtils.java   |    8 +++++-
 .../org/apache/nutch/crawl/TestLinkDbMerger.java   |    2 +-
 6 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Inlink.java b/src/java/org/apache/nutch/crawl/Inlink.java
index a5f7887..12b4bc1 100644
--- a/src/java/org/apache/nutch/crawl/Inlink.java
+++ b/src/java/org/apache/nutch/crawl/Inlink.java
@@ -25,28 +25,38 @@ public class Inlink implements Writable {
 
   private String fromUrl;
   private String anchor;
+  private boolean noFollow = false;
 
   public Inlink() {}
 
-  public Inlink(String fromUrl, String anchor) {
+  public Inlink(String fromUrl, String anchor, boolean noFollow) {
     this.fromUrl = fromUrl;
     this.anchor = anchor;
+    this.noFollow = noFollow;
   }
 
   public void readFields(DataInput in) throws IOException {
     fromUrl = Text.readString(in);
     anchor = Text.readString(in);
+    BooleanWritable noFollowWritable = new BooleanWritable();
+    noFollowWritable.readFields(in);
+    noFollow = noFollowWritable.get();
   }
 
   /** Skips over one Inlink in the input. */
   public static void skip(DataInput in) throws IOException {
     Text.skip(in);                                // skip fromUrl
     Text.skip(in);                                // skip anchor
+    // do not see a skip here so we'll read it
+    BooleanWritable noFollowWritable = new BooleanWritable();
+    noFollowWritable.readFields(in);
   }
 
   public void write(DataOutput out) throws IOException {
     Text.writeString(out, fromUrl);
     Text.writeString(out, anchor);
+    BooleanWritable noFollowWritable = new BooleanWritable(noFollow);
+    noFollowWritable.write(out);
   }
 
   public static Inlink read(DataInput in) throws IOException {
@@ -57,6 +67,7 @@ public class Inlink implements Writable {
 
   public String getFromUrl() { return fromUrl; }
   public String getAnchor() { return anchor; }
+  public boolean isNoFollowLink() { return noFollow; }
 
   public boolean equals(Object o) {
     if (!(o instanceof Inlink))
@@ -64,15 +75,17 @@ public class Inlink implements Writable {
     Inlink other = (Inlink)o;
     return
       this.fromUrl.equals(other.fromUrl) &&
-      this.anchor.equals(other.anchor);
+      this.anchor.equals(other.anchor) &&
+      this.noFollow == other.noFollow;
   }
 
   public int hashCode() {
-    return fromUrl.hashCode() ^ anchor.hashCode();
+    return fromUrl.hashCode() ^ anchor.hashCode() ^
+    new Boolean(noFollow).hashCode();
   }
 
   public String toString() {
-    return "fromUrl: " + fromUrl + " anchor: " + anchor;
+    return "fromUrl: " + fromUrl + " anchor: " + anchor + " nofollow: " + noFollow;
   }
 
 }
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java
index 7d6ea50..c57b6a0 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -128,7 +128,8 @@ public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData,
       if (anchor.length() > maxAnchorLength) {
         anchor = anchor.substring(0, maxAnchorLength);
       }
-      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
+      boolean noFollow = outlink.isNoFollowLink();
+      inlinks.add(new Inlink(fromUrl, anchor, noFollow));   // collect inverted link
       output.collect(new Text(toUrl), inlinks);
     }
   }
diff --git a/src/java/org/apache/nutch/crawl/LinkDbFilter.java b/src/java/org/apache/nutch/crawl/LinkDbFilter.java
index 645a3b3..faa636e 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbFilter.java
@@ -114,7 +114,8 @@ public class LinkDbFilter implements Mapper<Text, Inlinks, Text, Inlinks> {
         }
       }
       if (fromUrl != null) { 
-        result.add(new Inlink(fromUrl, inlink.getAnchor()));
+        boolean noFollow = inlink.isNoFollowLink();
+        result.add(new Inlink(fromUrl, inlink.getAnchor(), noFollow));
       }
     }
     if (result.size() > 0) { // don't collect empty inlinks
diff --git a/src/java/org/apache/nutch/parse/Outlink.java b/src/java/org/apache/nutch/parse/Outlink.java
index 744140c..c814c69 100644
--- a/src/java/org/apache/nutch/parse/Outlink.java
+++ b/src/java/org/apache/nutch/parse/Outlink.java
@@ -27,7 +27,8 @@ public class Outlink implements Writable {
 
   private String toUrl;
   private String anchor;
-
+  private boolean noFollow = false;
+  
   public Outlink() {}
 
   public Outlink(String toUrl, String anchor) throws MalformedURLException {
@@ -36,20 +37,32 @@ public class Outlink implements Writable {
     this.anchor = anchor;
   }
 
+  public void setNoFollow(boolean noFollow) {
+    this.noFollow = noFollow;
+  }
+  
   public void readFields(DataInput in) throws IOException {
     toUrl = Text.readString(in);
     anchor = Text.readString(in);
+    BooleanWritable noFollowWritable = new BooleanWritable();
+    noFollowWritable.readFields(in);
+    noFollow = noFollowWritable.get();
   }
 
   /** Skips over one Outlink in the input. */
   public static void skip(DataInput in) throws IOException {
     Text.skip(in);                                // skip toUrl
     Text.skip(in);                                // skip anchor
+    // do not see a skip here so we'll read it
+    BooleanWritable noFollowWritable = new BooleanWritable();
+    noFollowWritable.readFields(in);
   }
 
   public void write(DataOutput out) throws IOException {
     Text.writeString(out, toUrl);
     Text.writeString(out, anchor);
+    BooleanWritable noFollowWritable = new BooleanWritable(noFollow);
+    noFollowWritable.write(out);
   }
 
   public static Outlink read(DataInput in) throws IOException {
@@ -60,7 +73,7 @@ public class Outlink implements Writable {
 
   public String getToUrl() { return toUrl; }
   public String getAnchor() { return anchor; }
-
+  public boolean isNoFollowLink() { return noFollow; }
 
   public boolean equals(Object o) {
     if (!(o instanceof Outlink))
@@ -68,11 +81,12 @@ public class Outlink implements Writable {
     Outlink other = (Outlink)o;
     return
       this.toUrl.equals(other.toUrl) &&
-      this.anchor.equals(other.anchor);
+      this.anchor.equals(other.anchor) &&
+      this.noFollow == other.noFollow;
   }
 
   public String toString() {
-    return "toUrl: " + toUrl + " anchor: " + anchor;  // removed "\n". toString, not printLine... WD.
+    return "toUrl: " + toUrl + " anchor: " + anchor + " nofollow: " + noFollow;  // removed "\n". toString, not printLine... WD.
   }
 
 }
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 94a09cd..f5a9d3a 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -404,8 +404,12 @@ public class DOMContentUtils {
                 
                 URL url = (base.toString().indexOf(';') > 0) ? 
                   fixEmbeddedParams(base, target) :  new URL(base, target);
-                outlinks.add(new Outlink(url.toString(),
-                                         linkText.toString().trim()));
+                Outlink outlink = new Outlink(url.toString(),
+                                         linkText.toString().trim());
+                if (noFollow) {
+                    outlink.setNoFollow(true);
+                }
+                outlinks.add(outlink);
               } catch (MalformedURLException e) {
                 // don't care
               }
diff --git a/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java b/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
index a8e1b49..f53a7bc 100644
--- a/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
+++ b/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
@@ -149,7 +149,7 @@ public class TestLinkDbMerger extends TestCase {
       Inlinks inlinks = new Inlinks();
       String[] vals = (String[])init.get(key);
       for (int i = 0; i < vals.length; i++) {
-        Inlink in = new Inlink(vals[i], vals[i]);
+        Inlink in = new Inlink(vals[i], vals[i], false);
         inlinks.add(in);
       }
       writer.append(new Text(key), inlinks);
-- 
1.6.5.2

