Index: src/java/org/apache/nutch/db/WebDBReader.java
===================================================================
--- src/java/org/apache/nutch/db/WebDBReader.java	(revision 219862)
+++ src/java/org/apache/nutch/db/WebDBReader.java	(working copy)
@@ -17,13 +17,17 @@
 
 import java.io.*;
 import java.util.*;
+import java.util.regex.*;
+import java.net.URI;
 import java.nio.channels.*;
+import java.util.logging.*;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
 import org.apache.nutch.pagedb.*;
 import org.apache.nutch.linkdb.*;
+import org.apache.nutch.util.LogFormatter;
 
 /**********************************************
  * The WebDBReader implements all the read-only
@@ -35,6 +39,9 @@
 public class WebDBReader implements IWebDBReader {
     static final Page[] PAGE_RECORDS = new Page[0];
     static final Link[] LINK_RECORDS = new Link[0];
+    
+    public static final Logger LOG = 
+            LogFormatter.getLogger("org.apache.nutch.tools.WebDBReader");
 
     // filenames
     static final String PAGES_BY_URL = "pagesByURL";
@@ -409,7 +416,7 @@
      */
     public static void main(String argv[]) throws FileNotFoundException, IOException {
         if (argv.length < 2) {
-            System.out.println("Usage: java org.apache.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
+            System.out.println("Usage: java org.apache.nutch.db.WebDBReader (-local | -ndfs <namenode:port>) <db> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-dumplink url] | [-showlinks url] | [-showlinksdeep url] | [-stats] | [-detailstats]");
             return;
 
         }
@@ -521,12 +528,200 @@
                     System.out.println();
                   }
                 }
-            } else if ("-stats".equals(cmd)) {
+            } else if ("-dumplink".equals(cmd)) {
+               String url = argv[i++];
+               Page page =reader.getPage(url.trim());
+               Link[] links = reader.getLinks(page.getMD5());
+               if (links.length > 0) {
+                    System.out.println("from " + page.getURL());
+                    for (int j = 0; j < links.length; j++) {
+                      System.out.println(" to " + links[j].getURL());
+                    }
+                    System.out.println();
+               } 
+            } else if ("-showlinks".equals(cmd)) {
+                String url = argv[i++];
+                Link links[] = reader.getLinks(new UTF8(url.trim()));
+                          
+                for (int j = 0; j < links.length; j++) {
+                    StringBuffer buf = new StringBuffer();
+                    MD5Hash md5 = links[j].getFromID();
+                    Page pages[] = reader.getPages(md5);
+                    try {
+                        buf.append(url +"\t"+ pages[0].getURL() +"\t"+ links[j].getAnchorText());
+                        System.out.println(buf.toString());
+                    }
+                    catch(Exception e){
+                        //do nothing
+                    }
+                }
+            } else if ("-showlinksdeep".equals(cmd)) {
+               Integer o = new Integer(0);
+               String url = argv[i++];
+               String fulldomain = new String();
+               
+               //Domain without "www"
+               String domain = new String();
+               try {
+                   fulldomain = new URI(url).getHost();
+               } catch(Exception ex){
+                   LOG.info("Failed to build a URI from the String");
+               }
+               if (fulldomain.regionMatches(0,"www.", 0, 4) && fulldomain.length()>4) {
+                   String[] domains = fulldomain.split("www\\.");
+                   domain = domains[1];
+               }
+               LOG.info("Secondlevel-Domain: "+domain);
+               Pattern pattern = 
+                Pattern.compile(".*\\."+domain+".*" , Pattern.CASE_INSENSITIVE);               
+               //hashtable contains all URL's which have to be prooved
+               Hashtable h1 = new Hashtable();
+               
+               //hashtable of all URL's
+               Hashtable h2 = new Hashtable();
+               
+               LOG.info("URL: "+url);
+               
+               h1.put(url, o);
+               h2.put(url, o);
+               Enumeration e;
+               
+               while ( (e = h1.keys())!=null && e.hasMoreElements()) {
+                   String u = (String) e.nextElement();
+                   
+                   Page page = reader.getPage(u.trim());
+                   
+                   //Grab all the links from the given MD5 hash.
+                   Link[] links = reader.getLinks(page.getMD5());
+                   if (links.length > 0) {
+                       
+                       //prooves for each link, whether it is in h1 or h2 
+                       //or matches the domain
+                       for (int j = 0; j < links.length; j++) {
+                            String newurl = links[j].getURL().toString();
+                            String newurlhost = "can't not be found";
+                            try {
+                                newurlhost = new URI(newurl).getHost();
+                            } catch(Exception ex) {
+                                LOG.info("Failed to build a URI from the String");
+                            }
+                            Matcher match = pattern.matcher(newurlhost);
+                            
+                            //All domains with subdomains are stored in h1 and h2
+                            if (!h1.containsKey(newurl) && newurlhost!=null && 
+                               !h2.containsKey(newurl) && (fulldomain==newurlhost || match.find())) {
+                                 h1.put(newurl, o);
+                                 h2.put(newurl, o);
+                            }
+                        }
+                    }
+                   //URL should not be used twice
+                   h1.remove(u);
+              }
+              LOG.info("Links on the Domain: "+url);
+              Enumeration eh2 = h2.keys();
+              
+              StringBuffer buf = new StringBuffer();
+              LOG.info("Sublinks found: "+h2.size());
+              while (eh2.hasMoreElements()){
+                  
+                  //Get all the hyperlinks that link TO the indicated URL.
+                  Link links[] = reader.getLinks(new UTF8(((String) eh2.nextElement()).trim()));
+                  for (int j = 0; j < links.length; j++) {
+                        MD5Hash md5 = links[j].getFromID();
+                        Page pages[] = reader.getPages(md5);
+                        try{
+                            //uncomment this to show extern links only
+                            //if(!pages[0].getURL().toString().regionMatches(0,url,0,url.length())) {
+                                buf.append(links[j].getURL().toString() +"\t"+ pages[0].getURL() +"\t"+ links[j].getAnchorText()+"\n");
+                            //}
+                        }
+                        catch(Exception e2){
+                            //do nothing
+                        }
+                  }
+              }
+              System.out.println(buf.toString());
+              
+           } else if ("-stats".equals(cmd)) {
                 System.out.println("Stats for " + reader);
                 System.out.println("-------------------------------");
                 System.out.println("Number of pages: " + reader.numPages());
                 System.out.println("Number of links: " + reader.numLinks());
-            } else {
+            
+           } else if("-detailstats".equals(cmd)) {
+                //returnes the number of links with and without anchortextes,
+                //and the avarage length of anchortextes
+                System.out.println("Detailstats for " + reader);
+                System.out.println("-------------------------------");
+                System.out.println("Number of pages: " + reader.numPages());
+                System.out.println("Number of links: " + reader.numLinks());
+                
+                int  counter = 1000000;
+                long pages   = 1000000;
+                long counterwithanchortext = 0;
+                long counterwithoutanchortext = 0;
+                long lengthanchor = 0;
+                long counterdeleetedpages = 0;
+                long counterpageswithoutoutlinks = 0;
+                
+                Enumeration e = reader.pagesByMD5();
+                MD5Hash md5 = null;
+                while (e.hasMoreElements()) {
+                  Page page = (Page) e.nextElement();
+                  
+                  //skip all pages with the same hash
+                  if (page.getMD5().equals(md5)) {
+                    continue;
+                  }  
+                  md5 = page.getMD5();
+                  
+                  if (page.getNextFetchTime()==Long.MAX_VALUE) {
+                      counterdeleetedpages++;
+                  }
+                  if (page.getNumOutlinks()<1) {
+                      counterpageswithoutoutlinks++;
+                  }
+                  Link[] links = reader.getLinks(page.getMD5());
+                  if (links.length > 0) {
+                    for (int j = 0; j < links.length; j++) {
+                      int length = links[j].getAnchorText().toString().length();
+                      if (length>0) {
+                          lengthanchor += length;
+                          counterwithanchortext ++;
+                      } else {
+                          counterwithoutanchortext ++;
+
+                      }
+                      
+                    }
+                  }
+                  counter--;
+                  if (counter<1){
+                    LOG.info("Result for the first: " + pages + " pages");
+                    LOG.info("Number of links with anchortextes:    " + counterwithanchortext);
+                    LOG.info("Number of links without anchortextes: " + counterwithoutanchortext);
+                    LOG.info("Avarage length of anchortextes:       " + lengthanchor/counterwithanchortext);
+                    LOG.info("Number of deleted pages:              " + counterdeleetedpages);
+                    LOG.info("Number of pages without outlinks:     " + counterpageswithoutoutlinks);
+                    System.out.println();
+                    pages  += 1000000;
+                    counter = 1000000;
+                  }
+                  
+                }
+                LOG.info("Detailstats for " + reader);
+                LOG.info("-------------------------------");
+                LOG.info("Number of pages: " + reader.numPages());
+                LOG.info("Number of links: " + reader.numLinks());
+                LOG.info("Final result:");
+                LOG.info("Number of links with anchortextes:    " + counterwithanchortext);
+                LOG.info("Number of links without anchortextes: " + counterwithoutanchortext);
+                LOG.info("Avarage length of anchortextes:       " + lengthanchor/counterwithanchortext);
+                LOG.info("Number of deleted pages:              " + counterdeleetedpages);
+                LOG.info("Number of pages without outlinks:     " + counterpageswithoutoutlinks);
+           
+           }else {
                 System.out.println("Sorry, no command with name " + cmd);
             }
         } finally {