java.lang.Object org.supermind.crawl.CachingFetchedURLs
public class CachingFetchedURLs
Caches URL checksums via a chained scatter table. When there are too many urls, evicted checksums are persisted.
Field Summary |
---|
Fields inherited from interface org.supermind.crawl.FetchedURLs |
---|
LOG |
Constructor Summary | |
---|---|
CachingFetchedURLs()
|
Method Summary | |
---|---|
void |
close()
|
boolean |
contains(java.net.URL url)
Has the URL already been fetched? |
ScheduledURL |
get(long id)
Get a persisted URL. |
protected long |
getChecksum(java.net.URL url)
Create a 64-bit checksum by merging a 32-bit host checksum with the url's 32-bit checksum. |
void |
init()
|
void |
insert(ScheduledURL url,
org.apache.nutch.protocol.ProtocolOutput output)
Insert a fetched URL. |
void |
setChecksum(java.util.zip.Checksum checksum)
|
void |
setPurger(ScatterPurger purger)
|
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Constructor Detail |
---|
public CachingFetchedURLs()
Method Detail |
---|
public void close() throws java.io.IOException
close
in interface FetchedURLs
java.io.IOException
public boolean contains(java.net.URL url)
FetchedURLs
contains
in interface FetchedURLs
public ScheduledURL get(long id)
FetchedURLs
get
in interface FetchedURLs
id
- ScheduledURL's id
protected long getChecksum(java.net.URL url)
url
-
public void init() throws java.io.IOException
init
in interface FetchedURLs
java.io.IOException
public void insert(ScheduledURL url, org.apache.nutch.protocol.ProtocolOutput output)
FetchedURLs
insert
in interface FetchedURLs
url
- urloutput
- protocol outputpublic void setChecksum(java.util.zip.Checksum checksum)
public void setPurger(ScatterPurger purger)