Index: conf/top-level-domains.xsd
===================================================================
--- conf/top-level-domains.xsd	(revision 0)
+++ conf/top-level-domains.xsd	(revision 0)
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+
+<!--
+	!   Document   : top-level-domains.xsd
+	!   Author     : Enis Soztutar - enis.soz.nutch@gmail.com
+	!   Description: This document is the schema for valid tld definitions
+	!                For successful parsing of tld xml files, the xml file 
+	!                should be validated with this xsd. 
+	!   See        : org.apache.nutch.util.tld.TLDReader.java
+	! -->
+	
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+	targetNamespace="http://lucene.apache.org/nutch"
+	xmlns="http://lucene.apache.org/nutch"
+	elementFormDefault="qualified">
+
+	<xs:element name="tlds">
+		<xs:complexType>
+			<xs:sequence>
+				<xs:element name="itlds">
+					<xs:complexType>
+						<xs:sequence>
+							<xs:element name="tld" maxOccurs="unbounded" type="gtld" />
+						</xs:sequence>
+					</xs:complexType>
+				</xs:element>
+
+				<xs:element name="gtlds">
+					<xs:complexType>
+						<xs:sequence>
+							<xs:element name="tld" maxOccurs="unbounded" type="gtld" />
+						</xs:sequence>
+					</xs:complexType>
+				</xs:element>
+
+				<xs:element name="cctlds">
+					<xs:complexType>
+						<xs:sequence>
+							<xs:element name="tld" maxOccurs="unbounded" type="cctld"/>
+						</xs:sequence>
+					</xs:complexType>
+				</xs:element>
+
+			</xs:sequence>
+		</xs:complexType>
+	</xs:element>
+
+	<xs:complexType name="gtld">
+		<xs:sequence>
+			<xs:element name="status" >
+				<xs:simpleType>
+					<xs:restriction base="xs:string">
+					    <xs:enumeration value="INFRASTRUCTURE"/>
+					    <xs:enumeration value="SPONSORED"/>
+					    <xs:enumeration value="UNSPONSORED"/>
+					    <xs:enumeration value="STARTUP"/>
+					    <xs:enumeration value="PROPOSED"/>
+					    <xs:enumeration value="DELETED"/>
+					    <xs:enumeration value="PSEUDO_DOMAIN"/>
+					</xs:restriction>	
+				</xs:simpleType>
+			</xs:element>
+			<xs:element name="boost" type="xs:float"/>
+			<xs:element name="description" type="xs:string" minOccurs="0" />
+		</xs:sequence>
+		<xs:attribute name="domain" type="xs:string" />
+	</xs:complexType>
+
+	<xs:complexType name="cctld">
+		<xs:sequence>
+			<xs:element name="country" type="xs:string" />
+			<xs:element name="status">
+			<xs:simpleType>
+					<xs:restriction base="xs:string">
+					    <xs:enumeration value="IN_USE"/>
+					    <xs:enumeration value="NOT_IN_USE"/>
+					    <xs:enumeration value="DELETED"/>
+					</xs:restriction>	
+				</xs:simpleType>
+			</xs:element>
+			<xs:element name="boost" type="xs:float"/>
+			<xs:element name="note" type="xs:string" minOccurs="0" />
+		</xs:sequence>
+		<xs:attribute name="domain" type="xs:string" />
+	</xs:complexType>
+	
+</xs:schema>
\ No newline at end of file
Index: conf/top-level-domains.xml
===================================================================
--- conf/top-level-domains.xml	(revision 0)
+++ conf/top-level-domains.xml	(revision 0)
@@ -0,0 +1,1933 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+
+<!--
+	!   Document   : top-level-domains.xml
+	!   Author     : Enis Soztutar - enis.soz.nutch@gmail.com
+	!   Description: This document contains top level domains 
+	!                as described by the Internet Assigned Numbers
+	!                Authotiry (IANA). Top level domains(tlds) are grouped
+	!                to threre, namely infrastrusture, generic and country 
+	!                code tlds. infrastrusture tlds are only used for 
+	!                technical reasons. Generic tlds represents the type 
+	!                of the organization that they represent. Those in 
+	!                current use and those waiting for approval is listed.
+	!                Most od the country code tlds correspond to the two 
+	!                letter ISO-3166-1 country codes. 
+	!                Each tld is listed with its domain (such as com), a 
+	!                status enumeration describing the status of the tld, 
+	!                and optionally a description or note for convenience.
+	!                cctlds are listed with additional country name field.
+	!                
+	!                Note : second level domains such as .co.uk is not listed 
+	!                References : 
+	!                   http://www.iana.org
+	!                   http://www.iana.org/gtld/gtld.htm
+	!                   http://www.iana.org/root-whois/index.html
+	!                   http://en.wikipedia.org/wiki/Top-level_domain
+	!                   http://en.wikipedia.org/wiki/List_of_Internet_top-level_domains
+	! -->
+<tlds xmlns="http://lucene.apache.org/nutch"
+	  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+      xsi:schemaLocation="http://lucene.apache.org/nutch top-level-domains.xsd">
+      
+	<!--  Infrastructure Top Level Domains -->
+	<itlds>
+		<tld domain="root">
+			<status>INFRASTRUCTURE</status>
+			<boost>1.0</boost>
+			<description>
+				(from http://en.wikipedia.org/wiki/.root)
+				vrsn-end-of-zone-marker-dummy-record.root is a domain
+				name listed in the DNS root zone as a diagnostic marker,
+				whose presence demonstrates the root zone was not
+				truncated upon loading by a root nameserver. It could be
+				argued it represents a top-level domain of .root,
+				although technically no such delegation exists.
+			</description>
+		</tld>
+		
+		<tld domain="arpa">
+			<status>INFRASTRUCTURE</status>
+			<boost>1.0</boost>
+			<description>
+				(from http://en.wikipedia.org/wiki/.arpa) 
+				.arpa is an Internet top-level domain (TLD) used exclusively 
+				for Internet-infrastructure purposes. It does not function
+				as a normal TLD where websites are registered, but
+				rather as a meta-TLD used to look up addresses, and for
+				other purposes.
+			</description>
+		</tld>
+	</itlds>
+
+
+	<!--  Generic Top Level Domains -->
+	<gtlds>
+		<!-- 
+			The following gTLDs are in actual use
+		-->
+
+		<tld domain="aero">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for the air transport industry</description>
+		</tld>
+		
+		<tld domain="biz">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>for business use</description>
+		</tld>
+		
+		<tld domain="cat">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for Catalan language/culture</description>
+		</tld>
+		
+		<tld domain="com">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for commercial organizations, but unrestricted
+			</description>
+		</tld>
+		
+		<tld domain="coop">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for cooperatives</description>
+		</tld>
+		
+		<tld domain="edu">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for post-secondary educational establishments
+			</description>
+		</tld>
+		
+		<tld domain="gov">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for governments and their agencies in the United States
+			</description>
+		</tld>
+		
+		<tld domain="info">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for informational sites, but unrestricted
+			</description>
+		</tld>
+		
+		<tld domain="int">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for international organizations established by treaty
+			</description>
+		</tld>
+		
+		<tld domain="jobs">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for employment-related sites</description>
+		</tld>
+		
+		<tld domain="mil">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>for the US military</description>
+		</tld>
+		
+		<tld domain="mobi">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for sites catering to mobile devices
+			</description>
+		</tld>
+		
+		<tld domain="museum">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for museums</description>
+		</tld>
+		
+		<tld domain="name">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>for families and individuals</description>
+		</tld>
+		
+		<tld domain="net">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				originally for network infrastructures, now unrestricted
+			</description>
+		</tld>
+		
+		<tld domain="org">
+			<status>UNSPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				originally for organizations not clearly falling within
+				the other gTLDs, now unrestricted
+			</description>
+		</tld>
+		
+		<tld domain="pro">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>for certain professions</description>
+		</tld>
+		
+		<tld domain="travel">
+			<status>SPONSORED</status>
+			<boost>1.0</boost>
+			<description>
+				for travel agents, airlines, hoteliers, tourism bureaus,
+				etc.
+			</description>
+		</tld>
+
+		<!-- 
+			The following gTLDs are in the process of being approved, 
+			and may be added to the root nameservers in the near future 
+		-->
+
+		<tld domain="asia">
+			<status>STARTUP</status>
+			<boost>1.0</boost>
+			<description>for the Asian community</description>
+		</tld>
+		
+		<tld domain="post">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>for postal services</description>
+		</tld>
+		
+		<tld domain="tel">
+			<status>STARTUP</status>
+			<boost>1.0</boost>
+			<description>
+				for services involving connections between the telephone
+				network and the Internet
+			</description>
+		</tld>
+		
+		<tld domain="geo">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>for geographically related sites</description>
+		</tld>
+		
+		<tld domain="gal">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				for Galicia, a country within Spain
+			</description>
+		</tld>
+		
+		<tld domain="cym">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				for Wales, a country within the UK
+			</description>
+		</tld>
+		
+		<tld domain="sco">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				for Scotland, a country within the UK
+			</description>
+		</tld>
+		
+		<tld domain="kid">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				for websites designed for children
+			</description>
+		</tld>
+		
+		<tld domain="kids">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				for websites designed for children
+			</description>
+		</tld>
+		
+		<tld domain="mail">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>
+				http://en.wikipedia.org/wiki/.mail
+			</description>
+		</tld>
+		
+		<tld domain="web">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>For Web sites of all sorts</description>
+		</tld>
+		
+		<tld domain="xxx">
+			<status>PROPOSED</status>
+			<boost>1.0</boost>
+			<description>For Adult entertainment sites</description>
+		</tld>
+
+		<!-- 
+			The following gTLDs are removed from the registry
+		-->
+		<tld domain="nato">
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<description>
+				for NATO sites and operations. Replaced by .int
+			</description>
+		</tld>
+
+		<!-- 
+			The following gTLDs are PSEUDO_DOMAINs
+		-->
+		<tld domain="bitnet">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>
+				identifying a hostname not connected directly to the
+				Internet, but a bitnet network
+			</description>
+		</tld>
+		
+		<tld domain="csnet">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>
+				identifying a hostname not connected directly to the
+				Internet, but a csnet network
+			</description>
+		</tld>
+		
+		<tld domain="uucp">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>
+				identifying a hostname not connected directly to the
+				Internet, but a bitnet network
+			</description>
+		</tld>
+		
+		<tld domain="local">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>
+				.local is a pseudo top-level domain used by Apple,
+				Inc.'s Bonjour protocol.
+			</description>
+		</tld>
+		
+		<tld domain="internal">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>alias of .local</description>
+		</tld>
+		
+		<tld domain="onion">
+			<status>PSEUDO_DOMAIN</status>
+			<boost>1.0</boost>
+			<description>
+				designates an anonymous or pseudonymous address
+				reachable via the Tor network.
+			</description>
+		</tld>
+	</gtlds>
+
+
+	<!--  Country Code Top Level Domains -->
+
+	<cctlds>
+		<tld domain="ac">
+			<country>Ascension Island</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ad">
+			<country>Andorra</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ae">
+			<country>United Arab Emirates</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="af">
+			<country>Afghanistan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ag">
+			<country>Antigua and Barbuda</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ai">
+			<country>Anguilla</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="al">
+			<country>Albania</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="am">
+			<country>Armenia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="an">
+			<country>Netherlands Antilles</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ao">
+			<country>Angola</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="aq">
+			<country>Antarctica</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ar">
+			<country>Argentina</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="as">
+			<country>American Samoa</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="at">
+			<country>Austria</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="au">
+			<country>Australia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="aw">
+			<country>Aruba</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ax">
+			<country>Aland Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="az">
+			<country>Azerbaijan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ba">
+			<country>Bosnia and Herzegovina</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bb">
+			<country>Barbados</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bd">
+			<country>Bangladesh</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="be">
+			<country>Belgium</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bf">
+			<country>Burkina Faso</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bg">
+			<country>Bulgaria</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bh">
+			<country>Bahrain</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bi">
+			<country>Burundi</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bj">
+			<country>Benin</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bm">
+			<country>Bermuda</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bn">
+			<country>Brunei</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bo">
+			<country>Bolivia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="br">
+			<country>Brazil</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bs">
+			<country>Bahamas</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bt">
+			<country>Bhutan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bu">
+			<country>Burma</country>
+			<status>NOT_IN_USE</status>
+			<boost>1.0</boost>
+			<note>not in use since re-naming of country to Myanmar, see .mm</note>
+		</tld>
+
+		<tld domain="bv">
+			<country>Bouvet Island</country>
+			<status>NOT_IN_USE</status>
+			<boost>1.0</boost>
+			<note>not in use; no registrations</note>
+		</tld>
+
+		<tld domain="bw">
+			<country>Botswana</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="by">
+			<country>Belarus</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="bz">
+			<country>Belize</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ca">
+			<country>Canada</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cc">
+			<country>Cocos Keeling Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cd">
+			<country>Democratic Republic of the Congo</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>formerly .zr - Zaire</note>
+		</tld>
+
+		<tld domain="cf">
+			<country>Central African Republic</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cg">
+			<country>Republic of the Congo</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ch">
+			<country>Switzerland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ci">
+			<country>Côte d'Ivoire</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>Ivory Coast</note>
+		</tld>
+
+		<tld domain="ck">
+			<country>Cook Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cl">
+			<country>Chile</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cm">
+			<country>Cameroon</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cn">
+			<country>People s Republic of China</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="co">
+			<country>Colombia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cr">
+			<country>Costa Rica</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cs">
+			<country>Serbia and Montenegro</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>
+				formerly .yu - Yugoslavia; Note: on June 3, 2006, Montenegro declared 
+				independence, thus dissolving the state union) (.cs code not assigned; no DNS) 
+				(.cs code previously used for Czechoslovakia
+			</note>
+		</tld>
+
+		<tld domain="cu">
+			<country>Cuba</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cv">
+			<country>Cape Verde</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cx">
+			<country>Christmas Island</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cy">
+			<country>Cyprus</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="cz">
+			<country>Czech Republic</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="dd">
+			<country>German Democratic Republic(East Germany)</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>deleted in 1990</note>
+		</tld>
+
+		<tld domain="de">
+			<country>Germany</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="dj">
+			<country>Djibouti</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="dk">
+			<country>Denmark</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="dm">
+			<country>Dominica</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="do">
+			<country>Dominican Republic</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="dz">
+			<country>Algeria</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ec">
+			<country>Ecuador</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ee">
+			<country>Estonia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="eg">
+			<country>Egypt</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="eh">
+			<country>Western Sahara</country>
+			<status>NOT_IN_USE</status>
+			<boost>1.0</boost>
+			<note>not assigned; no DNS</note>
+		</tld>
+
+		<tld domain="er">
+			<country>Eritrea</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="es">
+			<country>Spain</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="et">
+			<country>Ethiopia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="eu">
+			<country>European Union</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>code "exceptionally reserved" by ISO 3166-1</note>
+		</tld>
+
+		<tld domain="fi">
+			<country>Finland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="fj">
+			<country>Fiji</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="fk">
+			<country>Falkland Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="fm">
+			<country>Federated States of Micronesia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="fo">
+			<country>Faroe Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="fr">
+			<country>France</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ga">
+			<country>Gabon</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gb">
+			<country>United Kingdom</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>Reserved domain by IANA; deprecated – see .uk</note>
+		</tld>
+
+		<tld domain="gd">
+			<country>Grenada</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ge">
+			<country>Georgia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gf">
+			<country>French Guiana</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gg">
+			<country>Guernsey</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gh">
+			<country>Ghana</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gi">
+			<country>Gibraltar</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gl">
+			<country>Greenland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gm">
+			<country>Gambia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gn">
+			<country>Guinea</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gp">
+			<country>Guadeloupe</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gq">
+			<country>Equatorial Guinea</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gr">
+			<country>Greece</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gs">
+			<country>South Georgia and the South Sandwich Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gt">
+			<country>Guatemala</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gu">
+			<country>Guam</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gw">
+			<country>Guinea Bissau</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="gy">
+			<country>Guyana</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="hk">
+			<country>Hong Kong</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="hm">
+			<country>Heard Island and McDonald Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="hn">
+			<country>Honduras</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="hr">
+			<country>Croatia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ht">
+			<country>Haiti</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="hu">
+			<country>Hungary</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="id">
+			<country>Indonesia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ie">
+			<country>Ireland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="il">
+			<country>Israel</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="im">
+			<country>Isle of Man</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="in">
+			<country>India</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="io">
+			<country>British Indian Ocean Territory</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="iq">
+			<country>Iraq</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ir">
+			<country>Iran</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="is">
+			<country>Iceland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="it">
+			<country>Italy</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="je">
+			<country>Jersey</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="jm">
+			<country>Jamaica</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="jo">
+			<country>Jordan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="jp">
+			<country>Japan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ke">
+			<country>Kenya</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kg">
+			<country>Kyrgyzstan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kh">
+			<country>Cambodia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ki">
+			<country>Kiribati</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="km">
+			<country>Comoros</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kn">
+			<country>Saint Kitts and Nevis</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kp">
+			<country>North Korea</country>
+			<status>NOT_IN_USE</status>
+			<boost>1.0</boost>
+			<note>not assigned; no DNS</note>
+		</tld>
+
+		<tld domain="kr">
+			<country>South Korea</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kw">
+			<country>Kuwait</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ky">
+			<country>Cayman Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="kz">
+			<country>Kazakhstan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="la">
+			<country>Laos</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lb">
+			<country>Lebanon</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lc">
+			<country>Saint Lucia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="li">
+			<country>Liechtenstein</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lk">
+			<country>Sri Lanka</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lr">
+			<country>Liberia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ls">
+			<country>Lesotho</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lt">
+			<country>Lithuania</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lu">
+			<country>Luxembourg</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="lv">
+			<country>Latvia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ly">
+			<country>Libya</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ma">
+			<country>Morocco</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mc">
+			<country>Monaco</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="md">
+			<country>Moldova</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="me">
+			<country>Montenegro</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mg">
+			<country>Madagascar</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mh">
+			<country>Marshall Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mk">
+			<country>Republic of Macedonia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ml">
+			<country>Mali</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mm">
+			<country>Myanmar</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>formerly .bu - Burma</note>
+		</tld>
+
+		<tld domain="mn">
+			<country>Mongolia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mo">
+			<country>Macau</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mp">
+			<country>Northern Mariana Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mq">
+			<country>Martinique</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mr">
+			<country>Mauritania</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ms">
+			<country>Montserrat</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mt">
+			<country>Malta</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mu">
+			<country>Mauritius</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mv">
+			<country>Maldives</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mw">
+			<country>Malawi</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mx">
+			<country>Mexico</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="my">
+			<country>Malaysia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="mz">
+			<country>Mozambique</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="na">
+			<country>Namibia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nc">
+			<country>New Caledonia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ne">
+			<country>Niger</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nf">
+			<country>Norfolk Island</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ng">
+			<country>Nigeria</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ni">
+			<country>Nicaragua</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nl">
+			<country>Netherlands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="no">
+			<country>Norway</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="np">
+			<country>Nepal</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nr">
+			<country>Nauru</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nu">
+			<country>Niue</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="nz">
+			<country>New Zealand</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="om">
+			<country>Oman</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pa">
+			<country>Panama</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pe">
+			<country>Peru</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pf">
+			<country>French Polynesia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pg">
+			<country>Papua New Guinea</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ph">
+			<country>Philippines</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pk">
+			<country>Pakistan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pl">
+			<country>Poland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pm">
+			<country>Saint Pierre and Miquelon</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pn">
+			<country>Pitcairn Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pr">
+			<country>Puerto Rico</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ps">
+			<country>Palestinian territories</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pt">
+			<country>Portugal</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="pw">
+			<country>Palau</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="py">
+			<country>Paraguay</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="qa">
+			<country>Qatar</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="re">
+			<country>Réunion</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ro">
+			<country>Romania</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="rs">
+			<country>Serbia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ru">
+			<country>Russia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="rw">
+			<country>Rwanda</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sa">
+			<country>Saudi Arabia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sb">
+			<country>Solomon Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sc">
+			<country>Seychelles</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sd">
+			<country>Sudan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="se">
+			<country>Sweden</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sg">
+			<country>Singapore</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sh">
+			<country>Saint Helena</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="si">
+			<country>Slovenia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sj">
+			<country>Svalbard and Jan Mayen Islands</country>
+			<status>NOT_IN_USE</status>
+			<boost>1.0</boost>
+			<note>not in use; no registrations</note>
+		</tld>
+
+		<tld domain="sk">
+			<country>Slovakia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sl">
+			<country>Sierra Leone</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sm">
+			<country>San Marino</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sn">
+			<country>Senegal</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="so">
+			<country>Somalia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sr">
+			<country>Suriname</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="st">
+			<country>São Tomé and Príncipe </country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="su">
+			<country>Soviet Union</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>
+				deprecated; being phased out; code "transitionally reserved" 
+				by ISO 3166-1
+			</note>
+		</tld>
+
+		<tld domain="sv">
+			<country>El Salvador</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sy">
+			<country>Syria</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="sz">
+			<country>Swaziland</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tc">
+			<country>Turks and Caicos Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="td">
+			<country>Chad</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tf">
+			<country>French Southern Territories</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tg">
+			<country>Togo</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="th">
+			<country>Thailand</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tj">
+			<country>Tajikistan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tk">
+			<country>Tokelau</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tl">
+			<country>East Timor</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>formerly .tp</note>
+		</tld>
+
+		<tld domain="tm">
+			<country>Turkmenistan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tn">
+			<country>Tunisia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="to">
+			<country>Tonga</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tp">
+			<country>East Timor</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>deprecated - use .tl; code "transitionally reserved" by ISO 3166-1</note>
+		</tld>
+
+		<tld domain="tr">
+			<country>Turkey</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tt">
+			<country>Trinidad and Tobago</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tv">
+			<country>Tuvalu</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="tw">
+			<country>Republic of China</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>Taiwan</note>
+		</tld>
+
+		<tld domain="tz">
+			<country>Tanzania</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ua">
+			<country>Ukraine</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ug">
+			<country>Uganda</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="uk">
+			<country>United Kingdom</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>code "exceptionally reserved" by ISO 3166-1 (see also .gb)</note>
+		</tld>
+
+		<tld domain="um">
+			<country>United States Minor Outlying Islands</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>see http://en.wikipedia.org/wiki/.um</note>
+		</tld>
+
+		<tld domain="us">
+			<country>United States</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="uy">
+			<country>Uruguay</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="uz">
+			<country>Uzbekistan</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="va">
+			<country>Vatican City</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="vc">
+			<country>Saint Vincent and the Grenadines</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ve">
+			<country>Venezuela</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="vg">
+			<country>British Virgin Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="vi">
+			<country>United States Virgin Islands</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="vn">
+			<country>Vietnam</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="vu">
+			<country>Vanuatu</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="wf">
+			<country>Wallis and Futuna</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="ws">
+			<country>Samoa</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>formerly Western Samoa</note>
+		</tld>
+
+		<tld domain="ye">
+			<country>Yemen</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="yt">
+			<country>Mayotte</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="yu">
+			<country>Yugoslavia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+			<note>
+				subsequently renamed Serbia and Montenegro (code officially 
+				replaced by .cs (see above) but still used; code 
+				"transitionally reserved" by ISO 3166-1)
+			</note>
+		</tld>
+
+		<tld domain="za">
+			<country>South Africa</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="zm">
+			<country>Zambia</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+
+		<tld domain="zr">
+			<country>Zaire</country>
+			<status>DELETED</status>
+			<boost>1.0</boost>
+			<note>replaced by .cd</note>
+		</tld>
+
+		<tld domain="zw">
+			<country>Zimbabwe</country>
+			<status>IN_USE</status>
+			<boost>1.0</boost>
+		</tld>
+	</cctlds>
+
+</tlds>
\ No newline at end of file
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 500263)
+++ conf/nutch-default.xml	(working copy)
@@ -48,7 +48,7 @@
 
 <property>
   <name>http.agent.name</name>
-  <value></value>
+  <value>nutchTest</value>
   <description>HTTP 'User-Agent' request header. MUST NOT be empty - 
   please set this to a single word uniquely related to your organization.
 
@@ -738,7 +738,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  <value>protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|tld|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
@@ -1012,4 +1012,21 @@
   </description>
 </property>
 
+<!-- top-level-domains plugin properties -->
+<property>
+  <name>top.level.domains.file</name>
+  <value>top-level-domains.xml</value>
+  <description> The file containing top level domain list.
+  </description>
+</property>
+
+<property>
+  <name>top.level.domains.index</name>
+  <value>false</value>
+  <description> Whether build an index in the tld field. If true
+   the field is indexed with Field.Index.UN_TOKENIZED, else it
+   is indexed with Field.Index.NO(is not searchable)
+  </description>
+</property>
+
 </configuration>
Index: src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java	(revision 0)
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.tld.TLDEntries;
+import org.apache.nutch.util.tld.TLDEntry;
+
+/**
+ * Scoring filter to boost tlds. 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private TLDEntries tldEnties;
+  //private static final Log LOG = LogFactory.getLog(TLDScoringFilter.class);
+  
+  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    
+    String[] tlds = doc.getValues("tld");
+    float boost = 1.0f;
+    
+    if(tlds != null) {
+      for(String tld : tlds) {
+        TLDEntry entry = tldEnties.get(tld); 
+        if(entry != null)
+          boost *= entry.getBoost();
+      }
+    }
+    return initScore * boost;
+  }
+  
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+      ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+      int validCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List inlinked) throws ScoringFilterException {
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    tldEnties = TLDEntries.getInstance(conf);
+  }
+
+}
Index: src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html	(revision 0)
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>
Index: src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntry.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntry.java	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntry.java	(revision 0)
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.tld;
+
+/**
+ * (From wikipedia) A top-level domain (TLD) is the last part of an 
+ * Internet domain name; that is, the letters which follow the final 
+ * dot of any domain name. For example, in the domain name 
+ * <code>www.website.com</code>, the top-level domain is <code>com</code>.
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see http://www.iana.org/
+ * @see http://en.wikipedia.org/wiki/Top-level_domain
+ */
+public class TLDEntry {
+
+  /**
+ * Enumeration of the status of the tld. Please see topLevelDomains.xml. 
+ */
+public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED
+  , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE
+  };
+
+  private String domain;
+  private Status status;
+  private float boost;
+
+  public TLDEntry(String domain, Status status, float boost){
+    this.domain = domain;
+    this.status = status;
+    this.boost = boost;
+  }
+
+  public static class CCTLDEntry extends TLDEntry {
+    private String countryName;
+    public CCTLDEntry(String domain, Status status, float boost, String countryName) {
+      super(domain, status, boost);
+      this.countryName = countryName;
+    }
+    public String getCountryName(){
+      return countryName;
+    }
+  }
+
+  public String getDomain() {
+    return domain;
+  }
+
+  public Status getStatus() {
+    return status;
+  }
+  
+  public float getBoost() {
+    return boost;
+  }
+
+}
Index: src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDReader.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDReader.java	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDReader.java	(revision 0)
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.tld;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.tld.TLDEntry.CCTLDEntry;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+
+/**
+ * For parsing xml files containing tlds.
+ * Parsed xml files should validate against 
+ * <code>top-level-domains.xsd</code>  
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+class TLDReader {
+
+  private static final Log LOG = LogFactory.getLog(TLDReader.class);
+
+  void read(TLDEntries tldEntries, InputStream input) throws IOException{
+    try {
+
+      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+      factory.setIgnoringComments(true);
+      DocumentBuilder builder = factory.newDocumentBuilder();
+      Document document = builder.parse(new InputSource(input));
+
+      Element root = document.getDocumentElement();
+      
+      if(root != null && root.getTagName().equals("tlds")) {
+        readITLDs(tldEntries, (Element)root.getElementsByTagName("itlds").item(0));
+        readGTLDs(tldEntries, (Element)root.getElementsByTagName("gtlds").item(0));
+        readCCTLDs(tldEntries, (Element)root.getElementsByTagName("cctlds").item(0));
+      }
+      else {
+        throw new IOException("xml file is not valid");
+      }
+    }
+    catch (ParserConfigurationException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    }
+    catch (SAXException ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+      throw new IOException(ex.getMessage());
+    }
+    
+  }
+
+  void readITLDs(TLDEntries tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addITLD(readGTLD((Element)children.item(i)));
+    }
+  }
+    
+  void readGTLDs(TLDEntries tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addGTLD(readGTLD((Element)children.item(i)));
+    }
+  }
+
+  void readCCTLDs(TLDEntries tldEntries, Element el) {
+    NodeList children = el.getElementsByTagName("tld");
+    for(int i=0;i<children.getLength();i++) {
+      tldEntries.addCCTLD(readCCTLD((Element)children.item(i)));
+    }
+  }
+
+  TLDEntry readGTLD(Element el) {
+    String domain = el.getAttribute("domain");
+    String status = el.getElementsByTagName("status").item(0).getFirstChild().getNodeValue();
+    float boost = Float.parseFloat(el.getElementsByTagName("boost").item(0).getFirstChild().getNodeValue());
+    return new TLDEntry(domain, TLDEntry.Status.valueOf(status), boost);
+  }
+
+  CCTLDEntry readCCTLD(Element el) {
+    String domain = el.getAttribute("domain");
+    String status = el.getElementsByTagName("status").item(0).getFirstChild().getNodeValue();
+    float boost = Float.parseFloat(el.getElementsByTagName("boost").item(0).getFirstChild().getNodeValue());
+    String countryName = el.getElementsByTagName("country").item(0).getNodeValue();
+    return new CCTLDEntry(domain,TLDEntry.Status.valueOf(status), boost, countryName);  
+  }
+}
Index: src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntries.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntries.java	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/util/tld/TLDEntries.java	(revision 0)
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.tld;
+
+import java.io.InputStream;
+import java.util.HashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.tld.TLDEntry.CCTLDEntry;
+
+/**
+ * Storage class for <code>TLDEntry</code>s 
+ * Note: this class is singleton for obvious reasons
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDEntries {
+  private static final Log LOG = LogFactory.getLog(TLDEntries.class);
+  
+  private HashMap<String, TLDEntry> itlds = new HashMap<String, TLDEntry>();
+  private HashMap<String, TLDEntry> gtlds = new HashMap<String, TLDEntry>();
+  private HashMap<String, CCTLDEntry> cctlds = new HashMap<String, CCTLDEntry>();
+  
+  
+  private static TLDEntries instance;
+  
+  /** private ctor */
+  private TLDEntries(Configuration conf) {
+    String file = conf.get("top.level.domains.file", "top-level-domains.xml");
+    InputStream input = this.getClass().getClassLoader().getResourceAsStream(file);
+    try {
+      new TLDReader().read(this, input);
+    }
+    catch (Exception ex) {
+      LOG.warn(StringUtils.stringifyException(ex));
+    }
+  }
+  
+  /**
+   * Singleton instance, lazy instantination
+   * @param conf
+   * @return
+   */
+  public static TLDEntries getInstance(Configuration conf) {
+    if(instance == null) {
+      instance = new TLDEntries(conf);
+    }
+    return instance;
+  }
+  
+  void addITLD(TLDEntry tld) {
+    itlds.put(tld.getDomain(), tld);
+  }
+  
+  void addGTLD(TLDEntry tld) {
+    gtlds.put(tld.getDomain(), tld);
+  }
+  
+  void addCCTLD(CCTLDEntry tld) {
+    cctlds.put(tld.getDomain(), tld);
+  }
+  
+  /** return whether the extension is a itld */
+  public boolean isITLD(String extension) {
+    return itlds.containsKey(extension);
+  }
+  
+  /** return whether the extension is a gtld */
+  public boolean isGTLD(String extension) {
+    return gtlds.containsKey(extension);
+  }
+  
+  /** return whether the extension is a cctld */
+  public boolean isCCTLD(String extension) {
+    return cctlds.containsKey(extension);
+  }
+  
+  /** return whether the extension is a itld, gtld or cctld */
+  public boolean isTLD(String extension) {
+    return 
+        gtlds.containsKey(extension) 
+        || cctlds.containsKey(extension)
+        || itlds.containsKey(extension);
+  }
+  
+  /**
+   * Return the TLDEntry of the extension
+   * @param extension the tld
+   * @return TLDEntry if tld is either itld or gtld,
+   *         CCTLDEntry if tld is cctld,
+   *         null if tld does not exist 
+   */
+  public TLDEntry get(String extension) {
+    if(gtlds.containsKey(extension))
+      return gtlds.get(extension);
+    else if(cctlds.containsKey(extension))
+      return cctlds.get(extension);
+    else if(itlds.containsKey(extension))
+      return itlds.get(extension);
+    else
+      return null; 
+  }
+    
+}
Index: src/plugin/tld/src/java/org/apache/nutch/util/tld/package.html
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/util/tld/package.html	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/util/tld/package.html	(revision 0)
@@ -0,0 +1,6 @@
+<html>
+<body>
+<p>Contains classes to parse and store Top Level Domain 
+Information as defined by IANA.</p><p></p>
+</body>
+</html>
Index: src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java	(revision 0)
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.tld.TLDEntries;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.hadoop.conf.Configuration;
+
+
+/** 
+ * Adds the Top level domain extensions to the index  
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+  public static final Log LOG = LogFactory.getLog(TLDIndexingFilter.class);
+
+  private Configuration conf;
+  private TLDEntries tlds;
+  private boolean index = false;
+  
+  public Document filter(Document doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
+    throws IndexingException {
+    
+    try {
+      URL url = new URL(urlText.toString());
+      String host = url.getHost();
+      String[] parts = host.split("\\.");
+      
+      for(int i=parts.length-1; i >= parts.length-2 && i > 0;i--) { //at most two level
+        if(tlds.isTLD(parts[i])) {
+          
+          if(index) {
+            // not store, but index, for the field to be searchable
+            doc.add(new Field("tld", parts[i], Field.Store.YES, Field.Index.UN_TOKENIZED));
+          }
+          else {
+            // store, no index
+            doc.add(new Field("tld", parts[i], Field.Store.YES, Field.Index.NO));
+          }
+        }
+        else
+          break; //tlds only appear at the end
+      }
+      
+    }catch (Exception ex) {
+      LOG.warn(ex);
+    }
+    
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    tlds = TLDEntries.getInstance(conf);
+    index = conf.getBoolean("top.level.domains.index", false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}
Index: src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html
===================================================================
--- src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html	(revision 0)
+++ src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html	(revision 0)
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>
Index: src/plugin/tld/plugin.xml
===================================================================
--- src/plugin/tld/plugin.xml	(revision 0)
+++ src/plugin/tld/plugin.xml	(revision 0)
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="tld"
+   name="Top Level Domain Plugin"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="tld.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.tld"
+              name="Top Level Domain Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="TLDIndexingFilter"
+                      class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+   </extension>
+
+   <extension id="org.apache.nutch.scoring.tld"
+              name="Top Level Domain Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+                      class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+   </extension>
+
+
+</plugin>

Property changes on: src/plugin/tld/plugin.xml
___________________________________________________________________
Name: svn:executable
   + *

Index: src/plugin/tld/build.xml
===================================================================
--- src/plugin/tld/build.xml	(revision 0)
+++ src/plugin/tld/build.xml	(revision 0)
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

Property changes on: src/plugin/tld/build.xml
___________________________________________________________________
Name: svn:executable
   + *

Index: src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
===================================================================
--- src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java	(revision 500263)
+++ src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java	(working copy)
@@ -87,7 +87,7 @@
 
   /** Use {@link CrawlDatum#getScore()}. */
   public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
-    return datum.getScore();
+    return datum.getScore() * initSort;
   }
 
   /** Increase the score by a sum of inlinked scores. */
@@ -149,6 +149,6 @@
 
   /** Dampen the boost value by scorePower.*/
   public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
-    return (float)Math.pow(dbDatum.getScore(), scorePower);
+    return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore;
   }
 }
Index: src/plugin/build.xml
===================================================================
--- src/plugin/build.xml	(revision 500263)
+++ src/plugin/build.xml	(working copy)
@@ -68,6 +68,7 @@
      <ant dir="summary-basic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="summary-lucene" target="deploy"/>
+     <ant dir="tld" target="deploy"/>
      <ant dir="urlfilter-automaton" target="deploy"/>
      <ant dir="urlfilter-prefix" target="deploy"/>
      <ant dir="urlfilter-regex" target="deploy"/>
@@ -157,6 +158,7 @@
     <ant dir="subcollection" target="clean"/>
     <ant dir="summary-basic" target="clean"/>
     <ant dir="summary-lucene" target="clean"/>
+    <ant dir="tld" target="clean"/>
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
