diff --git i/conf/nutch-default.xml w/conf/nutch-default.xml
index 67326ee..fd84d49 100644
--- i/conf/nutch-default.xml
+++ w/conf/nutch-default.xml
@@ -1504,6 +1504,15 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
+
+ lang.strip.alpha2
+ false
+ If set to true, and if the detected/identified language code
+ contains an alpha-2 code (for instance, 'en-US'), then strip out the alpha-2
+ code (for instance, strip out 'US' and keep only 'en').
+
+
+
diff --git i/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java w/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index cb8f8c1..ecab9a1 100644
--- i/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ w/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -119,17 +119,22 @@ public class HTMLLanguageParser implements HtmlParseFilter {
/** Try to find the document's language from page headers and metadata */
private String detectLanguage(Parse page, DocumentFragment doc) {
+ boolean stripAlpha2 = conf.getBoolean("lang.strip.alpha2", false);
+
String lang = getLanguageFromMetadata(page.getData().getParseMeta());
if (lang == null) {
- LanguageParser parser = new LanguageParser(doc);
+ LanguageParser parser = new LanguageParser(doc, stripAlpha2);
lang = parser.getLanguage();
}
- if (lang != null) {
- return lang;
+ if (lang == null) {
+ lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
}
- lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
+ // Strip out alpha-2 part from the language code, if needed.
+ if (stripAlpha2) {
+ lang = LanguageParser.parseLanguage(lang);
+ }
return lang;
}
@@ -192,8 +197,8 @@ public class HTMLLanguageParser implements HtmlParseFilter {
private String httpEquiv = null;
private String language = null;
- LanguageParser(Node node) {
- parse(node);
+ LanguageParser(Node node, boolean stripAlpha2) {
+ parse(node, stripAlpha2);
if (htmlAttribute != null) {
language = htmlAttribute;
} else if (dublinCore != null) {
@@ -207,7 +212,7 @@ public class HTMLLanguageParser implements HtmlParseFilter {
return language;
}
- void parse(Node node) {
+ void parse(Node node, boolean stripAlpha2) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
@@ -220,8 +225,11 @@ public class HTMLLanguageParser implements HtmlParseFilter {
// Check for the lang HTML attribute
if (htmlAttribute == null) {
- htmlAttribute = parseLanguage(((Element) currentNode)
- .getAttribute("lang"));
+ htmlAttribute = ((Element) currentNode).getAttribute("lang");
+ }
+
+ if (stripAlpha2) {
+ htmlAttribute = parseLanguage(htmlAttribute);
}
// Check for Meta
@@ -236,7 +244,10 @@ public class HTMLLanguageParser implements HtmlParseFilter {
if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) {
Node valueattr = attrs.getNamedItem("content");
if (valueattr != null) {
- dublinCore = parseLanguage(valueattr.getNodeValue());
+ dublinCore = valueattr.getNodeValue();
+ if (stripAlpha2) {
+ dublinCore = parseLanguage(dublinCore);
+ }
}
}
}
@@ -252,7 +263,10 @@ public class HTMLLanguageParser implements HtmlParseFilter {
.toLowerCase())) {
Node valueattr = attrs.getNamedItem("content");
if (valueattr != null) {
- httpEquiv = parseLanguage(valueattr.getNodeValue());
+ httpEquiv = valueattr.getNodeValue();
+ if (stripAlpha2) {
+ httpEquiv = parseLanguage(httpEquiv);
+ }
}
}
}