Index: conf/schema.xml
===================================================================
--- conf/schema.xml (revision 1418750)
+++ conf/schema.xml (working copy)
@@ -114,6 +114,9 @@
+
+
+
id
content
Index: conf/schema-solr4.xml
===================================================================
--- conf/schema-solr4.xml (revision 1418750)
+++ conf/schema-solr4.xml (working copy)
@@ -346,6 +346,9 @@
+
+
+
id
text
Index: src/java/org/apache/nutch/indexer/solr/SolrWriter.java
===================================================================
--- src/java/org/apache/nutch/indexer/solr/SolrWriter.java (revision 1418743)
+++ src/java/org/apache/nutch/indexer/solr/SolrWriter.java (working copy)
@@ -66,8 +66,8 @@
inputDoc.addField(solrMapping.mapKey(e.getKey()), val2);
String sCopy = solrMapping.mapCopyKey(e.getKey());
- if (sCopy != e.getKey()) {
- inputDoc.addField(sCopy, val2);
+ if (!sCopy.equals(e.getKey())) {
+ inputDoc.addField(sCopy, val2);
}
}
}
Index: src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
===================================================================
--- src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (revision 1418743)
+++ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (working copy)
@@ -44,10 +44,12 @@
/**
* Add (or reset) a few metaData properties as respective fields (if they are
- * available), so that they can be displayed by more.jsp (called by search.jsp).
+ * available), so that they can be accurtely used within the search index.
*
- * content-type is indexed to support query by type: last-modifed is indexed to
- * support query by date:
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP
+ * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt
+ * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative
+ * that the content provider wants the filename therein to be used as the title.
*
* Still need to make content-length searchable!
*
@@ -143,7 +145,7 @@
// NUTCH-1010 ContentLength not trimmed
String trimmed = contentLength.toString().trim();
if (!trimmed.isEmpty())
- doc.add("contentLength", trimmed);
+ doc.add("contentLength", trimmed);
}
return doc;
@@ -171,8 +173,23 @@
*/
private NutchDocument addType(NutchDocument doc, WebPage page, String url) {
String mimeType = null;
+ // try get contentType from HttpHeader
Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));
- if (contentType == null) {
+ if (LOG.DebugIsEnabled()) {
+ if (contentType != null) {
+ LOG.debug("MoreIndexingFilter: contentType obtained from HttpHeaders.");
+ }
+ }
+ // if not present in HttpHeaders, then we try from WebPage contentType
+ else if (contentType == null) {
+ contentType = page.getContentType();
+ if (LOG.DebugIsEnabled()) {
+ if (contentType != null) {
+ LOG.debug("MoreIndexingFilter: contentType obtained from WebPage contentType.");
+ }
+ }
+ }
+ else if (contentType == null) {
// Note by Jerome Charron on 20050415:
// Content Type not solved by a previous plugin
// Or unable to solve it... Trying to find it
@@ -200,7 +217,7 @@
// Check if we need to split the content type in sub parts
if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
- String[] parts = getParts(contentType.toString());
+ String[] parts = getParts(mimeType);
for(String part: parts) {
doc.add("type", part);