Index: pom.xml
===================================================================
--- pom.xml (revision 819528)
+++ pom.xml (working copy)
@@ -111,9 +111,15 @@
junit
test
+
+ org.mockito
+ mockito-core
+ 1.7
+ test
+
-
+
org.apache.felix
Index: src/main/java/org/apache/tika/parser/mbox/MboxParser.java
===================================================================
--- src/main/java/org/apache/tika/parser/mbox/MboxParser.java (revision 0)
+++ src/main/java/org/apache/tika/parser/mbox/MboxParser.java (revision 0)
@@ -0,0 +1,215 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version returns the headers for the first email
+ * via metadata, which means headers from subsequent emails will be lost.
+ */
+public class MboxParser implements Parser {
+ private static final Logger LOGGER = Logger.getLogger(MboxParser.class);
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = MboxParser.class.getSimpleName() + "-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+
+ private enum ParseStates {
+ START, IN_HEADER, IN_CONTENT
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, Map context)
+ throws IOException, TikaException, SAXException {
+
+ InputStreamReader isr;
+ try {
+ // Headers are going to be 7-bit ascii
+ isr = new InputStreamReader(stream, "us-ascii");
+ } catch (UnsupportedEncodingException e) {
+ LOGGER.error("Unexpected exception setting up MboxParser", e);
+ isr = new InputStreamReader(stream);
+ }
+
+ BufferedReader reader = new BufferedReader(isr);
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ ParseStates parseState = ParseStates.START;
+ String multiLine = null;
+ boolean inQuote = false;
+ int numEmails = 0;
+
+ // We're going to scan, line-by-line, for a line that starts with
+ // "From "
+ for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
+ boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
+ if (newMessage) {
+ numEmails += 1;
+ }
+
+ switch (parseState) {
+ case START:
+ if (newMessage) {
+ parseState = ParseStates.IN_HEADER;
+ newMessage = false;
+ // Fall through to IN_HEADER
+ } else {
+ break;
+ }
+
+ case IN_HEADER:
+ if (newMessage) {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ multiLine = curLine;
+ } else if (curLine.length() == 0) {
+ // Blank line is signal that we're transitioning to the content.
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ parseState = ParseStates.IN_CONTENT;
+
+ // Mimic what PackageParser does between entries.
+ xhtml.startElement("div", "class", "email-entry");
+ xhtml.startElement("p");
+ inQuote = false;
+ } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ multiLine += " " + curLine.trim();
+ } else {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ multiLine = curLine;
+ }
+
+ break;
+
+ // TODO - use real email parsing support so we can correctly handle
+ // things like multipart messages and quoted-printable encoding.
+ // We'd also want this for charset handling, where content isn't 7-bit
+ // ascii.
+ case IN_CONTENT:
+ if (newMessage) {
+ endMessage(xhtml, inQuote);
+ parseState = ParseStates.IN_HEADER;
+ multiLine = curLine;
+ } else {
+ boolean quoted = curLine.startsWith(">");
+ if (inQuote) {
+ if (!quoted) {
+ xhtml.endElement("q");
+ inQuote = false;
+ }
+ } else if (quoted) {
+ xhtml.startElement("q");
+ inQuote = true;
+ }
+
+ xhtml.characters(curLine);
+
+ // For plain text email, each line is a real break position.
+ xhtml.element("br", "");
+ }
+ }
+ }
+
+ if (parseState == ParseStates.IN_HEADER) {
+ saveHeaderInMetadata(numEmails, metadata, multiLine);
+ } else if (parseState == ParseStates.IN_CONTENT) {
+ endMessage(xhtml, inQuote);
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
+ if (inQuote) {
+ xhtml.endElement("q");
+ }
+
+ xhtml.endElement("p");
+ xhtml.endElement("div");
+ }
+
+ private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
+ if ((curLine == null) || (numEmails > 1)) {
+ return;
+ } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ return;
+ }
+
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ LOGGER.warn("Malformed email header in mbox file: " + curLine);
+ return;
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase();
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.add(Metadata.AUTHOR, headerContent);
+ metadata.add(Metadata.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ metadata.add(Metadata.TITLE, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ // TODO - parse and convert to ISO format YYYY-MM-DD
+ metadata.add(Metadata.DATE, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.add(Metadata.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.add(Metadata.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.add(Metadata.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata) throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, Collections.EMPTY_MAP);
+ }
+
+}
Property changes on: src/main/java/org/apache/tika/parser/mbox/MboxParser.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:eol-style
+ native
Index: src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
===================================================================
--- src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (revision 0)
+++ src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (revision 0)
@@ -0,0 +1,142 @@
+package org.apache.tika.parser.mbox;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.times;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class MboxParserTest extends TestCase {
+
+ public void testSimple() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/simple.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map context = new HashMap();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+ verify(handler).startDocument();
+ verify(handler, times(2)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler, times(2)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+ verify(handler).characters(new String("Test content 1").toCharArray(), 0, 14);
+ verify(handler).characters(new String("Test content 2").toCharArray(), 0, 14);
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testHeaders() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/headers.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map context = new HashMap();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
+ verify(handler).endDocument();
+
+ assertEquals("subject", metadata.get(Metadata.TITLE));
+ assertEquals("subject", metadata.get(Metadata.SUBJECT));
+ assertEquals("", metadata.get(Metadata.AUTHOR));
+ assertEquals("", metadata.get(Metadata.CREATOR));
+ assertEquals("", metadata.get("MboxParser-return-path"));
+ assertEquals("Tue, 9 Jun 2009 23:58:45 -0400", metadata.get(Metadata.DATE));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testMultilineHeader() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/multiline.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map context = new HashMap();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
+ verify(handler).endDocument();
+
+ assertEquals("from xxx by xxx with xxx; date", metadata.get("MboxParser-received"));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testQuoted() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/quoted.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map context = new HashMap();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), any(Attributes.class));
+ verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
+ verify(handler).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
+ verify(handler).characters(new String("Test content").toCharArray(), 0, 12);
+ verify(handler).characters(new String("> quoted stuff").toCharArray(), 0, 14);
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testComplex() {
+ Parser parser = new MboxParser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/complex.mbox");
+ ContentHandler handler = mock(DefaultHandler.class);
+ Map context = new HashMap();
+
+ try {
+ parser.parse(stream, handler, metadata, context);
+
+ verify(handler).startDocument();
+ verify(handler, times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler, times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"));
+ verify(handler, times(3)).startElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"), any(Attributes.class));
+ verify(handler, times(3)).endElement(eq(XHTMLContentHandler.XHTML), eq("q"), eq("q"));
+ verify(handler).endDocument();
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ private static InputStream getStream(String name) {
+ return Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ }
+
+
+}
Property changes on: src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
___________________________________________________________________
Added: svn:mime-type
+ text/plain
Added: svn:eol-style
+ native
Index: src/test/resources/test-documents/complex.mbox
===================================================================
--- src/test/resources/test-documents/complex.mbox (revision 0)
+++ src/test/resources/test-documents/complex.mbox (revision 0)
@@ -0,0 +1,291 @@
+From core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache.org@hadoop.apache.org Mon Jun 01 04:28:28 2009
+Return-Path:
+Delivered-To: apmail-hadoop-core-user-archive@www.apache.org
+Received: (qmail 19921 invoked from network); 1 Jun 2009 04:28:28 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 04:28:28 -0000
+Received: (qmail 84995 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Delivered-To: apmail-hadoop-core-user-archive@hadoop.apache.org
+Received: (qmail 84895 invoked by uid 500); 1 Jun 2009 04:28:38 -0000
+Mailing-List: contact core-user-help@hadoop.apache.org; run by ezmlm
+Precedence: bulk
+List-Help:
+List-Unsubscribe:
+List-Post:
+List-Id:
+Reply-To: core-user@hadoop.apache.org
+Delivered-To: mailing list core-user@hadoop.apache.org
+Received: (qmail 84885 invoked by uid 99); 1 Jun 2009 04:28:38 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:38 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+ tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [69.147.107.21] (HELO mrout2-b.corp.re1.yahoo.com) (69.147.107.21)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 04:28:26 +0000
+Received: from SNV-EXPF01.ds.corp.yahoo.com (snv-expf01.ds.corp.yahoo.com [207.126.227.250])
+ by mrout2-b.corp.re1.yahoo.com (8.13.8/8.13.8/y.out) with ESMTP id n514QYA6099963
+ for ; Sun, 31 May 2009 21:26:35 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+ h=received:user-agent:date:subject:from:to:message-id:
+ thread-topic:thread-index:in-reply-to:mime-version:content-type:
+ content-transfer-encoding:x-originalarrivaltime;
+ b=YVtSNdgjeeSBS1yY3XDolul49i+HrgNG7QszMo9LzGnrwejjgsl5+iUM6EiQgEpV
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by SNV-EXPF01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+ Sun, 31 May 2009 21:26:34 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon, 1 Jun 2009 04:26:33 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 09:56:31 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan
+To:
+Message-ID:
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnicSNoBw19cMU8UEaXwAdZ1YYhuw==
+In-Reply-To: <440622.41041.qm@web111005.mail.gq1.yahoo.com>
+Mime-version: 1.0
+Content-type: text/plain;
+ charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 04:26:34.0501 (UTC) FILETIME=[257EAB50:01C9E271]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" wrote:
+
+> Hi,
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out to
+> hadoop framework in protocol to indicate that map is done and the shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+
+
+From core-user-return-14701-apmail-hadoop-core-user-archive=hadoop.apache.org@hadoop.apache.org Mon Jun 01 05:31:14 2009
+Return-Path:
+Delivered-To: apmail-hadoop-core-user-archive@www.apache.org
+Received: (qmail 38243 invoked from network); 1 Jun 2009 05:31:14 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 05:31:14 -0000
+Received: (qmail 15621 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Delivered-To: apmail-hadoop-core-user-archive@hadoop.apache.org
+Received: (qmail 15557 invoked by uid 500); 1 Jun 2009 05:31:24 -0000
+Mailing-List: contact core-user-help@hadoop.apache.org; run by ezmlm
+Precedence: bulk
+List-Help:
+List-Unsubscribe:
+List-Post:
+List-Id:
+Reply-To: core-user@hadoop.apache.org
+Delivered-To: mailing list core-user@hadoop.apache.org
+Received: (qmail 15547 invoked by uid 99); 1 Jun 2009 05:31:24 -0000
+Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 05:31:24 +0000
+X-ASF-Spam-Status: No, hits=2.2 required=10.0
+ tests=HTML_MESSAGE,SPF_PASS
+X-Spam-Check-By: apache.org
+Received-SPF: pass (nike.apache.org: local policy)
+Received: from [68.142.237.94] (HELO n9.bullet.re3.yahoo.com) (68.142.237.94)
+ by apache.org (qpsmtpd/0.29) with SMTP; Mon, 01 Jun 2009 05:31:11 +0000
+Received: from [68.142.237.88] by n9.bullet.re3.yahoo.com with NNFMP; 01 Jun 2009 05:30:50 -0000
+Received: from [67.195.9.82] by t4.bullet.re3.yahoo.com with NNFMP; 01 Jun 2009 05:30:49 -0000
+Received: from [67.195.9.99] by t2.bullet.mail.gq1.yahoo.com with NNFMP; 01 Jun 2009 05:30:49 -0000
+Received: from [127.0.0.1] by omp103.mail.gq1.yahoo.com with NNFMP; 01 Jun 2009 05:28:01 -0000
+X-Yahoo-Newman-Property: ymail-3
+X-Yahoo-Newman-Id: 796121.97519.bm@omp103.mail.gq1.yahoo.com
+Received: (qmail 35264 invoked by uid 60001); 1 Jun 2009 05:30:49 -0000
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=yahoo.com; s=s1024; t=1243834249; bh=R8qzdi/IbLyO8UwpnaujDpT9E+6bJ7nkmZN2803EmRk=; h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type; b=vq4c6RIDbkuLPYd8mirusIXf6DqTb/IeT55In7W00Y5Sxx1ZiXBb78yE9+TDfXJ0elsEZvqv4ocyvolGE0eGtyYeJA0mZikpRNu6pidxPNpCplOcLHBRz7YQ7iERwv3TagRlWy2Xd3oD9ZeV0A05P7WUOiNNX1PUUJD1IVdrEZo=
+DomainKey-Signature:a=rsa-sha1; q=dns; c=nofws;
+ s=s1024; d=yahoo.com;
+ h=Message-ID:X-YMail-OSG:Received:X-Mailer:References:Date:From:Subject:To:In-Reply-To:MIME-Version:Content-Type;
+ b=6HXZV98ON5vBwmE/xS8stVD0D2F4dkMY7a0suX5KVTb736JdR8G59mqBq/dWcpbFTLiCLtxi18LMb/dU1RKRGOEdn3l3j/jKXhBrhIgfg3qtNskPedXDKBvn7JGXiSkqpA/tUtPjvc0Uuk8/LaA01SQTz40Engg7nD8/EJdIAhA=;
+Message-ID: <592088.35091.qm@web111010.mail.gq1.yahoo.com>
+X-YMail-OSG: KzhhrJYVM1m.MCS6vRpRP2ZZO2PrfnbngosELDCIa91ZqvhJph4RdmzfUW0jw9W04RCSch1K730bPohwNpNBIk2QR_zt4_mfbhfq7YEPkSoz9LSXG90P9vIo5Fc8qyZN0U6vA9gtdyGQTpN5ahvillUH9nAF0TMWv2SvZJLjPlQ0Z0p8oK8ltBwGTgLrM8Jtdn9D29yoRyi3_EpVOfdD9OP.EK50Vr1XwSUYMbnpZ0WGHMwd.Yig7A6Elwadm3YVbfOdx2mfrG.jQsUAxQjRBNvbrOM57.FaE11kHTe9aoBWSeihNg--
+Received: from [216.145.54.7] by web111010.mail.gq1.yahoo.com via HTTP; Sun, 31 May 2009 22:30:49 PDT
+X-Mailer: YahooMailRC/1277.43 YahooMailWebService/0.7.289.10
+References:
+Date: Sun, 31 May 2009 22:30:49 -0700 (PDT)
+From: Jianmin Woo
+Subject: Re: question about when shuffle/sort start working
+To: core-user@hadoop.apache.org
+In-Reply-To:
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="0-1193839393-1243834249=:35091"
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+--0-1193839393-1243834249=:35091
+Content-Type: text/plain; charset=us-ascii
+
+Thanks a lot for your explanation, Jothi.
+
+So is this event generated by hadoop framework? Is there any API in mapper to fire this event? Actually, I am thinking to implement a mapper that will emit some pairs, then fire this event to let the reducer works, the same mapper task then emit some other pairs and repeat. Do you think is this logic feasible by current API?
+
+Thanks,
+Jianmin
+
+
+
+
+
+________________________________
+From: Jothi Padmanabhan
+To: core-user@hadoop.apache.org
+Sent: Monday, June 1, 2009 12:26:31 PM
+Subject: Re: question about when shuffle/sort start working
+
+When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+fetch map outputs for a given map only on the receipt of such events.
+
+Jothi
+
+
+On 5/30/09 10:00 AM, "Jianmin Woo" wrote:
+
+> Hi,
+> I am being confused by the protocol between mapper and reducer. When mapper
+> emitting the (key,value) pair done, is there any signal the mapper send out to
+> hadoop framework in protocol to indicate that map is done and the shuffle/sort
+> can begin for reducer? If there is no this signal in protocol, when the
+> framework begin the shuffle/sort?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+
+
+
+--0-1193839393-1243834249=:35091--
+
+
+From core-user-return-14702-apmail-hadoop-core-user-archive=hadoop.apache.org@hadoop.apache.org Mon Jun 01 06:04:30 2009
+Return-Path:
+Delivered-To: apmail-hadoop-core-user-archive@www.apache.org
+Received: (qmail 53387 invoked from network); 1 Jun 2009 06:04:29 -0000
+Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3)
+ by minotaur.apache.org with SMTP; 1 Jun 2009 06:04:29 -0000
+Received: (qmail 39066 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Delivered-To: apmail-hadoop-core-user-archive@hadoop.apache.org
+Received: (qmail 38970 invoked by uid 500); 1 Jun 2009 06:04:39 -0000
+Mailing-List: contact core-user-help@hadoop.apache.org; run by ezmlm
+Precedence: bulk
+List-Help:
+List-Unsubscribe:
+List-Post:
+List-Id:
+Reply-To: core-user@hadoop.apache.org
+Delivered-To: mailing list core-user@hadoop.apache.org
+Received: (qmail 38955 invoked by uid 99); 1 Jun 2009 06:04:39 -0000
+Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:39 +0000
+X-ASF-Spam-Status: No, hits=1.2 required=10.0
+ tests=SPF_NEUTRAL
+X-Spam-Check-By: apache.org
+Received-SPF: neutral (athena.apache.org: local policy)
+Received: from [216.145.54.172] (HELO mrout2.yahoo.com) (216.145.54.172)
+ by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 01 Jun 2009 06:04:28 +0000
+Received: from SNV-EXBH01.ds.corp.yahoo.com (snv-exbh01.ds.corp.yahoo.com [207.126.227.249])
+ by mrout2.yahoo.com (8.13.6/8.13.6/y.out) with ESMTP id n5163FGq038852
+ for ; Sun, 31 May 2009 23:03:15 -0700 (PDT)
+DomainKey-Signature: a=rsa-sha1; s=serpent; d=yahoo-inc.com; c=nofws; q=dns;
+ h=received:user-agent:date:subject:from:to:message-id:
+ thread-topic:thread-index:in-reply-to:mime-version:content-type:
+ content-transfer-encoding:x-originalarrivaltime;
+ b=rChE4SCnwtWaZpjhovkiXDKfDiVNdRRvsadSGG9S9bgvOexn/9/5JjEQx1pOR7Nb
+Received: from SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.9]) by SNV-EXBH01.ds.corp.yahoo.com with Microsoft SMTPSVC(6.0.3790.3959);
+ Sun, 31 May 2009 23:03:15 -0700
+Received: from 10.66.92.213 ([10.66.92.213]) by SNV-EXVS08.ds.corp.yahoo.com ([207.126.227.58]) with Microsoft Exchange Server HTTP-DAV ;
+ Mon, 1 Jun 2009 06:03:15 +0000
+User-Agent: Microsoft-Entourage/12.17.0.090302
+Date: Mon, 01 Jun 2009 11:33:13 +0530
+Subject: Re: question about when shuffle/sort start working
+From: Jothi Padmanabhan
+To:
+Message-ID:
+Thread-Topic: question about when shuffle/sort start working
+Thread-Index: AcnifqWrLG6N7GAk7kqy9QalVWfegQ==
+In-Reply-To: <592088.35091.qm@web111010.mail.gq1.yahoo.com>
+Mime-version: 1.0
+Content-type: text/plain;
+ charset="US-ASCII"
+Content-transfer-encoding: 7bit
+X-OriginalArrivalTime: 01 Jun 2009 06:03:15.0462 (UTC) FILETIME=[A7231260:01C9E27E]
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+No you cannot raise this event yourself, this event is generated internally
+by the framework.
+
+I am guessing that what you probably want is to have a chain of MapReduce
+Jobs where the output of one is automatically fed as input to another. You
+can look at these classes: JobControl and ChainMapper/ChainReducer.
+
+Jothi
+
+On 6/1/09 11:00 AM, "Jianmin Woo" wrote:
+
+> Thanks a lot for your explanation, Jothi.
+>
+> So is this event generated by hadoop framework? Is there any API in mapper to
+> fire this event? Actually, I am thinking to implement a mapper that will emit
+> some pairs, then fire this event to let the reducer works, the
+> same mapper task then emit some other pairs and repeat. Do you
+> think is this logic feasible by current API?
+>
+> Thanks,
+> Jianmin
+>
+>
+>
+>
+>
+> ________________________________
+> From: Jothi Padmanabhan
+> To: core-user@hadoop.apache.org
+> Sent: Monday, June 1, 2009 12:26:31 PM
+> Subject: Re: question about when shuffle/sort start working
+>
+> When a Mapper completes, MapCompletionEvents are generated. Reducers try to
+> fetch map outputs for a given map only on the receipt of such events.
+>
+> Jothi
+>
+>
+> On 5/30/09 10:00 AM, "Jianmin Woo" wrote:
+>
+>> Hi,
+>> I am being confused by the protocol between mapper and reducer. When mapper
+>> emitting the (key,value) pair done, is there any signal the mapper send out
+>> to
+>> hadoop framework in protocol to indicate that map is done and the
+>> shuffle/sort
+>> can begin for reducer? If there is no this signal in protocol, when the
+>> framework begin the shuffle/sort?
+>>
+>> Thanks,
+>> Jianmin
+>>
+>>
+>>
+>>
+>
+>
+>
+
+
Index: src/test/resources/test-documents/headers.mbox
===================================================================
--- src/test/resources/test-documents/headers.mbox (revision 0)
+++ src/test/resources/test-documents/headers.mbox (revision 0)
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Return-Path:
+Subject: subject
+From:
+Date: Tue, 9 Jun 2009 23:58:45 -0400
+
+Test content
Index: src/test/resources/test-documents/multiline.mbox
===================================================================
--- src/test/resources/test-documents/multiline.mbox (revision 0)
+++ src/test/resources/test-documents/multiline.mbox (revision 0)
@@ -0,0 +1,5 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+Received: from xxx
+ by xxx with xxx; date
+
+Test content
Index: src/test/resources/test-documents/quoted.mbox
===================================================================
--- src/test/resources/test-documents/quoted.mbox (revision 0)
+++ src/test/resources/test-documents/quoted.mbox (revision 0)
@@ -0,0 +1,4 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content
+> quoted stuff
\ No newline at end of file
Index: src/test/resources/test-documents/simple.mbox
===================================================================
--- src/test/resources/test-documents/simple.mbox (revision 0)
+++ src/test/resources/test-documents/simple.mbox (revision 0)
@@ -0,0 +1,7 @@
+From envelope-sender-mailbox-name Mon Jun 01 10:00:00 2009
+
+Test content 1
+
+From envelope-sender-mailbox-name Mon Jun 01 11:00:00 2009
+
+Test content 2
Index: src/main/resources/org/apache/tika/tika-config.xml
===================================================================
--- src/main/resources/org/apache/tika/tika-config.xml (revision 819528)
+++ src/main/resources/org/apache/tika/tika-config.xml (working copy)
@@ -160,6 +160,10 @@
audio/x-aiff
+
+ application/mbox
+
+
\ No newline at end of file
Index: src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
===================================================================
--- src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (revision 819528)
+++ src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (working copy)
@@ -179,6 +179,7 @@
+