Index: tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java	(revision 1579038)
+++ tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java	(working copy)
@@ -17,22 +17,18 @@
 package org.apache.tika.parser.mbox;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.io.UnsupportedEncodingException;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
 import java.util.Collections;
-import java.util.Date;
-import java.util.Locale;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -41,8 +37,8 @@
 import org.xml.sax.SAXException;
 
 /**
- * Mbox (mailbox) parser. This version returns the headers for the first email
- * via metadata, which means headers from subsequent emails will be lost.
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and
+ * uses the DelegatingParser to process each mail.
  */
 public class MboxParser extends AbstractParser {
 
@@ -54,16 +50,11 @@
 
     public static final String MBOX_MIME_TYPE = "application/mbox";
     public static final String MBOX_RECORD_DIVIDER = "From ";
-    private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
-    private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+    public static final int MAIL_MAX_SIZE = 50000000;
 
     private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
     private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
 
-    private enum ParseStates {
-        START, IN_HEADER, IN_CONTENT
-    }
-
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -72,179 +63,56 @@
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, TikaException, SAXException {
+      
+        EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+				new ParsingEmbeddedDocumentExtractor(context));
 
-        InputStreamReader isr;
-        try {
-            // Headers are going to be 7-bit ascii
-            isr = new InputStreamReader(stream, "US-ASCII");
-        } catch (UnsupportedEncodingException e) {
-            throw new TikaException("US-ASCII is not supported!", e);
-        }
-
-        BufferedReader reader = new BufferedReader(isr);
-
+        String charsetName = "windows-1252";
+        
         metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
-        metadata.set(Metadata.CONTENT_ENCODING, "us-ascii");
+        metadata.set(Metadata.CONTENT_ENCODING, charsetName);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
-        ParseStates parseState = ParseStates.START;
-        String multiLine = null;
-        boolean inQuote = false;
-        int numEmails = 0;
+	    InputStreamReader isr = new InputStreamReader(stream, charsetName);
+	    BufferedReader reader = new BufferedReader(isr);
+	    try{
+	    	String curLine = reader.readLine();
+		    do{
+		    	if(curLine.startsWith(MBOX_RECORD_DIVIDER)){
+		    		
+		    		Metadata mailMetadata = new Metadata();
+		    		mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+		    		mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+			    	
+		    		curLine = reader.readLine();
+			    	
+			    	ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+		    		do{
+		    			
+			    		message.write(curLine.getBytes(charsetName));
+			    		message.write(0x0A);
+			    		curLine = reader.readLine();
+			    		
+			    	}while(curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+		    		
+		    		ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+		    		message = null;
+		    		
+		    		if(extractor.shouldParseEmbedded(mailMetadata))
+		    			extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+		    	
+		    	}else
+		    		curLine = reader.readLine();
+		    	
+		    }while(curLine != null && !Thread.currentThread().isInterrupted());
+	    	
+	    }finally{
+	    	reader.close();
+	    }
 
-        // We're going to scan, line-by-line, for a line that starts with
-        // "From "
-        for (String curLine = reader.readLine(); curLine != null; curLine = reader.readLine()) {
-            boolean newMessage = curLine.startsWith(MBOX_RECORD_DIVIDER);
-            if (newMessage) {
-                numEmails += 1;
-            }
-
-            switch (parseState) {
-            case START:
-                if (newMessage) {
-                    parseState = ParseStates.IN_HEADER;
-                    newMessage = false;
-                    // Fall through to IN_HEADER
-                } else {
-                    break;
-                }
-
-            case IN_HEADER:
-                if (newMessage) {
-                    saveHeaderInMetadata(numEmails, metadata, multiLine);
-                    multiLine = curLine;
-                } else if (curLine.length() == 0) {
-                    // Blank line is signal that we're transitioning to the content.
-                    saveHeaderInMetadata(numEmails, metadata, multiLine);
-                    parseState = ParseStates.IN_CONTENT;
-
-                    // Mimic what PackageParser does between entries.
-                    xhtml.startElement("div", "class", "email-entry");
-                    xhtml.startElement("p");
-                    inQuote = false;
-                } else if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
-                    multiLine += " " + curLine.trim();
-                } else {
-                    saveHeaderInMetadata(numEmails, metadata, multiLine);
-                    multiLine = curLine;
-                }
-
-                break;
-
-                // TODO - use real email parsing support so we can correctly handle
-                // things like multipart messages and quoted-printable encoding.
-                // We'd also want this for charset handling, where content isn't 7-bit
-                // ascii.
-            case IN_CONTENT:
-                if (newMessage) {
-                    endMessage(xhtml, inQuote);
-                    parseState = ParseStates.IN_HEADER;
-                    multiLine = curLine;
-                } else {
-                    boolean quoted = curLine.startsWith(">");
-                    if (inQuote) {
-                        if (!quoted) {
-                            xhtml.endElement("q");
-                            inQuote = false;
-                        }
-                    } else if (quoted) {
-                        xhtml.startElement("q");
-                        inQuote = true;
-                    }
-
-                    xhtml.characters(curLine);
-
-                    // For plain text email, each line is a real break position.
-                    xhtml.element("br", "");
-                }
-            }
-        }
-
-        if (parseState == ParseStates.IN_HEADER) {
-            saveHeaderInMetadata(numEmails, metadata, multiLine);
-        } else if (parseState == ParseStates.IN_CONTENT) {
-            endMessage(xhtml, inQuote);
-        }
-
         xhtml.endDocument();
     }
-
-    private void endMessage(XHTMLContentHandler xhtml, boolean inQuote) throws SAXException {
-        if (inQuote) {
-            xhtml.endElement("q");
-        }
-
-        xhtml.endElement("p");
-        xhtml.endElement("div");
-    }
-
-    private void saveHeaderInMetadata(int numEmails, Metadata metadata, String curLine) {
-        if ((curLine == null) || (numEmails > 1)) {
-            return;
-        } else if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
-            metadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
-            return;
-        }
-
-        Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
-        if (!headerMatcher.matches()) {
-            return; // ignore malformed header lines
-        }
-
-        String headerTag = headerMatcher.group(1).toLowerCase();
-        String headerContent = headerMatcher.group(2);
-
-        if (headerTag.equalsIgnoreCase("From")) {
-            metadata.set(TikaCoreProperties.CREATOR, headerContent);
-        } else if (headerTag.equalsIgnoreCase("To") ||
-        	headerTag.equalsIgnoreCase("Cc") ||
-        	headerTag.equalsIgnoreCase("Bcc")) {
-            Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
-            if(address.find()) {
-        	metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
-            } else if(headerContent.indexOf('@') > -1) {
-        	metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
-            }
-            
-            String property = Metadata.MESSAGE_TO;
-            if (headerTag.equalsIgnoreCase("Cc")) {
-        	property = Metadata.MESSAGE_CC;
-            } else if (headerTag.equalsIgnoreCase("Bcc")) {
-        	property = Metadata.MESSAGE_BCC;
-            }
-            metadata.add(property, headerContent);
-        } else if (headerTag.equalsIgnoreCase("Subject")) {
-            // TODO Move to title in Tika 2.0
-            metadata.add(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_TITLE, 
-                    headerContent);
-        } else if (headerTag.equalsIgnoreCase("Date")) {
-            try {
-                Date date = parseDate(headerContent);
-                metadata.set(TikaCoreProperties.CREATED, date);
-            } catch (ParseException e) {
-                // ignoring date because format was not understood
-            }
-        } else if (headerTag.equalsIgnoreCase("Message-Id")) {
-            metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
-        } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
-            metadata.set(TikaCoreProperties.RELATION, headerContent);
-        } else if (headerTag.equalsIgnoreCase("Content-Type")) {
-            // TODO - key off content-type in headers to
-            // set mapping to use for content and convert if necessary.
-
-            metadata.add(Metadata.CONTENT_TYPE, headerContent);
-            metadata.set(TikaCoreProperties.FORMAT, headerContent);
-        } else {
-            metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
-        }
-    }
     
-    public static Date parseDate(String headerContent) throws ParseException {
-        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
-        return dateFormat.parse(headerContent);
-    }
-
 }
