### Eclipse Workspace Patch 1.0
#P PDFBox
Index: pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java	(revision 1033990)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java	(working copy)
@@ -24,6 +24,8 @@
 import java.util.List;
 import java.util.NoSuchElementException;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSBoolean;
 import org.apache.pdfbox.cos.COSDictionary;
@@ -45,6 +47,11 @@
  */
 public class PDFStreamParser extends BaseParser
 {
+    /**
+     * Log instance.
+     */
+    private static final Log log = LogFactory.getLog(PDFStreamParser.class);
+    
     private List<Object> streamObjects = new ArrayList<Object>( 100 );
     private RandomAccess file;
     private PDFOperator lastBIToken = null;
@@ -174,8 +181,15 @@
 					{
 						token = parseNextToken();
 					}
-				} catch (IOException e) {
-					throw new RuntimeException(e);
+				} 
+				catch (IOException e)
+				{
+				    if (!forceParsing) {
+				        throw new RuntimeException(e);
+				    }
+				    log.warn("Error parsing next token.", e);
+				    
+				    token = null;
 				}
 			}
 
@@ -397,7 +411,11 @@
                 //(UnderstandingWebSphereClassLoaders.pdf) which has EI as part
                 //of the image data and will stop parsing prematurely if there is
                 //not a check for <whitespace>EI<whitespace>.
-                while( !(isWhitespace( twoBytesAgo ) &&
+                //
+                //the PDF attached to https://issues.apache.org/jira/browse/PDFBOX-789 (pdf_euba.pdf) 
+                //contains the sequence <0x0C>EI<0x0A>. According to 7.3.8 (which is referenced by 8.9.7)
+                //says that the data portion should end with end-of-line marker.
+                while( !(isEOL( twoBytesAgo ) &&
                          lastByte == 'E' &&
                          currentByte == 'I' &&
                          isWhitespace() //&&
Index: pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java	(revision 1033990)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java	(working copy)
@@ -186,7 +186,13 @@
             char r = (char)pdfSource.read();
             if( r != 'R' )
             {
-                throw new IOException( "expected='R' actual='" + r + "' " + pdfSource );
+                // some real world pdfs contain faulty data. if forceParsing assume it's R.  
+                String error = "expected='R' actual='" + r + "' " + pdfSource;
+                
+                if (!forceParsing) {
+                    throw new IOException( error );
+                }
+                log.warn( error );
             }
             COSObjectKey key = new COSObjectKey(((COSInteger) number).intValue(),
                     ((COSInteger) generationNumber).intValue());
@@ -485,6 +491,9 @@
                                                 pdfSource.unread( ENDSTREAM );
                                                 return;
                                             }
+                                            else {
+                                                out.write(ENDSTREAM, 0, 8);
+                                            }
                                         }else{
                                             out.write(ENDSTREAM, 0, 7);
                                         }
@@ -506,6 +515,9 @@
                                     pdfSource.unread( ENDOBJ );
                                     return;
                                 }
+                                else {
+                                    out.write(ENDOBJ, 0, 5);
+                                }
                             }else{
                                 out.write(ENDOBJ, 0, 4);
                             }
@@ -1376,7 +1388,10 @@
         }
         catch( NumberFormatException e )
         {
-            pdfSource.unread(intBuffer.toString().getBytes());
+            // In case of an exception it's best not to unread the data because you run the risk 
+            // of getting into an unlimited loop (especially so when forceParsing is true).
+            // For example with test-integer-too-large.pdf and forceParsing true, the parser gets 
+            // stuck in an unlimited loop when the data is unread
             throw new IOException( "Error: Expected an integer type, actual='" + intBuffer + "'" );
         }
         return retval;
Index: pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java	(revision 1033990)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java	(working copy)
@@ -17,9 +17,8 @@
 package org.apache.pdfbox.pdfparser;
 
 import java.io.File;
-import java.io.InputStream;
 import java.io.IOException;
-
+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -34,11 +33,8 @@
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.exceptions.WrappedIOException;
 import org.apache.pdfbox.io.RandomAccess;
-
 import org.apache.pdfbox.pdmodel.PDDocument;
-
 import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
-
 import org.apache.pdfbox.persistence.util.COSObjectKey;
 
 /**
@@ -125,6 +121,18 @@
     }
 
     /**
+     * Returns true if parsing should be continued. By default, forceParsing is returned. 
+     * This can be overridden to add application specific handling (for example to stop 
+     * parsing when the number of exceptions thrown exceed a certain number).
+     * 
+     * @param e The exception if vailable. Can be null if there is no exception available
+     */
+    protected boolean isContinueOnError(Exception e)
+    {
+        return forceParsing;
+    }
+    
+    /**
      * This will parse the stream and populate the COSDocument object.  This will close
      * the stream when it is done parsing.
      *
@@ -173,7 +181,7 @@
                     }
                     catch(IOException e)
                     {
-                        if(forceParsing)
+                        if(isContinueOnError(e))
                         {
                             /*
                              * Warning is sent to the PDFBox.log and to the Console that
@@ -508,7 +516,11 @@
                 //" genNumber=" + genNum + " key='" + objectKey + "'" );
                 if( !objectKey.equals( "obj" ) )
                 {
-                    throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
+                    if (!isContinueOnError(null) || !objectKey.equals("o")) {
+                        throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
+                    }
+                    //assume that "o" was meant to be "obj" (this is a workaround for 
+                    // PDFBOX-773 attached PDF Andersens_Fairy_Tales.pdf). 
                 }
             }
             else
@@ -577,38 +589,10 @@
                 }
                 else if( !pdfSource.isEOF() )
                 {
-                    try
-                    {
-                        //It is possible that the endobj  is missing, there
-                        //are several PDFs out there that do that so skip it and move on.
-                        Float.parseFloat( endObjectKey );
-                        pdfSource.unread( SPACE_BYTE );
-                        pdfSource.unread( endObjectKey.getBytes() );
-                    }
-                    catch( NumberFormatException e )
-                    {
-                        //we will try again incase there was some garbage which
-                        //some writers will leave behind.
-                        String secondEndObjectKey = readString();
-                        if( !secondEndObjectKey.equals( "endobj" ) )
-                        {
-                            if( isClosing() )
-                            {
-                                //found a case with 17506.pdf object 41 that was like this
-                                //41 0 obj [/Pattern /DeviceGray] ] endobj
-                                //notice the second array close, here we are reading it
-                                //and ignoring and attempting to continue
-                                pdfSource.read();
-                            }
-                            skipSpaces();
-                            String thirdPossibleEndObj = readString();
-                            if( !thirdPossibleEndObj.equals( "endobj" ) )
-                            {
-                                throw new IOException("expected='endobj' firstReadAttempt='" + endObjectKey + "' " +
-                                    "secondReadAttempt='" + secondEndObjectKey + "' " + pdfSource);
-                            }
-                        }
-                    }
+                    //It is possible that the endobj is missing, there
+                    //are several PDFs out there that do that so. Unread
+                    //and assume that endobj was missing
+                    pdfSource.unread( endObjectKey.getBytes() );
                 }
             }
             skipSpaces();
Index: pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java	(revision 1033990)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java	(working copy)
@@ -30,6 +30,8 @@
 import java.util.Vector;
 import java.util.regex.Pattern;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.cos.COSDocument;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.exceptions.CryptographyException;
@@ -59,6 +61,10 @@
  */
 public class PDFTextStripper extends PDFStreamEngine
 {
+    /**
+     * Log instance.
+     */
+    private static final Log log = LogFactory.getLog(PDFTextStripper.class);
 
     private static final String thisClassName = PDFTextStripper.class.getSimpleName().toLowerCase();
 
@@ -362,7 +368,16 @@
             if( contentStream != null )
             {
                 COSStream contents = contentStream.getStream();
-                processPage( nextPage, contents );
+                try {
+                    processPage( nextPage, contents );
+                }
+                catch(IOException e)
+                {
+                    if (!isForceParsing()) {
+                        throw e;
+                    }
+                    log.warn("Error processing page.", e);
+                }
             }
         }
     }
Index: pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDStream.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDStream.java	(revision 1037894)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDStream.java	(working copy)
@@ -164,7 +164,8 @@
         }
         else if( base instanceof COSArray )
         {
-            if (((COSArray)base).size() > 0) {
+            // with certain PDFs, the stream contains no items or only null.
+            if (((COSArray)base).size() > 0 && ((COSArray)base).getObject(0) != null) {
                 retval = new PDStream( new COSStreamArray( (COSArray)base ) );
             }
         }
Index: pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/COSStreamArray.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/COSStreamArray.java	(revision 1033990)
+++ pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/COSStreamArray.java	(working copy)
@@ -205,12 +205,17 @@
         for( int i=0;i<streams.size(); i++ )
         {
             COSStream stream = (COSStream)streams.getObject( i );
-            inputStreams.add( stream.getUnfilteredStream() );
-            //handle the case where there is no whitespace in the
-            //between streams in the contents array, without this
-            //it is possible that two operators will get concatenated
-            //together
-            inputStreams.add( new ByteArrayInputStream( inbetweenStreamBytes ) );
+            
+            // a returned stream can sometimes be null when forceParsing is used
+            if (stream != null)
+            {
+                inputStreams.add( stream.getUnfilteredStream() );
+                //handle the case where there is no whitespace in the
+                //between streams in the contents array, without this
+                //it is possible that two operators will get concatenated
+                //together
+                inputStreams.add( new ByteArrayInputStream( inbetweenStreamBytes ) );
+            }
         }
 
         return new SequenceInputStream( inputStreams.elements() );
