Index: src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
===================================================================
--- src/main/java/org/apache/pdfbox/text/PDFTextStripper.java	(revision 1704612)
+++ src/main/java/org/apache/pdfbox/text/PDFTextStripper.java	(working copy)
@@ -17,6 +17,9 @@
 package org.apache.pdfbox.text;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.text.Normalizer;
@@ -29,14 +32,20 @@
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.SortedSet;
+import java.util.StringTokenizer;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.Vector;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
 import org.apache.pdfbox.util.QuickSort;
@@ -58,6 +67,8 @@
     private static float defaultDropThreshold = 2.5f;
     private static final boolean useCustomQuickSort;
     
+    private static final Log LOG = LogFactory.getLog(PDFTextStripper.class);
+    
     // enable the ability to set the default indent/drop thresholds
     // with -D system properties:
     //    pdftextstripper.indent
@@ -297,8 +308,14 @@
 
         for (PDPage page : pages)
         {
+//            currentPageNo++;
+//            if (page.hasContents())
+//            {
+//                processPage(page);
+//            }
+            PDStream contentStream = page.getStream();
             currentPageNo++;
-            if (page.hasContents())
+            if (contentStream != null)
             {
                 processPage(page);
             }
@@ -1765,28 +1782,28 @@
      * @return a list of strings, one string for every word
      */
     private List<WordWithTextPositions> normalize(List<LineItem> line, boolean isRtlDominant,
-                                                  boolean hasRtl)
+            boolean hasRtl)
     {
         List<WordWithTextPositions> normalized = new LinkedList<WordWithTextPositions>();
         StringBuilder lineBuilder = new StringBuilder();
         List<TextPosition> wordPositions = new ArrayList<TextPosition>();
         // concatenate the pieces of text in opposite order if RTL is dominant
-        if (isRtlDominant)
+        // if (isRtlDominant)
+        // {
+        // int numberOfPositions = line.size();
+        // for (int i = numberOfPositions - 1; i >= 0; i--)
+        // {
+        // lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
+        // }
+        // }
+        // else
+        // {
+        for (LineItem item : line)
         {
-            int numberOfPositions = line.size();
-            for (int i = numberOfPositions - 1; i >= 0; i--)
-            {
-                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, line.get(i));
-            }
-        }
-        else
-        {
-            for (LineItem item : line)
-            {
-                lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
-            }
+            lineBuilder = normalizeAdd(normalized, lineBuilder, wordPositions, item);
         }
-        if (lineBuilder.length() > 0) 
+        // }
+        if (lineBuilder.length() > 0)
         {
             normalized.add(createWord(lineBuilder.toString(), wordPositions));
         }
@@ -1847,15 +1864,321 @@
         }
         if (builder == null)
         {
-            return word;
+            return handleDirection(word);
         }
         else
         {
             builder.append(word.substring(p, q));
-            return builder.toString();
+            return handleDirection(builder.toString());
+        }
+    }
+    /**
+     * Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
+     * word. If the word is a full line, the results will be the best. If the word contains of single words or
+     * characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
+     * characters!
+     * 
+     * @param word The word that shall be processed
+     * @return new word with the correct direction of the containing characters
+     */
+    private String handleDirection(String word)
+    {
+
+        StringBuilder lineBuilder = new StringBuilder();
+        lineBuilder.append(word);
+
+        String line = lineBuilder.toString();
+
+        // revert the direction of the line, if we got RTL dominance
+        if (isRTLDominant(word, true))
+        {
+            lineBuilder = lineBuilder.reverse();
+            line = correctLTR(lineBuilder.toString()); // correct LTR characters in RTL
+        }
+        else
+        {
+            line = correctRTL(lineBuilder.toString()); // correct RTL characters in LTR
+        }
+
+        return line;
+    }
+
+    /**
+     * This method checks, whether the given string contains a group of RTL-words or a single RTL-word and tries to
+     * reverse them.
+     * 
+     * @param string The string that shall be corrected
+     * @return word in the correct order and direction
+     */
+    private String correctRTL(String string)
+    {
+        String pattern = new String("[" + STRONG_RTL_CHARS + "]+" + "[[\\s]*[" + STRONG_RTL_CHARS
+                + "]+]*");
+
+        Pattern p = Pattern.compile(pattern);
+        Matcher m = p.matcher(string);
+
+        while (m.find())
+        {
+
+            String matched = m.group();
+            String reversematched = new StringBuilder(matched).reverse().toString();
+            matched = consolidateRegexGroupMarkers(matched); // we need to consolidate the regex, that means we escape
+                                                             // regex control characters
+
+            string = string.replaceAll(matched, reversematched);
         }
+
+        return string;
     }
 
+    // private static String WEAK_CHARS =
+    // "\u0000-\u0040\u005B-\u0060\u007B-\u00BF\u00D7\u00F7\u02B9-\u02FF\u2000-\u2BFF\u2010-\u2029\u202C\u202F-\u2BFF";
+    // private static String STRONG_LTR_CHARS =
+    // "A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" +
+    // "\u2C00-\uFB1C\uFE00-\uFE6F\uFEFD-\uFFFF\u0000-\u0040\u005B-\u0060\u007B-\u00BF\u00D7\u00F7\u02B9-\u02FF\u2000-\u2BFF\u2010-\u2029\u202C\u202F-\u2BFF";
+    //
+    private static String STRONG_LTR_CHARS = "A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF"
+            + "\u2C00-\uFB1C\uFE00-\uFE6F\uFEFD-\uFFFF";
+
+    private static String STRONG_RTL_CHARS = "\u0591-\u07FF\uFB1D-\uFDFF\uFE70-\uFEFC";
+
+    // we need to escape the characters, therefore we replace \ with \\\
+    private static String MIRRORING_PATTERN_ARRAY = new String("["
+            + "\\\u0028\\\u0029\\\u003C\\\u003E\\\u005B\\\u005D\\\u007B\\\u007D\\\u00AB\\\u00BB"
+            + "\\\u0F3A-\\\u0F3D" + "\\\u169B\\\u169C"
+            + "\\\u2039\\\u203A\\\u2045\\\u2046\\\u207D\\\u207E\\\u208D\\\u208E"
+            + "\\\u2208-\\\u220D\\\u2215\\\u223C\\\u223D\\\u2243" + "\\\u2252-\\\u2255"
+            + "\\\u2264-\\\u226B" + "\\\u226E\\\u226F" + "\\\u2270-\\\u228B" + "\\\u228F-\u2292"
+            + "\\\u2298" + "\\\u22A2\\\u22A3\\\u22A6\\\u22A8\\\u22A9\\\u22AB" + "\\\u22B0-\\\u22B7"
+            + "\\\u22C9-\\\u22CD" + "\\\u22D0\\\u22D1\\\u22D6-\\\u22ED"
+            + "\\\u22F0-\\\u22F4\\\u22F6\\\u22F7" + "\\\u22FA-\\\u22FE" + "\\\u2308-\\\u230B"
+            + "\\\u2329\\\u232A" + "\\\u2768-\u2775" + "\\\u27C3-\\\u27C6"
+            + "\\\u27C8\\\u27C9\\\u27CB\\\u27CD" + "\\\u27D5\\\u27D6\\\u27DD\\\u27DE"
+            + "\\\u27E2-\\\u27EF" + "\\\u2983-\\\u2998" + "\\\u29B8"
+            + "\\\u29C0\\\u29C1\\\u29C4\\\u29C5\\\u29CF" + "\\\u29D0-\\\u29D2" + "\u29D4\\\u29D5"
+            + "\\\u29D8-\\\u29DB" + "\\\u29F5\\\u29F8\\\u29F9\\\u29FC\\\u29FD"
+            + "\\\u2A2B-\\\u2A2E" + "\\\u2A34\\\u2A35\\\u2A3C\\\u2A3D" + "\\\u2A64\\\u2A65"
+            + "\\\u2A79\\\u2A7A\\\u2A7D-\\\u2A7F" + "\\\u2A80-\\\u2A84" + "\\\u2A8B\\\u2A8C"
+            + "\\\u2A91-\\\u2A9C" + "\\\u2AA1\\\u2AA2" + "\\\u2AA6-\\\u2AAD"
+            + "\\\u2AAF\\\u2AB0\\\u2AB3\\\u2AB4" + "\\\u2ABB-\u2AC6" + "\\\u2ACD-\\\u2AD6"
+            + "\\\u2ADE\\\u2AE3-\\\u2AE5" + "\\\u2AEC\\\u2AED\\\u2AF7-\\\u2AFA"
+            + "\\\u2E02-\\\u2E05" + "\\\u2E09\\\u2E0A\\\u2E0C\u2E0D" + "\u2E1C\\\u2E1D"
+            + "\\\u2E20-\\\u2E29" + "\\\u3008-\\\u3011" + "\\\u3014-\\\u301B" + "\\\uFE59-\uFE5E"
+            + "\\\uFE64\\\uFE65" + "\\\uFF08\uFF09" + "\\\uFF1C\\\uFF1E" + "\\\uFF3B\\\uFF3D"
+            + "\\\uFF5B\\\uFF5D\\\uFF5F" + "\\\uFF60\\\uFF62\\\uFF63" + "]");
+
+    private static HashMap<String, String> MIRRORING_CHAR_MAP = new HashMap<String, String>();
+
+    static
+    {
+        String path = "org/apache/pdfbox/text/bidi/BidiMirroring.txt";
+        InputStream input = PDFTextStripper.class.getClassLoader().getResourceAsStream(path);
+        try
+        {
+            parseBidiFile(input);
+        }
+        catch (IOException e)
+        {
+            LOG.warn("Could not parse BidiMirroring.txt, mirroring char map will be empty: "
+                    + e.getMessage());
+        }
+    };
+
+    /**
+     * This method parses the bidi file provided as inputstream.
+     * 
+     * @param inputStream - The bidi file as inputstream
+     * @throws IOException if any line could not be read by the LineNumberReader
+     */
+    private static void parseBidiFile(InputStream inputStream) throws IOException
+    {
+
+        LineNumberReader rd = new LineNumberReader(new InputStreamReader(
+                inputStream));
+
+        do
+        {
+            String s = rd.readLine();
+            if (s == null)
+            {
+                break;
+            }
+
+            int comment = s.indexOf('#'); // ignore comments
+            if (comment != -1)
+            {
+                s = s.substring(0, comment);
+            }
+
+            if (s.length() < 2)
+            {
+                continue;
+
+            }
+
+            StringTokenizer st = new StringTokenizer(s, ";");
+            int nFields = st.countTokens();
+            String[] fields = new String[nFields];
+            for (int i = 0; i < nFields; i++)
+            {
+                fields[i] = "" + (char) Integer.parseInt(st.nextToken().trim(), 16); //
+            }
+
+            if (fields.length == 2)
+            {
+                MIRRORING_CHAR_MAP.put(fields[0], fields[1]); // initialize the MIRRORING_CHAR_MAP
+            }
+
+        } while (true);
+    }
+
+    /**
+     * This method checks, whether the given string contains word a group of LTR-words or a single LTR-word and tries to
+     * reverse them.
+     * 
+     * @param string The string that shall be corrected
+     * @return word in the correct order and direction
+     */
+    private String correctLTR(String string)
+    {
+
+        String pattern = new String("([0-9][\\.\\/]?)+|[" + STRONG_LTR_CHARS + "]+" + "[[\\s]*["
+                + STRONG_LTR_CHARS + "]+]*");
+
+        // String pattern = new String( "([0-9][\\.\\/]?)+|[" + STRONG_LTR_CHARS + "]+" + "[[\\s]*[" + STRONG_LTR_CHARS
+        // + "]+]*" );
+        // String pattern = new String( "([0-9]+[\\.\\/]?)*|[" + STRONG_LTR_CHARS + "]+" + "[[\\s]*[" + STRONG_LTR_CHARS
+        // + "]+]*" );
+
+        Pattern p = Pattern.compile(pattern);
+        Matcher m = p.matcher(string);
+
+        while (m.find())
+        {
+            String matched = m.group();
+            String reversematched = new StringBuilder(matched).reverse().toString();
+            matched = consolidateRegexGroupMarkers(matched); // we need to consolidate the regex: escape regex control
+                                                             // characters or we get a repetition error
+
+            string = string.replaceAll(matched, reversematched);
+        }
+
+        // we need to process the neutral characters again, because they are inverted if they are inside or at the
+        // borders of RTL text
+        // that means a "{" must be mirrored to a "}" and so on...
+        string = postProcessNeutrals(string);
+
+        return string;
+    }
+
+    /**
+     * Replace regex control characters with escaped regex control characters, anything else we will get retention
+     * errors and false results if we find one of theses characters in the text!
+     * 
+     * @param regex The regex string that we want to consolidate
+     * @return consolidated regex
+     */
+    private String consolidateRegexGroupMarkers(String regex)
+    {
+        regex = regex.replaceAll("\\\u007B", "\\\\\u007B"); // {
+        regex = regex.replaceAll("\\\u007D", "\\\\\u007D"); // }
+        regex = regex.replaceAll("\\\u0028", "\\\\\u0028"); // (
+        regex = regex.replaceAll("\\\u0029", "\\\\\u0029"); // )
+        regex = regex.replaceAll("\\\u005B", "\\\\\u005B"); // [
+        regex = regex.replaceAll("\\\u005D", "\\\\\u005D"); // ]
+
+        return regex;
+    }
+
+    /**
+     * Postprocessing neutrals. This method creates a pattern-array, which contains all characters that have the
+     * Bidi_Mirrored=Yes property value, for which there is another Unicode character that typically has a glyph that is
+     * the mirror image of the original character's glyph.
+     * 
+     * @param word The word that shall be processed
+     * @return word with mirrored neutral characters
+     */
+    private String postProcessNeutrals(String word)
+    {
+
+        ArrayList<Integer> positionArray = new ArrayList<Integer>();
+
+        Pattern p = Pattern.compile(MIRRORING_PATTERN_ARRAY);
+        Matcher m = p.matcher(word);
+
+        while (m.find()) // find every unicode character that hits the pattern (has a mirrored unicode char)
+        {
+            int startposition = m.start();
+            positionArray.add(startposition);
+        }
+
+        StringBuilder wordBuilder = new StringBuilder(word);
+
+        if (word != null && positionArray.size() > 0)
+        {
+            for (Integer pos : positionArray) // exchange every found character with its mirrored image
+            {
+                if (MIRRORING_CHAR_MAP.containsKey(word.charAt(pos) + ""))
+                {
+                    wordBuilder.setCharAt(pos, MIRRORING_CHAR_MAP.get(word.charAt(pos) + "")
+                            .charAt(0));
+                }
+            }
+        }
+
+        return wordBuilder.toString();
+    }
+
+    /**
+     * Returns true if the text is RTL dominant, false otherwise. If countNumbersAsLTR is set to true, neutral
+     * characters will be counter as LTR characters.
+     * 
+     * @param text The text we want to check for RTL dominance
+     * @param countDigitsAsLTR Determines, whether digits are count as LTR characters. 
+     * @return True if the given text is RTL dominant, false otherwise
+     */
+    private boolean isRTLDominant(String text, boolean countDigitsAsLTR)
+    {
+
+        int rtlCount = 0;
+        int ltrCount = 0;
+        boolean countDigits = countDigitsAsLTR;
+
+        for (int a = 0; a < text.length(); a++)
+        {
+            char ca = text.charAt(a);
+            byte dir = Character.getDirectionality(ca);
+            if (dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT
+                    || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING
+                    || dir == Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE)
+            {
+                ltrCount++;
+            }
+            else if (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT
+                    || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC
+                    || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING
+                    || dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)
+            {
+                rtlCount++;
+            }
+            else
+            { // count neutral characters as ltr characters
+                if (Character.isDigit(ca) && countDigits)
+                {
+                    ltrCount++;
+                }
+            }
+        }
+
+        if (rtlCount > ltrCount)
+        {
+            return true;
+        }
+
+        return false;
+    }
+
     /**
      * Used within {@link #normalize(List, boolean, boolean)} to handle a {@link TextPosition}.
      * @return The StringBuilder that must be used when calling this method.
