Index: pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/ExtractText.java	(working copy)
@@ -28,6 +28,7 @@
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
 import org.apache.pdfbox.util.PDFText2HTML;
+import org.apache.pdfbox.util.PDFText2XML;
 import org.apache.pdfbox.util.PDFTextStripper;
 
 /**
@@ -47,6 +48,7 @@
     private static final String SORT = "-sort";
     private static final String IGNORE_BEADS = "-ignoreBeads";
     private static final String HTML = "-html";  // jjb - added simple HTML output
+    private static final String XML = "-xml";  // drodrigguez - added XML output
     private static final String FORCE = "-force"; //enables pdfbox to skip corrupt objects
 
     /**
@@ -68,6 +70,7 @@
     {
         boolean toConsole = false;
         boolean toHTML = false;
+        boolean toXML = false;
         boolean force = false;
         boolean sort = false;
         boolean separateBeads = true;
@@ -113,6 +116,11 @@
                 toHTML = true;
                 ext = ".html";
             }
+            else if(args[i].equals(XML)) {
+                toXML = true;
+                ext = ".xml";
+            }
+            
             else if( args[i].equals( SORT ) )
             {
                 sort = true;
@@ -227,7 +235,11 @@
                 }
                 else
                 {
-                    stripper = new PDFTextStripper(encoding);
+                    if(toXML) {
+                        stripper = new PDFText2XML(encoding);
+                    } else {
+                        stripper = new PDFTextStripper(encoding);
+                    }
                 }
                 stripper.setForceParsing( force );
                 stripper.setSortByPosition( sort );
@@ -260,6 +272,7 @@
             "  -encoding  <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
             "  -console                     Send text to console instead of file\n" +
             "  -html                        Output in HTML format instead of raw text\n" +
+            "  -xml                         Output in XML format instead of raw text\n" +
             "  -sort                        Sort the text before writing\n" +
             "  -ignoreBeads                 Disables the separation by beads\n" +
             "  -force                       Enables pdfbox to ignore corrupt objects\n" +
Index: pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/Matrix.java	(working copy)
@@ -295,11 +295,26 @@
          * sqrt(x2) =
          * abs(x)
          */
+
+        /**
+         * DR: The previous din't take int an account that 
+         * a rotation the &theta; angle could be 180&ordm; so
+         * single[1] and single[3] would be zero.
+         *
+         * If single[1] and single[3] are equal to zero we have to check
+         * against single[0] sign.
+         */
         if( !(single[1]==0.0f && single[3]==0.0f) )
         {
             xScale = (float)Math.sqrt(Math.pow(single[0], 2)+
                                       Math.pow(single[1], 2));
         }
+
+        if( single[1]==0.0f && single[3]==0.0f && single[0] < 0.0f )
+        {
+            xScale = -xScale;
+        }
+
         return xScale;
     }
 
@@ -315,10 +330,55 @@
             yScale = (float)Math.sqrt(Math.pow(single[3], 2)+
                                       Math.pow(single[4], 2));
         }
+
+        if( single[1]==0.0f && single[3]==0.0f && single[0] < 0.0f)
+        {
+            yScale = -yScale;
+        }
         return yScale;
     }
 
     /**
+     * Get the rotation angle of this matrix
+     * @return The rotation angle of this matrix
+     */
+    public float getRotation()
+    {
+
+        // We need to know if there could be a rotation wether not.
+        boolean rotation = !(single[1]==0.0f && single[3]==0.0f);
+        rotation = rotation | (!rotation && single[0] < 0.0f);
+
+        if(rotation) 
+        {
+            // Since there is rotation, we need to compute the angle
+            if(single[0] < 0.0f) return (float)(Math.PI / 2);
+
+            // We obtain xScale and yScale.
+            float xScale = (float)Math.sqrt(Math.pow(single[0], 2)+
+                                      Math.pow(single[1], 2));
+
+            float yScale = (float)Math.sqrt(Math.pow(single[3], 2)+
+                                      Math.pow(single[4], 2));
+
+            if(xScale != 0) {
+                if(single[0] != 0.0f) 
+                    return (float)Math.acos(single[0]/xScale);
+                return (float)Math.asin(single[1]/xScale);
+
+            }
+
+            if(yScale != 0) {
+                if(single[0] != 0.0f) 
+                    return (float)Math.asin(single[3]/yScale);
+                return (float)Math.asin(single[4]/yScale);
+
+            }
+        }
+        return 0.0f;
+    }
+
+    /**
      * Get the x position in the matrix.
      * @return The x-position.
      */
Index: pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java	(working copy)
@@ -168,8 +168,8 @@
         {
             showCharacter = false;
             String textCharacter = text.getCharacter();
-            float textX = text.getX();
-            float textY = text.getY();
+            float textX = text.getXDirAdj();
+            float textY = text.getYDirAdj();
             List<TextPosition> sameTextCharacters = this.characterListMapping.get( textCharacter );
             if( sameTextCharacters == null )
             {
@@ -194,12 +194,12 @@
             {
                 TextPosition character = (TextPosition)sameTextCharacters.get( i );
                 String charCharacter = character.getCharacter();
-                float charX = character.getX();
-                float charY = character.getY();
+                float charX = character.getXDirAdj();
+                float charY = character.getYDirAdj();
                 //only want to suppress
 
                 if( charCharacter != null &&
-                        //charCharacter.equals( textCharacter ) &&
+                        Math.abs(text.getDir()-character.getDir()) < 0.3/Math.PI &&
                         within( charX, textX, tolerance ) &&
                         within( charY,
                                 textY,
Index: pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2XML.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2XML.java	(revision 0)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2XML.java	(revision 0)
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import java.io.IOException;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+
+
+/**
+ * Wrap stripped text in a XML way, it just treat it words in an indepedent
+ * way in order to show word value, word position, line and page. It also
+ * incudes PDF metadata information.
+ *
+ * @author David Rodriguez - http://www.divisait.com -
+ * @version $Revision$
+ * @since   PDFBOX 1.4
+ */
+public class PDFText2XML extends PDFTextStripper 
+{
+    /**
+     * Initial builder size
+     */
+    private static final int INITIAL_PDF_TO_XML_BYTES = 8192;
+    
+    /**
+     * Constructor.
+     * @throws IOException If there is an error during initialization.
+     */
+    public PDFText2XML(String encoding) throws IOException 
+    {
+        super((encoding==null)?"UTF-8":encoding);
+        setLineSeparator(systemLineSeparator);
+        setParagraphStart("<l>"+systemLineSeparator);
+        setParagraphEnd("</l>"+systemLineSeparator);
+        setPageStart("<p>");
+        setPageEnd("</p>"+systemLineSeparator);
+        setArticleStart("<bead>"+systemLineSeparator);
+        setArticleEnd("</bead>"+systemLineSeparator);
+        setArticleStart(systemLineSeparator);
+        setArticleEnd(systemLineSeparator);
+    }
+
+    /**
+     * {@inheritDoc}
+     */
+    public void endDocument(PDDocument pdf) throws IOException 
+    {
+        output.write("</document>");
+    }
+
+    @Override
+    protected void writeWord(Word word,
+                             String normalizedWord) 
+    throws IOException {
+        StringBuilder sb = new StringBuilder();
+        boolean rtl = false;
+        rtl = !normalizedWord.equals(word.getWord());
+        sb.append("<t x=\"").append(word.getWordX()).append("\" y=\"").append(word.getWordY()).append("\" w=\"").append(word.getWordMaxWidth()).append("\" h=\"").append(word.getWordMaxHeight()).append("\" t=\"").append(word.getWordTheta());
+        if(rtl) {
+            sb.append(" rtl=\"true\"");
+        }
+        sb.append("\">");
+        
+        sb.append(escapeXML(normalizedWord));
+        sb.append("</t>").append(systemLineSeparator);
+        output.write(sb.toString());
+    }
+
+    @Override
+    protected void startDocument(PDDocument pdf) throws IOException {
+        super.startDocument(pdf);
+        StringBuilder buf = new StringBuilder(INITIAL_PDF_TO_XML_BYTES);
+        buf.append("<?xml version=\"1.0\" encoding=\"").append(outputEncoding).append("\"?>").append(systemLineSeparator);
+        buf.append("<document>").append(systemLineSeparator);
+        String title = document.getDocumentInformation().getTitle();
+        String author = document.getDocumentInformation().getAuthor();
+        String creator = document.getDocumentInformation().getCreator();
+        String keywords = document.getDocumentInformation().getKeywords();
+        String subject = document.getDocumentInformation().getSubject();
+        
+        if(title!=null || author!=null || creator!=null || keywords!=null || subject!=null) {
+            buf.append("<meta>").append(systemLineSeparator);
+            
+            if(title!=null) {
+                buf.append("<title>").append(escapeXML(title)).append("</title>").append(systemLineSeparator);            
+            }
+            if(author!=null) {
+                buf.append("<author>").append(escapeXML(author)).append("</author>").append(systemLineSeparator);
+            }
+            if(creator!=null) {
+                buf.append("<creator>").append(escapeXML(creator)).append("</creator>").append(systemLineSeparator);
+            }
+            if(keywords!=null) {
+                buf.append("<keywords>").append(escapeXML(keywords)).append("</keywords>").append(systemLineSeparator);
+            }
+            if(subject!=null) {
+                buf.append("<subject>").append(escapeXML(subject)).append("</subject>").append(systemLineSeparator);            
+            }        
+            buf.append("</meta>").append(systemLineSeparator);
+        }
+        
+        output.write(buf.toString());
+    }
+    
+    /**
+     * Escape special XML Characters
+     *
+     * @param chars String to be written to the stream
+     * @throws IOException
+     *             If there is an error writing to the stream.
+     */
+    protected String escapeXML(String chars)
+    {
+        StringBuilder output = new StringBuilder(chars.length());
+        
+        for (int i = 0; i < chars.length(); i++) 
+        {
+            char c = chars.charAt(i);
+            // write non-ASCII as named entities
+            if (c < 32) 
+            {
+                int charAsInt = c;
+                output.append("&#" + charAsInt + ";");
+            } 
+            else 
+            {
+                switch (c) 
+                {
+                case 34:
+                    output.append("&quot;");
+                    break;
+                case 38:
+                    output.append("&amp;");
+                    break;
+                case 60:
+                    output.append("&lt;");
+                    break;
+                case 62:
+                    output.append("&gt;");
+                    break;
+                default:
+                    output.append(c);
+                }
+            }
+        }
+        return output.toString();
+    }    
+}

Property changes on: pdfbox\src\main\java\org\apache\pdfbox\util\PDFText2XML.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java	(working copy)
@@ -631,28 +631,16 @@
                     previousAveCharWidth = -1;
                 }
 
-                float positionX;
-                float positionY;
-                float positionWidth;
-                float positionHeight;
+                /**
+                 * DR: We always use adjusted coordinates, we don't really mind
+                 * page orientation, but text orientation.
+                 */
+                float positionX = position.getXDirAdj();
+                float positionY = position.getYDirAdj();;
+                float positionWidth = position.getWidthDirAdj();
+                float positionHeight = position.getHeightDir();
+                float positionTheta = position.getDir();
 
-                /* If we are sorting, then we need to use the text direction
-                 * adjusted coordinates, because they were used in the sorting. */
-                if (getSortByPosition())
-                {
-                    positionX = position.getXDirAdj();
-                    positionY = position.getYDirAdj();
-                    positionWidth = position.getWidthDirAdj();
-                    positionHeight = position.getHeightDir();
-                }
-                else
-                {
-                    positionX = position.getX();
-                    positionY = position.getY();
-                    positionWidth = position.getWidth();
-                    positionHeight = position.getHeight();
-                }
-
                 //The current amount of characters in a word
                 int wordCharCount = position.getIndividualWidths().length;
 
@@ -722,9 +710,15 @@
                     /* XXX BC: In theory, this check should really check if the next char is in full range
                      * seen in this line. This is what I tried to do with minYTopForLine, but this caused a lot
                      * of regression test failures.  So, I'm leaving it be for now. */
-                    if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine))
+    
+                    /* DR: We check if theta is the same, since if the rotation angle is
+                     * different, we have a different line, we allow a little angle
+                     * deviation (0.3 degree)
+                     */
+                    if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine) ||
+                       Math.abs(positionTheta-lastPosition.getTextPosition().getDir()) > 0.3/Math.PI)
                     {
-                        writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+                        writeLine(normalize(line),isRtlDominant,hasRtl);
                         line.clear();
 
                         lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition);
@@ -779,7 +773,7 @@
             // print the final line
             if (line.size() > 0)
             {
-                writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
+                writeLine(normalize(line),isRtlDominant,hasRtl);
                 writeParagraphEnd();
             }
 
@@ -828,6 +822,21 @@
     {
         output.write(getWordSeparator());
     }
+    
+    /**
+     * Write the word to the output stream
+     * 
+     * @param   word            The original word to write (with its position, etc.)
+     * @param   normalizedWord  Normalized word - string -
+     * 
+     * @throws IOException
+     *             If there is a problem writing out the wordseparator to the document.
+     */
+    protected void writeWord(Word word, String normalizedWord) 
+    throws IOException
+    {
+        output.write(normalizedWord);
+    }    
 
     /**
      * Write the string in TextPosition to the output stream.
@@ -877,8 +886,8 @@
         {
             showCharacter = false;
             String textCharacter = text.getCharacter();
-            float textX = text.getX();
-            float textY = text.getY();
+            float textX = text.getXDirAdj();
+            float textY = text.getYDirAdj();
             List<TextPosition> sameTextCharacters = (List<TextPosition>)characterListMapping.get( textCharacter );
             if( sameTextCharacters == null )
             {
@@ -903,11 +912,12 @@
             {
                 TextPosition character = sameTextCharacters.get( i );
                 String charCharacter = character.getCharacter();
-                float charX = character.getX();
-                float charY = character.getY();
+                float charX = character.getXDirAdj();
+                float charY = character.getYDirAdj();
                 //only want to suppress
 
                 if( charCharacter != null &&
+                        Math.abs(text.getDir()-character.getDir()) < 0.3/Math.PI &&
                         //charCharacter.equals( textCharacter ) &&
                         within( charX, textX, tolerance ) &&
                         within( charY,
@@ -1803,21 +1813,38 @@
      * Write a list of string containing a whole line of a document.
      * @param line a list with the words of the given line
      * @param isRtlDominant determines if rtl or ltl is dominant
+     * @param hasRtl determines if lines contains rtl formatted text(parts)
      * @throws IOException if something went wrong
      */
-    private void writeLine(List<String> line, boolean isRtlDominant)throws IOException{
+    private void writeLine(List<Word> line, boolean isRtlDominant, boolean hasRtl)
+    throws IOException{
+        
         int numberOfStrings = line.size();
         if (isRtlDominant) {
             for(int i=numberOfStrings-1; i>=0; i--){
                 if (i > 1)
                     writeWordSeparator();
-                writeString(line.get(i));
+                
+                String word = line.get(i).getWord();
+                if (hasRtl) {
+                    word = normalize.makeLineLogicalOrder(word,isRtlDominant);
+                }
+                word = normalize.normalizePres(word);
+                
+                writeWord(line.get(i),word);
             }
         }
         else {
             for(int i=0; i<numberOfStrings; i++){
-                writeString(line.get(i));
-                if (!isRtlDominant && i < numberOfStrings-1)
+                String word = line.get(i).getWord();
+                if (hasRtl) {
+                    word = normalize.makeLineLogicalOrder(word,isRtlDominant);
+                }
+                word = normalize.normalizePres(word);                
+                
+                writeWord(line.get(i),word);
+
+                if (i < numberOfStrings-1)
                     writeWordSeparator();
             }
         }
@@ -1826,36 +1853,45 @@
     /**
      * Normalize the given list of TextPositions.
      * @param line list of TextPositions
-     * @param isRtlDominant determines if rtl or ltl is dominant 
-     * @param hasRtl determines if lines contains rtl formatted text(parts)
      * @return a list of strings, one string for every word
      */
-    private List<String> normalize(List<TextPosition> line, boolean isRtlDominant, boolean hasRtl){
-        LinkedList<String> normalized = new LinkedList<String>();
-        StringBuilder lineBuilder = new StringBuilder();
-        for(TextPosition text : line){
-            if (text instanceof WordSeparator) {
-                String lineStr = lineBuilder.toString();
-                if (hasRtl) {
-                    lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
+    private List<Word> normalize(List<TextPosition> line){
+        /* DR: We have to reorder the tokens in this line in a set of words */
+        Word currentWord = null;   
+        
+        ArrayList<Word> words = new ArrayList<Word>();
+        
+        for(TextPosition text : line) {
+            if(text instanceof WordSeparator) {
+                if(!currentWord.isEmpty()) {
+                    words.add(currentWord);
                 }
-                lineStr = normalize.normalizePres(lineStr);
-                normalized.add(lineStr);
-                lineBuilder = new StringBuilder();
+                currentWord = null;
+                
+            } else {
+                // DR: We create a new word to store data, firstly we have
+                // to check if currentPostion stores only one char, or severals,
+                // in last case we have to look for spaces in order to split
+                // current token into severals.
+                if(currentWord == null) {
+                    currentWord = new Word();
+                }
+                int _result = currentWord.addTextPosition(text);
+                if(_result == -1 ) {
+                    // If word is not empty we add to the list of words
+                    if(!currentWord.isEmpty()) {
+                        words.add(currentWord);
+                    }
+                    currentWord = null;
+                }
+                
             }
-            else {
-                lineBuilder.append(text.getCharacter());
-            }
         }
-        if (lineBuilder.length() > 0) {
-            String lineStr = lineBuilder.toString();
-            if (hasRtl) {
-                lineStr = normalize.makeLineLogicalOrder(lineStr,isRtlDominant);
-            }
-            lineStr = normalize.normalizePres(lineStr);
-            normalized.add(lineStr);
-        }
-        return normalized;
+        // Si la palabra no esta vacía, la añadimos.
+        if(currentWord!=null && !currentWord.isEmpty()) {
+            words.add(currentWord);
+        }        
+        return words;
     }
 
     /**
Index: pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/TextPosition.java	(working copy)
@@ -36,7 +36,7 @@
     private float endY;
 
     private float maxTextHeight; // maximum height of text, in display units
-    private int rot; // 0, 90, 180, 270 degrees of page rotation
+    private float rot; // 0, 90, 180, 270 degrees of page rotation
     private float x = Float.NEGATIVE_INFINITY;
     private float y = Float.NEGATIVE_INFINITY;
     private float pageHeight;
@@ -92,12 +92,11 @@
         this.endX = textPositionEnd.getXPosition();
         this.endY = textPositionEnd.getYPosition();
 
-        this.rot = page.findRotation();
-        // make sure it is 0 to 270 and no negative numbers
-        if(this.rot < 0)
-        {
-            rot += 360;
+        int _rot = page.findRotation();
+        if(_rot < 0 ) {
+            _rot += 360;
         }
+        this.rot = ((float)_rot)/(float)Math.PI;
 
         this.maxTextHeight = maxFontH;
         this.pageHeight = page.findMediaBox().getHeight();
@@ -148,7 +147,12 @@
         this.endX = textPositionEnd.getXPosition();
         this.endY = textPositionEnd.getYPosition();
 
-        this.rot = pageRotation;
+        int _rot = pageRotation;
+        if(_rot < 0 ) {
+            _rot += 360;
+        }
+        this.rot = ((float)_rot)/(float)Math.PI;
+
         // make sure it is 0 to 270 and no negative numbers
         if(this.rot < 0)
         {
@@ -189,90 +193,54 @@
     /**
      * Return the direction/orientation of the string in this object
      * based on its text matrix.
-     * @return The direction of the text (0, 90, 180, or 270)
+     * @return The direction of the text
      */
     public float getDir() 
     {
-        float a = textPos.getValue(0,0);
-        float b = textPos.getValue(0,1);
-        float c = textPos.getValue(1,0);
-        float d = textPos.getValue(1,1);
-
-        // 12 0   left to right
-        // 0 12 
-        if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
-        {
-            return 0;
-        }
-        // -12 0   right to left (upside down)
-        // 0 -12
-        else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
-        {
-            return 180;
-        }
-        // 0  12    up
-        // -12 0 
-        else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
-        {
-            return 90;
-        }
-        // 0  -12   down
-        // 12 0 
-        else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
-        {
-            return 270;
-        }
-        return 0;
+        float theta = textPos.getRotation();
+        if(theta < 0) theta = theta+(float)Math.PI;
+        return theta;
     }
 
     /**
      * Return the X starting coordinate of the text, adjusted by 
-     * the given rotation amount.  The rotation adjusts where the 0,0
-     * location is relative to the text. 
+     * the given rotation amount.  
      *  
-     * @param rotation Rotation to apply (0, 90, 180, or 270).  0 will perform no adjustments. 
+     * @param rotation Rotation to apply
      * @return X coordinate
      */
     private float getXRot(float rotation)
     {
-        if (rotation == 0)
-        {
-            return textPos.getValue(2,0);
-        }
-        else if (rotation == 90)
-        {
-            return textPos.getValue(2,1);
-        }
-        else if (rotation == 180)
-        {
-            return pageWidth - textPos.getValue(2,0);
-        }
-        else if (rotation == 270)
-        {
-            return pageHeight - textPos.getValue(2,1);
-        }
-        return 0;
+        float [] rot = new float[]{(float)Math.cos(rotation),(float)Math.sin(rotation)};    
+        
+        // We obtain xPos and yPos, in order to compute
+        // coordinates transformation
+        float xPosition = textPos.getXPosition();
+        float yPosition = textPos.getYPosition();
+        
+        float pageWidthRotated = Math.abs(pageWidth*rot[0]+pageHeight*rot[1]);
+        
+        float xRotated = (xPosition*rot[0] + yPosition*rot[1]);
+        if(xRotated < 0 ) xRotated += pageWidthRotated;
+        return xRotated;
     }
 
     /**
      * This will get the page rotation adjusted x position of the character.
-     * This is adjusted based on page rotation so that the upper 
-     * left is 0,0. 
      *
      * @return The x coordinate of the character.
      */
     public float getX()
     {
+
         if(x==Float.NEGATIVE_INFINITY){
-        	x = getXRot(rot);
+            x = getXRot(rot);
         }
         return x;
     }
 
     /**
-     * This will get the text direction adjusted x position of the character.
-     * This is adjusted based on text direction so that the first character
-     * in that direction is in the upper left at 0,0.
+     * This will get the text direction adjusted x position of the character
      *
      * @return The x coordinate of the text.
      */
@@ -282,72 +250,48 @@
     }
 
     /** 
-     * This will get the y position of the character with 0,0 in lower left. 
-     * This will be adjusted by the given rotation. 
-     * @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
+     * This will get the y position this will be adjusted by the rotation
      * 
+     * @param rotation Rotation to apply
+     * 
      * @return The y coordinate of the text
      */
-    private float getYLowerLeftRot(float rotation)
+    private float getYRot(float rotation)
     {
-        if (rotation == 0)
-        {
-            return textPos.getValue(2,1);
-        }
-        else if (rotation == 90)
-        {
-            return pageWidth - textPos.getValue(2,0);
-        }
-        else if (rotation == 180)
-        {
-            return pageHeight - textPos.getValue(2,1);
-        }
-        else if (rotation == 270)
-        {
-            return textPos.getValue(2,0);
-        }
-        return 0;
+        
+        float [] rot = new float[]{(float)Math.cos(rotation),(float)Math.sin(rotation)};    
+        
+        // We obtain xPos and yPos, in order to compute
+        // coordinates transformation
+        float xPosition = textPos.getXPosition();
+        float yPosition = textPos.getYPosition();
+    
+    
+        // We compute pageHeight rotation
+        return (-xPosition*rot[1] + yPosition*rot[0]);
     }
 
     /**
-     * This will get the y position of the text, adjusted so that 0,0 is upper left and 
-     * it is adjusted based on the page rotation. 
+     * This will get the y position of the text
      *
      * @return The adjusted y coordinate of the character.
      */
     public float getY()
     {
     	if(y==Float.NEGATIVE_INFINITY){
-            if ((rot == 0) || (rot == 180))
-            {
-                y = pageHeight - getYLowerLeftRot(rot);
-            }
-            else 
-            {
-                y = pageWidth - getYLowerLeftRot(rot);
-            }
+            y = getYRot(rot); 
     	}
     	return y;
     }
 
     /**
-     * This will get the y position of the text, adjusted so that 0,0 is upper left and 
-     * it is adjusted based on the text direction. 
+     * This will get the y position of the text
      *
      * @return The adjusted y coordinate of the character.
      */
     public float getYDirAdj()
     {
-        float dir = getDir();
-        // some PDFBox code assumes that the 0,0 point is in upper left, not lower left
-        if ((dir == 0) || (dir == 180))
-        {
-            return pageHeight - getYLowerLeftRot(dir);
-        }
-        else
-        {
-            return pageWidth - getYLowerLeftRot(dir);
-        }
+        return getYRot(getDir());
     }
 
 
@@ -355,19 +299,17 @@
     /**
      * Get the length or width of the text, based on a given rotation. 
      * 
-     * @param rotation Rotation that was used to determine coordinates (0,90,180,270)
+     * @param rotation Rotation that was used to determine coordinates
      * @return Width of text in display units
      */
     private float getWidthRot(float rotation)
     {
-        if ((rotation == 90) || (rotation == 270)) 
-        {
-            return Math.abs(endY - textPos.getYPosition());
-        }
-        else 
-        {
-            return Math.abs(endX - textPos.getXPosition());
-        }
+        float [] rot = new float[]{(float)Math.cos(rotation),(float)Math.sin(rotation)};    
+        
+        float width =  Math.abs(endX - textPos.getXPosition());
+        float height = Math.abs(endY - textPos.getYPosition());
+        return Math.abs((width*rot[0] + height*rot[1]));
+
     }
 
     /**
@@ -517,6 +459,13 @@
 
         double tp2Xstart = tp2.getXDirAdj();
         double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();
+        
+        /*
+         * We check also the rotation angle.
+         */
+        if(Math.abs(getDir()-tp2.getDir()) > 0.3/Math.PI) {
+            return false;
+        }
 
         /*
          * No X overlap at all so return as soon as possible. 
Index: pdfbox/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java	(revision 1027422)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/TextPositionComparator.java	(working copy)
@@ -54,15 +54,15 @@
         
         float pos1YBottom = pos1.getYDirAdj();
         float pos2YBottom = pos2.getYDirAdj();
-        // note that the coordinates have been adjusted so 0,0 is in upper left
-        float pos1YTop = pos1YBottom - pos1.getHeightDir();
-        float pos2YTop = pos2YBottom - pos2.getHeightDir();
+        // 0,0 coordinates are in upper left
+        float pos1YTop = pos1YBottom + pos1.getHeightDir();
+        float pos2YTop = pos2YBottom + pos2.getHeightDir();
 
         float yDifference = Math.abs( pos1YBottom-pos2YBottom);
         //we will do a simple tolerance comparison.
         if( yDifference < .1 ||
-            (pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom) ||
-            (pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom))
+            (pos2YBottom <= pos1YTop && pos2YBottom >= pos1YBottom) ||
+            (pos1YBottom <= pos2YTop && pos1YBottom >= pos2YBottom))
         {
             if( x1 < x2 )
             {
@@ -77,7 +77,7 @@
                 retval = 0;
             }
         }
-        else if( pos1YBottom < pos2YBottom )
+        else if( pos1YBottom > pos2YBottom )
         {
             retval = -1;
         }
Index: pdfbox/src/main/java/org/apache/pdfbox/util/Word.java
===================================================================
--- pdfbox/src/main/java/org/apache/pdfbox/util/Word.java	(revision 0)
+++ pdfbox/src/main/java/org/apache/pdfbox/util/Word.java	(revision 0)
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.util;
+
+import java.util.ArrayList;
+
+/**
+ * Class used to store several TextPositions that conforms a word, a word
+ * is a set of positions splited by an space.
+ * <pre>
+ * $Log$
+ * </pre>
+ *
+ * @author David Rodriguez - http://www.divisait.com - 
+ * @since   PDFBOX 1.4
+ */
+public class Word {
+    /**
+     * Word start x position;
+     */
+    private float wordX;
+    
+    /**
+     * Word start y position
+     */
+    private float wordY;
+    
+    /**
+     * Word rotation angle
+     */
+    private float wordTheta;
+    
+    /**
+     * Word max height
+     */
+    private float wordMaxHeight;
+    
+    /**
+     * Word max width
+     */
+    private float wordMaxWidth;
+    
+    /**
+     * Word data
+     */
+    private StringBuilder word;
+    
+    /**
+     * This methods adds a position to the current word, we are making the
+     * asumption that a TextPosition doesn't store several words o word fragments.
+     * 
+     * @param   text    The new position to add
+     * @return  It returs 0 if position is added to the current word, or -1
+     *          if it founds a word-separator char
+     */
+    public int addTextPosition(TextPosition text) {
+        if(text.getCharacter()==null || text.getCharacter().equals(" ")) return -1;
+        
+        if(word == null) {
+            word = new StringBuilder();
+            wordX = text.getX();
+            wordY = text.getY();
+            wordTheta = text.getTextPos().getRotation();
+            word = new StringBuilder();
+        }
+        
+        // We modifiy height word, taking into account max letter height
+        if(text.getHeight() > wordMaxHeight) wordMaxHeight = text.getHeight();
+        wordMaxWidth += text.getWidth();
+        
+        // We append text character to current word
+        word.append(text.getCharacter());
+        return 0;
+    }
+    
+    
+    public float getWordX() {
+        return wordX;
+    }
+
+    public float getWordY() {
+        return wordY;
+    }
+
+    public float getWordTheta() {
+        return wordTheta;
+    }
+
+    public float getWordMaxHeight() {
+        return wordMaxHeight;
+    }
+
+    public float getWordMaxWidth() {
+        return wordMaxWidth;
+    }
+
+    public String getWord() {
+        return word.toString();
+    }
+    
+    public boolean isEmpty() {
+        return (word == null ||word.length()==0);
+    }
+    
+
+}

Property changes on: pdfbox\src\main\java\org\apache\pdfbox\util\Word.java
___________________________________________________________________
Added: svn:eol-style
   + native

