diff -u -r trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java src/main/java/org/apache/pdfbox/util/PDFText2HTML.java --- trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java 2012-01-23 11:50:35.234525938 +0100 +++ src/main/java/org/apache/pdfbox/util/PDFText2HTML.java 2012-01-23 11:39:29.578528524 +0100 @@ -64,14 +64,12 @@ protected void writeHeader() throws IOException { StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES); - buf.append("\n"); buf.append(""); buf.append("" + escape(getTitle()) + "\n"); if(outputEncoding != null) { buf.append("\n"); + + this.outputEncoding + "\" />\n"); } buf.append("\n"); buf.append("\n"); @@ -232,4 +230,23 @@ } return builder.toString(); } + + protected void startFontBlock(int fontsize) throws IOException + { + super.writeString(""); + } + + protected void endFontBlock() throws IOException + { + super.writeString(""); + } + + protected void endStyle(char type) throws IOException + { + super.writeString(""); + } + + protected void startStyle(char type) throws IOException { + super.writeString("<" + type + ">"); + } } diff -u -r trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java src/main/java/org/apache/pdfbox/util/PDFTextStripper.java --- trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java 2012-01-23 11:50:35.234525938 +0100 +++ src/main/java/org/apache/pdfbox/util/PDFTextStripper.java 2012-01-23 11:32:02.426523898 +0100 @@ -517,6 +517,26 @@ //default is to do nothing } + protected void startFontBlock(int fontsize) + throws IOException + { + } + + protected void endFontBlock() + throws IOException + { + } + + protected void endStyle(char type) + throws IOException + { + } + + protected void startStyle(char type) + throws IOException + { + } + private static final float ENDOFLASTTEXTX_RESET_VALUE = -1; private static final float MAXYFORLINE_RESET_VALUE = -Float.MAX_VALUE; private static final float EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE = -Float.MAX_VALUE; @@ -539,6 +559,10 @@ float endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE; float lastWordSpacing = LASTWORDSPACING_RESET_VALUE; float maxHeightForLine = MAXHEIGHTFORLINE_RESET_VALUE; + int currentFont = 12; + boolean activeFontBlock = false; + boolean isBold = false; + boolean isItalic = false; PositionWrapper lastPosition = null; PositionWrapper lastLineStartPosition = null; @@ -734,6 +758,18 @@ if(!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) { writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant); + + endFontBlock(); + if (isBold) { + endStyle('b'); + isBold = false; + } + if (isItalic) { + endStyle('i'); + isItalic = false; + } + activeFontBlock = false; + line.clear(); lastLineStartPosition = @@ -775,6 +811,23 @@ } line.add(position); } + + currentFont = (int)position.getFontSizeInPt(); + + if (!activeFontBlock) { + String baseFont = position.getFont().getBaseFont().toLowerCase(); + if (baseFont.contains("bold")) { + isBold = true; + startStyle('b'); + } + if (baseFont.contains("italic")) { + isItalic = true; + startStyle('i'); + } + startFontBlock(currentFont); + activeFontBlock = true; + } + maxHeightForLine = Math.max( maxHeightForLine, positionHeight ); minYTopForLine = Math.min(minYTopForLine,positionY - positionHeight); lastPosition = current; @@ -792,7 +845,20 @@ // print the final line if (line.size() > 0) { + if (!activeFontBlock) { + startFontBlock(currentFont); + } writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant); + + endFontBlock(); + if (isBold) { + endStyle('b'); + isBold = false; + } + if (isItalic) { + endStyle('i'); + isItalic = false; + } writeParagraphEnd(); }