/**
*
*/
package org.apache.pdfbox.util;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.TextPosition;
/**
* rewrite of PDFText2HTML that extends off of PDFTextStripper2,
* using that class' improved instrumentation to improve the
* tagging of text chunks.
*
* @author m.martinez@ll.mit.edu
*
*/
public class PDFText2HTML2 extends PDFTextStripper2 {
private static final int INITIAL_PDF_TO_HTML_BYTES = 8192;
private boolean onFirstPage = true;
private static final String CR = System.getProperty("line.separator");
/**
* @param encoding
* @throws IOException
*/
public PDFText2HTML2(String encoding) throws IOException {
super(encoding);
this.outputEncoding = encoding;
setLineSeparator(CR);
setParagraphStart("
");
setParagraphEnd("
"+CR);
setPageStart("");
setPageEnd("
"+CR);
}
/**
* Write the header to the output document. Now also writes the tag defining
* the character encoding.
*
* @throws IOException
* If there is a problem writing out the header to the document.
*/
protected void writeHeader() throws IOException
{
StringBuffer buf = new StringBuffer(INITIAL_PDF_TO_HTML_BYTES);
buf.append("\n");
buf.append("");
buf.append("" + getTitle() + "\n");
if(outputEncoding != null)
{
buf.append("\n");
}
buf.append("\n");
buf.append("\n");
super.writeString(buf.toString());
}
/**
* {@inheritDoc}
*/
protected void writePage() throws IOException
{
if (onFirstPage)
{
writeHeader();
onFirstPage = false;
}
super.writePage();
}
/**
* {@inheritDoc}
*/
public void endDocument(PDDocument pdf) throws IOException
{
super.writeString("");
}
/**
* This method will attempt to guess the title of the document using
* either the document properties or the first lines of text.
*
* @return returns the title.
*/
protected String getTitle()
{
String titleGuess = document.getDocumentInformation().getTitle();
if(titleGuess != null && titleGuess.length() > 0)
{
return titleGuess;
}
else
{
Iterator textIter = getCharactersByArticle().iterator();
float lastFontSize = -1.0f;
StringBuffer titleText = new StringBuffer();
while (textIter.hasNext())
{
Iterator textByArticle = ((List) textIter.next()).iterator();
while (textByArticle.hasNext())
{
TextPosition position = (TextPosition) textByArticle.next();
float currentFontSize = position.getFontSize();
//If we're past 64 chars we will assume that we're past the title
//64 is arbitrary
if (currentFontSize != lastFontSize || titleText.length() > 64)
{
if (titleText.length() > 0)
{
return titleText.toString();
}
lastFontSize = currentFontSize;
}
if (currentFontSize > 13.0f)
{ // most body text is 12pt
titleText.append(position.getCharacter());
}
}
}
}
return "";
}
/**
* Write out the article separator (div tag) with proper text direction
* information.
*
* @param isltr true if direction of text is left to right
* @throws IOException
* If there is an error writing to the stream.
*/
protected void startArticle(boolean isltr) throws IOException
{
if (isltr)
{
super.writeString("");
}
else
{
super.writeString("
");
}
super.startArticle(isltr);
}
/**
* Write out the article separator.
*
* @throws IOException
* If there is an error writing to the stream.
*/
protected void endArticle() throws IOException
{
super.endArticle();
super.writeString("
");
}
/**
* Write a string to the output stream and escape some HTML characters.
*
* @param chars String to be written to the stream
* @throws IOException
* If there is an error writing to the stream.
*/
protected void writeString(String chars) throws IOException
{
for (int i = 0; i < chars.length(); i++)
{
char c = chars.charAt(i);
// write non-ASCII as named entities
if ((c < 32) || (c > 126))
{
int charAsInt = c;
super.writeString("" + charAsInt + ";");
}
else
{
switch (c)
{
case 34:
super.writeString(""");
break;
case 38:
super.writeString("&");
break;
case 60:
super.writeString("<");
break;
case 62:
super.writeString(">");
break;
default:
super.writeString(String.valueOf(c));
}
}
}
}
}