1 | /* | |
2 | * Copyright 2008 by Kevin Day. | |
3 | * | |
4 | * Contributions copyright 2014 Tizra Inc. | |
5 | * | |
6 | * The contents of this file are subject to the Mozilla Public License Version 1.1 | |
7 | * (the "License"); you may not use this file except in compliance with the License. | |
8 | * You may obtain a copy of the License at http://www.mozilla.org/MPL/ | |
9 | * | |
10 | * Software distributed under the License is distributed on an "AS IS" basis, | |
11 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | |
12 | * for the specific language governing rights and limitations under the License. | |
13 | * | |
14 | * The Original Code is 'iText, a free JAVA-PDF library'. | |
15 | * | |
16 | * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by | |
17 | * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. | |
18 | * All Rights Reserved. | |
19 | * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer | |
20 | * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. | |
21 | * | |
22 | * Contributor(s): all the names of the contributors are added in the source code | |
23 | * where applicable. | |
24 | * | |
25 | * Alternatively, the contents of this file may be used under the terms of the | |
26 | * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the | |
27 | * provisions of LGPL are applicable instead of those above. If you wish to | |
28 | * allow use of your version of this file only under the terms of the LGPL | |
29 | * License and not to allow others to use your version of this file under | |
30 | * the MPL, indicate your decision by deleting the provisions above and | |
31 | * replace them with the notice and other provisions required by the LGPL. | |
32 | * If you do not delete the provisions above, a recipient may use your version | |
33 | * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. | |
34 | * | |
35 | * This library is free software; you can redistribute it and/or modify it | |
36 | * under the terms of the MPL as stated above or under the terms of the GNU | |
37 | * Library General Public License as published by the Free Software Foundation; | |
38 | * either version 2 of the License, or any later version. | |
39 | * | |
40 | * This library is distributed in the hope that it will be useful, but WITHOUT | |
41 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
42 | * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more | |
43 | * details. | |
44 | * | |
45 | * If you didn't download this code from the following link, you should check if | |
46 | * you aren't using an obsolete version: | |
47 | * http://www.lowagie.com/iText/ | |
48 | */ | |
49 | package com.lowagie.text.pdf.parser; | |
50 | ||
51 | ||
52 | import com.lowagie.text.ExceptionConverter; | |
53 | import com.lowagie.text.pdf.*; | |
54 | ||
55 | import javax.annotation.Nonnull; | |
56 | import java.io.ByteArrayOutputStream; | |
57 | import java.io.IOException; | |
58 | import java.util.ArrayList; | |
59 | import java.util.List; | |
60 | import java.util.ListIterator; | |
61 | ||
62 | /** | |
63 | * Extracts text from a PDF file. | |
64 | * | |
65 | * @since 2.1.4 | |
66 | */ | |
67 | @SuppressWarnings("WeakerAccess") | |
68 | public class PdfTextExtractor { | |
69 | ||
70 | /** | |
71 | * The PdfReader that holds the PDF file. | |
72 | */ | |
73 | private final PdfReader reader; | |
74 | ||
75 | /** | |
76 | * The {@link TextAssembler} that will receive render notifications and | |
77 | * provide resultant text | |
78 | */ | |
79 | private final TextAssembler renderListener; | |
80 | ||
81 | /** | |
82 | * Creates a new Text Extractor object, using a {@link TextAssembler} as the | |
83 | * render listener | |
84 | * | |
85 | * @param reader the reader with the PDF | |
86 | */ | |
87 | public PdfTextExtractor(PdfReader reader) { | |
88 | this(reader, new MarkedUpTextAssembler(reader)); | |
89 | } | |
90 | ||
91 | /** | |
92 | * Creates a new Text Extractor object, using a {@link TextAssembler} as the | |
93 | * render listener | |
94 | * | |
95 | * @param reader the reader with the PDF | |
96 | * @param usePdfMarkupElements should we use higher level tags for PDF markup entities? | |
97 | */ | |
98 | public PdfTextExtractor(PdfReader reader, boolean usePdfMarkupElements) { | |
99 | this(reader, new MarkedUpTextAssembler(reader, usePdfMarkupElements)); | |
100 | } | |
101 | ||
102 | /** | |
103 | * Creates a new Text Extractor object. | |
104 | * | |
105 | * @param reader the reader with the PDF | |
106 | * @param renderListener the render listener that will be used to analyze renderText | |
107 | * operations and provide resultant text | |
108 | */ | |
109 | public PdfTextExtractor(PdfReader reader, TextAssembler renderListener) { | |
110 | this.reader = reader; | |
111 | this.renderListener = renderListener; | |
112 | } | |
113 | ||
114 | /** | |
115 | * Gets the content bytes of a page. | |
116 | * | |
117 | * @param pageNum the 1-based page number of page you want get the content | |
118 | * stream from | |
119 | * @return a byte array with the effective content stream of a page | |
120 | * @throws IOException | |
121 | */ | |
122 | private byte[] getContentBytesForPage(int pageNum) throws IOException { | |
123 | try (RandomAccessFileOrArray ignored = reader.getSafeFile()) { | |
124 | PdfDictionary pageDictionary = reader.getPageN(pageNum); | |
125 | PdfObject contentObject = pageDictionary.get(PdfName.CONTENTS); | |
126 |
1
1. getContentBytesForPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesForPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return getContentBytesFromContentObject(contentObject); |
127 | } | |
128 | } | |
129 | ||
130 | /** | |
131 | * Gets the content bytes from a content object, which may be a reference a | |
132 | * stream or an array. | |
133 | * | |
134 | * @param contentObject the object to read bytes from | |
135 | * @return the content bytes | |
136 | * @throws IOException | |
137 | */ | |
138 | private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws IOException { | |
139 | final byte[] result; | |
140 | switch (contentObject.type()) { | |
141 | case PdfObject.INDIRECT: | |
142 | PRIndirectReference ref = (PRIndirectReference) contentObject; | |
143 | PdfObject directObject = PdfReader.getPdfObject(ref); | |
144 | result = getContentBytesFromContentObject(directObject); | |
145 | break; | |
146 | case PdfObject.STREAM: | |
147 | PRStream stream = (PRStream) PdfReader.getPdfObject(contentObject); | |
148 | result = PdfReader.getStreamBytes(stream); | |
149 | break; | |
150 | case PdfObject.ARRAY: | |
151 | // Stitch together all content before calling processContent(), | |
152 | // because | |
153 | // processContent() resets state. | |
154 | ByteArrayOutputStream allBytes = new ByteArrayOutputStream(); | |
155 | PdfArray contentArray = (PdfArray) contentObject; | |
156 | ListIterator<PdfObject> iter = contentArray.listIterator(); | |
157 |
1
1. getContentBytesFromContentObject : negated conditional → NO_COVERAGE |
while (iter.hasNext()) { |
158 | PdfObject element = iter.next(); | |
159 |
1
1. getContentBytesFromContentObject : removed call to java/io/ByteArrayOutputStream::write → NO_COVERAGE |
allBytes.write(getContentBytesFromContentObject(element)); |
160 | } | |
161 | result = allBytes.toByteArray(); | |
162 | break; | |
163 | default: | |
164 | throw new IllegalStateException("Unable to handle Content of type " + contentObject.getClass()); | |
165 | } | |
166 |
1
1. getContentBytesFromContentObject : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesFromContentObject to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return result; |
167 | } | |
168 | ||
169 | /** | |
170 | * Gets the text from a page. | |
171 | * | |
172 | * @param page the 1-based page number of page | |
173 | * @return a String with the content as plain text (without PDF syntax) | |
174 | * @throws IOException on error | |
175 | */ | |
176 | @Nonnull | |
177 | public String getTextFromPage(int page) throws IOException { | |
178 |
1
1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return getTextFromPage(page, false); |
179 | } | |
180 | ||
181 | /** | |
182 | * get the text from the page | |
183 | * | |
184 | * @param page page number we are interested in | |
185 | * @param useContainerMarkup should we put tags in for PDf markup container elements (not | |
186 | * really HTML at the moment). | |
187 | * @return result of extracting the text, with tags as requested. | |
188 | * @throws IOException on error | |
189 | */ | |
190 | @Nonnull | |
191 | public String getTextFromPage(int page, boolean useContainerMarkup) throws IOException { | |
192 | PdfDictionary pageDict = reader.getPageN(page); | |
193 |
1
1. getTextFromPage : negated conditional → NO_COVERAGE |
if (pageDict == null) { |
194 |
1
1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return ""; |
195 | } | |
196 | PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES); | |
197 | ||
198 |
1
1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/TextAssembler::reset → NO_COVERAGE |
renderListener.reset(); |
199 |
1
1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/TextAssembler::setPage → NO_COVERAGE |
renderListener.setPage(page); |
200 | PdfContentStreamHandler handler = new PdfContentStreamHandler(renderListener); | |
201 |
1
1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/PdfTextExtractor::processContent → NO_COVERAGE |
processContent(getContentBytesForPage(page), resources, handler); |
202 |
1
1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return handler.getResultantText(); |
203 | } | |
204 | ||
205 | /** | |
206 | * Processes PDF syntax | |
207 | * | |
208 | * @param contentBytes the bytes of a content stream | |
209 | * @param resources the resources that come with the content stream | |
210 | * @param handler interprets events caused by recognition of operations in a | |
211 | * content stream. | |
212 | */ | |
213 | public void processContent(byte[] contentBytes, PdfDictionary resources, | |
214 | PdfContentStreamHandler handler) { | |
215 |
1
1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::pushContext → NO_COVERAGE |
handler.pushContext("div class='t-extracted-page'"); |
216 | try { | |
217 | PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes)); | |
218 | List<PdfObject> operands = new ArrayList<>(); | |
219 |
2
1. processContent : changed conditional boundary → NO_COVERAGE 2. processContent : negated conditional → NO_COVERAGE |
while (ps.parse(operands).size() > 0) { |
220 |
1
1. processContent : Replaced integer subtraction with addition → NO_COVERAGE |
PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1); |
221 |
1
1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::invokeOperator → NO_COVERAGE |
handler.invokeOperator(operator, operands, resources); |
222 | } | |
223 | } catch (Exception e) { | |
224 | throw new ExceptionConverter(e); | |
225 | } | |
226 |
1
1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::popContext → NO_COVERAGE |
handler.popContext(); |
227 | } | |
228 | } | |
Mutations | ||
126 |
1.1 |
|
157 |
1.1 |
|
159 |
1.1 |
|
166 |
1.1 |
|
178 |
1.1 |
|
193 |
1.1 |
|
194 |
1.1 |
|
198 |
1.1 |
|
199 |
1.1 |
|
201 |
1.1 |
|
202 |
1.1 |
|
215 |
1.1 |
|
219 |
1.1 2.2 |
|
220 |
1.1 |
|
221 |
1.1 |
|
226 |
1.1 |