1 | /** | |
2 | * Copyright 2014 by Tizra Inc. | |
3 | * The contents of this file are subject to the Mozilla Public License Version 1.1 | |
4 | * (the "License"); you may not use this file except in compliance with the License. | |
5 | * You may obtain a copy of the License at http://www.mozilla.org/MPL/ | |
6 | * <p> | |
7 | * Software distributed under the License is distributed on an "AS IS" basis, | |
8 | * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License | |
9 | * for the specific language governing rights and limitations under the License. | |
10 | * <p> | |
11 | * The Original Code is 'iText, a free JAVA-PDF library'. | |
12 | * <p> | |
13 | * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by | |
14 | * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie. | |
15 | * All Rights Reserved. | |
16 | * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer | |
17 | * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved. | |
18 | * <p> | |
19 | * Contributor(s): all the names of the contributors are added in the source code | |
20 | * where applicable. | |
21 | * <p> | |
22 | * Alternatively, the contents of this file may be used under the terms of the | |
23 | * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the | |
24 | * provisions of LGPL are applicable instead of those above. If you wish to | |
25 | * allow use of your version of this file only under the terms of the LGPL | |
26 | * License and not to allow others to use your version of this file under | |
27 | * the MPL, indicate your decision by deleting the provisions above and | |
28 | * replace them with the notice and other provisions required by the LGPL. | |
29 | * If you do not delete the provisions above, a recipient may use your version | |
30 | * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE. | |
31 | * <p> | |
32 | * This library is free software; you can redistribute it and/or modify it | |
33 | * under the terms of the MPL as stated above or under the terms of the GNU | |
34 | * Library General Public License as published by the Free Software Foundation; | |
35 | * either version 2 of the License, or any later version. | |
36 | * <p> | |
37 | * This library is distributed in the hope that it will be useful, but WITHOUT | |
38 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
39 | * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more | |
40 | * details. | |
41 | */ | |
42 | package com.lowagie.text.pdf.parser; | |
43 | ||
44 | import com.lowagie.text.pdf.PdfReader; | |
45 | ||
46 | import javax.annotation.Nullable; | |
47 | import java.util.ArrayList; | |
48 | import java.util.List; | |
49 | ||
50 | /** | |
51 | * We'll get called on a variety of marked section content (perhaps including | |
52 | * the results of nested sections), and will assemble it into an order as we | |
53 | * can. | |
54 | * | |
55 | * @author dgd | |
56 | * | |
57 | */ | |
58 | public class MarkedUpTextAssembler implements TextAssembler { | |
59 | /** | |
60 | * our result may be partially processed already, in which case we'll just | |
61 | * add things to it, once ready. | |
62 | */ | |
63 | List<FinalText> result = new ArrayList<>(); | |
64 | private PdfReader reader; | |
65 | @Nullable | |
66 | private ParsedTextImpl inProgress = null; | |
67 | private int page; | |
68 | private int wordIdCounter = 1; | |
69 | private boolean usePdfMarkupElements = false; | |
70 | /** | |
71 | * as we get new content (final or not), we accumulate it until we reach the | |
72 | * end of a parsing unit | |
73 | * | |
74 | * Each parsing unit may have a tag name that should wrap its content | |
75 | */ | |
76 | private List<TextAssemblyBuffer> partialWords = new ArrayList<>(); | |
77 | ||
78 | MarkedUpTextAssembler(PdfReader reader) { | |
79 | this.reader = reader; | |
80 | } | |
81 | ||
82 | MarkedUpTextAssembler(PdfReader reader, boolean usePdfMarkupElements) { | |
83 | this.reader = reader; | |
84 | this.usePdfMarkupElements = usePdfMarkupElements; | |
85 | } | |
86 | ||
87 | /** | |
88 | * Remember an unassembled chunk until we hit the end of this element, or we | |
89 | * hit an assembled chunk, and need to pull things together. | |
90 | * | |
91 | * @param unassembled | |
92 | * chunk of text rendering instruction to contribute to final | |
93 | * text | |
94 | */ | |
95 | @Override | |
96 | public void process(ParsedText unassembled, String contextName) { | |
97 | partialWords.addAll(unassembled.getAsPartialWords()); | |
98 | } | |
99 | ||
100 | /** | |
101 | * Slot fully-assembled chunk into our result at the current location. If | |
102 | * there are unassembled chunks waiting, assemble them first. | |
103 | * | |
104 | * @param completed | |
105 | * This is a chunk from a nested element | |
106 | */ | |
107 | @Override | |
108 | public void process(FinalText completed, String contextName) { | |
109 |
1
1. process : removed call to com/lowagie/text/pdf/parser/MarkedUpTextAssembler::clearAccumulator → NO_COVERAGE |
clearAccumulator(); |
110 | result.add(completed); | |
111 | ||
112 | } | |
113 | ||
114 | /** | |
115 | * {@inheritDoc} | |
116 | * @see com.lowagie.text.pdf.parser.TextAssembler#process(com.lowagie.text.pdf.parser.Word, | |
117 | * String) | |
118 | */ | |
119 | @Override | |
120 | public void process(Word completed, String contextName) { | |
121 | partialWords.add(completed); | |
122 | } | |
123 | ||
124 | /** | |
125 | * | |
126 | */ | |
127 | private void clearAccumulator() { | |
128 | for (TextAssemblyBuffer partialWord : partialWords) { | |
129 | // Visit each partialWord, calling renderText | |
130 |
1
1. clearAccumulator : removed call to com/lowagie/text/pdf/parser/TextAssemblyBuffer::assemble → NO_COVERAGE |
partialWord.assemble(this); |
131 | } | |
132 |
1
1. clearAccumulator : removed call to java/util/List::clear → NO_COVERAGE |
partialWords.clear(); |
133 |
1
1. clearAccumulator : negated conditional → NO_COVERAGE |
if (inProgress != null) { |
134 | result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); | |
135 | inProgress = null; | |
136 | } | |
137 | } | |
138 | ||
139 | private FinalText concatenateResult(@Nullable String containingElementName) { | |
140 | // null element means that this is a formatting artifact, not content. | |
141 |
1
1. concatenateResult : negated conditional → NO_COVERAGE |
if (containingElementName == null) { |
142 | // at some point, we may want to extract alternate text for some | |
143 | // artifacts. | |
144 |
1
1. concatenateResult : mutated return of Object value for com/lowagie/text/pdf/parser/MarkedUpTextAssembler::concatenateResult to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return null; |
145 | } | |
146 | StringBuilder res = new StringBuilder(); | |
147 |
2
1. concatenateResult : negated conditional → NO_COVERAGE 2. concatenateResult : negated conditional → NO_COVERAGE |
if (usePdfMarkupElements && !containingElementName.isEmpty()) { |
148 | res.append('<').append(containingElementName).append('>'); | |
149 | } | |
150 | for (FinalText item : result) { | |
151 | res.append(item.getText()); | |
152 | } | |
153 | // important, as the stuff buffered in the result is now used up! | |
154 |
1
1. concatenateResult : removed call to java/util/List::clear → NO_COVERAGE |
result.clear(); |
155 |
2
1. concatenateResult : negated conditional → NO_COVERAGE 2. concatenateResult : negated conditional → NO_COVERAGE |
if (usePdfMarkupElements && !containingElementName.isEmpty()) { |
156 | res.append("</"); | |
157 | int spacePos = containingElementName.indexOf(' '); | |
158 |
2
1. concatenateResult : changed conditional boundary → NO_COVERAGE 2. concatenateResult : negated conditional → NO_COVERAGE |
if (spacePos >= 0) { |
159 | containingElementName = containingElementName.substring(0, spacePos); | |
160 | } | |
161 | res.append(containingElementName).append('>'); | |
162 | } | |
163 |
1
1. concatenateResult : mutated return of Object value for com/lowagie/text/pdf/parser/MarkedUpTextAssembler::concatenateResult to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return new FinalText(res.toString()); |
164 | } | |
165 | ||
166 | /** | |
167 | * {@inheritDoc} | |
168 | * @see com.lowagie.text.pdf.parser.TextAssembler#endParsingContext(String) | |
169 | */ | |
170 | @Override | |
171 | public FinalText endParsingContext(@Nullable String containingElementName) { | |
172 |
1
1. endParsingContext : removed call to com/lowagie/text/pdf/parser/MarkedUpTextAssembler::clearAccumulator → NO_COVERAGE |
clearAccumulator(); |
173 |
1
1. endParsingContext : mutated return of Object value for com/lowagie/text/pdf/parser/MarkedUpTextAssembler::endParsingContext to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return concatenateResult(containingElementName); |
174 | } | |
175 | ||
176 | /** | |
177 | * | |
178 | * @see com.lowagie.text.pdf.parser.TextAssembler#reset() | |
179 | */ | |
180 | @Override | |
181 | public void reset() { | |
182 |
1
1. reset : removed call to java/util/List::clear → NO_COVERAGE |
result.clear(); |
183 |
1
1. reset : removed call to java/util/List::clear → NO_COVERAGE |
partialWords.clear(); |
184 | inProgress = null; | |
185 | } | |
186 | ||
187 | @Override | |
188 | public void renderText(FinalText finalText) { | |
189 | result.add(finalText); | |
190 | } | |
191 | ||
192 | /** | |
193 | * Captures text using a simplified algorithm for inserting hard returns and | |
194 | * spaces | |
195 | * | |
196 | * @see com.lowagie.text.pdf.parser.AbstractRenderListener#renderText(java.lang.String, | |
197 | * com.lowagie.text.pdf.parser.GraphicsState, | |
198 | * com.lowagie.text.pdf.parser.Matrix, | |
199 | * com.lowagie.text.pdf.parser.Matrix) | |
200 | */ | |
201 | @Override | |
202 | public void renderText(ParsedTextImpl partialWord) { | |
203 |
1
1. renderText : negated conditional → NO_COVERAGE |
boolean firstRender = inProgress == null; |
204 | boolean hardReturn = false; | |
205 |
1
1. renderText : negated conditional → NO_COVERAGE |
if (firstRender) { |
206 | inProgress = partialWord; | |
207 | return; | |
208 | } | |
209 | Vector start = partialWord.getStartPoint(); | |
210 | Vector lastStart = inProgress.getStartPoint(); | |
211 | Vector lastEnd = inProgress.getEndPoint(); | |
212 | ||
213 | // see | |
214 | // http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html | |
215 | float dist = inProgress.getBaseline() | |
216 | .subtract(lastStart) | |
217 | .cross(lastStart.subtract(start)) | |
218 |
1
1. renderText : Replaced float division with multiplication → NO_COVERAGE |
.lengthSquared() / inProgress.getBaseline().subtract(lastStart).lengthSquared(); |
219 | ||
220 |
1
1. renderText : Replaced float multiplication with division → NO_COVERAGE |
float sameLineThreshold = partialWord.getAscent() * 0.5f; |
221 | // let's try using 25% of current leading for vertical slop. | |
222 |
3
1. renderText : changed conditional boundary → NO_COVERAGE 2. renderText : negated conditional → NO_COVERAGE 3. renderText : negated conditional → NO_COVERAGE |
if (dist > sameLineThreshold || Float.isNaN(dist)) { |
223 | hardReturn = true; | |
224 | } | |
225 | /* | |
226 | * Note: Technically, we should check both the start and end positions, | |
227 | * in case the angle of the text changed without any displacement but | |
228 | * this sort of thing probably doesn't happen much in reality, so we'll | |
229 | * leave it alone for now | |
230 | */ | |
231 | float spacing = lastEnd.subtract(start).length(); | |
232 |
2
1. renderText : negated conditional → NO_COVERAGE 2. renderText : negated conditional → NO_COVERAGE |
if (hardReturn || partialWord.breakBefore()) { |
233 | result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); | |
234 |
1
1. renderText : negated conditional → NO_COVERAGE |
if (hardReturn) { |
235 | result.add(new FinalText("\n")); | |
236 |
1
1. renderText : negated conditional → NO_COVERAGE |
if (usePdfMarkupElements) { |
237 | result.add(new FinalText("<br class='t-pdf' />")); | |
238 | } | |
239 | } | |
240 | inProgress = partialWord; | |
241 | // System.out.println("<< Hard Return >>"); | |
242 |
4
1. renderText : changed conditional boundary → NO_COVERAGE 2. renderText : Replaced double division with multiplication → NO_COVERAGE 3. renderText : negated conditional → NO_COVERAGE 4. renderText : negated conditional → NO_COVERAGE |
} else if (spacing < partialWord.getSingleSpaceWidth() / 2.3 || inProgress.shouldNotSplit()) { |
243 | inProgress = new Word(inProgress.getText() + partialWord.getText().trim(), | |
244 | partialWord.getAscent(), | |
245 | partialWord.getDescent(), | |
246 | lastStart, | |
247 | partialWord.getEndPoint(), | |
248 | inProgress.getBaseline(), | |
249 | partialWord.getSingleSpaceWidth(), | |
250 | inProgress.shouldNotSplit(), | |
251 | inProgress.breakBefore()); | |
252 | } else { | |
253 | result.add(inProgress.getFinalText(reader, page, this, usePdfMarkupElements)); | |
254 | inProgress = partialWord; | |
255 | } | |
256 | } | |
257 | ||
258 | /** | |
259 | * Getter. | |
260 | * | |
261 | * @see SimpleTextExtractingPdfContentRenderListener#_reader | |
262 | * @return reader | |
263 | */ | |
264 | protected PdfReader getReader() { | |
265 | return reader; | |
266 | } | |
267 | ||
268 | /** | |
269 | * {@inheritDoc} | |
270 | * @see com.lowagie.text.pdf.parser.TextAssembler#setPage(int) | |
271 | */ | |
272 | @Override | |
273 | public void setPage(int page) { | |
274 | this.page = page; | |
275 | } | |
276 | ||
277 | /** | |
278 | * {@inheritDoc} | |
279 | * @see com.lowagie.text.pdf.parser.TextAssembler#getWordId() | |
280 | */ | |
281 | @Override | |
282 | public String getWordId() { | |
283 |
2
1. getWordId : Replaced integer addition with subtraction → NO_COVERAGE 2. getWordId : mutated return of Object value for com/lowagie/text/pdf/parser/MarkedUpTextAssembler::getWordId to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE |
return "word" + wordIdCounter++; |
284 | } | |
285 | ||
286 | } | |
Mutations | ||
109 |
1.1 |
|
130 |
1.1 |
|
132 |
1.1 |
|
133 |
1.1 |
|
141 |
1.1 |
|
144 |
1.1 |
|
147 |
1.1 2.2 |
|
154 |
1.1 |
|
155 |
1.1 2.2 |
|
158 |
1.1 2.2 |
|
163 |
1.1 |
|
172 |
1.1 |
|
173 |
1.1 |
|
182 |
1.1 |
|
183 |
1.1 |
|
203 |
1.1 |
|
205 |
1.1 |
|
218 |
1.1 |
|
220 |
1.1 |
|
222 |
1.1 2.2 3.3 |
|
232 |
1.1 2.2 |
|
234 |
1.1 |
|
236 |
1.1 |
|
242 |
1.1 2.2 3.3 4.4 |
|
283 |
1.1 2.2 |