PdfTextExtractor.java

1
/*
2
 * Copyright 2008 by Kevin Day.
3
 *
4
 * Contributions copyright 2014 Tizra Inc.
5
 *
6
 * The contents of this file are subject to the Mozilla Public License Version 1.1
7
 * (the "License"); you may not use this file except in compliance with the License.
8
 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
9
 *
10
 * Software distributed under the License is distributed on an "AS IS" basis,
11
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12
 * for the specific language governing rights and limitations under the License.
13
 *
14
 * The Original Code is 'iText, a free JAVA-PDF library'.
15
 *
16
 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
17
 * the Initial Developer are Copyright (C) 1999-2008 by Bruno Lowagie.
18
 * All Rights Reserved.
19
 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
20
 * are Copyright (C) 2000-2008 by Paulo Soares. All Rights Reserved.
21
 *
22
 * Contributor(s): all the names of the contributors are added in the source code
23
 * where applicable.
24
 *
25
 * Alternatively, the contents of this file may be used under the terms of the
26
 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
27
 * provisions of LGPL are applicable instead of those above.  If you wish to
28
 * allow use of your version of this file only under the terms of the LGPL
29
 * License and not to allow others to use your version of this file under
30
 * the MPL, indicate your decision by deleting the provisions above and
31
 * replace them with the notice and other provisions required by the LGPL.
32
 * If you do not delete the provisions above, a recipient may use your version
33
 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
34
 *
35
 * This library is free software; you can redistribute it and/or modify it
36
 * under the terms of the MPL as stated above or under the terms of the GNU
37
 * Library General Public License as published by the Free Software Foundation;
38
 * either version 2 of the License, or any later version.
39
 *
40
 * This library is distributed in the hope that it will be useful, but WITHOUT
41
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
42
 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
43
 * details.
44
 *
45
 * If you didn't download this code from the following link, you should check if
46
 * you aren't using an obsolete version:
47
 * http://www.lowagie.com/iText/
48
 */
49
package com.lowagie.text.pdf.parser;
50
51
52
import com.lowagie.text.ExceptionConverter;
53
import com.lowagie.text.pdf.*;
54
55
import javax.annotation.Nonnull;
56
import java.io.ByteArrayOutputStream;
57
import java.io.IOException;
58
import java.util.ArrayList;
59
import java.util.List;
60
import java.util.ListIterator;
61
62
/**
63
 * Extracts text from a PDF file.
64
 *
65
 * @since 2.1.4
66
 */
67
@SuppressWarnings("WeakerAccess")
68
public class PdfTextExtractor {
69
70
    /**
71
     * The PdfReader that holds the PDF file.
72
     */
73
    private final PdfReader reader;
74
75
    /**
76
     * The {@link TextAssembler} that will receive render notifications and
77
     * provide resultant text
78
     */
79
    private final TextAssembler renderListener;
80
81
    /**
82
     * Creates a new Text Extractor object, using a {@link TextAssembler} as the
83
     * render listener
84
     *
85
     * @param reader the reader with the PDF
86
     */
87
    public PdfTextExtractor(PdfReader reader) {
88
        this(reader, new MarkedUpTextAssembler(reader));
89
    }
90
91
    /**
92
     * Creates a new Text Extractor object, using a {@link TextAssembler} as the
93
     * render listener
94
     *
95
     * @param reader               the reader with the PDF
96
     * @param usePdfMarkupElements should we use higher level tags for PDF markup entities?
97
     */
98
    public PdfTextExtractor(PdfReader reader, boolean usePdfMarkupElements) {
99
        this(reader, new MarkedUpTextAssembler(reader, usePdfMarkupElements));
100
    }
101
102
    /**
103
     * Creates a new Text Extractor object.
104
     *
105
     * @param reader         the reader with the PDF
106
     * @param renderListener the render listener that will be used to analyze renderText
107
     *                       operations and provide resultant text
108
     */
109
    public PdfTextExtractor(PdfReader reader, TextAssembler renderListener) {
110
        this.reader = reader;
111
        this.renderListener = renderListener;
112
    }
113
114
    /**
115
     * Gets the content bytes of a page.
116
     *
117
     * @param pageNum the 1-based page number of page you want get the content
118
     *                stream from
119
     * @return a byte array with the effective content stream of a page
120
     * @throws IOException
121
     */
122
    private byte[] getContentBytesForPage(int pageNum) throws IOException {
123
        try (RandomAccessFileOrArray ignored = reader.getSafeFile()) {
124
            PdfDictionary pageDictionary = reader.getPageN(pageNum);
125
            PdfObject contentObject = pageDictionary.get(PdfName.CONTENTS);
126 1 1. getContentBytesForPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesForPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE
            return getContentBytesFromContentObject(contentObject);
127
        }
128
    }
129
130
    /**
131
     * Gets the content bytes from a content object, which may be a reference a
132
     * stream or an array.
133
     *
134
     * @param contentObject the object to read bytes from
135
     * @return the content bytes
136
     * @throws IOException
137
     */
138
    private byte[] getContentBytesFromContentObject(PdfObject contentObject) throws IOException {
139
        final byte[] result;
140
        switch (contentObject.type()) {
141
            case PdfObject.INDIRECT:
142
                PRIndirectReference ref = (PRIndirectReference) contentObject;
143
                PdfObject directObject = PdfReader.getPdfObject(ref);
144
                result = getContentBytesFromContentObject(directObject);
145
                break;
146
            case PdfObject.STREAM:
147
                PRStream stream = (PRStream) PdfReader.getPdfObject(contentObject);
148
                result = PdfReader.getStreamBytes(stream);
149
                break;
150
            case PdfObject.ARRAY:
151
                // Stitch together all content before calling processContent(),
152
                // because
153
                // processContent() resets state.
154
                ByteArrayOutputStream allBytes = new ByteArrayOutputStream();
155
                PdfArray contentArray = (PdfArray) contentObject;
156
                ListIterator<PdfObject> iter = contentArray.listIterator();
157 1 1. getContentBytesFromContentObject : negated conditional → NO_COVERAGE
                while (iter.hasNext()) {
158
                    PdfObject element = iter.next();
159 1 1. getContentBytesFromContentObject : removed call to java/io/ByteArrayOutputStream::write → NO_COVERAGE
                    allBytes.write(getContentBytesFromContentObject(element));
160
                }
161
                result = allBytes.toByteArray();
162
                break;
163
            default:
164
                throw new IllegalStateException("Unable to handle Content of type " + contentObject.getClass());
165
        }
166 1 1. getContentBytesFromContentObject : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesFromContentObject to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE
        return result;
167
    }
168
169
    /**
170
     * Gets the text from a page.
171
     *
172
     * @param page the 1-based page number of page
173
     * @return a String with the content as plain text (without PDF syntax)
174
     * @throws IOException on error
175
     */
176
    @Nonnull
177
    public String getTextFromPage(int page) throws IOException {
178 1 1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE
        return getTextFromPage(page, false);
179
    }
180
181
    /**
182
     * get the text from the page
183
     *
184
     * @param page               page number we are interested in
185
     * @param useContainerMarkup should we put tags in for PDf markup container elements (not
186
     *                           really HTML at the moment).
187
     * @return result of extracting the text, with tags as requested.
188
     * @throws IOException on error
189
     */
190
    @Nonnull
191
    public String getTextFromPage(int page, boolean useContainerMarkup) throws IOException {
192
        PdfDictionary pageDict = reader.getPageN(page);
193 1 1. getTextFromPage : negated conditional → NO_COVERAGE
        if (pageDict == null) {
194 1 1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE
            return "";
195
        }
196
        PdfDictionary resources = pageDict.getAsDict(PdfName.RESOURCES);
197
198 1 1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/TextAssembler::reset → NO_COVERAGE
        renderListener.reset();
199 1 1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/TextAssembler::setPage → NO_COVERAGE
        renderListener.setPage(page);
200
        PdfContentStreamHandler handler = new PdfContentStreamHandler(renderListener);
201 1 1. getTextFromPage : removed call to com/lowagie/text/pdf/parser/PdfTextExtractor::processContent → NO_COVERAGE
        processContent(getContentBytesForPage(page), resources, handler);
202 1 1. getTextFromPage : mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE
        return handler.getResultantText();
203
    }
204
205
    /**
206
     * Processes PDF syntax
207
     *
208
     * @param contentBytes the bytes of a content stream
209
     * @param resources    the resources that come with the content stream
210
     * @param handler      interprets events caused by recognition of operations in a
211
     *                     content stream.
212
     */
213
    public void processContent(byte[] contentBytes, PdfDictionary resources,
214
                               PdfContentStreamHandler handler) {
215 1 1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::pushContext → NO_COVERAGE
        handler.pushContext("div class='t-extracted-page'");
216
        try {
217
            PdfContentParser ps = new PdfContentParser(new PRTokeniser(contentBytes));
218
            List<PdfObject> operands = new ArrayList<>();
219 2 1. processContent : changed conditional boundary → NO_COVERAGE
2. processContent : negated conditional → NO_COVERAGE
            while (ps.parse(operands).size() > 0) {
220 1 1. processContent : Replaced integer subtraction with addition → NO_COVERAGE
                PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);
221 1 1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::invokeOperator → NO_COVERAGE
                handler.invokeOperator(operator, operands, resources);
222
            }
223
        } catch (Exception e) {
224
            throw new ExceptionConverter(e);
225
        }
226 1 1. processContent : removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::popContext → NO_COVERAGE
        handler.popContext();
227
    }
228
}

Mutations

126

1.1
Location : getContentBytesForPage
Killed by : none
mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesForPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE

157

1.1
Location : getContentBytesFromContentObject
Killed by : none
negated conditional → NO_COVERAGE

159

1.1
Location : getContentBytesFromContentObject
Killed by : none
removed call to java/io/ByteArrayOutputStream::write → NO_COVERAGE

166

1.1
Location : getContentBytesFromContentObject
Killed by : none
mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getContentBytesFromContentObject to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE

178

1.1
Location : getTextFromPage
Killed by : none
mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE

193

1.1
Location : getTextFromPage
Killed by : none
negated conditional → NO_COVERAGE

194

1.1
Location : getTextFromPage
Killed by : none
mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE

198

1.1
Location : getTextFromPage
Killed by : none
removed call to com/lowagie/text/pdf/parser/TextAssembler::reset → NO_COVERAGE

199

1.1
Location : getTextFromPage
Killed by : none
removed call to com/lowagie/text/pdf/parser/TextAssembler::setPage → NO_COVERAGE

201

1.1
Location : getTextFromPage
Killed by : none
removed call to com/lowagie/text/pdf/parser/PdfTextExtractor::processContent → NO_COVERAGE

202

1.1
Location : getTextFromPage
Killed by : none
mutated return of Object value for com/lowagie/text/pdf/parser/PdfTextExtractor::getTextFromPage to ( if (x != null) null else throw new RuntimeException ) → NO_COVERAGE

215

1.1
Location : processContent
Killed by : none
removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::pushContext → NO_COVERAGE

219

1.1
Location : processContent
Killed by : none
changed conditional boundary → NO_COVERAGE

2.2
Location : processContent
Killed by : none
negated conditional → NO_COVERAGE

220

1.1
Location : processContent
Killed by : none
Replaced integer subtraction with addition → NO_COVERAGE

221

1.1
Location : processContent
Killed by : none
removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::invokeOperator → NO_COVERAGE

226

1.1
Location : processContent
Killed by : none
removed call to com/lowagie/text/pdf/parser/PdfContentStreamHandler::popContext → NO_COVERAGE

Active mutators

Tests examined


Report generated by PIT 1.4.2