Issue 736 attachment: text_extract.cc (3.4 KB)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include <stdio.h>

#include "core/fxcrt/fx_memory.h"
#include "public/fpdf_dataavail.h"
#include "public/fpdf_text.h"
#include "public/fpdfview.h"

class TestLoader {
public:
TestLoader(const char* pBuf, size_t len) : m_pBuf(pBuf), m_Len(len) {}
static int GetBlock(void* param,
unsigned long pos,
unsigned char* pBuf,
unsigned long size) {
TestLoader* pLoader = static_cast<TestLoader*>(param);
if (pos + size < pos || pos + size > pLoader->m_Len)
return 0;

memcpy(pBuf, pLoader->m_pBuf + pos, size);
return 1;
}

private:
const char* const m_pBuf;
const size_t m_Len;
};

FPDF_BOOL Is_Data_Avail(FX_FILEAVAIL* pThis, size_t offset, size_t size) {
return true;
}

void Add_Segment(FX_DOWNLOADHINTS* pThis, size_t offset, size_t size) {
}

int main(int argc, const char* argv[]) {
if (argc != 2)
return -1;
FPDF_InitLibraryWithConfig(nullptr);
FILE* pFile = nullptr;
pFile = fopen(argv[1], "rb");
if (!pFile)
return -1;
fseek(pFile, 0L, SEEK_END);
size_t len = ftell(pFile);
fseek(pFile, 0L, SEEK_SET);
char* pBuf = new char[len];
fread(pBuf, len, sizeof(char), pFile);
TestLoader loader(pBuf, len);
FPDF_FILEACCESS file_access;
memset(&file_access, '\0', sizeof(file_access));
file_access.m_FileLen = static_cast<unsigned long>(len);
file_access.m_GetBlock = TestLoader::GetBlock;
file_access.m_Param = &loader;
FX_FILEAVAIL file_avail;
memset(&file_avail, '\0', sizeof(file_avail));
file_avail.version = 1;
file_avail.IsDataAvail = Is_Data_Avail;

FX_DOWNLOADHINTS hints;
memset(&hints, '\0', sizeof(hints));
hints.version = 1;
hints.AddSegment = Add_Segment;

FPDF_DOCUMENT doc;
int nRet = PDF_DATA_NOTAVAIL;
bool bIsLinearized = false;
FPDF_AVAIL pdf_avail = FPDFAvail_Create(&file_avail, &file_access);

if (FPDFAvail_IsLinearized(pdf_avail) == PDF_LINEARIZED) {
fprintf(stderr, "Linearized path...\n");
doc = FPDFAvail_GetDocument(pdf_avail, nullptr);
if (doc) {
while (nRet == PDF_DATA_NOTAVAIL) {
nRet = FPDFAvail_IsDocAvail(pdf_avail, &hints);
}
if (nRet == PDF_DATA_ERROR) {
fprintf(stderr, "Unknown error in checking if doc was available.\n");
return -1;
}
nRet = FPDFAvail_IsFormAvail(pdf_avail, &hints);
if (nRet == PDF_FORM_ERROR || nRet == PDF_FORM_NOTAVAIL) {
fprintf(stderr,
"Error %d was returned in checking if form was available.\n",
nRet);
return -1;
}
bIsLinearized = true;
}
} else {
fprintf(stderr, "Non-linearized path...\n");
doc = FPDF_LoadCustomDocument(&file_access, nullptr);
}
int page_count = FPDF_GetPageCount(doc);
int rendered_pages = 0;
int bad_pages = 0;
for (int i = 0; i < page_count; ++i) {
FPDF_PAGE page = FPDF_LoadPage(doc, i);
if (!page) {
delete[] pBuf;
fclose(pFile);
return -1;
}
char filename[256];
snprintf(filename, sizeof(filename), "%s-%d.txt", "page", i + 1);
FILE* fp = fopen(filename, "wb");
if (!fp)
return -1;
FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
int total_chars = FPDFText_CountChars(text_page);
for (int i = 0; i < total_chars; ++i) {
double font_size = FPDFText_GetFontSize(text_page, i);
fprintf(fp, "font size is %f\n", font_size);
}
fclose(fp);
}
delete[] pBuf;
fclose(pFile);
FPDF_DestroyLibrary();
return 0;
}