Files with content differing by more than the overlap threshold in pdfbox_1_8_8 and pdfbox_1_8_9_20150316

FILE_PATH FILE_EXTENSION DETECTED_FILE_EXTENSION_A DETECTED_FILE_EXTENSION_B TOKEN_COUNT_A TOKEN_COUNT_B Token Count B - Token Count A OVERLAP TOP_10_MORE_IN_A TOP_10_MORE_IN_B
531/531945.pdf.json
pdf
pdf
pdf
57869
57567
-302
0.997
page: 151 | 408a: 1 | 408b: 1 | 410: 1 | 453: 1 | 454: 1 | 455: 1 | 456: 1 | 457: 1 | 458: 1

917/917211.pdf.json
pdf
pdf
pdf
14978
14876
-102
0.997
draft: 25 | eis: 25 | sldfr: 25 | f_5.2_17: 1 | f_5.2_18: 1 | f_5.2_19: 1 | f_5.2_20: 1 | f_5.2_21: 1 | f_5.2_22: 1 | f_5.2_23: 1

524/524061.pdf.json
pdf
pdf
pdf
8768
8662
-106
0.994
5: 5 | cv: 4 | oo: 4 | 2: 3 | 32: 3 | t: 3 | 2032: 2 | 2037: 2 | 2042: 2 | 2047: 2

167/167852.pdf.json
pdf
pdf
pdf
37704
39154
1450
0.981

wkh: 89 | ri: 80 | lq: 41 | d: 34 | wr: 33 | dqg: 29 | h: 20 | lv: 19 | v: 19 | 5h: 13
056/056021.pdf.json
pdf
pdf
pdf
3125
3011
-114
0.981
yes: 32 | 2008: 3 | center: 3 | navesink: 3 | river: 3 | the: 3 | 08: 2 | and: 2 | june: 2 | mammal: 2

805/805736.pdf.json
pdf
pdf
pdf
2728
2603
-125
0.977
xxxxxxxxx1xxxxxxxxx2xxxxxxxxx3xxxxxxxxx4xxxxxxxxx5: 70 | yes: 35 | xxxxxxxxx1x: 10 | xx: 5 | xxxxxxxxx1: 5

719/719091.pdf.json
pdf
pdf
pdf
5944
5464
-480
0.958
not: 30 | on: 30 | print: 30 | size: 30 | to: 30 | actual: 15 | all: 15 | appear: 15 | checked: 15 | form: 15

005/005937.pdf.json
pdf
pdf
pdf
1167
1050
-117
0.945
global: 5 | field: 4 | o: 4 | one: 4 | sep: 4 | forecasts: 3 | monitoring: 3 | of: 3 | the: 3 | to: 3
0: 1 | 012: 1
524/524276.pdf.json
pdf
pdf
pdf
8060
7089
-971
0.936
the: 44 | in: 35 | and: 32 | of: 32 | tanabe: 28 | a: 19 | tsunami: 18 | p: 14 | 1: 12 | 1700: 12

661/661729.pdf.json
pdf
pdf
pdf
15785
9825
-5960
0.767
7: 1045 | 1: 741 | 2: 558 | 3: 414 | 6: 393 | 0: 205 | f: 204 | 27: 202 | 5: 178 | d: 127





select FILE_PATH, FILE_EXTENSION as "File Extension", DETECTED_FILE_EXTENSION_A as "Detected File Extension A", DETECTED_FILE_EXTENSION_B as "Detected File Extension B", TOKEN_COUNT_A as "Token Count A", TOKEN_COUNT_B as "Token Count B", (TOKEN_COUNT_B-TOKEN_COUNT_A) as "Token Count B - Token Count A", DEC_PATTERN(OVERLAP, '#.###') as Overlap, TOP_10_MORE_IN_A as "Top 10 Tokens with Higher Counts in A", TOP_10_MORE_IN_B as "Top 10 Tokens with Higher Counts in B" from comparisons where (JSON_EX_A is null and JSON_EX_B is null and SORT_STACK_TRACE_A is null and SORT_STACK_TRACE_B is null) and (TOKEN_COUNT_A > 30 or TOKEN_COUNT_B > 30) and (overlap < 0.90 or abs(TOKEN_COUNT_A - TOKEN_COUNT_B) > 100) order by FILE_EXTENSION, overlap desc,abs(TOKEN_COUNT_A-TOKEN_COUNT_B) desc