Files with content differing by more than the overlap threshold in PDFBox_1_8_9_20150316_single_thread and pdfbox_1_8_9_20150316

FILE_PATH FILE_EXTENSION DETECTED_FILE_EXTENSION_A DETECTED_FILE_EXTENSION_B TOKEN_COUNT_A TOKEN_COUNT_B Token Count B - Token Count A OVERLAP TOP_10_MORE_IN_A TOP_10_MORE_IN_B
524/524061.pdf.json
pdf
pdf
pdf
8768
8662
-106
0.994
5: 5 | cv: 4 | oo: 4 | 2: 3 | 32: 3 | t: 3 | 2032: 2 | 2037: 2 | 2042: 2 | 2047: 2

005/005937.pdf.json
pdf
pdf
pdf
1167
1050
-117
0.945
global: 5 | field: 4 | o: 4 | one: 4 | sep: 4 | forecasts: 3 | monitoring: 3 | of: 3 | the: 3 | to: 3
0: 1 | 012: 1
524/524276.pdf.json
pdf
pdf
pdf
8060
7089
-971
0.936
the: 44 | in: 35 | and: 32 | of: 32 | tanabe: 28 | a: 19 | tsunami: 18 | p: 14 | 1: 12 | 1700: 12





select FILE_PATH, FILE_EXTENSION as "File Extension", DETECTED_FILE_EXTENSION_A as "Detected File Extension A", DETECTED_FILE_EXTENSION_B as "Detected File Extension B", TOKEN_COUNT_A as "Token Count A", TOKEN_COUNT_B as "Token Count B", (TOKEN_COUNT_B-TOKEN_COUNT_A) as "Token Count B - Token Count A", DEC_PATTERN(OVERLAP, '#.###') as Overlap, TOP_10_MORE_IN_A as "Top 10 Tokens with Higher Counts in A", TOP_10_MORE_IN_B as "Top 10 Tokens with Higher Counts in B" from comparisons where (JSON_EX_A is null and JSON_EX_B is null and SORT_STACK_TRACE_A is null and SORT_STACK_TRACE_B is null) and (TOKEN_COUNT_A > 30 or TOKEN_COUNT_B > 30) and (overlap < 0.90 or abs(TOKEN_COUNT_A - TOKEN_COUNT_B) > 100) order by FILE_EXTENSION, overlap desc,abs(TOKEN_COUNT_A-TOKEN_COUNT_B) desc