Files with content differing by more than the overlap threshold in tika_1_6 and tika_1_8_SNAPSHOT
FILE_EXTENSION | COUNT |
xls
| 303
|
pdf
| 98
|
doc
| 11
|
txt
| 6
|
ppt
| 3
|
unk
| 2
|
rtf
| 1
|
select FILE_EXTENSION, count(1) as COUNT from comparisons where (JSON_EX_A is null and JSON_EX_B is null and SORT_STACK_TRACE_A is null and SORT_STACK_TRACE_B is null) and (TOKEN_COUNT_A > 30 or TOKEN_COUNT_B > 30) and (overlap < 0.90 or abs(TOKEN_COUNT_A - TOKEN_COUNT_B) > 100) group by FILE_EXTENSION order by COUNT desc