Files with content differing by more than the overlap threshold in tika_1_6 and tika_1_8_SNAPSHOT

FILE_EXTENSION COUNT
xls
303
pdf
98
doc
11
txt
6
ppt
3
unk
2
rtf
1




select FILE_EXTENSION, count(1) as COUNT from comparisons where (JSON_EX_A is null and JSON_EX_B is null and SORT_STACK_TRACE_A is null and SORT_STACK_TRACE_B is null) and (TOKEN_COUNT_A > 30 or TOKEN_COUNT_B > 30) and (overlap < 0.90 or abs(TOKEN_COUNT_A - TOKEN_COUNT_B) > 100) group by FILE_EXTENSION order by COUNT desc