Actual file extensions vs. "Detected extensions" in "tika_1_8_SNAPSHOT"

Actual file extension->Detected file extension COUNT
csv->txt
1933
gz->tgz
1567
html->xhtml
1443
unk->txt
766
log->txt
671
txt->eml
457
html->xml
354
f->txt
257
dbase3->bin
193
text->txt
187
dbase3->txt
179
pps->ppt
157
eps->ps
151
text->doc
97
xls->
95
wp->bin
94
unk->bin
82
tex->txt
68
ps->txt
54
kmz->zip
47
troff->txt
39
xml->html
37
ps->tgz
29
hlp->txt
24
html->txt
23
xml->rdf
23
text->ppt
22
sql->txt
19
unk->
19
txt->ps
18
html->css
17
sgml->txt
14
xml->rss
14
html->jpg
13
xml->xhtml
11
gls->bin
10
html->pdf
10
html->doc
8
tmp->txt
8
html->fits
7
ppt->doc
6
tex->bib
6
html->eml
5
html->tiff
5
pdf->txt
5
text->xls
4
pub->txt
3
text->
3
csv->pbm
2
csv->xls
2
data->txt
2
doc->ppt
2
eps->txt
2
html->ps
2
html->tex
2
html->xls
2
text->wmv
2
xls->xlr
2
html->
1
html->elc
1
html->ppt
1
java->html
1
log->html
1
odp->sxi
1
pdf->html
1
ppt->xls
1
py->txt
1
text->iso
1
text->jpg
1
text->ppm
1
text->ppsx
1
text->sit
1
text->xps
1
tmp->doc
1
tmp->html
1
ttf->txt
1
txt->html
1
unk->eml
1
vrml->txt
1
wk1->
1
wp->txt
1




select FILE_EXTENSION || '->' || DETECTED_FILE_EXTENSION_B as 'Actual file extension->Detected file extension', count(1) as COUNT from comparisons where FILE_EXTENSION is not null and JSON_EX_B is null and SORT_STACK_TRACE_B is null and FILE_EXTENSION <> DETECTED_FILE_EXTENSION_B group by FILE_EXTENSION || '->' || DETECTED_FILE_EXTENSION_B order by COUNT desc