Actual file extensions vs. "Detected extensions" in "tika_1_6"
Actual file extension->Detected file extension | COUNT |
csv->txt
| 1933
|
gz->tgz
| 1567
|
html->xhtml
| 1443
|
unk->txt
| 765
|
log->txt
| 671
|
txt->eml
| 457
|
html->xml
| 354
|
f->txt
| 257
|
dbase3->bin
| 193
|
text->txt
| 187
|
dbase3->txt
| 179
|
pps->ppt
| 155
|
eps->ps
| 151
|
text->doc
| 97
|
unk->bin
| 94
|
wp->bin
| 94
|
tex->txt
| 68
|
ps->txt
| 54
|
kmz->zip
| 47
|
troff->txt
| 39
|
xml->html
| 37
|
ps->tgz
| 29
|
hlp->txt
| 24
|
html->txt
| 23
|
xml->rdf
| 23
|
text->ppt
| 22
|
sql->txt
| 19
|
txt->ps
| 18
|
unk->
| 18
|
html->css
| 17
|
sgml->txt
| 14
|
xml->rss
| 14
|
html->jpg
| 13
|
xml->xhtml
| 11
|
gls->bin
| 10
|
html->pdf
| 10
|
html->doc
| 8
|
tmp->txt
| 8
|
html->fits
| 7
|
ppt->doc
| 6
|
tex->bib
| 6
|
html->eml
| 5
|
html->tiff
| 5
|
pdf->txt
| 5
|
txt->emlx
| 5
|
text->xls
| 4
|
pub->txt
| 3
|
text->
| 3
|
csv->pbm
| 2
|
csv->xls
| 2
|
data->txt
| 2
|
doc->ppt
| 2
|
eps->txt
| 2
|
html->ps
| 2
|
html->tex
| 2
|
html->xls
| 2
|
text->wmv
| 2
|
xls->xlr
| 2
|
html->
| 1
|
html->elc
| 1
|
html->ppt
| 1
|
java->html
| 1
|
log->html
| 1
|
odp->sxi
| 1
|
pdf->html
| 1
|
ppt->xls
| 1
|
py->txt
| 1
|
text->iso
| 1
|
text->jpg
| 1
|
text->ppm
| 1
|
text->ppsx
| 1
|
text->sit
| 1
|
text->xps
| 1
|
tmp->doc
| 1
|
tmp->html
| 1
|
ttf->txt
| 1
|
txt->html
| 1
|
unk->eml
| 1
|
unk->emlx
| 1
|
vrml->txt
| 1
|
wk1->
| 1
|
wp->txt
| 1
|
select FILE_EXTENSION || '->' || DETECTED_FILE_EXTENSION_A as 'Actual file extension->Detected file extension', count(1) as COUNT from comparisons where FILE_EXTENSION is not null and JSON_EX_A is null and SORT_STACK_TRACE_A is null and FILE_EXTENSION <> DETECTED_FILE_EXTENSION_A group by FILE_EXTENSION || '->' || DETECTED_FILE_EXTENSION_A order by COUNT desc