Languages detected in tika_1_8_SNAPSHOT by language then content type

LANG_ID1_B DETECTED_CONTENT_TYPE_B COUNT
af
text/plain; charset=ISO-8859-1
24
af
application/pdf
3
af
application/vnd.ms-excel
3
af
application/gzip
2
af
text/plain; charset=windows-1252
2
af
application/xml
1
af
text/html; charset=ISO-8859-1
1
af
text/html; charset=iso-8859-1
1
bg
application/pdf
1
bg
text/html; charset=windows-1251
1
bg
text/plain; charset=KOI8-R
1
bn
application/pdf
111
bn
text/plain; charset=windows-1250
5
bn
text/html; charset=UTF-32LE
1
bn
text/plain; charset=windows-1252
1
bn
text/plain; charset=windows-1253
1
bn
text/plain; charset=windows-1255
1
cs
application/pdf
2
cs
text/plain; charset=ISO-8859-1
2
cs
text/html; charset=IBM500
1
cs
text/html; charset=ISO-8859-1
1
cs
text/html; charset=UTF-8
1
da
text/html; charset=ISO-8859-1
46
da
text/plain; charset=ISO-8859-1
34
da
text/html; charset=iso-8859-1
22
da
text/plain; charset=windows-1252
12
da
application/pdf
5
da
application/vnd.ms-excel
4
da
application/msword
3
da
application/xml
1
da
text/html; charset=utf-8
1
de
text/plain; charset=ISO-8859-1
750
de
text/plain; charset=windows-1252
407
de
application/pdf
119
de
application/vnd.ms-excel
98
de
text/html; charset=ISO-8859-1
74
de
text/html; charset=windows-1252
61
de
text/html; charset=iso-8859-1
11
de
application/msword
8
de
text/html; charset=UTF-8
8
de
application/xhtml+xml
5
de
application/vnd.ms-powerpoint
4
de
application/xml
4
de
text/html; charset=utf-8
4
de
text/plain; charset=GB18030
4
de
application/gzip
2
de
application/vnd.ms-excel.sheet.4
2
de
image/jpeg
1
de
text/html
1
de
text/html; charset=us-ascii
1
el
application/pdf
1
en
application/pdf
50508
en
application/msword
15836
en
text/plain; charset=ISO-8859-1
14402
en
text/html; charset=windows-1252
12982
en
text/html; charset=iso-8859-1
12861
en
text/plain; charset=windows-1252
11982
en
application/vnd.ms-powerpoint
11667
en
text/html; charset=ISO-8859-1
10930
en
text/html; charset=UTF-8
8195
en
application/vnd.ms-excel
7388
en
text/html; charset=utf-8
3187
en
application/xml
1641
en
application/xhtml+xml
1420
en
message/rfc822
446
en
application/xml; charset=UTF-8
353
en
text/html
304
en
application/gzip
293
en
application/rtf
243
en
text/html; charset=us-ascii
222
en
application/vnd.ms-excel.sheet.4
92
en
text/html; charset=Windows-1252
74
en
text/plain; charset=UTF-8
73
en
text/plain; charset=GB18030
69
en
text/x-java-source
53
en
application/zip
48
en
text/html; charset=10646
45
en
application/vnd.google-earth.kml+xml
37
en
application/vnd.openxmlformats-officedocument.presentationml.presentation
33
en
application/xhtml+xml; charset=utf-8
24
en
text/html; charset=
24
en
text/html; charset=GB18030
23
en
text/html; charset=US-ASCII
22
en
application/rdf+xml
20
en
application/vnd.openxmlformats-officedocument.wordprocessingml.document
18
en
text/html; charset=macintosh
18
en
text/plain; charset=ISO-8859-15
14
en
application/rss+xml
12
en
application/xhtml+xml; charset=iso-8859-1
11
en
text/css; charset=ISO-8859-1
10
en
text/html' charset=iso-8859-1
9
en
text/html; charset=IBM437
9
en
text; charset=ISO-8859-1
9
en
text/html; charset=windows-1250
8
en
text/html; charset=windows-1251
7
en
text/plain; charset=EUC-KR
7
en
image/jpeg
6
en
text/html charset=ISO-8859-1
6
en
text/html; charset=8859-1
6
en
text/html; charset=iso8859-1
6
en
text-html; charset=Windows-1252
5
en
text/html; charset=gb2312
5
en
text/html; charset=windows-1256
5
en
text/plain; charset=ISO-8859-5
5
en
text/plain; charset=KOI8-R
5
en
text/plain; charset=Shift_JIS
5
en
texthtml; charset=is0-8859-1
5
en
text/css; charset=iso-8859-1
4
en
text/css
3
en
text/html; charset=EUC-JP
3
en
text/html; charset=WINDOWS-1252
3
en
text/html; charset=iso_8859_1
3
en
text/html; iso-8859-1=
3
en
text/plain; charset=IBM855
3
en
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
2
en
application/x-tika-msworks-spreadsheet
2
en
text/html; chaobjrset=windows-1252
2
en
text/html; charset=IBM866
2
en
text/html; charset=ISO-8859-15
2
en
text/html; charset=Shift_JIS
2
en
text/html; charset=WINDOWS-1251
2
en
text/html; charset=big5
2
en
text/html; charset=iso-8859-15
2
en
text/html; charset=shift_jis
2
en
text/html; charset=unicode
2
en
text/html; charset=windows-1254
2
en
text/html; set=iso-8859-1
2
en
text/plain; charset=ISO-2022-JP
2
en
 end-functional polystyrene, interdiffusion, neutron reflectometry, surface, thin film , Diffusion, Reflectometry, Thin Films; c
1
en
Public Affairs Officer, USAID/WBG
1
en
This USAID/Timor-Leste page describes the programmatic activities of USAID in Timor-Leste.
1
en
application/vnd.ms-excel.sheet.3
1
en
application/vnd.openxmlformats-officedocument.presentationml.slideshow
1
en
application/xhtml+xml; charset=UTF-8
1
en
noindex
1
en
text/html   charset=iso-8859-1
1
en
text/html+xml; charset=UTF-8
1
en
text/html/ charset=iso-8859-1
1
en
text/html; charset="iso-8859\<sup\>-1\<\/sup\>"
1
en
text/html; charset=IBM855
1
en
text/html; charset=ISO-2022-JP
1
en
text/html; charset=ISO-8859-9
1
en
text/html; charset=KOI8-R
1
en
text/html; charset=UTF-16LE
1
en
text/html; charset=euc-kr
1
en
text/html; charset=iso-10646
1
en
text/html; charset=iso-8859-1; macromedia dreamweaver 4.0=
1
en
text/html; charset=iso-8859-2
1
en
text/html; charset=ks_c_5601-1987
1
en
text/html; charset=x-mac-roman
1
en
text/plain; charset=IBM866
1
en
text/plain; charset=windows-1251
1
en
text/plain; charset=windows-1255
1
es
application/pdf
400
es
text/plain; charset=ISO-8859-1
169
es
text/html; charset=iso-8859-1
157
es
text/html; charset=UTF-8
125
es
application/msword
118
es
text/html; charset=windows-1252
108
es
text/plain; charset=windows-1252
90
es
application/vnd.ms-excel
73
es
text/html; charset=ISO-8859-1
69
es
application/vnd.ms-powerpoint
49
es
text/html; charset=utf-8
44
es
application/xhtml+xml
16
es
text/html
10
es
application/xml
4
es
message/rfc822
3
es
text/plain; charset=ISO-8859-15
3
es
text/plain; charset=UTF-8
3
es
application/rtf
2
es
application/xhtml+xml; charset=utf-8
1
es
text/html; charset=Windows-1252
1
es
text/html; charset=us-ascii
1
es
text; charset=ISO-8859-1
1
et
text/plain; charset=ISO-8859-1
16
et
text/plain; charset=windows-1252
6
et
application/pdf
4
et
text/html; charset=iso-8859-1
3
et
text/html
2
et
application/vnd.ms-excel
1
fi
text/plain; charset=windows-1252
3
fr
text/plain; charset=ISO-8859-1
261
fr
text/plain; charset=windows-1252
155
fr
application/pdf
145
fr
text/html; charset=ISO-8859-1
51
fr
text/html; charset=windows-1252
42
fr
text/html; charset=UTF-8
30
fr
text/html; charset=iso-8859-1
24
fr
application/msword
17
fr
application/vnd.ms-excel
13
fr
application/vnd.ms-powerpoint
6
fr
application/gzip
4
fr
text/html; charset=utf-8
2
fr
application/vnd.ms-excel.sheet.4
1
fr
application/xml
1
he
application/pdf
1
hi
application/pdf
2
hi
text/plain; charset=windows-1252
1
hr
application/pdf
7
hr
application/msword
4
hr
text/plain; charset=ISO-8859-1
4
hr
text/html; charset=ISO-8859-1
3
hr
text/html; charset=windows-1250
3
hr
text/plain; charset=windows-1252
3
hr
text/html; charset=UTF-8
2
hr
text/html; charset=utf-8
1
hu
text/plain; charset=ISO-8859-1
64
hu
text/plain; charset=windows-1252
48
hu
application/vnd.ms-excel
9
hu
application/pdf
5
hu
text/html; charset=windows-1252
3
hu
text/html; charset=ISO-8859-1
2
hu
text/html; charset=utf-8
2
hu
text/html; charset=UTF-8
1
hu
text/html; charset=iso-8859-1
1
id
text/plain; charset=ISO-8859-1
28
id
text/html; charset=ISO-8859-1
19
id
text/plain; charset=windows-1252
15
id
application/pdf
14
id
text/html; charset=windows-1252
14
id
application/msword
5
id
application/vnd.ms-excel
5
id
text/html; charset=utf-8
2
id
text/html; charset=iso-8859-1
1
it
application/pdf
52
it
text/plain; charset=ISO-8859-1
51
it
text/html; charset=iso-8859-1
34
it
text/plain; charset=windows-1252
33
it
text/html; charset=windows-1252
25
it
text/html; charset=ISO-8859-1
12
it
text/html; charset=utf-8
8
it
application/vnd.ms-excel
7
it
application/msword
4
it
application/gzip
2
it
application/vnd.ms-powerpoint
2
it
application/xml
2
it
text/html
2
it
application/xhtml+xml
1
it
text/html; charset=UTF-8
1
ja
text/html; charset=iso-2022-jp
1
ko
application/pdf
2
ko
text/plain; charset=UTF-8
1
lt
application/pdf
10
lt
text/html; charset=UTF-8
2
lt
text/html; charset=windows-1252
2
lt
text/plain; charset=windows-1252
1
lv
application/vnd.ms-excel
1
mk
application/pdf
1
ne
application/msword
1
nl
application/vnd.ms-excel
40
nl
text/plain; charset=ISO-8859-1
13
nl
text/html; charset=ISO-8859-1
8
nl
application/pdf
6
nl
text/plain; charset=windows-1252
6
nl
text/html; charset=iso-8859-1
3
nl
text/html; charset=windows-1252
3
nl
text/html; charset=UTF-8
1
nl
text/html; charset=utf-8
1
nl
text/plain; charset=UTF-8
1
no
text/plain; charset=ISO-8859-1
12
no
application/vnd.ms-excel
6
no
text/plain; charset=windows-1252
2
no
application/pdf
1
no
text/html; charset=windows-1252
1
pl
text/plain; charset=windows-1252
91
pl
text/plain; charset=ISO-8859-1
59
pl
application/pdf
7
pl
application/vnd.ms-excel
5
pl
application/msword
2
pl
text/html; charset=ISO-8859-1
2
pl
text/html; charset=UTF-8
2
pl
application/xml
1
pl
text/html; charset=windows-1252
1
pl
text/plain; charset=windows-1250
1
pt
text/plain; charset=ISO-8859-1
52
pt
application/pdf
36
pt
text/plain; charset=windows-1252
32
pt
text/html; charset=iso-8859-1
12
pt
application/vnd.ms-excel
10
pt
application/msword
9
pt
text/html; charset=windows-1252
9
pt
text/html; charset=ISO-8859-1
7
pt
text/html; charset=UTF-8
4
pt
text/html; charset=utf-8
2
pt
application/gzip
1
pt
application/vnd.ms-powerpoint
1
pt
application/xml
1
pt
message/rfc822
1
pt
text/plain; charset=IBM500
1
ro
text/plain; charset=ISO-8859-1
61
ro
text/html; charset=ISO-8859-1
14
ro
application/vnd.ms-excel
13
ro
text/plain; charset=windows-1252
11
ro
application/pdf
9
ro
text/html; charset=UTF-8
3
ro
text/html; charset=iso-8859-2
3
ro
application/vnd.ms-powerpoint
2
ro
text/html; charset=iso-8859-1
2
ro
text/html; charset=windows-1252
2
ro
application/msword
1
ro
application/xml
1
ro
text/html; charset=utf-8
1
ro
text/html; charset=windows-1251
1
ru
application/pdf
6
ru
application/msword
5
ru
text/html; charset=UTF-8
1
ru
text/html; charset=windows-1251
1
sk
text/plain; charset=ISO-8859-1
20
sk
text/plain; charset=windows-1252
7
sk
application/pdf
4
sk
application/vnd.ms-excel
2
sk
text/html; charset=windows-1252
1
sk
text/plain; charset=windows-1250
1
sl
text/html; charset=ISO-8859-1
16
sl
text/plain; charset=ISO-8859-1
7
sl
text/plain; charset=windows-1252
6
sl
application/msword
2
sl
application/pdf
2
sl
text/html; charset=windows-1252
2
sl
application/xml
1
so
text/plain; charset=ISO-8859-1
107
so
application/vnd.ms-excel
26
so
text/plain; charset=windows-1252
22
so
application/pdf
6
so
text/html; charset=ISO-8859-1
5
so
application/msword
1
so
text/html; charset=UTF-8
1
so
text/html; charset=windows-1252
1
sq
text/plain; charset=ISO-8859-1
489
sq
text/plain; charset=windows-1252
23
sq
text/html; charset=ISO-8859-1
10
sq
application/pdf
7
sq
text/html; charset=windows-1252
4
sq
application/xml
2
sq
text/html; charset=UTF-8
1
sv
text/plain; charset=ISO-8859-1
19
sv
text/html; charset=ISO-8859-1
9
sv
application/gzip
5
sv
text/html; charset=iso-8859-1
4
sv
text/plain; charset=windows-1252
3
sv
application/pdf
2
sv
application/xml
2
sv
text/html; charset=utf-8
2
sv
application/vnd.ms-excel
1
sv
text/html; charset=macintosh
1
sv
text/html; charset=windows-1252
1
sw
text/html; charset=ISO-8859-1
55
sw
text/plain; charset=ISO-8859-1
17
sw
text/plain; charset=windows-1252
6
sw
text/html; charset=windows-1252
4
sw
application/msword
1
th
application/pdf
4
th
application/vnd.ms-powerpoint
1
th
text/html; charset=UTF-8
1
th
text/plain; charset=windows-1255
1
tl
text/plain; charset=ISO-8859-1
24
tl
application/pdf
16
tl
text/html; charset=ISO-8859-1
10
tl
text/plain; charset=windows-1252
10
tl
text/html; charset=windows-1252
5
tl
application/vnd.ms-excel
3
tl
text/html; charset=utf-8
3
tl
application/gzip
2
tl
application/msword
2
tl
application/xml
2
tl
text/html
1
tr
application/pdf
10
tr
text/html; charset=UTF-8
6
tr
text/plain; charset=ISO-8859-1
5
tr
application/msword
1
vi
text/plain; charset=ISO-8859-1
175
vi
application/pdf
39
vi
text/plain; charset=windows-1252
22
vi
text/html; charset=UTF-8
11
vi
text/html; charset=iso-8859-1
6
vi
application/msword
5
vi
application/gzip
4
vi
application/vnd.ms-excel
4
vi
application/xml
3
vi
text/html; charset=utf-8
3
vi
text/html; charset=windows-1252
3
vi
image/jpeg
1
vi
text/html
1
vi
text/html; charset=ISO-8859-1
1
vi
text/html; charset=csVISCII
1
vi
text/plain; charset=UTF-8
1
zh-cn
text/html; charset=UTF-16
17
zh-cn
application/msword
9
zh-cn
text/html; charset=utf-8
1
zh-cn
text/html; charset=windows-1252
1




select LANG_ID1_B, DETECTED_CONTENT_TYPE_B, count(1) as COUNT from comparisons where LANG_ID1_B is not null group by DETECTED_CONTENT_TYPE_B, LANG_ID1_B order by lang_ID1_B, COUNT desc, DETECTED_CONTENT_TYPE_B;