anonym said:
I am looking for an available java function or library that takes a
sentence or a text as an input and outputs whether the text is in
English or not.
You can look at monograph or digraph frequencies and make
a guess based on those.
I did some experiments a long time ago.
See the C snippet below for some ideas.
Arne
=====================================================
// monograph RIO analysis
if((f['r']+f['R'])>(f['i']+f['I'])) {
indicator[DK]++;
indicator[FR]--;
}
if((f['O']+f['o'])>(f['R']+f['r'])) {
indicator[UK]++;
indicator[ES]++;
indicator[DK]--;
}
if((f['I']+f['i'])>(f['O']+f['o'])) {
indicator[DE]++;
indicator[UK]--;
indicator[ES]--;
}
// characteristic digraph analysis
if((ff['t'*256+'h']+ff['T'*256+'H']+ff['T'*256+'h'])>0.01*l) {
indicator[UK]++;
indicator[DK]--;
indicator[FR]--;
indicator[DE]--;
indicator[ES]--;
}
if((ff['c'*256+'h']+ff['C'*256+'H']+ff['C'*256+'h'])>0.01*l) {
indicator[DE]++;
indicator[DK]--;
indicator[FR]--;
indicator[ES]--;
}
if((ff['o'*256+'u']+ff['O'*256+'U']+ff['O'*256+'u'])>0.01*l) {
indicator[UK]++;
indicator[FR]++;
indicator[DE]--;
indicator[DK]--;
indicator[ES]--;
}
if((ff['n'*256+'t']+ff['N'*256+'T']+ff['N'*256+'t'])>0.01*l) {
indicator[FR]++;
indicator[UK]--;
indicator[DE]--;
indicator[ES]--;
}
if((ff['u'*256+'e']+ff['U'*256+'E']+ff['U'*256+'e'])>0.01*l) {
indicator[ES]++;
indicator[DK]--;
indicator[UK]--;
indicator[FR]--;
indicator[DE]--;
}
if((ff['l'*256+'a']+ff['L'*256+'A']+ff['L'*256+'a'])>0.01*l) {
indicator[ES]++;
indicator[DK]--;
indicator[FR]--;
indicator[DE]--;
}
// unused characters analysis
if((f['j']+f['J'])>0.01*l) {
indicator[DE]--;
}
if((f['k']+f['K'])>0.01*l) {
indicator[DK]++;
indicator[FR]--;
indicator[ES]--;
}
if((f['w']+f['W'])>0.01*l) {
indicator[UK]++;
indicator[DE]++;
indicator[FR]--;
indicator[ES]--;
}
if((f['y']+f['Y'])>0.01*l) {
indicator[UK]++;
indicator[FR]--;
indicator[DE]--;
}
// special characters analysis
if((f[UCHAR('Æ')]+f[UCHAR('Ø')]+f[UCHAR('Å')]+
f[UCHAR('æ')]+f[UCHAR('ø')]+f[UCHAR('å')])>0) { // danish
indicator[DK]++;
indicator[UK]--;
indicator[FR]--;
indicator[DE]--;
indicator[ES]--;
}
if((f[UCHAR('Ä')]+f[UCHAR('Ö')]+f[UCHAR('Ü')]+
f[UCHAR('ä')]+f[UCHAR('ö')]+f[UCHAR('ü')])>0) { // german umlaut
indicator[DE]++;
indicator[DK]--;
indicator[UK]--;
indicator[FR]--;
indicator[ES]--;
}
if((f[UCHAR('É')]+f[UCHAR('Í')]+f[UCHAR('Ó')]+
f[UCHAR('é')]+f[UCHAR('í')]+f[UCHAR('ó')])>0) { // roman slash
indicator[FR]++;
indicator[ES]++;
indicator[DK]--;
indicator[UK]--;
indicator[DE]--;
}
if((f[UCHAR('Ñ')]+f[UCHAR('ñ')])>0) { // spanish n tilde
indicator[ES]++;
indicator[DK]--;
indicator[UK]--;
indicator[FR]--;
indicator[DE]--;
}
if((f[UCHAR('Ç')]+f[UCHAR('ç')])>0) { // french c cedile
indicator[FR]++;
indicator[DK]--;
indicator[UK]--;
indicator[DE]--;
indicator[ES]--;
}
if((f[UCHAR('ß')])>0) { // german double s
indicator[DE]++;
indicator[FR]--;
indicator[DK]--;
indicator[UK]--;
indicator[ES]--;
}
if((f[UCHAR('À')]+f[UCHAR('È')]+f[UCHAR('Ò')]+
f[UCHAR('à')]+f[UCHAR('è')]+f[UCHAR('ò')])>0) { // roman backslash
indicator[FR]++;
indicator[DK]--;
indicator[UK]--;
indicator[DE]--;
}
if((f[UCHAR('Ê')]+f[UCHAR('Î')]+f[UCHAR('Ô')]+
f[UCHAR('ê')]+f[UCHAR('î')]+f[UCHAR('ô')])>0) { // roman hat
indicator[FR]++;
indicator[DK]--;
indicator[UK]--;
indicator[DE]--;
}