-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguageDetection.py
80 lines (66 loc) · 2.72 KB
/
languageDetection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from nltk.util import trigrams as nltk_trigrams
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.probability import FreqDist
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat
'''
from http://misja.posterous.com/language-detection-with-python-nltk?utm_source=twitterfeed&utm_medium=twitter
'''
class LangIdCorpusReader(CorpusReader):
'''
LangID corpus reader
'''
CorpusView = StreamBackedCorpusView
def _get_trigram_weight(self, line):
'''
Split a line in a trigram and its frequency count
'''
data = line.strip().split(' ')
if len(data) == 2:
return (data[1], int(data[0]))
def _read_trigram_block(self, stream):
'''
Read a block of trigram frequencies
'''
freqs = []
for i in range(20): # Read 20 lines at a time.
freqs.append(self._get_trigram_weight(stream.readline()))
return filter(lambda x: x != None, freqs)
def freqs(self, fileids=None):
'''
Return trigram frequencies for a language from the corpus
'''
return concat([self.CorpusView(path, self._read_trigram_block)
for path in self.abspaths(fileids=fileids)])
class LangDetect(object):
language_trigrams = {}
langid = LazyCorpusLoader('langid', LangIdCorpusReader,
r'(?!\.).*\.txt')
def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
for lang in languages:
self.language_trigrams[lang] = FreqDist()
for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
self.language_trigrams[lang].inc(f[0], f[1])
def detect(self, text):
'''
Detect the text's language
'''
words = nltk_word_tokenize(text.lower())
trigrams = {}
scores = dict([(lang, 0) for lang in self.language_trigrams.keys()])
for match in words:
for trigram in self.get_word_trigrams(match):
if not trigram in trigrams.keys():
trigrams[trigram] = 0
trigrams[trigram] += 1
total = sum(trigrams.values())
for trigram, count in trigrams.items():
for lang, frequencies in self.language_trigrams.items():
# normalize and add to the total score
scores[lang] += (float(frequencies[trigram]) /
float(frequencies.N())) * (float(count) / float(total))
return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]
def get_word_trigrams(self, match):
return [''.join(trigram) for trigram in nltk_trigrams(match) if
trigram != None]