languageDetection.py

from nltk.util import trigrams as nltk_trigrams
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.probability import FreqDist
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.util import StreamBackedCorpusView, concat

'''
from http://misja.posterous.com/language-detection-with-python-nltk?utm_source=twitterfeed&utm_medium=twitter
'''

class LangIdCorpusReader(CorpusReader):
    '''
    LangID corpus reader
    '''
    CorpusView = StreamBackedCorpusView

    def _get_trigram_weight(self, line):
        '''
        Split a line in a trigram and its frequency count
        '''
        data = line.strip().split(' ')
        if len(data) == 2:
            return (data[1], int(data[0]))

    def _read_trigram_block(self, stream):
        '''
        Read a block of trigram frequencies
        '''
        freqs = []
        for i in range(20): # Read 20 lines at a time.
            freqs.append(self._get_trigram_weight(stream.readline()))
        return filter(lambda x: x != None, freqs)

    def freqs(self, fileids=None):
        '''
        Return trigram frequencies for a language from the corpus        
        '''
        return concat([self.CorpusView(path, self._read_trigram_block) 
                       for path in self.abspaths(fileids=fileids)])

class LangDetect(object):
    language_trigrams = {}
    langid            = LazyCorpusLoader('langid', LangIdCorpusReader,
r'(?!\.).*\.txt')

    def __init__(self, languages=['nl', 'en', 'fr', 'de', 'es']):
        for lang in languages:
            self.language_trigrams[lang] = FreqDist()
            for f in self.langid.freqs(fileids=lang+"-3grams.txt"):
                self.language_trigrams[lang].inc(f[0], f[1])

    def detect(self, text):
        '''
        Detect the text's language
        '''
        words    = nltk_word_tokenize(text.lower())
        trigrams = {}
        scores   = dict([(lang, 0) for lang in self.language_trigrams.keys()])

        for match in words:
            for trigram in self.get_word_trigrams(match):
                if not trigram in trigrams.keys():
                    trigrams[trigram] = 0
                trigrams[trigram] += 1

        total = sum(trigrams.values())

        for trigram, count in trigrams.items():
            for lang, frequencies in self.language_trigrams.items():
                # normalize and add to the total score
                scores[lang] += (float(frequencies[trigram]) /
float(frequencies.N())) * (float(count) / float(total))

        return sorted(scores.items(), key=lambda x: x[1], reverse=True)[0][0]

    def get_word_trigrams(self, match):
        return [''.join(trigram) for trigram in nltk_trigrams(match) if
trigram != None]