annak
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

collocations.py
text/x-python

Download raw (1.2 KB)

#! /usr/bin/env python2


# Finds bigrams (word couples) in a text
#
# USAGE
#
# python collocations.py TEXT
# python collocations.py "file.txt"     # if text is in current directory
# python collocations.py "../books-AK/Ice/Ice.txt"     # text folder from project directory


import os
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.collocations import BigramCollocationFinder
from optparse import OptionParser


def collocations(text, n=20):
    """
    Returns a list of bigrams
    """
    (txt_dir, txt_name) = os.path.split(text)
    corpus = PlaintextCorpusReader(txt_dir, [txt_name])
    n_text = nltk.text.Text(corpus.words(txt_name))

    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # Finds bigrams
    finder = BigramCollocationFinder.from_words(n_text)
    ignored_words = nltk.corpus.stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)

    return finder.nbest(bigram_measures.likelihood_ratio, n)


if __name__ == '__main__':
    parser = OptionParser()
    parser.add_option("-n", "--number", type="int", dest="number", default=20, help="clean cache")
    (options, args) = parser.parse_args()

    bigrams = collocations(args[0], n=options.number)
    print(bigrams)