vj12
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

vj12.py
text/x-python

Download raw (12.6 KB)

# -*- coding: utf-8 -*-

"""
This is the main script for vj12 publication websites.  It runs on bottle, a
micro framework written in python. See <http://bottlepy.org/>.
"""


from __future__ import division


import bottle
import nltk
import os
import re

import codecs
from glob import glob
from string import replace
from bottle import (run, get, request, response, template, route, static_file)
from glob import glob
from json import dumps
from nltk.corpus import PlaintextCorpusReader
from string import replace
import random


PROJECT_DIR = os.path.abspath(os.path.dirname(__file__))
CORPUS_ROOT = os.path.join(PROJECT_DIR, 'texts')
STATIC_DIR = os.path.join(PROJECT_DIR, '..', 'static')

def pick_ascii(light=False):
    if light:
        fn = 'ascii-simple.tpl'
    else:
        fn = random.choice(["ascii.tpl", "ascii2.tpl", "ascii3.tpl", "ascii4.tpl", "ascii5.tpl"])
    f = open("templates/ascii/%s" % fn, "r") 
    background = f.read()
    f.close()
    return background

@route('/')
def home():
    return template('templates/home', background = pick_ascii())


@route('/about/')
def about():
    return template('templates/about', background = pick_ascii())


@route('/kaleidosmatch/')
def kaleidosmatch():
    return template('templates/kaleidosmatch', background = pick_ascii())

@route('/fit_the_annual_report_for_purpose/')
def fit_the_annual_report_for_purpose():
    return template('templates/fit_the_annual_report_for_purpose', background=pick_ascii())

@route('/macro/')
def macro():
    return template('templates/macro', background=pick_ascii(light=True))

@route('/micromacro/')
def micromacro():
    return template('templates/micro', background = pick_ascii())

@route('/moss_ambiguity/')
def moss_ambiguity():
    return template('templates/moss_ambiguity', background = pick_ascii())


# BELOW: EXPERIMENTS


@route('/context')
@route('/context/:filename/:word')
def context(filename = False, word = False):
    """
    An interface to look at permutated indices.
    """
    if word and re.match ("^[a-zA-Z0-9_\(\),\.]+$", filename) and re.match ("^[a-zA-Z0-9_]+$", word):
        handler = codecs.open ("texts/%s.txt" % filename, "r", "utf-8")
        found_lines = []
        head_length = 0
        
        for line in handler:
            pattern = "(?P<head>^|^\W+|(\w+\W+){1,4})(?P<body>%s)(?P<tail>(\W+\w+){1,4}|\W+$|$)" % word
            match = re.search (pattern, line);
            while match:
                head_length = len (match.group ('head')) if len(match.group ('head')) > head_length else head_length
                found_lines.append ({'head': match.group ('head'), 'body': match.group ('body'), 'tail': match.group ('tail')})
                line = line[match.end('body'):]
                match = re.search (pattern, line)
        
        for x, line in enumerate(found_lines):
            found_lines[x]['head'] = found_lines[x]['head'].rjust (head_length, ' ')
            found_lines[x]['full_line'] = ''.join (line)
            
    return dumps ({'filename': filename, 'word': word, 'result': found_lines})


@route('/compare/')
def compare():
    return template ('templates/compare', background = pick_ascii())

@route('/view/')
def view():
    return template('templates/view')
    
@route('/text/pair_list/')
def pairList ():
    return dumps (['Fit_for_purpose|The_man_pages', 'To_talk_of_many_things|Systemic_ambiguity', 'Kaleidoscope,_a_genesis|Smatch_(1)'])
    
@route('/text/list/')
def textlist ():
    """
    Generates a a javascript array of the available texts filename.
    """
    files = []
    for filename in glob(os.path.join(CORPUS_ROOT, '*.txt')):
        name = os.path.basename(filename)
        (basename, extension) = os.path.splitext(name)
        files.append(basename)

    return dumps (files)
        
@route('/text/:filename')
def text(filename):
    """
    Returns a json dictionnary containing:
    
    - data: the content of the file "filename"
    - name: the name of the text, deduced form the file name
    """
    path = 'texts/'
    filename = os.path.join(CORPUS_ROOT, "%s.txt" % filename)
    name = os.path.basename(filename)
    (basename, extension) = os.path.splitext(name)
    file_dict = {'name': basename, 'data': None}
    
    f = open(filename, 'r')
    file_dict['data'] = f.read()
    f.closed
    
    return dumps(file_dict)

@route('/js')
def js():
    return static_file ('jquery.min.js', 'js/')
    
@route('/collocations/:text/')
def collocations(text):
    text = '%s.txt' % text
    corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
    n_text = nltk.text.Text(corpus.words(text))

    bigram_measures = nltk.collocations.BigramAssocMeasures()

    # Finds bigrams
    from nltk.collocations import BigramCollocationFinder
    
    finder = BigramCollocationFinder.from_words(n_text)
    ignored_words = nltk.corpus.stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
    bigrams = finder.nbest(bigram_measures.likelihood_ratio, 20)

    #return template('templates/split', text=source, word_list=foo)
    return dumps ({'name': text, 'bigrams': bigrams})

@route('/similar/:text/:word/')
def similar (text, word):
    if re.match ("^[a-zA-Z0-9_\(\),\.]+$", text) and re.match ("^[a-zA-Z0-9_]+$", word):
        text = '%s.txt' % text
        
        f = open(os.path.join(CORPUS_ROOT, text), 'r')
        source = f.read()
        f.close()
        
        corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
        n_text = nltk.text.Text(corpus.words(text))
        context_index = nltk.text.ContextIndex(n_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
        word = word.lower()
        wci = context_index._word_to_contexts
        result = []
        
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = nltk.probability.FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
            words = nltk.util.tokenwrap(fd.keys()[:20])
            
            for middle_word in words.split(' '):
                for context in contexts:
                    if re.search ("/" + context[0] + "(\W|\s)+" + middle_word + "(\W|\s)+" + context[1] + "/i", source) != 'none':
                        print (context[0], middle_word, context[1])
                        result.append ({'word': word, 'context_left': context[0], 'context_right': context[1]})
            
        return dumps ({'name': text, 'word': word, 'result': result})    
    
@route('/word_list/:text')
def word_list(text):
    """Returns an alphabetical list of words for the given text."""
    corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
    n_text = nltk.text.Text(corpus.words(text))

    f = open(os.path.join(CORPUS_ROOT, text), 'r')
    source = f.read()
    f.close()

    # TODO: use NLTK built-in functions for this!
    word_list = sorted(set(map(str.lower, n_text)))
    word_list = sorted(set(w.lower() for w in n_text if w.isalpha()))

    # How many words are used? (includes punctuation and stop words)
    token_count = len(word_list)

    token_usage_average = len(map(str.lower, n_text)) / len(set(map(str.lower, n_text)))

    # How many times the word the appears?
    the_word_count = n_text.count("the")
    print(the_word_count)
    
    # What is the coverage in pourcents of the word "the" in the text?
    the_word_coverage = 100 * n_text.count('the') / len(n_text)
    print(the_word_coverage)

    # What are the 50 most used words 
    frequency_distribution = nltk.probability.FreqDist(n_text)
    print(frequency_distribution.keys()[:50])

    # Prints every single word and how many times it appears
    for (word, count) in frequency_distribution.iteritems():
        print(word, count)

    # How many times the word "digital" appears?
    print(frequency_distribution['digital'])

    # Plots the frequency of the 50 most used words
    #frequency_distribution.plot(50, cumulative=True)

    # Find words with more than 10 letters
    long_words = [w for w in n_text if len(w) > 10]
    print(long_words)

    # Find words with more than 10 letters appearing more than 3 times
    common_long_words = sorted([w for w in n_text if len(w) > 10 and frequency_distribution[w] > 3])
    print(common_long_words)

    # What are the bigrams (words appearing often in pairs, eg. "red wine")
    # NOTE: not very useful yet as we want to find "rare words"
    text_bigrams = nltk.util.bigrams(n_text)
    print(text_bigrams)

    # Distribution of word length in the text
    fdist = nltk.probability.FreqDist([len(w) for w in n_text])
    print(fdist.keys())

    # What is the longest word in the text
    print(max(fdist))

    # What is the occurence of 3 letters words?
    print(fdist.freq(3))

    print(fdist.tabulate())

    # What are the unusual words?
    def unusual_words(text):
        text_vocab = set(w.lower() for w in text if w.isalpha())
        english_vocab = set(w.lower() for w in nltk.corpus.words.words())
        unusual = text_vocab.difference(english_vocab)
        return sorted(unusual)
    print(unusual_words(n_text))

    # non-stop words
    def non_stop_words(text):
        text_vocab = set(w.lower() for w in text if w.isalpha())
        english_vocab = set(w.lower() for w in nltk.corpus.stopwords.words('english'))
        non_stop = text_vocab.difference(english_vocab)
        return sorted(non_stop)
    print(non_stop_words(word_list))

    # Prints all the words finishing by ed
    ed_words = [w for w in non_stop_words(word_list) if re.search('ed$', w)]
    print(ed_words)

    # Prints all the words finishing by ing
    ing_words = [w for w in non_stop_words(word_list) if re.search('ing$', w)]
    print(ing_words)

    # Prints all the words finishing by ed or ing
    ed_or_ing_words = [w for w in non_stop_words(word_list) if re.search('(ing|ed)$', w)]
    print(ed_or_ing_words)

    # Prints all numbers
    numbers = [w for w in non_stop_words(word_list) if re.search('[0-9]+', w)]
    print(numbers)


    # Removes suffixes to find unique lemmas
    # eg. laptop(s)
    # FIXME: returns non-existing words
    # TODO: use NLTK built-in function instead
    def stem(word):
        regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
        stem, suffix = re.findall(regexp, word)[0]
        return stem
    stems = [stem(t) for t in non_stop_words(word_list)]
    print(stems)

    # Tokenizes the text
    # NOTE: what's the difference between this method and the way n_text is produced?
    f = open(os.path.join(CORPUS_ROOT, text), 'r')
    tokens = nltk.word_tokenize(f.read())
    f.close()
    print(tokens)


    porter = nltk.PorterStemmer()
    grail = nltk.corpus.webtext.words('grail.txt')
    text = IndexedText(porter, grail)
    text.concordance('lie')


    wnl = nltk.WordNetLemmatizer()
    print([wnl.lemmatize(t) for t in tokens])


    
    #word_list = "<br />".join(non_stop_words(word_list))
    foo = ""
    for word in non_stop_words(word_list):
        foo += "<a href='#'>"
        foo += word
        foo += "</a><br />"

    return template('templates/split', text=source, word_list=foo)


class IndexedText(object):
    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index((self._stem(word), i)
                                 for (i, word) in enumerate(text))

    def concordance(self, word, width=40):
        key = self._stem(word)
        wc = int(width / 4)                # words of context
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i - wc:i])
            rcontext = ' '.join(self._text[i:i + wc])
            ldisplay = '%*s'  % (width, lcontext[-width:])
            rdisplay = '%-*s' % (width, rcontext[:width])
            print ldisplay, rdisplay

    def _stem(self, word):
        return self._stemmer.stem(word).lower()

@route('/concordance/:text')
def concordance(text):
    """Returns an alphabetical list of words for the given text."""
    corpus = PlaintextCorpusReader(CORPUS_ROOT, [text])
    n_text = nltk.text.Text(corpus.words(text))
    interesting = [
            'data',
            'grey',
            'literature',
            'relation',
            'user',
            'information',
            'error',
            'value',
            'other',
            ]
    # TODO: use NLTK built-in functions for this!
    word_list = map(str.lower, list(set(list(corpus.words()))))
    word_list.sort()
    #word_list = "<br />".join(word_list)
    return template('templates/split', word_list=word_list, text=text)


@route('/static/<filename:path>')
def send_static(filename):
    # NOTE: route filers neccesitate bottle >= 0.10.
    return static_file(filename, root=STATIC_DIR)


if __name__ == '__main__':
    bottle.debug(True)
    run(host='localhost', port=8080)