Download raw (2.4 KB)
from pattern.en import ngrams, parsetree import os.path import glob # paths = [ # "../content/essays/feedback-loops/Feedback-loops-temporal-shifts.md", # "../content/essays/turning-into-trees/Tu(r)ning-into-Trees.md", # "../content/essays/we-are-all-earth/We-are-all-earth.md", # ] paths = glob.glob('../content/**/*.md', recursive=True) # https://stackoverflow.com/a/1751478 def chunks(l, n): n = max(1, n) for i in range(0, len(l)-n-1): yield l[i:i+n] with open('stopwords-pattern.txt', 'r') as h: stopwords = list(map(str.strip, h.read().split(r','))) sources = {} # bigram as key # {(lemmaA:str, lemmaB:str): [(filename:str, a:Word, b:Word),]} bigrams = {} for p in paths: with open(p, 'r') as h: filename = os.path.basename(p) text = h.read() sources[filename] = text parsed = parsetree(text, tokenize=True, tags=True, chunks=False, relations=False, lemmata=True) lemma_bigrams = {} for sentence in parsed.sentences: # Transform into standard list and filter out interpunction word_list = list(filter(lambda w: w.tag not in ['O', ',', 'SYM', '(', ')', ':', '', '#', '"'] and w.lemma not in ['^', '’', '‘', '\'', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], list(sentence.words))) for a, b in chunks(word_list, 2): # Drop if b is in stopwords, or if both a and b are # in stopwords. If b is in stopwords, we don't care # about a if a.lemma in stopwords and b.lemma in stopwords: continue bigram = (a.lemma, b.lemma) if bigram not in bigrams: bigrams[bigram] = [] bigrams[bigram].append((filename, a, b)) # for bigram in ngrams(text, n=2): # if bigram[0] in stopwords and bigram[1] in stopwords: # # Skip if both bigrams are stopwords # continue # if not bigram in bigrams: # bigrams[bigram] = [] # bigrams[bigram].append(filename) # Loop through all bigrams, filter out the once that occur # only in one text filtered_bigrams = {} for bigram, entries in bigrams.items(): texts = [r[0] for r in entries] texts_set = set(texts) if len(texts_set) > 1: occurences = ['{} {}'.format(a.string, b.string) for _, a, b in entries] filtered_bigrams[bigram] = texts_set print('') print(' -- '.join(bigram)) print(', '.join(occurences)) # print('{} -- {}'.format(entries[0][1].tag, entries[0][2].tag)) print(', '.join(texts_set)) # print(filtered_bigrams)