alchorisma
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

bigrams.py
text/x-python

Download raw (2.4 KB)

from pattern.en import ngrams, parsetree
import os.path
import glob

# paths = [
#   "../content/essays/feedback-loops/Feedback-loops-temporal-shifts.md",
#   "../content/essays/turning-into-trees/Tu(r)ning-into-Trees.md",
#   "../content/essays/we-are-all-earth/We-are-all-earth.md",

# ]

paths = glob.glob('../content/**/*.md', recursive=True)

# https://stackoverflow.com/a/1751478
def chunks(l, n):
  n = max(1, n)

  for i in range(0, len(l)-n-1):
    yield l[i:i+n]

with open('stopwords-pattern.txt', 'r') as h:
  stopwords = list(map(str.strip, h.read().split(r',')))

sources = {}

# bigram as key
# {(lemmaA:str, lemmaB:str): [(filename:str, a:Word, b:Word),]}
bigrams = {}

for p in paths:
  with open(p, 'r') as h:
    filename = os.path.basename(p)
    text = h.read()
    sources[filename] = text

    parsed = parsetree(text, tokenize=True, tags=True, chunks=False, relations=False, lemmata=True)

    lemma_bigrams = {}

    for sentence in parsed.sentences:
      # Transform into standard list and filter out interpunction
      word_list = list(filter(lambda w: w.tag not in ['O', ',', 'SYM', '(', ')', ':', '', '#', '"'] and w.lemma not in ['^', '’', '‘', '\'', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], list(sentence.words)))

      for a, b in chunks(word_list, 2):
        # Drop if b is in stopwords, or if both a and b are 
        # in stopwords. If b is in stopwords, we don't care
        # about a
        if a.lemma in stopwords and b.lemma in stopwords:
          continue
        
        bigram = (a.lemma, b.lemma)

        if bigram not in bigrams:
          bigrams[bigram] = []

        bigrams[bigram].append((filename, a, b))

    # for bigram in ngrams(text, n=2):
    #   if bigram[0] in stopwords and bigram[1] in stopwords:
    #     # Skip if both bigrams are stopwords
    #     continue

    #   if not bigram in bigrams:
    #     bigrams[bigram] = []

    #   bigrams[bigram].append(filename)

# Loop through all bigrams, filter out the once that occur
# only in one text
filtered_bigrams = {}
for bigram, entries in bigrams.items():
  texts = [r[0] for r in entries]
  texts_set = set(texts)

  if len(texts_set) > 1:
    occurences = ['{} {}'.format(a.string, b.string) for _, a, b in entries]
    filtered_bigrams[bigram] = texts_set
    print('')
    print(' -- '.join(bigram))
    print(', '.join(occurences))
    # print('{} -- {}'.format(entries[0][1].tag, entries[0][2].tag))
    print(', '.join(texts_set))

# print(filtered_bigrams)