species-of-things
clone your own copy | download snapshot

Snapshots | iceberg

No images in this repository’s iceberg at this time

Inside this repository

objects.v2.py
text/x-python

Download raw (7.0 KB)

#-*- coding: utf-8 -*-

from os import listdir
from os.path import isfile, join 
import codecs 
import re 
import urllib2
import argparse
from sys import stdout

parser = argparse.ArgumentParser()
parser.add_argument('searchwords', type=str, nargs='+', help="The text-file to parse")
parser.add_argument('-e', '--exclude', dest='exclude', type=str, nargs='*', help="The html-file to store produced clip in.")
parser.add_argument('-o', '--outname', dest='outname', type=str, help="Output name") 
args = parser.parse_args()

class Node (object):
  def __init__ (self, data = '', parent=None, level=0):
    self.parent = parent
    self.data = data
    self.children = []
    self.level = level
    self.indentsize = 8
    self.indentchar = ' '
    self.wordpatt = "((?<=\W)|(?<=^)){0}((?=\W)|(?=$))"

  def lastChild (self):
    if self.children:
      return self.children[-1];
    return None 

  def hasChildren (self):
    return True if self.children else False

  def include (self, data, level):
    # print data, level, self.level
    if ((level - self.level > 1) and self.children):
      self.lastChild().include(data, level)
    else:
      self.children.append(Node(data=data, parent=self, level=self.level+1))

  def valid (self, search, exclude):
    if any(re.search(self.wordpatt.format(word), self.data) <> None for word in search) \
      and not any(re.search(self.wordpatt.format(word), self.data) <> None for word in exclude):
        return True
    
    return False

  def test (self, search, exclude):
    if self.valid(search, exclude):
      return True
    elif any(child.test(search, exclude) for child in self.children):
      return True

    return False

  def clean (self, search, exclude):
    if not self.valid(search, exclude):
      filtered = [] 
      for child in self.children:
        if child.valid(search, exclude):
          filtered.append(child)
        else:
          child.clean(search, exclude)
          if child.hasChildren():
            filtered.append(child)

      self.children = filtered

  def text (self):
    txt = u'{0}{1}\n'.format(self.level * self.indentsize * self.indentchar, self.data)

    for child in self.children:
      txt += child.text()

    return txt

  def html (self, search):
    innerhtml = self.data
    for word in search:
      # print word, self.wordpatt.format(word), '<span class="highlight">{0}</span>'.format(word), innerhtml
      innerhtml = re.sub(self.wordpatt.format(word), u'<span class="highlight">{0}</span>'.format(word), innerhtml)

    html = u'<div class="row level_{0}">{1}</div>\n'.format(self.level, innerhtml)
    for child in self.children:
      html += child.html(search)

    return html 

pads = [
  'http://species-of-things/ether/p/amazon/export/txt',
  'http://species-of-things/ether/p/polyhistorm/export/txt',
  'http://species-of-things/ether/p/etymologiae/export/txt',
  'http://species-of-things/ether/p/cyclopaedia/export/txt',
  'http://species-of-things/ether/p/encyclopedia/export/txt',
  'http://species-of-things/ether/p/great-inventions/export/txt',
  'http://species-of-things/ether/p/taric/export/txt'
]

# pads = [
  # '../classification-systems/amazon.txt',
  # '../classification-systems/cyclopaedia.txt',
  # '../classification-systems/encyclopedia.txt',
  # '../classification-systems/etymologiae.txt',
  # '../classification-systems/greatinventions.txt',
  # '../classification-systems/polyhistor.txt'
  # '../classification-systems/taric.txt'
# ]

is_url_match = r'^http(?:s)?:\/\/'
padnames = []
files = []

exclude = args.exclude if args.exclude else []
searchwords = args.searchwords
outname = args.outname if args.outname else searchwords[0]

trees = []

stdout.write('Opening pads:\n')
for path in pads:
  stdout.write('  - {0}\n'.format(path))
  tree = None

  if re.match(is_url_match, path):
    handle = urllib2.urlopen(path)
    #print handle.info()
    namematch = re.search('(\w+)/export/txt', path)
    padname = namematch.group(1)
    text = [unicode(line, 'utf-8') for line in handle.readlines()]
    
  else:
    handle = codecs.open(path, 'r', 'utf-8')
    namematch = re.search('(\w+)\.\w+$', path) 
    padname = namematch.group(1)
    text = handle.readlines()

  padnames.append(padname)

  for line in text:
    line = re.sub('\t', '        ', line)
    match = re.search('^(\s*)(.*?)$', line)
    head = match.group(1)
    level = (len(head))/8 if head else 0
    data = match.group(2)

    if not tree:
      tree = Node(data=data, level=level)
    else:
      tree.include(data, level)

  trees.append(tree)

stdout.write('Filtering\n')
with codecs.open('../../stories/object-{0}.html'.format(outname), 'w+', encoding='utf-8') as output:
  for i, tree in enumerate(trees):
    stdout.write('  - {0}\n'.format(padnames[i]))
    tree.clean(searchwords, exclude)
    output.write('<div class="source">')
    output.write('<div class="source-title">{0}</div>'.format(padnames[i]))
    output.write(tree.html(searchwords))
    output.write('</div>')

# stdout.write('Parsing:\n')
#   for tree in trees:
#     tree.clean(words, exclude)
#     print tree.text()

# 	#output.write('<small class="info">highlighted words: '+strwordsnice+'</small><br>\n')
# 	#output.write('<small class="info">excluded words: '+strexclude+'</small>\n\n')
# 	output.write(u'<div id="main">\n')

# 	for i, lines in enumerate(files):
# 		stdout.write('  - {0}\n'.format(padnames[i]))
# 		output.write(u'\t<div class="source">\n\t\t<div class="source-title"><small>{0}</small></div>\n'.format(padnames[i]))
# 		# output.write(padnames[i])
# 		# output.write('</small><br>')
# 		#lines_lower = [re.sub(u'\s{8}', u'\t', line.lower()) for line in lines]
# 		lines = [re.sub(u'\s{8}', u'\t', line) for line in lines]

# 		for l, line in enumerate(lines):
# 			line = re.sub('\n$', '', line)

# 			# highlight search word
# 			if any(re.search(wordpatt.format(word), line) <> None for word in searchwords):
# 				if not any(re.search(wordpatt.format(word), line) <> None for word in exclude):
# 					level = get_level(line)
# 					for word in searchwords:
# 						line = re.sub(wordpatt.format(word), u'<span cwlass="highlight high{0}">{1}</span>'.format(level, word), line, re.I)
# 					parents = get_parents(l, lines, level, [])
# 					if not parents:
# 						parents = []
# 					children = get_children(l, lines, level)
# 					if not children:
# 						children = []
# 					# # print get_level(line), line, parents, children
# 					# line = line.encode("utf-8")
# 					output.write(u'\t\t<div class="match">\n')
# 					output.write(u'\t\t\t<div class="parents">{0}</div>'.format(u''.join([u'<div class="parent level_{0}">{1}</div>'.format(level, parent) for level, parent in parents[::-1]])))
# 					output.write(u'\t\t\t<div class="line level_{0}"><small class="linenumber">{1}</small>{2}</div>'.format(level, l, line))
# 					output.write(u'\t\t\t<div class="children">{0}</div>'.format(u''.join([u'<div class="child level_{0}">{1}</div>'.format(level, child) for level, child in children])))
# 					output.write(u'\t\t</div>\n')
# 		output.write(u'</div>')
# 	output.write(u'\t</div>\n</div>\n')

# print '*output written*'