No images in this repository’s iceberg at this time
Download raw (7.0 KB)
#-*- coding: utf-8 -*- from os import listdir from os.path import isfile, join import codecs import re import urllib2 import argparse from sys import stdout parser = argparse.ArgumentParser() parser.add_argument('searchwords', type=str, nargs='+', help="The text-file to parse") parser.add_argument('-e', '--exclude', dest='exclude', type=str, nargs='*', help="The html-file to store produced clip in.") parser.add_argument('-o', '--outname', dest='outname', type=str, help="Output name") args = parser.parse_args() class Node (object): def __init__ (self, data = '', parent=None, level=0): self.parent = parent self.data = data self.children = [] self.level = level self.indentsize = 8 self.indentchar = ' ' self.wordpatt = "((?<=\W)|(?<=^)){0}((?=\W)|(?=$))" def lastChild (self): if self.children: return self.children[-1]; return None def hasChildren (self): return True if self.children else False def include (self, data, level): # print data, level, self.level if ((level - self.level > 1) and self.children): self.lastChild().include(data, level) else: self.children.append(Node(data=data, parent=self, level=self.level+1)) def valid (self, search, exclude): if any(re.search(self.wordpatt.format(word), self.data) <> None for word in search) \ and not any(re.search(self.wordpatt.format(word), self.data) <> None for word in exclude): return True return False def test (self, search, exclude): if self.valid(search, exclude): return True elif any(child.test(search, exclude) for child in self.children): return True return False def clean (self, search, exclude): if not self.valid(search, exclude): filtered = [] for child in self.children: if child.valid(search, exclude): filtered.append(child) else: child.clean(search, exclude) if child.hasChildren(): filtered.append(child) self.children = filtered def text (self): txt = u'{0}{1}\n'.format(self.level * self.indentsize * self.indentchar, self.data) for child in self.children: txt += child.text() return txt def html (self, search): innerhtml = self.data for word in search: # print word, self.wordpatt.format(word), '<span class="highlight">{0}</span>'.format(word), innerhtml innerhtml = re.sub(self.wordpatt.format(word), u'<span class="highlight">{0}</span>'.format(word), innerhtml) html = u'<div class="row level_{0}">{1}</div>\n'.format(self.level, innerhtml) for child in self.children: html += child.html(search) return html pads = [ 'http://species-of-things/ether/p/amazon/export/txt', 'http://species-of-things/ether/p/polyhistorm/export/txt', 'http://species-of-things/ether/p/etymologiae/export/txt', 'http://species-of-things/ether/p/cyclopaedia/export/txt', 'http://species-of-things/ether/p/encyclopedia/export/txt', 'http://species-of-things/ether/p/great-inventions/export/txt', 'http://species-of-things/ether/p/taric/export/txt' ] # pads = [ # '../classification-systems/amazon.txt', # '../classification-systems/cyclopaedia.txt', # '../classification-systems/encyclopedia.txt', # '../classification-systems/etymologiae.txt', # '../classification-systems/greatinventions.txt', # '../classification-systems/polyhistor.txt' # '../classification-systems/taric.txt' # ] is_url_match = r'^http(?:s)?:\/\/' padnames = [] files = [] exclude = args.exclude if args.exclude else [] searchwords = args.searchwords outname = args.outname if args.outname else searchwords[0] trees = [] stdout.write('Opening pads:\n') for path in pads: stdout.write(' - {0}\n'.format(path)) tree = None if re.match(is_url_match, path): handle = urllib2.urlopen(path) #print handle.info() namematch = re.search('(\w+)/export/txt', path) padname = namematch.group(1) text = [unicode(line, 'utf-8') for line in handle.readlines()] else: handle = codecs.open(path, 'r', 'utf-8') namematch = re.search('(\w+)\.\w+$', path) padname = namematch.group(1) text = handle.readlines() padnames.append(padname) for line in text: line = re.sub('\t', ' ', line) match = re.search('^(\s*)(.*?)$', line) head = match.group(1) level = (len(head))/8 if head else 0 data = match.group(2) if not tree: tree = Node(data=data, level=level) else: tree.include(data, level) trees.append(tree) stdout.write('Filtering\n') with codecs.open('../../stories/object-{0}.html'.format(outname), 'w+', encoding='utf-8') as output: for i, tree in enumerate(trees): stdout.write(' - {0}\n'.format(padnames[i])) tree.clean(searchwords, exclude) output.write('<div class="source">') output.write('<div class="source-title">{0}</div>'.format(padnames[i])) output.write(tree.html(searchwords)) output.write('</div>') # stdout.write('Parsing:\n') # for tree in trees: # tree.clean(words, exclude) # print tree.text() # #output.write('<small class="info">highlighted words: '+strwordsnice+'</small><br>\n') # #output.write('<small class="info">excluded words: '+strexclude+'</small>\n\n') # output.write(u'<div id="main">\n') # for i, lines in enumerate(files): # stdout.write(' - {0}\n'.format(padnames[i])) # output.write(u'\t<div class="source">\n\t\t<div class="source-title"><small>{0}</small></div>\n'.format(padnames[i])) # # output.write(padnames[i]) # # output.write('</small><br>') # #lines_lower = [re.sub(u'\s{8}', u'\t', line.lower()) for line in lines] # lines = [re.sub(u'\s{8}', u'\t', line) for line in lines] # for l, line in enumerate(lines): # line = re.sub('\n$', '', line) # # highlight search word # if any(re.search(wordpatt.format(word), line) <> None for word in searchwords): # if not any(re.search(wordpatt.format(word), line) <> None for word in exclude): # level = get_level(line) # for word in searchwords: # line = re.sub(wordpatt.format(word), u'<span cwlass="highlight high{0}">{1}</span>'.format(level, word), line, re.I) # parents = get_parents(l, lines, level, []) # if not parents: # parents = [] # children = get_children(l, lines, level) # if not children: # children = [] # # # print get_level(line), line, parents, children # # line = line.encode("utf-8") # output.write(u'\t\t<div class="match">\n') # output.write(u'\t\t\t<div class="parents">{0}</div>'.format(u''.join([u'<div class="parent level_{0}">{1}</div>'.format(level, parent) for level, parent in parents[::-1]]))) # output.write(u'\t\t\t<div class="line level_{0}"><small class="linenumber">{1}</small>{2}</div>'.format(level, l, line)) # output.write(u'\t\t\t<div class="children">{0}</div>'.format(u''.join([u'<div class="child level_{0}">{1}</div>'.format(level, child) for level, child in children]))) # output.write(u'\t\t</div>\n') # output.write(u'</div>') # output.write(u'\t</div>\n</div>\n') # print '*output written*'