No images in this repository’s iceberg at this time
Download raw (4.8 KB)
from os import listdir from os.path import isfile, join import codecs import re import urllib import argparse from sys import stdout parser = argparse.ArgumentParser() parser.add_argument('searchwords', type=str, nargs='+', help="The text-file to parse") parser.add_argument('-e', '--exclude', dest='exclude', type=str, nargs='*', help="The html-file to store produced clip in.") parser.add_argument('-o', '--outname', dest='outname', type=str, help="Output name") args = parser.parse_args() pads = [ 'http://10.10.161.238/ether/p/amazon/export/txt', 'http://10.10.161.238/ether/p/polyhistorm/export/txt', 'http://10.10.161.238/ether/p/etymologiae/export/txt', 'http://10.10.161.238/ether/p/cyclopaedia/export/txt', 'http://10.10.161.238/ether/p/encyclopedia/export/txt', 'http://10.10.161.238/ether/p/great-inventions/export/txt', 'http://10.10.161.238/ether/p/taric/export/txt' ] # pads = [ # '../classification-systems/amazon.txt', # '../classification-systems/cyclopaedia.txt', # '../classification-systems/encyclopedia.txt', # '../classification-systems/etymologiae.txt', # '../classification-systems/greatinventions.txt', # '../classification-systems/polyhistor.txt', # '../classification-systems/taric.txt' # ] # Return parents def get_parents (start, lines, level, parents = []): for l in range(start, -1, -1): line_level = get_level(lines[l]) if line_level < level: parents.append((line_level, lines[l])) if line_level > 0: return get_parents(l, lines, line_level, parents) else: return parents # Return all subtrees def get_children (start, lines, level): children = [] for l in range(start+1, len(lines)): line_level = get_level(lines[l]) if line_level > level: children.append((line_level, lines[l])) # line_children = get_children(l, lines, line_level) # if line_children: # children.append(line_children) else: return children def get_level (line): m = re.search('^\t*', line) if m: return len(m.group(0)) else: return 0 is_url_match = r'^http(?:s)?:\/\/' padnames = [] files = [] exclude = args.exclude searchwords = args.searchwords outname = args.outname if args.outname else searchwords[0] stdout.write('Opening pads:\n') for path in pads: stdout.write(' - {0}\n'.format(path)) if re.match(is_url_match, path): handle = urllib.urlopen(path) namematch = re.search('(\w+)/export/txt', path) padname = namematch.group(1) else: handle = codecs.open(path, 'r', 'utf-8') namematch = re.search('(\w+)\.\w+$', path) padname = namematch.group(1) # text = urllib.urlopen(pad) text = handle.readlines() files.append(text) padnames.append(padname) # for pad in pads: # handle = codecs.open(pad, 'r', 'utf-8') # text = handle.readlines() # files.append(text) # padmatch = re.search('(\w+)\.txt$', pad) # padname = padmatch.group(1) # padnames.append(padname) strwords = '-'.join(searchwords) strwordsnice = ' '.join(searchwords) strexclude = ' '.join(exclude) stdout.write('Parsing:\n') with codecs.open('../stories/object_{0}.html'.format(outname),'w', 'utf-8') as output: output.write('<small class="info">highlighted words: '+strwordsnice+'</small><br>\n') output.write('<small class="info">excluded words: '+strexclude+'</small>\n\n') output.write('<div id="main">\n') for i, lines in enumerate(files): stdout.write(' - {0}\n'.format(padnames[i])) output.write('\t<div class="source">\n\t\t<div class="source-title"><small>{0}</small></div>\n'.format(padnames[i])) # output.write(padnames[i]) # output.write('</small><br>') lines = [re.sub('\s{8}', '\t', line.lower()) for line in lines] for l, line in enumerate(lines): line = re.sub('\n$','', line) # highlight search word if any(word in line for word in searchwords): if not any(word in line for word in exclude): level = get_level(line) for word in searchwords: line = line.replace(word,'<span class="highlight high'+str(get_level(line))+'">'+word+'</span>') parents = get_parents(l, lines, level, []) if not parents: parents = [] children = get_children(l, lines, level) if not children: children = [] # # print get_level(line), line, parents, children # # line = line.encode("utf-8") output.write(u'\t\t<div class="match t{0}">\n') output.write(u'\t\t\t<div class="parents">{0}</div>'.format(u''.join([u'<div class="parent level_">{1}</div>'.format(level, parent.strip('\t\n')) for level, parent in parents[::-1]]))) output.write(u'\t\t\t<div class="line t{0}"><small class="linenumber">{1}</small>{2}</div>'.format(level, l, re.sub('\n+$', '', line))) output.write(u'\t\t\t<div class="children">{0}</div>'.format(u''.join([u'<div class="child level_{0}">{1}</div>'.format(level, child.strip('\t\n')) for level, child in children]))) output.write(u'\t\t</div>\n') output.write('</div>') output.write('\t</div>\n</div>\n') print '*output written*'