clone your own copy | download snapshot

Snapshots | iceberg

No images in this repository’s iceberg at this time

Inside this repository


Download raw (4.8 KB)

from os import listdir
from os.path import isfile, join
import codecs
import re
import urllib
import argparse
from sys import stdout

parser = argparse.ArgumentParser()
parser.add_argument('searchwords', type=str, nargs='+', help="The text-file to parse")
parser.add_argument('-e', '--exclude', dest='exclude', type=str, nargs='*', help="The html-file to store produced clip in.")
parser.add_argument('-o', '--outname', dest='outname', type=str, help="Output name")
args = parser.parse_args()

pads = [

# pads = [
# 	'../classification-systems/amazon.txt',
# 	'../classification-systems/cyclopaedia.txt',
# 	'../classification-systems/encyclopedia.txt',
# 	'../classification-systems/etymologiae.txt',
# 	'../classification-systems/greatinventions.txt',
# 	'../classification-systems/polyhistor.txt',
# 	'../classification-systems/taric.txt'
# ]

# Return parents
def get_parents (start, lines, level, parents = []):
	for l in range(start, -1, -1):
		line_level = get_level(lines[l])
		if line_level < level:
			parents.append((line_level, lines[l]))
			if line_level > 0:
				return get_parents(l, lines, line_level, parents)
				return parents

# Return all subtrees
def get_children (start, lines, level):
	children = []
	for l in range(start+1, len(lines)):
		line_level = get_level(lines[l])

		if line_level > level:
			children.append((line_level, lines[l]))
			# line_children = get_children(l, lines, line_level)
			# if line_children:
				# children.append(line_children)
			return children

def get_level (line):
	m = re.search('^\t*', line)
	if m:
		return len(m.group(0))
		return 0

is_url_match = r'^http(?:s)?:\/\/'
padnames = []
files = []

exclude = args.exclude
searchwords = args.searchwords
outname = args.outname if args.outname else searchwords[0]

stdout.write('Opening pads:\n')
for path in pads:
	stdout.write('  - {0}\n'.format(path))
	if re.match(is_url_match, path):
		handle = urllib.urlopen(path)
		namematch = re.search('(\w+)/export/txt', path)
		padname = namematch.group(1)
		handle = codecs.open(path, 'r', 'utf-8')
		namematch = re.search('(\w+)\.\w+$', path) 
		padname = namematch.group(1)

	# text = urllib.urlopen(pad)
	text = handle.readlines()
# for pad in pads:
# 	handle = codecs.open(pad, 'r', 'utf-8')
# 	text = handle.readlines()
# 	files.append(text)
# 	padmatch = re.search('(\w+)\.txt$', pad) 
# 	padname = padmatch.group(1)
# 	padnames.append(padname)

strwords = '-'.join(searchwords)
strwordsnice = ' '.join(searchwords)
strexclude = ' '.join(exclude)

with codecs.open('../stories/object_{0}.html'.format(outname),'w', 'utf-8') as output:
	output.write('<small class="info">highlighted words: '+strwordsnice+'</small><br>\n')
	output.write('<small class="info">excluded words: '+strexclude+'</small>\n\n')
	output.write('<div id="main">\n')

	for i, lines in enumerate(files):
		stdout.write('  - {0}\n'.format(padnames[i]))
		output.write('\t<div class="source">\n\t\t<div class="source-title"><small>{0}</small></div>\n'.format(padnames[i]))
		# output.write(padnames[i])
		# output.write('</small><br>')
		lines = [re.sub('\s{8}', '\t', line.lower()) for line in lines]

		for l, line in enumerate(lines):
			line = re.sub('\n$','', line)

			# highlight search word
			if any(word in line for word in searchwords):
				if not any(word in line for word in exclude):
					level = get_level(line)
					for word in searchwords:
						line = line.replace(word,'<span class="highlight high'+str(get_level(line))+'">'+word+'</span>')
					parents = get_parents(l, lines, level, [])
					if not parents:
						parents = []
					children = get_children(l, lines, level)
					if not children:
						children = []
					# # print get_level(line), line, parents, children
					# # line = line.encode("utf-8")
					output.write(u'\t\t<div class="match t{0}">\n')
					output.write(u'\t\t\t<div class="parents">{0}</div>'.format(u''.join([u'<div class="parent level_">{1}</div>'.format(level, parent.strip('\t\n')) for level, parent in parents[::-1]])))
					output.write(u'\t\t\t<div class="line t{0}"><small class="linenumber">{1}</small>{2}</div>'.format(level, l, re.sub('\n+$', '', line)))
					output.write(u'\t\t\t<div class="children">{0}</div>'.format(u''.join([u'<div class="child level_{0}">{1}</div>'.format(level, child.strip('\t\n')) for level, child in children])))

print '*output written*'