species-of-things
clone your own copy | download snapshot

Snapshots | iceberg

No images in this repository’s iceberg at this time

Inside this repository

objects.py
text/x-python

Download raw (4.8 KB)

from os import listdir
from os.path import isfile, join
import codecs
import re
import urllib
import argparse
from sys import stdout

parser = argparse.ArgumentParser()
parser.add_argument('searchwords', type=str, nargs='+', help="The text-file to parse")
parser.add_argument('-e', '--exclude', dest='exclude', type=str, nargs='*', help="The html-file to store produced clip in.")
parser.add_argument('-o', '--outname', dest='outname', type=str, help="Output name")
args = parser.parse_args()

pads = [
	'http://10.10.161.238/ether/p/amazon/export/txt',
	'http://10.10.161.238/ether/p/polyhistorm/export/txt',
	'http://10.10.161.238/ether/p/etymologiae/export/txt',
	'http://10.10.161.238/ether/p/cyclopaedia/export/txt',
	'http://10.10.161.238/ether/p/encyclopedia/export/txt',
	'http://10.10.161.238/ether/p/great-inventions/export/txt',
	'http://10.10.161.238/ether/p/taric/export/txt'
]

# pads = [
# 	'../classification-systems/amazon.txt',
# 	'../classification-systems/cyclopaedia.txt',
# 	'../classification-systems/encyclopedia.txt',
# 	'../classification-systems/etymologiae.txt',
# 	'../classification-systems/greatinventions.txt',
# 	'../classification-systems/polyhistor.txt',
# 	'../classification-systems/taric.txt'
# ]

# Return parents
def get_parents (start, lines, level, parents = []):
	for l in range(start, -1, -1):
		line_level = get_level(lines[l])
		if line_level < level:
			parents.append((line_level, lines[l]))
			if line_level > 0:
				return get_parents(l, lines, line_level, parents)
			else:
				return parents

# Return all subtrees
def get_children (start, lines, level):
	children = []
	
	for l in range(start+1, len(lines)):
		line_level = get_level(lines[l])

		if line_level > level:
			children.append((line_level, lines[l]))
			# line_children = get_children(l, lines, line_level)
			# if line_children:
				# children.append(line_children)
		else:
			return children

def get_level (line):
	m = re.search('^\t*', line)
	if m:
		return len(m.group(0))
	else:
		return 0

is_url_match = r'^http(?:s)?:\/\/'
padnames = []
files = []

exclude = args.exclude
searchwords = args.searchwords
outname = args.outname if args.outname else searchwords[0]

stdout.write('Opening pads:\n')
for path in pads:
	stdout.write('  - {0}\n'.format(path))
	if re.match(is_url_match, path):
		handle = urllib.urlopen(path)
		namematch = re.search('(\w+)/export/txt', path)
		padname = namematch.group(1)
		
	else:
		handle = codecs.open(path, 'r', 'utf-8')
		namematch = re.search('(\w+)\.\w+$', path) 
		padname = namematch.group(1)

	# text = urllib.urlopen(pad)
	text = handle.readlines()
	files.append(text)
	padnames.append(padname)
	
# for pad in pads:
# 	handle = codecs.open(pad, 'r', 'utf-8')
# 	text = handle.readlines()
# 	files.append(text)
# 	padmatch = re.search('(\w+)\.txt$', pad) 
# 	padname = padmatch.group(1)
# 	padnames.append(padname)

strwords = '-'.join(searchwords)
strwordsnice = ' '.join(searchwords)
strexclude = ' '.join(exclude)

stdout.write('Parsing:\n')
with codecs.open('../stories/object_{0}.html'.format(outname),'w', 'utf-8') as output:
	output.write('<small class="info">highlighted words: '+strwordsnice+'</small><br>\n')
	output.write('<small class="info">excluded words: '+strexclude+'</small>\n\n')
	output.write('<div id="main">\n')

	for i, lines in enumerate(files):
		stdout.write('  - {0}\n'.format(padnames[i]))
		output.write('\t<div class="source">\n\t\t<div class="source-title"><small>{0}</small></div>\n'.format(padnames[i]))
		# output.write(padnames[i])
		# output.write('</small><br>')
		lines = [re.sub('\s{8}', '\t', line.lower()) for line in lines]

		for l, line in enumerate(lines):
			line = re.sub('\n$','', line)

			# highlight search word
			if any(word in line for word in searchwords):
				if not any(word in line for word in exclude):
					level = get_level(line)
					for word in searchwords:
						line = line.replace(word,'<span class="highlight high'+str(get_level(line))+'">'+word+'</span>')
					parents = get_parents(l, lines, level, [])
					if not parents:
						parents = []
					children = get_children(l, lines, level)
					if not children:
						children = []
					# # print get_level(line), line, parents, children
					# # line = line.encode("utf-8")
					output.write(u'\t\t<div class="match t{0}">\n')
					output.write(u'\t\t\t<div class="parents">{0}</div>'.format(u''.join([u'<div class="parent level_">{1}</div>'.format(level, parent.strip('\t\n')) for level, parent in parents[::-1]])))
					output.write(u'\t\t\t<div class="line t{0}"><small class="linenumber">{1}</small>{2}</div>'.format(level, l, re.sub('\n+$', '', line)))
					output.write(u'\t\t\t<div class="children">{0}</div>'.format(u''.join([u'<div class="child level_{0}">{1}</div>'.format(level, child.strip('\t\n')) for level, child in children])))
					output.write(u'\t\t</div>\n')
		output.write('</div>')
	output.write('\t</div>\n</div>\n')

print '*output written*'