Download raw (1.5 KB)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import glob import codecs import os.path import argparse parser = argparse.ArgumentParser("Wrap last sentence of paragraphs. To deal with orphans") parser.add_argument('folder', help="Stories folder, with HTML files") args = parser.parse_args() def wrap_last_sentence (m): groups = m.groupdict() content = groups['content'] if 'content' in groups else '' reference = groups['reference'] if 'reference' in groups and groups['reference'] else '' interpunctions = [m for m in re.finditer('[\.\?\!\:]\s*(?!$)', content)] if (interpunctions): splitindex = interpunctions[-1].end() else: splitindex = content.find(u' ', -80) + 1 head = content[:splitindex] tail = content[splitindex:] if '>' in tail and not re.match(".*\<(\w+)(?:[^\>])*>.+\</\\1\>.*", tail): splitindex += [m for m in re.finditer('>\s*', tail)][-1].end() head = content[:splitindex] tail = content[splitindex:] return u'{0}<span class="last-line">{1}</span>{2}'.format(head, tail, reference) for htmlpath in glob.glob("{0}/*.html".format(os.path.normpath(args.folder))): print "\t{0}".format(htmlpath) htmlstring = None with codecs.open(htmlpath, mode='r', encoding='utf-8') as htmlfile: htmlstring = htmlfile.read() htmlstring = re.sub(u'(?P<content>.{,170}[\.\?\!\)]\s*)(?P<reference><a[^>]*>.*</a>)?(?=<\/p>)', wrap_last_sentence, htmlstring) if htmlstring: with codecs.open(htmlpath, mode='w', encoding='utf-8') as htmlfile: htmlfile.write(htmlstring)