the-riddle
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

wrap_last_sentence_paragraphs.py
text/x-python

Download raw (1.5 KB)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import glob
import codecs
import os.path
import argparse

parser = argparse.ArgumentParser("Wrap last sentence of paragraphs. To deal with orphans")
parser.add_argument('folder', help="Stories folder, with HTML files")
args = parser.parse_args()


def wrap_last_sentence (m):
  groups = m.groupdict()
  content = groups['content'] if 'content' in groups else ''
  reference = groups['reference'] if 'reference' in groups and groups['reference'] else ''
  
  interpunctions = [m for m in re.finditer('[\.\?\!\:]\s*(?!$)', content)]
  if (interpunctions):
    splitindex = interpunctions[-1].end()
  else:
    splitindex = content.find(u' ', -80) + 1

  head = content[:splitindex]
  tail = content[splitindex:]

  if '>' in tail and not re.match(".*\<(\w+)(?:[^\>])*>.+\</\\1\>.*", tail):
    splitindex += [m for m in re.finditer('>\s*', tail)][-1].end()
    head = content[:splitindex]
    tail = content[splitindex:]

  return u'{0}<span class="last-line">{1}</span>{2}'.format(head, tail, reference)

for htmlpath in glob.glob("{0}/*.html".format(os.path.normpath(args.folder))):
  print "\t{0}".format(htmlpath)
  htmlstring = None

  with codecs.open(htmlpath, mode='r', encoding='utf-8') as htmlfile:
    htmlstring = htmlfile.read()
    htmlstring = re.sub(u'(?P<content>.{,170}[\.\?\!\)]\s*)(?P<reference><a[^>]*>.*</a>)?(?=<\/p>)', wrap_last_sentence, htmlstring)

  if htmlstring:
    with codecs.open(htmlpath, mode='w', encoding='utf-8') as htmlfile:
      htmlfile.write(htmlstring)