Download raw (1.4 KB)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import glob import codecs import os.path import argparse parser = argparse.ArgumentParser("Insert image references") parser.add_argument('folder', help="Stories folder, with HTML files") args = parser.parse_args() def insert_image_references (m): src = m.group(1) alt = m.group(2) className = m.group(3) num = re.search("[\d|\w]+$", alt).group(0) return '<span class="img_ref" data-for="figure_{3}">{3}</span><img src="{0}" alt="{1}" class="{2}" id="figure_{3}" />'.format(src, alt, className, num) for bookfolder in glob.glob("{0}/book*".format(os.path.normpath(args.folder))): print "{0}".format(bookfolder) for htmlpath in glob.glob("{0}/*.html".format(os.path.normpath(bookfolder))): print "\t{0}".format(htmlpath) htmlstring = None with codecs.open(htmlpath, mode='r', encoding='utf-8') as htmlfile: htmlstring = htmlfile.read() htmlstring = re.sub('\<img src="(.[^\"]+)" alt="(.[^\"]+)" (?:class="(.[^\"]+)" )?/\>', insert_image_references, htmlstring) # matches = re.finditer('\<img src="(.[^\"]+)" alt="(.[^\"]+)" (?:class="(.[^\"]+)" )?/\>', htmlstring) # if matches: # for m in matches: # print m.group(1), m.group(2), m.group(3) if htmlstring: with codecs.open(htmlpath, mode='w', encoding='utf-8') as htmlfile: htmlfile.write(htmlstring)