Download raw (3.6 KB)
#! /usr/bin/env python2 # Copyright (C) 2015-2017 Alexandre Leray (Open Source Publishing) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # Fetches the images of an HTML document and fixes links to them # # Usage: # # ./generate.py infile.md outfile.html import urlparse import os import requests import html5lib import logging from html5lib.filters import _base logger = logging.getLogger(__name__) formatter = logging.Formatter('[%(levelname)s] %(message)s') handler = logging.StreamHandler() handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) class ScriptSrcFilter(_base.Filter): def __init__(self, source, local_dir="."): self.source = source self.local_dir = local_dir def __iter__(self): for token in _base.Filter.__iter__(self): if token["name"] == "script": print token, token["data"][(None, "src")] class ImgSrcFilter(_base.Filter): """ """ def __init__(self, source, local_dir="."): self.source = source self.local_dir = local_dir def __iter__(self): for token in _base.Filter.__iter__(self): if token["type"] == "EmptyTag" and token['name'] == "img": src = token["data"][(None, 'src')] print(" downloading {}".format(src)) response = requests.get(src, stream=True) if not response.ok: logger.info("Could not retrieve {}. Skipping.".format(src)) continue parts = urlparse.urlparse(response.url) fn = os.path.basename(parts.path) print(" -> {}".format(fn)) if not os.path.exists(self.local_dir): os.makedirs(self.local_dir) local_path = os.path.join(self.local_dir, fn) with open(local_path, 'wb') as handle: for block in response.iter_content(1024): handle.write(block) token["data"][(None, 'src')] = "/" + local_path yield token def pull_images(infile): print("processing {}".format(infile.name)) dom = html5lib.parseFragment(infile.read(), treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) local_dir = os.path.join(os.path.dirname(infile.name), "img") stream = ImgSrcFilter(stream, local_dir=local_dir) s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) output = s.render(stream) return output if __name__ == '__main__': import argparse import sys parser = argparse.ArgumentParser() parser.add_argument('infile', type=argparse.FileType('r'), default=sys.stdin) parser.add_argument('-o', '--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args() out = pull_images(args.infile) args.outfile.write(out.encode("utf-8"))