the-riddle
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

pullimages.py
text/x-python

Download raw (3.6 KB)

#! /usr/bin/env python2


# Copyright (C) 2015-2017 Alexandre Leray (Open Source Publishing)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


# Fetches the images of an HTML document and fixes links to them
#
# Usage:
#
#     ./generate.py infile.md outfile.html


import urlparse
import os
import requests
import html5lib
import logging
from html5lib.filters import _base


logger = logging.getLogger(__name__)
formatter = logging.Formatter('[%(levelname)s] %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

class ScriptSrcFilter(_base.Filter):
    def __init__(self, source, local_dir="."):
        self.source = source
        self.local_dir = local_dir
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            if token["name"] == "script":
                print token, token["data"][(None, "src")]

class ImgSrcFilter(_base.Filter):
    """
    """
    def __init__(self, source, local_dir="."):
        self.source = source
        self.local_dir = local_dir

    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            if token["type"] == "EmptyTag" and token['name'] == "img":
                src = token["data"][(None, 'src')]
                print("    downloading {}".format(src))

                response = requests.get(src, stream=True)

                if not response.ok:
                    logger.info("Could not retrieve {}. Skipping.".format(src))
                    continue

                parts = urlparse.urlparse(response.url)
                fn = os.path.basename(parts.path)
                print("        -> {}".format(fn))

                if not os.path.exists(self.local_dir):
                    os.makedirs(self.local_dir)

                local_path = os.path.join(self.local_dir, fn)

                with open(local_path, 'wb') as handle:
                    for block in response.iter_content(1024):
                        handle.write(block)

                    token["data"][(None, 'src')] = "/" + local_path

            yield token


def pull_images(infile):
    print("processing {}".format(infile.name))
    dom = html5lib.parseFragment(infile.read(), treebuilder="etree")
    walker = html5lib.getTreeWalker("etree")

    stream = walker(dom)
    local_dir = os.path.join(os.path.dirname(infile.name), "img")
    stream = ImgSrcFilter(stream, local_dir=local_dir)

    s = html5lib.serializer.HTMLSerializer(quote_attr_values=True,
                                            omit_optional_tags=False)

    output = s.render(stream)
    return output


if __name__ == '__main__':
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument('infile', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('-o', '--outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    args = parser.parse_args()

    out = pull_images(args.infile)
    args.outfile.write(out.encode("utf-8"))