medor.maquette
clone your own copy | download snapshot

Snapshots | iceberg

No images in this repository’s iceberg at this time

Inside this repository

microtypo.py
text/x-python

Download raw (2.2 KB)

#! /usr/bin/env python


# Copyright (C) 2015 Alexandre Leray

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


# Fixes French typography in HTML files
#
# Usage:
#
#     ./microtypo.py infile.md outfile.html
#     echo "<p>...</p>" | ./microtypo.py - outfile.html
#     echo "<p>...</p>" | ./microtypo.py | someotherprogramm
#     ./microtypo.py infile.md outfile.html


import codecs
import html5lib

from html5lib_typogrify.french.filters import ellipsis, spaces, dashes, widows_orphans


def fix_french(html):
    dom = html5lib.parseFragment(html, treebuilder="dom")
    walker = html5lib.getTreeWalker("dom")

    stream = walker(dom)
    stream = dashes.Filter(stream)
    stream = ellipsis.Filter(stream)
    stream = spaces.Filter(stream)
    stream = widows_orphans.Filter(stream)

    serializer = html5lib.serializer.HTMLSerializer(quote_attr_values=True,
            alphabetical_attributes=True,
            omit_optional_tags=False)
    output = serializer.serialize(stream)

    return serializer.render(stream)


if __name__ == '__main__':
    import argparse
    import sys

    parser = argparse.ArgumentParser()
    parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
    parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
    args = parser.parse_args()

    content = args.infile.read()
    try:
        unicode_content = content.decode("utf-8")
    except UnicodeDecodeError:
        unicode_content = content.decode("iso8559-1")

    html = fix_french(unicode_content)

    args.outfile.write(html.encode("utf-8"))