adva-zakai.overbooked
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

cleaner.py
text/x-python

Download raw (578 bytes)

# -*- coding: utf-8 -*-
import codecs
import html5lib
from html5lib_typogrify.french.filters import spaces
from html5lib_typogrify.french.filters import hyphenate


with codecs.open('content.html', 'rb') as f:
    walker = html5lib.getTreeWalker("dom")
    dom = html5lib.parse(f, treebuilder="dom", encoding='utf-8')
    stream = walker(dom)
    stream = spaces.Filter(stream)
    stream = hyphenate.Filter(stream)

s = html5lib.serializer.HTMLSerializer()
output = s.serialize(stream)

with codecs.open('content.cleaned.html', 'w', 'utf-8') as f:
    f.write(s.render(stream))