Download raw (578 bytes)
# -*- coding: utf-8 -*- import codecs import html5lib from html5lib_typogrify.french.filters import spaces from html5lib_typogrify.french.filters import hyphenate with codecs.open('content.html', 'rb') as f: walker = html5lib.getTreeWalker("dom") dom = html5lib.parse(f, treebuilder="dom", encoding='utf-8') stream = walker(dom) stream = spaces.Filter(stream) stream = hyphenate.Filter(stream) s = html5lib.serializer.HTMLSerializer() output = s.serialize(stream) with codecs.open('content.cleaned.html', 'w', 'utf-8') as f: f.write(s.render(stream))