medor
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

etree_tags.py
text/x-python

Download raw (3.1 KB)

import re
import lxml
import codecs
import html5lib
import requests
from django import template
from django.conf import settings
from urlparse import parse_qsl, urlparse
import os.path
from hashlib import sha1
from django.utils.functional import curry
from django.utils.safestring import mark_safe

register = template.Library()


@register.filter
def cache(url):
    name = sha1(url).hexdigest()
    path = os.path.join(settings.CACHE_PATH, name)

    if not os.path.exists(settings.CACHE_PATH):
        os.makedirs(settings.CACHE_PATH)

    if not os.path.exists(path):
        r = requests.get(url)
        with open(path, 'wb') as fd:
            for chunk in r.iter_content(1024):
                fd.write(chunk)

    return path


@register.filter
def tree(path):
    f = codecs.open(path, "r", "utf-8")
    html = f.read()

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    tree = parser.parse(html, encoding=None, parseMeta=True, useChardet=True)

    return tree


@register.filter
def outline(tree, path=None):
    pattern = re.compile('^h(\d)')
    last_depth = None
    sections = [] # [header, <section />]

    if path:
        root = tree.xpath(path)[0]
    else:
        try:
            root = tree.getroot()
        except:
            root = tree

    for child in root.iterchildren():
        tag = child.tag

        if tag is lxml.etree.Comment:
            continue

        match = pattern.match(tag.lower())

        if match:
            depth = int(match.group(1))

            if depth <= last_depth or last_depth is None:
                last_depth = depth

                sections.append([child, lxml.etree.Element('section')])
                continue

        if sections:
            sections[-1][1].append(child)

    for section in sections:
        outline(section[1])
        section[0].addprevious(section[1])
        section[1].insert(0, section[0])

    return tree


@register.filter
def cleanmw(tree):
    for elt in tree.xpath('//span[@class="editsection"]'):
        elt.getparent().remove(elt)

    for elt in tree.xpath('//span[@class="mw-headline"]'):
        parent = elt.getparent()
        parent.getparent().attrib['id'] = elt.attrib.pop('id')
        text = elt.text
        parent.remove(elt)
        parent.text = text

    for elt in tree.xpath('//section'):
        wrapper =  lxml.etree.Element('div')
        wrapper.attrib['class'] = 'wrapper'
        for child in elt.getchildren():
            wrapper.append(child)
        elt.append(wrapper)

    for elt in tree.xpath('//img'):
        if not elt.attrib['src'].startswith('http'):
            elt.attrib['src'] = 'http://tunakutafuta.be/' + elt.attrib['src']

    return tree



@register.filter
def xpath(tree, xpath):
    return tree.xpath(xpath)


@register.filter(is_safe=True)
def serialize(tree):
    tostring = curry(lxml.etree.tostring, method='xml', pretty_print=True, xml_declaration=None, encoding="utf-8")

    if isinstance(tree, list):
        ret = "\n".join([tostring(item) for item in tree])
    else:
        ret = tostring(tree)

    return mark_safe(ret)