medor
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

mediawiki_tags.py
text/x-python

Download raw (3.6 KB)

import re
import lxml
import codecs
import html5lib
import requests
from django import template
from django.conf import settings
from urlparse import parse_qsl, urlparse
import os.path

register = template.Library()


def cache(URL):
    qs = parse_qsl(urlparse(URL).query)
    title = dict((x, y) for x, y in qs)['title']
    filename = os.path.join(settings.CACHE_PATH, title)

    if not os.path.exists(settings.CACHE_PATH):
        os.makedirs(settings.CACHE_PATH)

    if not os.path.exists(filename):
        r = requests.get(URL)
        with open(filename, 'wb') as fd:
            for chunk in r.iter_content(1024):
                fd.write(chunk)

    return filename


def outline(tree):
    pattern = re.compile('^h(\d)')
    last_depth = None
    sections = [] # [header, <section />]

    for child in tree.iterchildren():
        tag = child.tag

        if tag is lxml.etree.Comment:
            continue

        match = pattern.match(tag.lower())

        if match:
            depth = int(match.group(1))

            if depth <= last_depth or last_depth is None:
                last_depth = depth

                sections.append([child, lxml.etree.Element('section')])
                continue

        if sections:
            sections[-1][1].append(child)

    for section in sections:
        outline(section[1])
        section[0].addprevious(section[1])
        section[1].insert(0, section[0])


def extract(src, id, details=False):
    f = codecs.open(src, "r", "utf-8")
    html = f.read()

    parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
    tree = parser.parse(html, encoding=None, parseMeta=True, useChardet=True)

    outline(tree.xpath("//div[@id='mw-content-text']")[0])

    for elt in tree.xpath('//span[@class="editsection"]'):
        elt.getparent().remove(elt)

    for elt in tree.xpath('//span[@class="mw-headline"]'):
        parent = elt.getparent()
        parent.getparent().attrib['id'] = elt.attrib.pop('id')
        text = elt.text
        parent.remove(elt)
        parent.text = text

    for elt in tree.xpath('//section'):
        #parent = elt.getparent()
        #parent.getparent().attrib['id'] = elt.attrib.pop('id')
        #text = elt.text
        #parent.remove(elt)
        wrapper =  lxml.etree.Element('div')
        wrapper.attrib['class'] = 'wrapper'
        for child in elt.getchildren():
            wrapper.append(child)
        elt.append(wrapper)
        #import ipdb; ipdb.set_trace()

    for elt in tree.xpath('//img'):
        elt.attrib['src'] = 'http://tunakutafuta.be/' + elt.attrib['src']

    frag = tree.xpath("//section[@id='%s']" % id)[0]

    if details:
        pattern = re.compile('^h(\d)')
        for child in frag.iterchildren():
            if child.tag == "section":
                child.tag = "details"
                for child in child.iterchildren():
                    tag = child.tag
                    match = pattern.match(tag.lower())
                    if match:
                        child.tag = "summary"


    return lxml.etree.tostring(frag, method='xml', pretty_print=True, xml_declaration=None, encoding="utf-8")


def mwinclude(name, id, rev=None):
    url = 'http://tunakutafuta.be/index.php?title=%s' % name
    if rev:
        url += '&oldid=%s' % rev
    src = cache(url)
    section = extract(src, id)
    return section


register.simple_tag(mwinclude)


def mwincludedetails(name, id, rev=None):
    url = 'http://tunakutafuta.be/index.php?title=%s' % name
    if rev:
        url += '&oldid=%s' % rev
    src = cache(url)
    section = extract(src, id, details=True)
    return section


register.simple_tag(mwincludedetails)