medor.www
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

0023_auto_20160714_1426.py
text/x-python

Download raw (2.0 KB)

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from django.db import migrations

import re
import html5lib
from html5lib.filters import _base


def remove_control_characters(html):
    def str_to_int(s, default, base=10):
        if int(s, base) < 0x10000:
            return unichr(int(s, base))
        return default
    html = re.sub(ur"&#(\d+);?", lambda c: str_to_int(c.group(1), c.group(0)), html)
    html = re.sub(ur"&#[xX]([0-9a-fA-F]+);?", lambda c: str_to_int(c.group(1), c.group(0), base=16), html)
    html = re.sub(ur"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", html)
    return html


class BodyContentFilter(_base.Filter):
    def __iter__(self):
        emit = False

        for token in _base.Filter.__iter__(self):

            if token["type"] == "StartTag" and token["name"] == "body":
                emit = True
                continue

            elif token["type"] == "EndTag" and token["name"] == "body":
                emit = False

            if emit:
                yield token


def move_chapo(apps, schema_editor):
    Article = apps.get_model("publish", "Article")

    walker = html5lib.getTreeWalker("lxml")
    s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False)

    for article in Article.objects.all():
        print(article.title)
        tree = html5lib.parse(remove_control_characters(article.body), treebuilder="lxml", namespaceHTMLElements=False)

        chapo = tree.xpath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' chapeau ')]")

        if len(chapo):
            chapo[0].getparent().remove(chapo[0])
            del chapo[0].attrib["class"]
            stream2 = walker(chapo[0])
            article.lead = s.render(stream2)

        stream = walker(tree)
        stream = BodyContentFilter(stream)
        article.body = s.render(stream)

        article.save()


class Migration(migrations.Migration):

    dependencies = [
        ('publish', '0022_article_lead'),
    ]

    operations = [
        migrations.RunPython(move_chapo),
    ]