Download raw (3.6 KB)
#! /usr/bin/env python2 # Copyright (C) 2015 Alexandre Leray # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <>. # Pulls stories from our plateform # # Usage: # # ./ [where] import requests import codecs import settings import html5lib from html5lib.filters import whitespace from html5lib_typogrify.french.filters import hyphenate, medor, figures def get_client(): login_url = '' client = requests.session() # Retrieve the CSRF token first client.get(login_url) # sets cookie csrftoken = client.cookies['csrftoken'] login_data = dict(username=settings.USER, password=settings.PASS, csrfmiddlewaretoken=csrftoken, next='/') resp =, data=login_data, headers=dict(Referer=login_url)) if resp.status_code != 200: import sys sys.exit() return client def pull_stories(where="stories"): client = get_client() hostname = '' path = "/fr/api/article-membership/?issue__id=3" url = hostname + path request = client.get(url) for membership in request.json(): #order = membership['order'] slug = membership['article']['slug'] rubric_type = membership['article']['rubric']['type'] print("Pulling %s" % slug) name = slug # Gets the story ## TODO: namespace the url on the django project request = client.get(hostname + "/fr/%s.html" % membership['id']) output = request.text # Using etree is important here because it does not suffer from a bug # where a text featuring entitities is split into various # adjacent text nodes. # (thanks html5lib folks for the tip). # See <> dom = html5lib.parseFragment(output, treebuilder="etree") walker = html5lib.getTreeWalker("etree") stream = walker(dom) stream = whitespace.Filter(stream) stream = medor.Filter(stream) stream = figures.Filter(stream) print(rubric_type) if rubric_type == "portrait": stream = hyphenate.Filter(stream, min_len=7, left=3, right=4) if rubric_type == "editorial": stream = hyphenate.Filter(stream, min_len=7, left=3, right=4) elif rubric_type == "rubrique": stream = hyphenate.Filter(stream, min_len=4, left=2, right=2) else: stream = hyphenate.Filter(stream, min_len=5, left=2, right=3) s = html5lib.serializer.HTMLSerializer(quote_attr_values=True, omit_optional_tags=False) output = s.render(stream) f ="%s/%s.html" % (where, name), "w", "utf-8") f.write(output) f.close() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('where', default="stories") args = parser.parse_args() pull_stories(where=args.where)