Download raw (3.3 KB)
from django.core.management.base import BaseCommand
from django.utils.html import strip_tags
from django.utils.text import Truncator
from django.core.files import File
from django.core.files.temp import NamedTemporaryFile
import os
import os.path
from urllib.parse import urlparse
import requests
import html5lib
from cssselect import GenericTranslator, SelectorError
from scraper.models import Feed, FeedEntry
def clean(txt):
# rex = re.compile(r'\W+')
# txt = strip_tags(txt)
txt = txt.strip().replace("\n", " ")
# txt = rex.sub(' ', txt)
# txt = Truncator(txt).chars(50)
return txt
def get_content(parent, selector, attr=None):
"""extract content from the given element.
"""
if not selector:
return None
try:
exp = GenericTranslator().css_to_xpath(selector)
except SelectorError:
return None
results = parent.xpath(exp)
if results:
ret = results[0]
if attr:
if attr in ret.attrib:
ret = clean(ret.attrib[attr])
else:
ret = None
else:
ret = clean(ret.xpath("string()"))
return ret
else:
return None
class Command(BaseCommand):
help = 'Updates the satellite feeds'
def handle(self, *args, **options):
self.stdout.write('Scraping satellites')
for feed in Feed.objects.all():
self.stdout.write('Scraping %s' % feed.url)
r = requests.get(feed.url)
tree = html5lib.parse(r.text, treebuilder="lxml", namespaceHTMLElements=False)
try:
post_exp = GenericTranslator().css_to_xpath(feed.post_selector)
except SelectorError:
continue
entry_list = tree.xpath(post_exp)
entry_list.reverse()
for entry in entry_list:
permalink = get_content(entry, feed.permalink_selector, attr=feed.permalink_attribute)
title = get_content(entry, feed.title_selector, attr=feed.title_attribute)
main_image_url = get_content(entry, feed.main_image_selector, attr=feed.main_image_attribute)
# excerpt = get_content(entry, feed.excerpt_selector, attr=feed.excerpt_attribute)
if title:
self.stdout.write('Indexing {} {}'.format(title, permalink))
else:
self.stdout.write('Could not find a title for this entry {}... Skipping.'.format(permalink))
continue
obj, created = FeedEntry.objects.get_or_create(permalink=permalink, defaults={
"title": title,
"feed": feed,
"is_published": True
})
if created:
if main_image_url:
r = requests.get(main_image_url)
img_temp = NamedTemporaryFile(delete=True)
img_temp.write(r.content)
img_temp.flush()
a = urlparse(main_image_url)
fn = os.path.basename(a.path)
obj.main_image_original.save(fn, File(img_temp))
obj.save()
self.stdout.write('Successfully scraped %s' % feed.url)