Download raw (1.6 KB)
#!/usr/bin/env python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup import urllib2 from main.settings import db from datetime import datetime import sys import hashlib """ pages url: String with page url selectors: Array -- css selectors to find desired snippets on the page items: url: original url selector: query used to retreive content timestamp: time when page was fetched html: resulting html """ def geturl (url): handle = urllib2.urlopen(url) buff = '' while True: chunk = handle.read(4096) if not chunk: break buff += chunk return buff for page in db.pages.find(): sys.stdout.write('Retreiving: {0}'.format(page['url'])) html = geturl(page['url']) sys.stdout.write('Parsing: {0}'.format(page['url'])) doc = BeautifulSoup(html, "html5lib") sys.stdout.write('Parsed: {0}'.format(page['url'])) for selector in page['selectors']: sys.stdout.write('Fetching snippets for: {0}'.format(selector)) for snippet in doc.select(selector): html = snippet.prettify() htmlhash = hashlib.sha256(html.encode('utf-8')).hexdigest() if db.snippets.find_one({'hash': htmlhash, 'url': page['url']}) is None: db.snippets.insert({ 'url': page['url'], 'selector': selector, 'timestamp': datetime.now(), 'hash': htmlhash, 'html': html })