le75
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

scraper.py
text/x-python

Download raw (1.6 KB)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib2
from main.settings import db
from datetime import datetime
import sys
import hashlib

"""

    pages
        url: String with page url
        selectors: Array -- css selectors to find desired snippets on the page

    items:
        url: original url
        selector: query used to retreive content
        timestamp: time when page was fetched
        html: resulting html

"""

def geturl (url):
    handle = urllib2.urlopen(url)
    buff = ''
    while True:
        chunk = handle.read(4096)
        if not chunk:
            break
        buff += chunk

    return buff

for page in db.pages.find():
    sys.stdout.write('Retreiving: {0}'.format(page['url']))
    html = geturl(page['url'])
    
    sys.stdout.write('Parsing: {0}'.format(page['url']))
    
    doc = BeautifulSoup(html, "html5lib")
    sys.stdout.write('Parsed: {0}'.format(page['url']))
    
    for selector in page['selectors']:
        sys.stdout.write('Fetching snippets for: {0}'.format(selector))
        
        for snippet in doc.select(selector):
            html = snippet.prettify()
            htmlhash = hashlib.sha256(html.encode('utf-8')).hexdigest()
            
            if db.snippets.find_one({'hash': htmlhash, 'url': page['url']}) is None:
                db.snippets.insert({
                    'url': page['url'],
                    'selector': selector,
                    'timestamp': datetime.now(),
                    'hash': htmlhash,
                    'html': html
                })