Download raw (933 bytes)
#!/usr/bin/env python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup, NavigableString, Comment """ Returns paths for all images in the snippet """ def extract_image_paths (snippet): return [ img.attrs['src'] for img in BeautifulSoup(snippet, "html5lib").find_all('img') if img ] """ Returns all the paragraphs """ def extract_paragraphs (snippet): return [ p for p in BeautifulSoup(snippet, "html5lib").find_all('p') ] """ Returns all (?) text in the snippet concatenated in one string """ def extract_text (snippet): return get_content(BeautifulSoup(snippet, "html5lib")) def get_content (snippet): content = '' if type(snippet) <> Comment: for child in snippet.contents: if type(child) is NavigableString: content = content + child.string else: content = content + get_content(child) return content