le75
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

utils.py
text/x-python

Download raw (933 bytes)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup, NavigableString, Comment

"""
    Returns paths for all images in the snippet
"""
def extract_image_paths (snippet):
    return [ img.attrs['src'] for img in BeautifulSoup(snippet, "html5lib").find_all('img') if img ]

"""
    Returns all the paragraphs
"""
def extract_paragraphs (snippet):
    return [ p for p in BeautifulSoup(snippet, "html5lib").find_all('p') ]

"""
    Returns all (?) text in the snippet concatenated
    in one string
"""
def extract_text (snippet):
    return get_content(BeautifulSoup(snippet, "html5lib"))
  
def get_content (snippet):
    content = ''
    
    if type(snippet) <> Comment:
        for child in snippet.contents:
            if type(child) is NavigableString:
                content = content + child.string
            else:
                content = content + get_content(child)

        
    return content