import codecs
import html5lib
f = codecs.open("anna_kavan.html", "r", "utf-8")
html = f.read()
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
tree = parser.parse(html, encoding=None, parseMeta=True, useChardet=True)
tweets = []
for elt in tree.xpath('//li[@data-item-type="tweet"]'):
fullname = elt.xpath('.//strong[@class="fullname js-action-profile-name show-popup-with-id"]')[0].text
username_elt = elt.xpath('.//span[@class="username js-action-profile-name"]')[0]
username = [i for i in username_elt.itertext()][-1]
tweet_elt = elt.xpath('.//p[@class="js-tweet-text tweet-text"]')[0]
tweet = "".join([i for i in tweet_elt.itertext()])
date_elt = elt.xpath('.//span[@data-time-ms]')[0]
timestamp = int(date_elt.attrib['data-time-ms'])
id_elt = elt.xpath('.//div[@data-tweet-id]')[0]
id = int(id_elt.attrib['data-tweet-id'])
data = {
"fullname": fullname,
"username": username,
"timestamp": timestamp,
"tweet": tweet,
"id": id,
"img": []
}
img_elts = elt.xpath('.//div[@data-resolved-url-large]')
for img in img_elts:
data['img'].append(img.attrib['data-resolved-url-large'])
tweets.append(data)
#print(fullname)
#print(username)
#print(timestamp)
#print(tweet.encode('utf-8'))
print(tweets)
#nom
#id
#date
#texte
#image