portrait-of-a-community
clone your own copy | download snapshot

Snapshots | iceberg

Inside this repository

similarity_matrix.py
text/x-python

Download raw (923 bytes)

# -*- coding: utf-8 -*-
#!/usr/bin/env python

import csv
from sklearn.feature_extraction.text import TfidfVectorizer

CSVPATH = 'datas/csv/beyond-the-first-decade_datas_2006-2015.csv'

descriptions = []
with open(CSVPATH, 'rb') as csvfile:
    reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
    talks = []
    descriptions = [row['description'] for row in reader if row['description']]

vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(descriptions)

matrix = (tfidf * tfidf.T).A

for i, row in enumerate(matrix):
    l = list(row)
    most_similar_value = sorted(l, reverse=True)[1]
    most_similar_i = l.index( most_similar_value )
    #print i, row[i], most_similar_i, most_similar_value
    
    print "A DESCRIPTION"
    print
    print descriptions[i]
    print
    print "THE MOST SIMILAR DESCRIPTION @ %s" % most_similar_value
    print
    print descriptions[most_similar_i]
    print