# Return 'similar score' based on two dictionaries' similarity in Python?

import math

ratings={'Shane': {'127 Hours': 3.0, 'Avatar': 4.0, 'Nonstop': 5.0}, 'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0}}

def cosine_similarity(vec1,vec2):
sum11, sum12, sum22 = 0, 0, 0
for i in range(len(vec1)):
x = vec1[i]; y = vec2[i]
sum11 += x*x
sum22 += y*y
sum12 += x*y
return sum12/math.sqrt(sum11*sum22)

list1 = list(ratings['Shane'].values())
list2 =  list(ratings['Joe'].values())

sim = cosine_similarity(list1,list2)
print(sim)


output

o/p : 0.9205746178983233


Updated When i use :

ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0}}


output :0.9574271077563381

Update2: Normalized length and considered keys

from math import*

ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0},
'Bob': {'Panic Room':5.0,'Nonstop':5.0}}

def square_rooted(x):

return round(sqrt(sum([a*a for a in x])),3)

def cosine_similarity(x,y):

input1 = {}
input2 = {}
vector2 = []
vector1 =[]

if len(x) > len(y):
input1 = x
input2 = y
else:
input1 = y
input2 = x

vector1 = list(input1.values())

for k in input1.keys():    # Normalizing input vectors.
if k in input2:
vector2.append(float(input1[k]))
else :
vector2.append(float(0))

numerator = sum(a*b for a,b in zip(vector2,vector1))
denominator = square_rooted(vector1)*square_rooted(vector2)
return round(numerator/float(denominator),3)

print("Similarity between Shane and Joe")
print (cosine_similarity(ratings['Shane'],ratings['Joe']))

print("Similarity between Joe and Bob")
print (cosine_similarity(ratings['Joe'],ratings['Bob']))

print("Similarity between Shane and Bob")
print (cosine_similarity(ratings['Shane'],ratings['Bob']))


output:

Similarity between Shane and Joe
0.887
Similarity between Joe and Bob
0.346
Similarity between Shane and Bob
0.615


Nice explanation between jaccurd and cosine : https://datascience.stackexchange.com/questions/5121/applications-and-differences-for-jaccard-similarity-and-cosine-similarity

i am using Python 3.4

NOTE: I have assigned 0 to missing values. But you can assign some proper values too. Refer : http://www.analyticsvidhya.com/blog/2015/02/7-steps-data-exploration-preparation-building-model-part-2/

https://en.m.wikipedia.org/wiki/Jaccard_index

and now some cleaned-up sample code.

def jac(s1,s2):
"""the jaccard index between 2 sets"""
s_union = s1.union(s2)
s_inter = s1.intersection(s2)

len_union = len(s_union)
if not len_union:
return 0

return len(s_inter)*1.0/len_union

from itertools import permutations

ratings={'Shane': {'127 Hours': 5.0, 'Avatar': 4.0, 'Nonstop': 5.0},
'Joe': {'127 Hours': 5.0, 'Taken 3': 4.0, 'Avatar': 5.0, 'Nonstop': 3.0},
'Bob': {'Panic Room':5.0,'Nonstop':5.0}}

def common_movie(dict0, dict1):
"""have we rated the same movies?"""
set0 = set(dict0.items())
set1 = set(dict1.items())
return jac(set0, set1)

def movies_and_ratings(dict0, dict1):
"""how do our movies and ratings line up?"""

set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())

key_commonality = jac(set_keys0, set_keys1)

set0 = set(dict0.items())
set1 = set(dict1.items())

item_commonality = jac(set0, set1)

#ok, so now we give a proximity on key match, even if key + data dont match
return 0.3 * key_commonality + 0.7 * item_commonality

def common_movie_ratings(dict0, dict1):
"""how do our ratings correspond on the same movies?"""

set_keys0 = set(dict0.keys())
set_keys1 = set(dict1.keys())

set_common = set_keys0.intersection(set_keys1)

set0 = set([v for k, v in dict0.items() if k in set_common])
set1 = set([v for k, v in dict1.items() if k in set_common])

return jac(set0, set1)

for pair in permutations(ratings.keys(), 2):

dict0, dict1 = ratings[pair[0]], ratings[pair[1]]
print "\n %s vs %s" % (pair)

#make no assumption on key/value
#order coming out of a dictionary.  So, you need to order them.
li = dict0.items()
li.sort()
print "  %s" % (li)
li = dict1.items()
li.sort()
print "  %s" % (li)

print "     common_movie    :%s" % common_movie(dict0, dict1)
print "     movies_and_ratings:%s" % movies_and_ratings(dict0, dict1)
print "     common_movie_ratings  :%s" % common_movie_ratings(dict0, dict1)


The output:

 Shane vs Bob
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie    :0.25
movies_and_ratings:0.25
common_movie_ratings  :1.0

Shane vs Joe
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie    :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings  :0.333333333333

Bob vs Shane
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie    :0.25
movies_and_ratings:0.25
common_movie_ratings  :1.0

Bob vs Joe
[('Nonstop', 5.0), ('Panic Room', 5.0)]
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
common_movie    :0.0
movies_and_ratings:0.06
common_movie_ratings  :0.0

Joe vs Shane
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('127 Hours', 5.0), ('Avatar', 4.0), ('Nonstop', 5.0)]
common_movie    :0.166666666667
movies_and_ratings:0.341666666667
common_movie_ratings  :0.333333333333

Joe vs Bob
[('127 Hours', 5.0), ('Avatar', 5.0), ('Nonstop', 3.0), ('Taken 3', 4.0)]
[('Nonstop', 5.0), ('Panic Room', 5.0)]
common_movie    :0.0
movies_and_ratings:0.06
common_movie_ratings  :0.0