1. 程式人生 > >資料探勘之曼哈頓距離、歐幾裡距離、明氏距離、皮爾遜相關係數、餘弦相似度Python實現程式碼

資料探勘之曼哈頓距離、歐幾裡距離、明氏距離、皮爾遜相關係數、餘弦相似度Python實現程式碼

# -*- coding:utf8 -*-
from math import sqrt


users = {"Angelica": {"Blues Traveler": 3.5, "Broken Bells": 2.0, "Norah Jones": 4.5, "Phoenix": 5.0, "Slightly Stoopid": 1.5, "The Strokes": 2.5, "Vampire Weekend": 2.0},
         "Bill":{"Blues Traveler": 2.0, "Broken Bells": 3.5, "Deadmau5": 4.0, "Phoenix": 2.0, "Slightly Stoopid": 3.5, "Vampire Weekend": 3.0},
         "Chan": {"Blues Traveler": 5.0, "Broken Bells": 1.0, "Deadmau5": 1.0, "Norah Jones": 3.0, "Phoenix": 5, "Slightly Stoopid": 1.0},
         "Dan": {"Blues Traveler": 3.0, "Broken Bells": 4.0, "Deadmau5": 4.5, "Phoenix": 3.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 2.0},
         "Hailey": {"Broken Bells": 4.0, "Deadmau5": 1.0, "Norah Jones": 4.0, "The Strokes": 4.0, "Vampire Weekend": 1.0},
         "Jordyn":  {"Broken Bells": 4.5, "Deadmau5": 4.0, "Norah Jones": 5.0, "Phoenix": 5.0, "Slightly Stoopid": 4.5, "The Strokes": 4.0, "Vampire Weekend": 4.0},
         "Sam": {"Blues Traveler": 5.0, "Broken Bells": 2.0, "Norah Jones": 3.0, "Phoenix": 5.0, "Slightly Stoopid": 4.0, "The Strokes": 5.0},
         "Veronica": {"Blues Traveler": 3.0, "Norah Jones": 5.0, "Phoenix": 4.0, "Slightly Stoopid": 2.5, "The Strokes": 3.0}
        }






def manhattan(rating1, rating2):
    """Computes the Manhattan distance. Both rating1 and rating2 are dictionaries
       of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
    distance = 0
    commonRatings = False 
    for key in rating1:
        if key in rating2:
            distance += abs(rating1[key] - rating2[key])
            commonRatings = True
    if commonRatings:
        return distance
    else:
        return -1 #Indicates no ratings in common
#歐幾裡距離
def euclidean(rating1,rating2):
    """Computes the Euclidean distance. Both rating1 and rating2 are dictionaries
        of the form {'The Strokes': 3.0, 'Slightly Stoopid': 2.5}"""
    distance=0
    commonRatings = False
    for key in rating1:
        if key in rating2:
            #distance += sqrt((rating1[key]-rating2[key])**2)
            distance += (rating1[key] - rating2[key])**2
            commonRatings=True
    if commonRatings:
        return distance
    else:
         return -1




#明氏距離
def minkowski(rating1,rating2,r):
    distance=0
    commonRatings=False
    for key in rating1:
        if key in rating2:
            distance += pow(abs(rating1[key]-rating2[key]),r)
            commonRatings=True
            return pow(distance,1/r)
        else:
            return -1
def computeNearestNeighbor(username, users):
    """creates a sorted list of users based on their distance to username"""
    distances = []
    for user in users:
        if user != username:
            distance = minkowski(users[user], users[username],3)
            distances.append((distance, user))
    # sort based on distance -- closest first
    distances.sort()
    return distances


def recommend(username, users):
    """Give list of recommendations"""
    # first find nearest neighbor
    nearest = computeNearestNeighbor(username, users)[0][1]
    print nearest


    recommendations = []
    # now find bands neighbor rated that user didn't
    neighborRatings = users[nearest]
    userRatings = users[username]
    for artist in neighborRatings:
        if not artist in userRatings:
            recommendations.append((artist, neighborRatings[artist]))
    # using the fn sorted for variety - sort is more efficient
    return sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)
 
# examples - urncomment to run


#print( recommend('Hailey', users))
def pearson(rating1,rating2):
    sum_xy=0
    sum_x=0
    sum_y=0
    sum_x2=0
    sum_y2=0
    n=0
    for key in rating1:
        if key in rating2:
            n += 1
            x = rating1[key]
            y = rating2[key]
            sum_xy += x*y
            sum_x += x
            sum_y += y
            sum_x2 += x**2
            sum_y2 += y**2
    denominnator = sqrt(sum_x2-(sum_x**2)/n)*sqrt(sum_y2-(sum_y**2)/n)
    if denominnator == 0:
        return 0
    else:
        return (sum_xy-(sum_x*sum_y)/n)/denominnator
def cos_like(rating1,rating2):
    innerProd=0
    vector_x=0
    vectoy_y=0
    for key  in rating1:
        if key in rating2:
            x=rating1[key]
            y=rating2[key]
            innerProd += x*y
            vector_x += x**2
            vectoy_y += y**2
    if sqrt(vector_x)*sqrt(vectoy_y)==0:
        return 0
    else:
        return innerProd/(sqrt(vector_x)*sqrt(vectoy_y))
print cos_like(users['Angelica'],users['Bill'])
print pearson(users['Angelica'],users['Bill'])
for list in ( recommend('Veronica', users)):
    print list