1. 程式人生 > >使用者房源推薦—基於內容的推薦演算法(CB)

使用者房源推薦—基於內容的推薦演算法(CB)

CB推薦演算法根據使用者過去喜歡的產品,為使用者推薦和他過去喜歡的產品相似的產品。採用基於特徵的空間向量模型,並用最近鄰方法進行推薦。
演算法步驟:

  • 抽取房源的基本特徵,考慮到租房的實際情況,確定的基本特徵有價格 (house_price) 面積 (house_area),房屋型別 (house_type),地區 (district)

  • 利用一個使用者過去喜歡(及不喜歡)的房源特徵資料,來學習出此使用者的喜好特徵。將各個特徵進行分類。其中價格分為10類,面積分10類,房屋型別6類,地區9類利用歷史資料統計出每個使用者的每個特徵中各類別的次數,之後相加取平均,表示某使用者的某特徵的喜好向量。

  • 價格t1 : (house_price) ,
    面積t2: (house_area) ,
    型別t3: (house_type),
    地區t4: (district)。
cosθ1=T1t1||T1||||t1||cosθ2=T2t2||T2||||t2||cosθ3=T3t3||T3||||t3||cosθ4=T4t4||T4||||t4||
使用者在實際租房時,更多的是考慮房屋的價格和地區,因此主觀確定4個特徵的權重為
ω=[ω1,ω2,ω3,ω4]=[0.35,0.15,0.15,0.35]
(權值可根據實際推薦進行調整)

則每個房源與使用者喜好的加權相識度
similar_item=ω1cosθ1+ω2cosθ2+ω3cosθ3+ω4cosθ4
對所有的similar_item進行由大到小排序,取前10作為推薦房源。
#coding:utf-8
import pyodbc
import time
import numpy as np

class recommend_house:
    '''房源推薦演算法類'''
    def house_data(self):
        '''
        函式功能:獲取資料庫中收藏使用者的房源資料
        引數:無
        返回值:所有房源資料:rent_house_info,使用者收藏房源資料:user_house_info,使用者id列表:collect_user_id
        '''
cnxn = pyodbc.connect('DSN=zjx;UID=root') cursor = cnxn.cursor() sql = "select DISTINCT uid,h_id from test.shoucang where h_type = 1 order by uid" cursor.execute(sql) user_info = cursor.fetchall() sql = "select id, house_price, house_area, house_type, district,status from test.house_rent_info_geren" cursor.execute(sql) rent_house_info = cursor.fetchall() user_house_info = [] each_user_info = [] now_id = user_info[0][0] late_id = user_info[0][0] each_user_info.append([user_info[0][0],user_info[0][1]]) count = 0 collect_user_id = [] for item in user_info[1:]: now_id = item[0] if now_id == late_id: each_user_info[count].append(item[1]) else: count = count + 1 each_user_info.append([item[0],item[1]]) late_id = item[0] count = 0 for item in each_user_info: user_id = item[0] collect_user_id.append(item[0]) flag = True for house_id in item[1:]: for each_house in rent_house_info: if each_house[0] == house_id: if flag : user_house_info.append([each_house[1:]]) flag = False else: user_house_info[count].append(each_house[1:]) break count = count + 1 return rent_house_info, user_house_info, collect_user_id def cosine_similarity(self, vector_A, vector_B,len_vector_A): ''' 函式功能:計算兩向量的餘弦相似度 引數:向量vector_A,vector_B 返回值:兩向量的餘弦相似度 ''' for i in range(len(vector_B)): if vector_B[i] == 1: index = i break vector_inner = vector_A[index] vector_cos = vector_inner/(len_vector_A) return vector_cos def price_classify(self, count1, price): ''' 函式功能:對房源價格進行分類 引數:計數count1和房源價格 返回值:計數count1 ''' if price <= 1000: count1[0] = count1[0] + 1 elif 1000 < price <= 1500: count1[1] = count1[1] + 1 elif 1500 < price <= 2000: count1[2] = count1[2] + 1 elif 2000 < price <= 2500: count1[3] = count1[3] + 1 elif 2500 < price <= 3000: count1[4] = count1[4] + 1 elif 3000 < price <= 3500: count1[5] = count1[5] + 1 elif 3500 < price <= 4000: count1[6] = count1[6] + 1 elif 4000 < price <= 4500: count1[7] = count1[7] + 1 elif 4500 < price <= 5000: count1[8] = count1[8] + 1 else: count1[9] = count1[9] + 1 return count1 def area_classify(self, count2, area): ''' 函式功能:對房源面積進行分類 引數:計數count2和房源面積 返回值:計數count2 ''' if area <= 20: count2[0] = count2[0] + 1 elif 20 < area <= 30: count2[1] = count2[1] + 1 elif 30 < area <= 40: count2[2] = count2[2] + 1 elif 40 <area <= 50: count2[3] = count2[3] + 1 elif 50 < area <= 60: count2[4] = count2[4] + 1 elif 60 < area <= 70: count2[5] = count2[5] + 1 elif 70 < area <= 80: count2[6] = count2[6] + 1 elif 80 < area <= 90: count2[7] = count2[7] + 1 elif 90 < area <= 100: count2[8] = count2[8] + 1 else: count2[9] = count2[9] + 1 return count2 def type_classify(self, count3, room_type): ''' 函式功能:對房源型別進行分類 引數:計數count3和房源型別 返回值:計數count3 ''' if room_type.find('1室') > -1: count3[0] = count3[0] + 1 elif room_type.find('2室') > -1: count3[1] = count3[1] + 1 elif room_type.find('3室') > -1: count3[2] = count3[2] + 1 elif room_type.find('4室') > -1: count3[3] = count3[3] + 1 elif room_type.find('5室') > -1: count3[4] = count3[4] + 1 else: count3[5] = count3[5] + 1 return count3 def district_classify(self, count4, room_district): ''' 函式功能:對房源地區進行分類 引數:計數count4和房源地區 返回值:計數count4 ''' if room_district.find('濱江') > -1: count4[0] = count4[0] + 1 elif room_district.find('西湖') > -1: count4[1] = count4[1] + 1 elif room_district.find('上城') > -1: count4[2] = count4[2] + 1 elif room_district.find('下城') > -1: count4[3] = count4[3] + 1 elif room_district.find('江干') > -1: count4[4] = count4[4] + 1 elif room_district.find('拱墅') > -1: count4[5] = count4[5] + 1 elif room_district.find('蕭山') > -1: count4[6] = count4[6] + 1 elif room_district.find('餘杭') > -1: count4[7] = count4[7] + 1 else: count4[8] = count4[8] + 1 return count4 def count_price(self, collect_house_info): ''' 函式功能:獲取某使用者的房源價格偏好向量 引數:某使用者收藏的房源資料 返回值:某使用者的房源價格偏好向量 ''' count1 = [0]*10 vector_price = [] for item in collect_house_info: count1 = self.price_classify(count1, item[0]) len_collect = float(len(collect_house_info)) for item in count1: vector_price.append(item/len_collect) return vector_price def count_area(self, collect_house_info): ''' 函式功能:獲取某使用者的房源面積偏好向量 引數:某使用者收藏的房源資料 返回值:某使用者的房源面積偏好向量 ''' count2 = [0] *10 vector_area = [] area_null = 0 for item in collect_house_info: try: item = list(item) item[1] = int(item[1]) count2 = self.area_classify(count2, item[1]) except: area_null = area_null + 1 len_collect = float(len(collect_house_info)) for item in count2: vector_area.append(item/(len_collect - area_null)) return vector_area def count_type(self, collect_house_info): ''' 函式功能:獲取某使用者的房源型別偏好向量 引數:某使用者收藏的房源資料 返回值:某使用者的房源型別偏好向量 ''' count3 = [0]*6 vector_type = [] type_null = 0 for item in collect_house_info: if item[2] == ''or item[2] == 'null': type_null = type_null + 1 else: count3 = self.type_classify(count3, item[2]) len_collect = float(len(collect_house_info)) for item in count3: vector_type.append(item/(len_collect - type_null)) return vector_type def count_district(self, collect_house_info): ''' 函式功能:獲取某使用者的房源地區偏好向量 引數:某使用者收藏的房源資料 返回值:某使用者的房源地區偏好向量 ''' count4 = [0] * 9 vector_district = [] district_null = 0 for item in collect_house_info: if item[3] == 'null': district_null = district_null + 1 else: count4 = self.district_classify(count4, item[3]) len_collect = float(len(collect_house_info)) for item in count4: vector_district.append(item/(len_collect - district_null)) return vector_district def CB_recommend(self, similar_weight): ''' 函式功能:計算每個房源與使用者偏好向量之間的加權相似度 引數:存放加權相似度的列表similar_weight 返回值:加權相似度的列表similar_weight ''' '''分別計算每個使用者各特徵的偏好向量''' vector_price = self.count_price(collect_house_info) vector_area = self.count_area(collect_house_info) vector_type = self.count_type(collect_house_info) vector_district = self.count_district(collect_house_info) price_cos_sim = [] area_cos_sim = [] type_cos_sim = [] district_cos_sim = [] for item in rent_house_info: if item[5] == 1: similar_weight.append(-9999) else: count1 = [0] * 10 count2 = [0] * 10 count3 = [0] * 6 count4 = [0] * 9 if item[1] is None: sim_price = -9999 price_cos_sim.append(sim_price) else: count1 = self.price_classify(count1, item[1]) len_vector_A = np.sqrt(np.inner(vector_price, vector_price)) sim_price = self.cosine_similarity(vector_price, count1,len_vector_A) price_cos_sim.append(sim_price) try: item[2] = int(item[2]) count2 = self.area_classify(count2, item[2]) len_vector_A = np.sqrt(np.inner(vector_area, vector_area)) sim_area = self.cosine_similarity(vector_area, count2, len_vector_A ) area_cos_sim.append(sim_area) except: sim_area = -9999 area_cos_sim.append(sim_area) if item[3] == '' or item[3] == 'null': sim_type = -9999 type_cos_sim.append(sim_type) else: count3 = self.type_classify(count3, item[3]) len_vector_A = np.sqrt(np.inner(vector_type, vector_type)) sim_type = self.cosine_similarity(vector_type, count3, len_vector_A) type_cos_sim.append(sim_type) if item[4] == 'null': sim_district = -9999 district_cos_sim.append(sim_district) else: count4 = self.district_classify(count4, item[4]) len_vector_A = np.sqrt(np.inner(vector_district, vector_district)) sim_district = self.cosine_similarity(vector_district, count4, len_vector_A) district_cos_sim.append(sim_district) weight_cos = 0.35 * sim_price + 0.15 * sim_area + 0.15 * sim_type + 0.35 * sim_district similar_weight.append(weight_cos) return similar_weight if __name__ == '__main__': t1 = time.time() test = recommend_house() rent_house_info, user_house_info, collect_user_id = test.house_data() id_num = 0 print "共有使用者數:",len(collect_user_id) print "輸出格