标题: 求协同过滤算法程序 [打印本页] 作者: harveymao 时间: 2014-7-18 23:21 标题: 求协同过滤算法程序 本人在学校参加暑假培训,需要这个算法做题!求大神赐教,感激不尽+ v$ z" W, M0 ]$ M3 }$ k 作者: 百年孤独 时间: 2014-7-19 09:22
# -*- coding=utf-8 -*-7 P: `* ~9 N/ a* P0 V, ]2 f4 ^ h! t
4 W( f) B- V9 S
import math 3 Y- z$ M1 f) J4 v2 Vimport sys" r( f0 ]& n" [! G# B7 D7 R
from texttable import Texttable 6 _; J! U" D: p* Y+ n3 {; q 5 j! p( {1 N4 r, w 4 y6 [# [) Z) t$ ~& J# ( y+ [" T. o w( ?# U! |4 i# 使用 |A&B|/sqrt(|A || B |)计算余弦距离# ?: Q0 e5 ] Q$ M5 g
# 5 D9 ~/ X+ z. S7 L! Q" d2 z3 _# 2 D2 a$ c" R$ @1 E" ~#3 F! K8 D+ C s! h( s% k6 ~
def calcCosDistSpe(user1,user2): ' |0 u: k; \: k avg_x=0.0 ! L; o% \. ^! ~& k c avg_y=0.0$ ^* b5 c! I! M0 p
for key in user1: O. A! ^7 l% F( a6 Z( @' f' U
avg_x+=key[1] - U$ D* v7 i8 x- Q1 {: l3 d avg_x=avg_x/len(user1) 1 s P: | \5 l5 K5 k9 U $ q" O& u" A3 I8 ^2 v# t1 n
for key in user2: * i# \& ?- c: l avg_y+=key[1] ( o% o i9 e1 l, ~! [ avg_y=avg_y/len(user2)0 d3 y/ T e& Q! ~( v, J3 v
% W* j: n3 S) J* r% Q N u1_u2=0.0 : c$ T/ X( k3 x5 ? for key1 in user1: & _1 N4 v5 `6 e. z1 I3 ~ for key2 in user2:0 Q1 i& V6 n0 Z; y9 \: ]1 s3 ~" J8 k
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:# X3 C5 U S' b8 s9 i2 ~: Y$ a
u1_u2+=1 , D5 ]- N% i$ e, U& Q6 A3 O u1u2=len(user1)*len(user2)*1.0 ; C) N/ ]( E7 I, t sx_sy=u1_u2/math.sqrt(u1u2) & j' _9 p2 h, ^2 X return sx_sy ) `( d0 A; ~( _7 n5 t & b4 b# X+ j0 A) T& ^# T2 F) R- G6 k I: }3 `7 G- Z; R
#- |$ x; s1 Q! d6 S3 ^/ a& @' Q6 n
# 计算余弦距离9 O6 \, Z( ^; O
# ; @- x% ?4 \+ q" G! b# % A: ]5 O) d2 }# s0 M( G/ Idef calcCosDist(user1,user2): * X/ z+ ]% N1 J sum_x=0.0" H0 V1 n- R1 N( I5 h
sum_y=0.0 $ B" E$ M1 j3 I sum_xy=0.0; r' ~4 Z$ O3 c G* y m, t
for key1 in user1: * o$ N& q7 _; U for key2 in user2:! K- f& |! U/ G% \2 Y" |
if key1[0]==key2[0] :4 B8 n6 D2 Z+ f/ A
sum_xy+=key1[1]*key2[1]- @! r9 b$ N* S# P2 K
sum_y+=key2[1]*key2[1]" K/ i! P. D4 q
sum_x+=key1[1]*key1[1]" H3 H( s/ j% j6 z D
! O9 U& _" Y' F# V
if sum_xy == 0.0 : ! y9 U% K0 \5 V1 H/ z# F: x0 f return 0 O+ ^# G% c7 ?' I& y3 m4 x
sx_sy=math.sqrt(sum_x*sum_y) ' j. P& w5 m/ D2 l& @! w' A
return sum_xy/sx_sy # ?4 F6 L/ D. F- Z; B . `: {& Q. {$ s2 C4 k3 l. k9 T : Y4 s5 o) C+ k2 b' F# 6 Q6 [: F0 U, C#: ]& { K2 p& t# [/ K
# 相似余弦距离* T$ ~* t4 B; w/ f: f5 w: a+ u1 |8 Y
# 4 r1 p& K+ A6 _#% y% t: l: R3 L% ^- P8 I& U, I; h/ v+ x
# 5 s# `/ O k7 ?! V1 jdef calcSimlaryCosDist(user1,user2): % q L* H5 B7 |' n sum_x=0.0; X2 w H( x. w4 u
sum_y=0.0% T2 Z g4 Z6 S
sum_xy=0.0 ( `- D3 e1 S0 [ avg_x=0.0" D( s o1 @; q# ?2 _: I/ S6 n- g
avg_y=0.0 9 ~0 x) L$ M" w for key in user1: # z e5 Z' y7 y; o N avg_x+=key[1]4 p1 g7 L1 e- P/ |/ z& d* }
avg_x=avg_x/len(user1)+ ~8 }. N5 R; \$ N% ?
7 p! W9 i o0 e) P for key in user2: T. J( k- S' a# C2 h avg_y+=key[1]0 N, z. _- g- O- x1 E7 M6 s
avg_y=avg_y/len(user2) 3 O2 O3 B- {3 X0 ?7 Q# [; ]! o/ t ; r% x, s: t. {. A
for key1 in user1: 6 m5 |0 \9 E* W2 f" y for key2 in user2: . q& E0 I0 D r3 X5 u if key1[0]==key2[0] :! j6 Z8 Y9 }& C C) W$ p" B
sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y) 1 q) n* x) M3 q4 w& S! z/ J sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y), q% m4 g+ b ?) |) |4 m
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x) * d6 z4 d: b' t# h5 a4 M. X8 Y 0 _ {+ B! }/ L) j4 h4 ?2 U
if sum_xy == 0.0 :" q2 R9 D6 P' I) c
return 00 f) Y2 z1 s7 L% e$ C: G
sx_sy=math.sqrt(sum_x*sum_y) 2 F @" c0 w) i! k return sum_xy/sx_sy8 h; G2 {- }9 w. k( {) _
x) O& K6 j4 a! t F* U# S, C. X- }4 U9 Z## h# a; T' N2 o9 k
# 读取文件 ; M5 @; b2 w5 y v9 ~' M- Z) n; q* E8 v5 w# , f& @$ _8 T' d5 M#3 V9 z# E3 u7 p$ y( _9 K" O
def readFile(file_name):) p, w6 Z- R- K7 o7 Z( d- K
contents_lines=[] 2 D4 v, |% j# I3 P( N f=open(file_name,"r") ; t3 F5 p Z! ]5 P contents_lines=f.readlines() 3 D. R4 g9 V N, o% l4 B f.close() 8 H$ {2 u* U- A; H$ d" X1 T return contents_lines( ?+ {! g( X; r! u9 A/ e
$ D! e! `- w4 t/ J, n, T* U- R* R( o
1 h/ l" b* O4 Y, u6 B; v7 G ( g3 T# o5 g+ o: A#" X4 ]8 [+ V2 G& G' E
# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间 & j! ~; T, s& r2 K& j/ c5 e) N# 输入:数据集合 : i( ^( | _; z0 t7 `6 E% d: A# 输出:已经解压的排名信息 : v: P+ X2 ^+ j2 v. l6 y" ~#: c2 z- q5 I/ {1 H4 O
def getRatingInformation(ratings): " \! u/ d) z' `1 `( {+ \ rates=[]2 n3 }8 X7 ]/ o2 Y# L
for line in ratings:. e7 a2 Y4 G! `; u, t+ q
rate=line.split("\t") v8 J* N% f; I5 i, n
rates.append([int(rate[0]),int(rate[1]),int(rate[2])]) Q1 S8 ~) D9 }, `
return rates- I2 e$ ^0 v& U6 A" B0 c- k' j
4 X+ h0 z6 ]8 E# I0 O( P. W
4 U8 W% h# S' {5 N/ o4 B- f# 3 b7 a# x" @. W+ r/ S9 g& D# 生成用户评分的数据结构 3 A; G7 e9 [; D5 l5 M7 j/ t! z# + w% S# ~" Z* v: w) N2 T2 L k
# 输入:所以数据 [[2,1,5],[2,4,2]...]# a( R& ^8 x! P# B; e
# 输出:1.用户打分字典 2.电影字典 5 `3 A- j; m3 A8 O# 使用字典,key是用户id,value是用户对电影的评价,$ w# M0 @# J. b8 D8 i) y
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2 $ A% i2 m. {/ X8 k6 _# / V9 J2 e* _0 k5 f0 n- vdef createUserRankDic(rates):9 V1 n+ Y7 e$ h% K
user_rate_dic={}% |" N1 \* Z6 F9 H; n4 y, [
item_to_user={} ' O9 M! r \7 ]% \ for i in rates:2 p. n2 v$ N2 E( t9 d
user_rank=(i[1],i[2])9 J2 X r7 ?! G: ?! M- X/ I
if i[0] in user_rate_dic: " l# H1 A. S9 x" r; n user_rate_dic[i[0]].append(user_rank) : u) Q/ ^& P' a9 {# d else:. h! Q/ R& [6 Y! l7 _' h" C
user_rate_dic[i[0]]=[user_rank] 5 }8 z7 m1 Q5 m" \ + Z4 m) G0 J; M5 N- c2 B# q8 t) v if i[1] in item_to_user:* j) {6 P5 H( \
item_to_user[i[1]].append(i[0]) & A# f, L: \% \ else: 6 ]+ x. i1 Q6 j5 R2 z5 U C item_to_user[i[1]]=[i[0]]/ h, l. e+ i# O; m( S
$ C2 p6 B3 M8 _8 Z( t& E, [! i: N, b#$ ]( W' m8 J9 D p
# 计算与指定用户最相近的邻居 1 i0 l0 S' q# z3 C" B# 输入:指定用户ID,所以用户数据,所以物品数据8 |" L" ^7 c5 V* @* W- o
# 输出:与指定用户最相邻的邻居列表% @: q- t; O4 Q
#0 E: @+ k2 r% ?
def calcNearestNeighbor(userid,users_dic,item_dic):' D+ F' j8 z& K8 j0 S; O
neighbors=[] 9 H/ n" E& X4 a3 w #neighbors.append(userid) $ Y0 `$ t9 T O for item in users_dic[userid]:2 V& X& j4 K2 S& E! e0 T# j. _
for neighbor in item_dic[item[0]]: 5 Z& s; r% n+ h) C! g3 I if neighbor != userid and neighbor not in neighbors: ; d( n$ r9 B2 o% Y8 u2 _ neighbors.append(neighbor). ~; K( [1 \5 r/ C
; J( D1 Q2 E8 z# @1 Q: G8 k
neighbors_dist=[] o+ `& v0 o) H( m0 `# p: { for neighbor in neighbors: 4 e# h1 g+ J6 h% V, N" N2 p dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe" {% `3 v* W* R/ {$ R- p0 W
neighbors_dist.append([dist,neighbor]) / b/ O) Q+ L& X% G. o' M neighbors_dist.sort(reverse=True) % d# W g0 `# U( F9 m k5 C- W #print neighbors_dist6 j3 G( v7 r; x2 q1 c; Z5 p0 f
return neighbors_dist i4 u9 J1 d7 Y6 Z. K! W5 U
+ q, t3 T0 |5 Z + T5 V* s5 t3 E) N$ z- K# % ?) _) W2 g8 n: a' n# 使用UserFC进行推荐 * C N" R4 p" n' }4 W# n# 输入:文件名,用户ID,邻居数量. { m: u; K7 r* w; n0 Q8 @
# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表 1 g: c; i) p5 @4 F6 m: l# ) L+ e2 m+ D" T0 \def recommendByUserFC(file_name,userid,k=5):7 _2 e, F2 Q2 G6 u0 j
1 q. X& s* p! S. v2 ^5 H3 N w* t
#读取文件数据 2 x0 S2 x3 I' H, q8 @* z test_contents=readFile(file_name) # z3 \: w% O4 _8 Y , B7 [4 M" u3 \# b #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] * ~4 R2 V d, R# A; Y) J test_rates=getRatingInformation(test_contents) 0 a4 `1 V& g0 B/ |/ F . x4 t* }0 z# m( f6 w #格式化成字典数据 . z @- [8 g2 ?5 k- R
# 1.用户字典:dic[用户id]=[(电影id,电影评分)...] . k' i1 d* j5 C- i4 N # 2.电影字典:dic[电影id]=[用户id1,用户id2...]- D3 u- h( H0 f' G) B+ x
test_dic,test_item_to_user=createUserRankDic(test_rates)' {2 I' a: v7 _
% x% n$ v- s+ i0 k M$ m
#寻找邻居 ' j0 P% Z( Q9 U& H8 d: `) d neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]" X! c( y3 e1 p8 c
6 F$ t4 J1 X" b5 t recommend_dic={}3 I7 ~$ Z7 T$ ^8 O
for neighbor in neighbors:0 s0 h" _9 g Z* j8 e
neighbor_user_id=neighbor[1] 6 m0 X% |6 b+ B6 ^( f4 J movies=test_dic[neighbor_user_id]1 ~3 n `6 l/ Q& F3 `$ I
for movie in movies:7 Q4 U i2 E2 c, [0 U
#print movie1 \2 S# V* Z" Q
if movie[0] not in recommend_dic:: y5 s5 L7 w' v2 s- `( B; d4 m' t+ c
recommend_dic[movie[0]]=neighbor[0] ! ?# K" R7 @& r else:# f0 a+ O. s1 z, t" L* }; c
recommend_dic[movie[0]]+=neighbor[0]3 S4 D: _+ W/ Y. I8 u
#print len(recommend_dic)) k/ g! D l/ i. O6 Q
0 `2 |, M5 v/ l3 d" S% R
#建立推荐列表- T6 a) ~4 O! }% V5 b& n. t) f- I( g
recommend_list=[]0 p& _$ K8 s+ p+ `! Y) v+ F
for key in recommend_dic: / L! I1 ]& s% a' r5 |7 L1 k$ C& W #print key5 E j# f4 P: w/ D" T
recommend_list.append([recommend_dic[key],key]) + H2 c% D" f& K2 i ! c0 j0 G$ N: l
# E5 f$ @2 ^8 N2 D7 y5 w
recommend_list.sort(reverse=True) 9 m. O+ u" l3 C9 m: C #print recommend_list6 G% ]% s0 a: C- u
user_movies = [ i[0] for i in test_dic[userid]] j+ i( C( G5 C% x# a( l - \$ {2 z' |! v& y7 g$ ]- F" Q return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors D' p4 \9 V+ u+ f
' K; E6 D: {3 L" ^ " T+ u. s7 J3 U' f/ ]3 U( y 2 C! ?; Q9 _5 f* p/ W% r+ o! S# $ q4 d' s5 e" {% |/ u#2 r' h( |' S! L8 L/ C+ l& }5 b6 Q- x# o
# 获取电影的列表 ( Z" r7 w- o, m3 q7 h# ! [+ s+ V: @' L# r/ z5 l1 k. g6 B# 5 I$ I3 B9 o5 }4 Udef getMoviesList(file_name):& A8 m" q+ I( p& P, F. @6 n
#print sys.getdefaultencoding()! b! Q& S% b4 D
movies_contents=readFile(file_name) 8 L8 \! ~. k! \. T- ~ movies_info={} " L) B8 p3 U- t2 D# R for movie in movies_contents:& G8 }4 i: b$ W+ h5 B
movie_info=movie.split("|") s/ x( [, F6 B' a movies_info[int(movie_info[0])]=movie_info[1:] ' R* E( {* v# ^* F9 c% `2 ] return movies_info* I' i% w' P1 | O5 W5 _7 h ^
( d; e, S# a. \
2 q k! J% h: K: c) j 7 k' j, p& l, a4 c1 R#主程序- u5 ?2 @5 s g
#输入 : 测试数据集合 ( N2 J |4 S, q) V) \0 k5 I$ Nif __name__ == '__main__': & l |: u2 s# P- u6 W2 y U reload(sys)1 ^! f& b) }3 u; a6 B& S& O9 S3 ?
sys.setdefaultencoding('utf-8')' F& u8 Y8 w S
movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item") + Q9 v! b9 u2 T/ I* B% g" M2 a5 H! n- s recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80) + T, s4 q- X( B: ]& M neighbors_id=[ i[1] for i in neighbors] 2 @# i3 W/ q4 O D6 Y' p G table = Texttable()' V% S" W3 A" q: y0 ~# z8 [
table.set_deco(Texttable.HEADER)! ?. a) D- Q3 ~6 e
table.set_cols_dtype(['t', # text : c# s5 C0 S3 T* h 't', # float (decimal)# M6 c+ K% T6 q2 m
't']) # automatic 9 j6 Q. t4 [ f- m7 Y& Y, r table.set_cols_align(["l", "l", "l"])8 I; l# Z6 x) S: m! Y, |4 e: h
rows=[]! T& ^6 R4 {; A- z# R
rows.append([u"movie name",u"release", u"from userid"]) & j6 O) q# f( p for movie_id in recommend_list[:20]:" a3 O% m2 o4 M
from_user=[]$ j# P2 o8 a3 o- ~, ]* L- }/ H
for user_id in items_movie[movie_id]: " P- Q7 U! z% P( Q; y" p) C6 N" E if user_id in neighbors_id:4 s! L5 L3 q, t
from_user.append(user_id) , R# X$ W' \7 l" }- A& I rows.append([movies[movie_id][0],movies[movie_id][1],""]) 7 Y1 Z& K3 i, b' Q, Z table.add_rows(rows)+ W( \4 J. ~! W& y6 i5 H0 c/ c
print table.draw()作者: mea_lsc 时间: 2015-4-19 00:25