标题: 求协同过滤算法程序 [打印本页] 作者: harveymao 时间: 2014-7-18 23:21 标题: 求协同过滤算法程序 本人在学校参加暑假培训,需要这个算法做题!求大神赐教,感激不尽 7 x9 T) D" }2 O: `, z作者: 百年孤独 时间: 2014-7-19 09:22
# -*- coding=utf-8 -*- 1 A# v( d8 f9 R" V 5 p3 K, d' d0 Y# { m1 @import math ) V' i1 t, Y* S8 C% \import sys) F" \ [7 J- H4 u
from texttable import Texttable , J5 v$ p7 k1 p) L2 V2 z6 a, w {& |- d5 X6 B' T: |
9 W1 l0 j7 m% ]
# - F! Q! y& Q5 |+ s1 J( _3 f# 使用 |A&B|/sqrt(|A || B |)计算余弦距离 ) n& c/ f) m" h( l# / _+ b" w2 d* l* T) H- y# 0 Q( O7 M# a7 g# . Q* x* I! d9 @( v' _def calcCosDistSpe(user1,user2): $ }5 ]! R3 p3 q" `0 e avg_x=0.0- W, l& R* y6 w1 Y: ^& A, k
avg_y=0.0 ! E1 u9 V8 A" W5 G% ~# m* M for key in user1: 1 t% k2 A. N9 R- D( i. I; K1 o avg_x+=key[1]# d4 I Y: ?0 X* y0 P) v
avg_x=avg_x/len(user1): j. E5 r) T' {. X; R! j3 e+ \
- O! P2 q- g% |6 P. h0 N2 E' o
for key in user2:0 d$ Q8 @2 C1 c% v: Q
avg_y+=key[1] ' M; P" c7 k4 o& b5 o' M4 X! q9 S avg_y=avg_y/len(user2)5 U. U0 w ~1 e& p: X
3 L" o, ?% u0 o9 T3 d2 | u1_u2=0.0 & K' Y1 A: w! q& |% A0 t for key1 in user1: + p' ^4 ]' a/ I+ y3 P% m2 @' Y: z for key2 in user2:& ~9 I2 ]/ y: M. L, E( O, M
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]: 9 O$ g$ V4 v6 s; X u1_u2+=1 ' {1 o; B* W/ s u1u2=len(user1)*len(user2)*1.01 l9 _1 W% o" m* b. i! p' P
sx_sy=u1_u2/math.sqrt(u1u2) ) k$ r& ]' m: X9 K return sx_sy+ _1 M/ N0 V4 B
5 J+ N: `5 A8 ?! k9 `: ?9 C# m; P
/ T; B& H% C. T- [) g8 a# / | A- ~' V Q2 `* ]; f% e' k$ @3 C# 计算余弦距离0 r1 |) P, @4 e. P$ g
#- @1 u& [& F* u+ F
# - V! J) Z' R* m4 Q Z+ ` ]def calcCosDist(user1,user2): $ N/ U: Z. M$ K5 H% W sum_x=0.0 " c0 M8 U9 d& j7 O6 O# n* v' W sum_y=0.0 5 `% f8 E' G; j2 K& a s/ ^; a sum_xy=0.0, l% T7 _6 N3 v0 g' b
for key1 in user1: % k& w/ a. |2 V: v' E) { for key2 in user2: " U$ Q3 |/ X* C) r [$ F/ N if key1[0]==key2[0] : 5 z; g* L4 G7 f; A sum_xy+=key1[1]*key2[1] 2 M2 }0 f0 C# T) O3 U$ [: z sum_y+=key2[1]*key2[1] , @8 i; R/ w$ Y% O8 [* V: F% S sum_x+=key1[1]*key1[1]6 o3 F5 C4 q0 u
- x3 R; b2 \2 f% g% X
if sum_xy == 0.0 :- O1 t3 L& J6 ~2 v
return 0 , w" G( K3 \8 y$ e. V2 V' s sx_sy=math.sqrt(sum_x*sum_y) - i) E. E0 p4 d$ E' V& I; K return sum_xy/sx_sy2 j; w8 t( R4 P8 ]4 _& G
6 M( M$ G4 J1 o& `( d
! k3 b$ Q* u3 e
#. x; s" v c9 |3 k" R2 m+ q
# 3 R5 r; r% h- ~/ x0 l# 相似余弦距离* P. d& q! h6 H) n
# , x2 g. H! u: U#% |4 I E# P1 d/ ?0 p( U5 P
#" i8 S M/ S) v# f5 L
def calcSimlaryCosDist(user1,user2): ^+ w7 H. I4 ?. W4 Z sum_x=0.0. k4 G: o! J) e; V" a
sum_y=0.0$ F1 ~. N' W2 V6 g6 I
sum_xy=0.0 3 e$ o# Q6 u" I3 @* z8 F% }7 N avg_x=0.0- p7 u! s) O* v6 v6 T
avg_y=0.0& V3 J( H1 @2 K1 d
for key in user1: \( W, I- J1 O( d. }* r/ {2 y
avg_x+=key[1]2 |$ Y, H; z* }; m q
avg_x=avg_x/len(user1)" h' V1 q" ?* Q$ r* q) k
% f2 K6 Z e8 f2 X. g
for key in user2:- y6 _$ C' S; w1 V6 `$ G7 v
avg_y+=key[1] " V6 J) J" k* q y avg_y=avg_y/len(user2) 0 ?6 \, ` w9 U; z: E) A , M" g0 m# [8 C7 k
for key1 in user1:! q) T* S9 H, H. p- O+ v) s
for key2 in user2: 0 ?$ K9 ?3 P7 x! U3 Q; t if key1[0]==key2[0] :4 R' ?5 P" R3 c5 y; U
sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y) + B9 Q ~8 t0 y$ x1 B* M sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)# _2 g0 M" [6 {
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x) & _/ X( W: L: |% I% f, J. K- X. E . p$ S9 X: T) ~/ F& K8 _
if sum_xy == 0.0 :) ?" e; s* P+ J5 [+ I# ~% x
return 03 ^- S: { R C/ ]0 b: C9 M% R# a$ X. j
sx_sy=math.sqrt(sum_x*sum_y) 4 c8 G$ O" r" u return sum_xy/sx_sy # P, `; u0 _, B/ s6 Y* I3 c : |5 a$ }% Y1 Z) u. e6 p
* k% s2 W3 U; d
#. q; Q+ p3 y( ^4 J; |, x
# 读取文件$ a4 K" O; C9 v4 l4 f1 {
#9 H$ s* \) R( K3 l/ T
# : I4 x t3 j ^( g4 W' L( Wdef readFile(file_name):8 W) V- r7 ]% t$ p
contents_lines=[]! q8 f0 M+ z9 G. a
f=open(file_name,"r")/ x. z' y, I7 B
contents_lines=f.readlines() A r4 L8 H/ Y6 J* R( | f.close() . E" L6 p5 p r return contents_lines9 p, S, h* y i4 [/ e. m# Y4 Y
$ w+ j" N* [: |
7 x( K4 I f& h3 y1 g
5 `8 T' ?# f# ~; e% C- O; b' z8 L
# e0 x9 P" `( t1 v& c
# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间 7 Y! K' h0 G7 }% b+ O" Q% e% e: O# 输入:数据集合 0 U& e0 ?( Z# N% Q6 a9 J' C, M& F0 h# 输出:已经解压的排名信息 9 j) x8 \: m) F h#) V2 p6 P4 q6 M- A
def getRatingInformation(ratings):7 a+ c, F' N3 K: ^8 s/ E5 B
rates=[]/ H' E( ?) @" \/ e5 W8 C6 i. ~2 W
for line in ratings:0 f( L6 U' p- V8 a
rate=line.split("\t") 1 Y, }& [% E) l- G+ _: o( \; V0 Y rates.append([int(rate[0]),int(rate[1]),int(rate[2])]) 8 V3 u% s* `5 D p8 B5 z t return rates . o0 L( L5 Q& Q2 @! B 0 c6 p1 u& V2 l8 B/ @# U9 Y0 M/ o. V
# + N K2 a% j: s2 V4 w* d# 生成用户评分的数据结构9 E/ D- G! M4 i D7 A {3 V# V
# ; m0 q; Y3 N9 _5 K7 q# K6 Q* ~
# 输入:所以数据 [[2,1,5],[2,4,2]...]/ r3 B8 t( Y) x* g- K
# 输出:1.用户打分字典 2.电影字典 % _9 N/ \( P1 J! m. n: ?% X# 使用字典,key是用户id,value是用户对电影的评价, $ o1 T7 K2 v$ i: \# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2) {2 ]5 h" t. f9 l- d" L+ |
#1 e& O1 I& A7 f0 {
def createUserRankDic(rates):9 S; S! n+ Z; l+ L- G$ f8 K
user_rate_dic={}1 f& y& q" B {( H) \
item_to_user={} 2 a& }# G) J# F% K1 s- }/ s for i in rates:% U8 @, P2 W; G6 B% L$ Y$ d# |/ L
user_rank=(i[1],i[2])4 f- ?% q% Y) P2 N) k" p
if i[0] in user_rate_dic: ! Z3 H/ [; `2 c- M1 y user_rate_dic[i[0]].append(user_rank)5 Q# Q7 z. E2 c" G$ e! |
else: ' e$ _ J4 s. g( B5 `' K; u user_rate_dic[i[0]]=[user_rank] 1 e7 ~1 g0 U; O& [ 7 c( ^' V0 V; ] if i[1] in item_to_user: + w4 i6 N) ^6 C0 H- K% J8 e3 U item_to_user[i[1]].append(i[0]) 3 U1 K3 j& Q4 J' P- X8 Q0 m else: : n( T) j' \. m3 y8 B- p! ? item_to_user[i[1]]=[i[0]], y- [! e+ Y6 n/ H- X3 W$ I( o% ~$ H
1 P# k( j0 C0 P
return user_rate_dic,item_to_user( ^# n5 U; Z; G9 U" q: O
7 Y; ?8 o& Z. l$ G8 ~- w# J" g* n2 x2 {: l, w- n( O3 z: j. W1 m
# 5 t4 N( Y# c) ~1 V1 l6 V% Q$ O# 计算与指定用户最相近的邻居 : l" e' H$ m; g* O* v# 输入:指定用户ID,所以用户数据,所以物品数据! H# `5 W: t6 [8 _/ v: \; ~
# 输出:与指定用户最相邻的邻居列表4 Y$ Q0 X3 v" E' T. Y& t* ^+ o
#: F( G- F3 ?& ?$ e. o0 b
def calcNearestNeighbor(userid,users_dic,item_dic): : X0 K4 r3 p# |# J: c6 O0 F' Y+ y neighbors=[] 5 N0 p7 Y, n3 L% } #neighbors.append(userid) 6 a3 M5 e" Y& w$ M* K! U/ R# F for item in users_dic[userid]: j( W: n- f* r" Q3 N
for neighbor in item_dic[item[0]]: ! k8 V, [6 z& Q/ G/ J; G6 P" I if neighbor != userid and neighbor not in neighbors: 5 G# w. x j* Y0 E4 r
neighbors.append(neighbor)( _9 d& i' \" p
2 W& b; `8 X. o; c
neighbors_dist=[]; C( ^, ?; K b% b
for neighbor in neighbors: ; Z8 T+ o, l4 |7 B, N8 [; I dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe 2 O1 ~1 F" T6 J3 w. j& ` neighbors_dist.append([dist,neighbor])3 f& v2 l/ X8 T# I. H `3 B
neighbors_dist.sort(reverse=True) : p6 D6 v1 Z+ t. L, z: v, x5 t #print neighbors_dist $ ?" c$ q: T8 c, P+ ?7 W/ @ return neighbors_dist ' L% S+ C0 p' R% P1 V9 e. V % f- ?% i( L7 @ 0 k- X' q' W& t& [# 1 p$ }7 [/ k" r% L. `+ M d& e# 使用UserFC进行推荐 1 v& X! u( T, f5 o _3 g) j9 b# 输入:文件名,用户ID,邻居数量 3 Q5 ]9 c; z7 m) R! m4 B; E# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表$ q" q+ _& t' @
#- a$ t4 R6 R( W% }: H2 b
def recommendByUserFC(file_name,userid,k=5): 6 |9 w4 Q( L, G/ Y# J( F# G 0 K6 j, m, ^" a/ _7 h! w
#读取文件数据- u+ P* N6 `9 p9 I% B8 h
test_contents=readFile(file_name) ' }2 r1 V& A* c6 e x8 ]+ y# r" x9 @
#文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] 6 H7 z) W* J9 o+ f4 B
test_rates=getRatingInformation(test_contents)& B# q' l; H1 P% C* c9 e9 i
9 Q/ M, R) J) U0 n4 X. K r #格式化成字典数据 ) b" ^$ u- M: [% _; j1 h # 1.用户字典:dic[用户id]=[(电影id,电影评分)...]1 I* k: V; C8 i( e6 g1 R
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]: x) H: N5 w1 d6 e0 R, d" q r# }
test_dic,test_item_to_user=createUserRankDic(test_rates) F' A, F2 q. B$ n: C! R6 K" b* n $ r5 @2 [+ |) O& s
#寻找邻居8 Q8 g5 s% b1 H; j: [' n# i/ S
neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]3 [9 Z, Z2 e2 n) g- O0 p- G* C' w
( o4 P5 j6 }6 Q; q6 k5 Z
recommend_dic={}& |- V% {+ J8 K; n: X
for neighbor in neighbors: , C; |8 |& H- Z! \' ` neighbor_user_id=neighbor[1] ; {8 N. p! W3 O5 V+ }7 i movies=test_dic[neighbor_user_id]" `/ x' w I1 c8 W
for movie in movies:$ a! @" B* x! e6 s5 i# [. d, w
#print movie 7 C. u6 Z( N/ c8 D7 Q if movie[0] not in recommend_dic: 8 C! _8 B( q+ {2 b9 O recommend_dic[movie[0]]=neighbor[0]& f9 h" Z( R8 ]+ E/ L0 b* f R; S
else:$ A5 \7 E: i% h4 Z
recommend_dic[movie[0]]+=neighbor[0] : e d# @& Z- k) U$ D9 ]) a #print len(recommend_dic)+ y6 ?4 g* O/ x @ Y% ]9 N
0 k; b! Z% t6 G9 u
#建立推荐列表 * i- f" } p, B recommend_list=[] 6 D6 W* s; W: w5 \ for key in recommend_dic:' m0 C6 A4 O7 A. j7 L& Q: D
#print key, {' b% c& b" I: p3 Y
recommend_list.append([recommend_dic[key],key]), c: b; n) [. H
3 ]5 p- q& L8 p. [$ U& l % t; `* s' @+ U6 n" _ recommend_list.sort(reverse=True)9 ~! T$ O2 K6 V* ]% n1 _6 `5 ?
#print recommend_list 0 l' b% h W% m4 s" B user_movies = [ i[0] for i in test_dic[userid]] 6 B( n8 d. `+ g3 [' g+ |7 a, x+ M2 ^8 l- M
return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors' N" H$ ~# I5 o! r7 m& n, C
# Q- u! Y2 T% C; ?. m: a ) W' Q( y3 X! I0 r4 \# W8 c" M- k y
. m" ?: ]; {; y7 X
#- O3 \# R8 B+ K# A, P5 H
# ' W: r2 Q4 o0 ^2 m% r- f/ h# 获取电影的列表 6 t. V+ M8 z. |8 @6 d- ?* V1 a#- `- X) i7 u8 s3 ?' h
# ! S8 Y/ x, ~7 Y& H# U# 2 s. f. @: ~4 C' J2 C$ B& C4 Fdef getMoviesList(file_name):- }3 ^) ]% H" D' K, ?4 W, C
#print sys.getdefaultencoding()& R/ C" R; U) g
movies_contents=readFile(file_name) " a7 q& c7 ]9 S movies_info={}4 F, g5 ^+ Y2 T
for movie in movies_contents: * ], M) J4 r$ ], E7 Q8 ^0 C movie_info=movie.split("|")" b. ` ?& ?( D* U
movies_info[int(movie_info[0])]=movie_info[1:]; S. h. t; W! ]5 ?! v
return movies_info % O0 }. b- `/ m8 V3 } 9 @; N- u" Y9 ?9 ?
- P0 o- T7 S& b& e" \7 l% S
7 H: \( W i9 A; O' F n! P
#主程序6 r- H: e b8 d7 q; ]
#输入 : 测试数据集合 / O T& b, M. V) x' J# lif __name__ == '__main__':8 n$ a5 Q& n; {/ c: i
reload(sys)& ^$ c; ? z1 B
sys.setdefaultencoding('utf-8') + M9 ?9 l* P$ E" t movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")4 E1 W P: |4 `6 D8 [
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80) $ X5 L1 d/ _! |. n0 ^5 h neighbors_id=[ i[1] for i in neighbors] ) Z" V/ O8 {0 q+ k' p% K$ J& d/ X$ ?! n0 t table = Texttable()! i5 l/ H5 d5 L, Z0 c2 s# u
table.set_deco(Texttable.HEADER) 0 @) k2 ]: r: J, A+ O! h# D. b table.set_cols_dtype(['t', # text ' A2 \, l, c0 F$ A4 U 't', # float (decimal) 9 j2 d L0 }9 g 't']) # automatic 1 `; ?/ ]4 C* r table.set_cols_align(["l", "l", "l"]) 9 w$ \3 X) }$ Z; f h7 Y* x; q rows=[] # r4 ?7 m7 b* [5 S# L. [2 x rows.append([u"movie name",u"release", u"from userid"])+ N( A! r2 }- H3 ~
for movie_id in recommend_list[:20]: 5 A& m' q3 J' o1 n from_user=[] z3 P4 S, h( Z" A7 }; l
for user_id in items_movie[movie_id]:9 b2 f+ q: Z4 R# N- b
if user_id in neighbors_id: $ V) u% q: r/ r7 r from_user.append(user_id) ) D" C+ x L+ P# E( j) `9 X: } rows.append([movies[movie_id][0],movies[movie_id][1],""])" t5 _9 |1 R% b7 _' M
table.add_rows(rows), B( s3 t) j3 ` U! e$ q1 x
print table.draw()作者: mea_lsc 时间: 2015-4-19 00:25