- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 8
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
|---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-
. F" E( i2 }7 H/ ]% n A3 u+ N. A% W) U( E- s7 ~
import math
# _9 T# B2 O K6 Eimport sys
. r2 G4 v( @* x' A6 H( \% V/ dfrom texttable import Texttable9 q$ ^+ k2 m$ i) g# R
0 p: N5 r' E+ Y0 g0 B3 t: q
, f! |9 z, A3 d#1 e7 U1 W$ s7 V8 Y* N6 z8 a T
# 使用 |A&B|/sqrt(|A || B |)计算余弦距离
% a# y! U& s7 P& ~- j- |1 R: F#7 c$ P5 _0 \0 \9 Q, g9 G
#) @& L3 X8 F$ f& J* v
#+ N2 @. g% f2 s
def calcCosDistSpe(user1,user2):, h# V# B6 S$ K0 N& f G! l `
avg_x=0.0
5 I- I: O7 v2 g; i* V3 | avg_y=0.0
! k" s5 p; U- j& Z/ s. e for key in user1:
) f Q4 ~# y" ? s, z8 m5 I avg_x+=key[1]
) T: n4 C& P: | k. \9 z4 M( i avg_x=avg_x/len(user1)
$ V3 x0 F, k$ ]3 G3 T/ P
+ _3 i' g. E+ B. o' A# A9 @/ { for key in user2:5 J1 b# L6 r; \& V2 |8 E% k
avg_y+=key[1]
$ c; j6 m0 g- b1 J) b7 e. B7 C/ B2 C avg_y=avg_y/len(user2)
?! n z4 E) `# U( V/ x/ c$ w E
2 C7 b' b9 U [1 B u1_u2=0.0
) W( s1 G0 g8 h( P for key1 in user1:9 B. W& D1 H5 C6 {8 ^2 z4 I
for key2 in user2:
+ S+ k! f- G2 @6 i2 c3 @# \ if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:9 W8 w; P; C$ f
u1_u2+=1$ F( h6 D# ^: U8 [; ~$ Z
u1u2=len(user1)*len(user2)*1.0
3 M5 U u3 u$ ]" W- d8 q sx_sy=u1_u2/math.sqrt(u1u2)
8 \$ }* _* R: N2 q/ h7 { return sx_sy
* k. S) y' T n: o2 c/ Q. \& y+ z# t2 r: w: g
3 G5 G2 D2 o7 k! x0 ?$ W% t
#( ~- N0 B/ c. V! l Y' i
# 计算余弦距离) _+ E, |/ O- g. S& I% y
#
+ ^' J1 o% F0 L, l5 E9 @1 U: V#
/ b; O3 S. C9 f; L$ r7 [7 z5 vdef calcCosDist(user1,user2):/ W6 F4 o8 b, ~- q0 c% n
sum_x=0.0
9 ~5 |1 ]1 j# ]1 A o! ^ sum_y=0.0' O- J0 s1 ^# E
sum_xy=0.0
. V1 W: f& _2 S( ^) S& V for key1 in user1:4 r6 z0 [2 H# a% u1 r2 Y+ |2 v4 i- }
for key2 in user2:/ w2 p6 I# b: N! A$ ?* e
if key1[0]==key2[0] :
: p# f/ H* B4 \0 j sum_xy+=key1[1]*key2[1]+ X5 _, F1 R) M8 ]3 C
sum_y+=key2[1]*key2[1]
( o$ ^6 t9 y9 H* Z sum_x+=key1[1]*key1[1]8 h K) u% v k# T1 `8 n |: c8 b
5 u" M: n2 W( |" p$ [ if sum_xy == 0.0 :" c0 m& Q# E p- r
return 0
( Q3 h+ Q0 r4 D. E! g5 C& M& n b sx_sy=math.sqrt(sum_x*sum_y)
0 i/ g8 e+ k8 W) h, I7 D/ B return sum_xy/sx_sy
) P6 [; P/ d0 a: w4 _9 e: q# f6 y* R8 P9 P
9 F# o+ C! o/ P#
9 r8 |3 I5 I" r+ [# |#' X5 q/ ?& c" K* J) I
# 相似余弦距离
8 m- L# w, U0 L3 x9 H0 C" b#
l/ q. D2 r5 v# k0 l& c#
3 e/ j4 W0 c9 Q+ g& }#3 w4 I# N% }( o, N8 ^! V& j8 A
def calcSimlaryCosDist(user1,user2):
7 D6 ~3 Z6 O* d3 ^ sum_x=0.0* V# Z( ~0 i3 E# d3 U; Q% E
sum_y=0.0
4 h# h) r+ N6 G) g! K/ p0 i sum_xy=0.0
/ a/ l6 P- r% I% }: i8 C avg_x=0.0
; s2 Q; k% O/ S. V avg_y=0.0- L" A% g5 K% o1 z
for key in user1:
: |, F( b, }9 r N( h1 }7 Z avg_x+=key[1]
! n" f9 V0 Y- ]* t+ z avg_x=avg_x/len(user1), O0 v( F% F/ K8 N1 h
6 _3 r/ @% Q( Z for key in user2:
$ u6 G0 P2 k. ~# y' Q2 i5 q- i7 i avg_y+=key[1] C7 g9 G+ c, a) [' c( X
avg_y=avg_y/len(user2) n" q+ X! r8 e9 m* M
9 X* {$ M( U' H; X- L! A6 }
for key1 in user1:
) T7 ?# i+ q: l' h: N for key2 in user2:/ T3 q* U' `$ ~0 s, |; y3 C
if key1[0]==key2[0] :
, T. C) o! o6 M9 r% W6 e+ ?1 ^ sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)) }* W) z4 Z u- n& F( }
sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)# J. b# S) `% x9 H: h, Q; I
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)
. n" ], {4 a5 ~; T0 ~ 3 y8 v' f9 M/ W1 {' e8 q/ P
if sum_xy == 0.0 :. N( t) b' T0 v% y) }
return 0* x/ M; e! x* H
sx_sy=math.sqrt(sum_x*sum_y)
- W: V5 ]# @# i9 B _) Y, G return sum_xy/sx_sy5 }* s; [9 K5 C9 [; M& ~) X
( g* y% r* p8 M0 f
' I1 L9 k+ M* h#+ ]+ C/ [% h( u% t( |1 P
# 读取文件
% ?6 M& U) { x1 s' c#+ m( v- U$ x3 B& M- v A% \' B
#: l- T5 R) W c) c$ K4 t: [+ s
def readFile(file_name):
- L; z, ]% [, h" C# ^& R7 y contents_lines=[]
5 ^7 S+ m z. g: M: Z: b/ g/ h f=open(file_name,"r")
4 E. b3 H, ~2 G8 e1 v% ?# I& l contents_lines=f.readlines()
2 |+ [' ` A _ f.close()
6 G/ a. b; T5 ^. H8 V. E return contents_lines; z. U9 n& ^/ d. U
; ~5 y0 w8 }) \8 v7 e" h0 p2 P9 j* H7 g) @5 g' I
' l* U: q& x* Y, k8 d
#5 Q/ {) l9 _5 \+ y
# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
0 l' G! a7 |9 U# 输入:数据集合
l" Z7 E- f# n/ B1 m# 输出:已经解压的排名信息7 r) ~) _; M1 L# O
#
/ C) D$ R1 y/ @/ u% o* Cdef getRatingInformation(ratings):8 x6 K7 i z7 O+ I! N
rates=[]
$ G& [4 M$ z) ]/ Q6 u for line in ratings:
7 t/ K) x+ G6 f, ]. e8 M rate=line.split("\t")
8 M4 f" c$ p# X$ W, N3 I4 L rates.append([int(rate[0]),int(rate[1]),int(rate[2])])- Z( J, A/ z! f& `2 l: k' S a, `( X
return rates
1 R' Z( {: S/ Z0 [7 R4 @. n* s: r2 ` G, P6 k Y$ \( q& E- l
3 r& M* M& R$ K# N0 Q: T3 r/ e* d# Y1 _: L$ K4 ^* c
# 生成用户评分的数据结构
# A! x9 k! o- Z#
" X# e1 f D: Y8 b8 K1 f6 S# 输入:所以数据 [[2,1,5],[2,4,2]...]) e( h# H& u& l1 N# t0 Z0 ]
# 输出:1.用户打分字典 2.电影字典
3 g+ x% g e/ d, E) I/ b3 z2 K7 k( P& h# 使用字典,key是用户id,value是用户对电影的评价,; g1 Z+ ~ c/ |
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2: `8 }! M5 s& r& [5 f/ a- U8 c2 W
#3 b) N+ v' B5 L" u; M' I
def createUserRankDic(rates):
! D7 o0 c1 i( Z' x user_rate_dic={}- a1 n! W2 U" _' s o. i
item_to_user={}
. |5 K1 q5 J K for i in rates:
) T7 |; t1 J/ E- [ user_rank=(i[1],i[2])
. V1 {7 R9 I& i& B3 b if i[0] in user_rate_dic:" `3 H/ }! Z0 F* I5 F2 L7 H
user_rate_dic[i[0]].append(user_rank)- w& {, I3 E3 i8 Q1 O% p2 J9 X
else:4 T% b2 u. _& `$ r: Y G
user_rate_dic[i[0]]=[user_rank]* L: N* s4 a& ? G8 [& Z( T+ x2 O
* M8 ?2 b9 ~& ?8 r/ z; C% P* f if i[1] in item_to_user:
1 `0 k+ s4 N* p' b2 t" B* n item_to_user[i[1]].append(i[0]); _; X& w5 ~% u; F( R& V& G
else:
0 E/ [% t9 z+ ~ Z! w( c3 d item_to_user[i[1]]=[i[0]]
6 W& q0 `2 J$ q3 A/ r5 @0 g . Q0 Y0 r! b/ v! P$ k M, o
return user_rate_dic,item_to_user) ?! q/ L( z5 V8 G9 x# x/ ^
! a j( b3 `, f4 k( m3 }
4 l9 f7 J4 A. G4 T: X- Z#
" ~' v& f/ \2 c' J% @# 计算与指定用户最相近的邻居
4 u+ q, d$ P6 P! P" _1 Z5 J6 ^0 m# 输入:指定用户ID,所以用户数据,所以物品数据3 p$ ^, ~; s& F" D( H
# 输出:与指定用户最相邻的邻居列表5 M- f/ ?1 h/ s7 s" a# |; P7 c
#
! |& L7 {5 q$ e) @def calcNearestNeighbor(userid,users_dic,item_dic):6 D4 r7 N- J9 u$ q5 z
neighbors=[]
3 p; X' ]3 z9 d5 N* m' T( L2 k* ^ #neighbors.append(userid) A8 D- @+ K" u/ B0 K; E# u- t, i# h
for item in users_dic[userid]:) T9 c5 U6 a/ c, T8 V5 K
for neighbor in item_dic[item[0]]:, s# A. B3 \5 m* S% a) L
if neighbor != userid and neighbor not in neighbors: ! e# d# m1 _5 }' ~: \( X
neighbors.append(neighbor): @( P$ E& K: x( u/ y
4 A* @! l* v/ g8 e; M5 B
neighbors_dist=[]2 o; n6 a" ^8 s9 a) c% w
for neighbor in neighbors:6 A: j7 C3 ^9 K) j
dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe
$ n6 Y0 d' i- s8 A neighbors_dist.append([dist,neighbor])
3 S: y4 J7 }+ U+ x# V neighbors_dist.sort(reverse=True)
2 y6 o' Z1 U0 ~1 ~- \: \& f# Y. d) j #print neighbors_dist" A; V" q% \2 F. S9 @! x& c) L
return neighbors_dist7 r* y: X# ^8 R7 d1 { Z" V
# l& a6 R- V9 O. w i
$ J# k: I2 C1 k#
) g/ ~3 }3 t8 I3 A) W. `3 Z* U# 使用UserFC进行推荐1 D: i1 @! _, C- i: E% z. T J$ t% A
# 输入:文件名,用户ID,邻居数量
0 G: e; ^, X3 g3 V" }9 D# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表
6 g7 s7 K- }3 G3 f0 ^# z# n, t: `; q. A
def recommendByUserFC(file_name,userid,k=5):3 Z! f& o$ q' i
! j' \6 s2 e* O" g. Z #读取文件数据- h: ?% @% d8 T; V, l6 u3 X" C
test_contents=readFile(file_name)6 j! }* L: B. w5 _& c: m
z* x3 b3 W! ~; Q$ k6 Z8 L #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...]
+ h) ?. t$ l& T- E8 H, m test_rates=getRatingInformation(test_contents)' x1 b3 x( d; Q$ z2 [, h& W- F. v
|4 {* m" R# l- J$ o' q
#格式化成字典数据 : b% r# g5 \: D8 h; f$ \ X' \
# 1.用户字典:dic[用户id]=[(电影id,电影评分)...]( X# W* I1 l. n2 ~, N* O, f
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]& p$ i5 y# V! ^ `# J
test_dic,test_item_to_user=createUserRankDic(test_rates)9 C; a8 [+ z3 l% i( c
5 q1 e+ }4 O$ l) | #寻找邻居! q- [+ t0 {; a! h1 {- n: |
neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]
8 h) ]: R4 Z7 _" L ] 9 X/ \; o( Q( ~; [2 U' @4 g; d0 y
recommend_dic={}+ W8 k* I5 r% J$ ~ X- J. H9 D& Y
for neighbor in neighbors:! ~6 J* H8 V; S" Q; Z3 q5 R% L# A
neighbor_user_id=neighbor[1]( M) p# p5 S) _1 c& H
movies=test_dic[neighbor_user_id]. _7 v6 T; B+ v
for movie in movies:
( z7 K) L7 o4 p0 y1 P #print movie. D; }. @8 a9 w+ ^) T; u9 V5 z
if movie[0] not in recommend_dic:7 ?. E- B8 d9 V% n+ Q
recommend_dic[movie[0]]=neighbor[0]1 ~" Q2 w% S9 w& \$ G7 Z
else:4 q' a7 P+ h$ O" _1 k& {
recommend_dic[movie[0]]+=neighbor[0]
/ r/ c7 w6 j. z {3 p #print len(recommend_dic)
% d l- L( W$ V5 |8 W - e9 C8 ~: b; V( N3 ?$ U/ d6 G
#建立推荐列表9 D* V/ x5 P2 c5 G1 I$ s! J. z% S$ K; n
recommend_list=[]
4 r$ r7 S1 X6 l5 _0 t for key in recommend_dic:
; D* s+ _' ^# Q5 [ #print key4 R, d- i+ V' R' s8 F
recommend_list.append([recommend_dic[key],key])
2 k- t( p2 F! |* M2 } * W* Q* H, R i2 i' G
: p7 @& M7 H' z3 h; I! U1 j( o' x recommend_list.sort(reverse=True)# T; E) \+ @2 c) d8 S9 |+ i
#print recommend_list
2 Z8 L+ ]6 E8 j( K2 o' J* m user_movies = [ i[0] for i in test_dic[userid]]
' F! g( |+ A! A; ~9 _
$ G- o" b& ^+ H return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors6 ~; m( {) O4 J% n. g
4 Z' m8 k, f1 }! b
4 P5 \* ]# l8 ], D. C" e2 t1 L) `+ _6 T
#
( V+ Y( Z0 h0 C R#
m5 F" ?' t* T# 获取电影的列表
* h R6 ?- h# h4 ?#
& U# ~- o& V1 c) P: M, Q#
0 G, @( Q( ~, O! C! T" ^9 I#
4 i( w! j D1 x7 `def getMoviesList(file_name):) ^" U( C/ Y# V9 P( q! O
#print sys.getdefaultencoding()2 c/ i* ]# t3 m( }1 g; h5 ]
movies_contents=readFile(file_name)4 Q7 ?$ z( T9 S5 |: C0 ?* o. u
movies_info={}: A) w0 o# Z) {
for movie in movies_contents:( n* s, o, L# t) m" _+ P
movie_info=movie.split("|")
2 p9 `" \( e, @% d movies_info[int(movie_info[0])]=movie_info[1:]! K* v2 w1 H1 _9 _
return movies_info
* w7 W$ T* [5 `/ n+ }% V2 o- k( K
4 V8 |& U Z( c/ ` ' K) i! T( o- x
v0 t/ o$ C: `5 u# W C: [1 I5 T#主程序
! x0 W' W; t8 Q3 L/ u6 V#输入 : 测试数据集合" {; ~: y- l) Z y/ Q6 C6 M- J! }
if __name__ == '__main__':
6 L" N' r: I! U. J- K reload(sys)
- k5 h' }; p' \* L* S0 f% h sys.setdefaultencoding('utf-8')
' {/ @5 ]* K# m, V% f! p, D& |1 O movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")% P$ Q' ~; Z. S, Y6 X
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)+ Q" u9 t* o6 G, O8 I9 F3 R! g
neighbors_id=[ i[1] for i in neighbors]& J2 U( s) \3 @' I2 r
table = Texttable(): t; T8 o+ e6 h5 {' V/ v
table.set_deco(Texttable.HEADER)6 i% ~# L( o: I. A+ g5 W4 b' q
table.set_cols_dtype(['t', # text
* J3 n7 Q) V+ [8 Y2 j/ e 't', # float (decimal)
) @, e4 t5 U$ v 't']) # automatic
3 w/ X; I0 {% \ table.set_cols_align(["l", "l", "l"])
9 X" D) G7 Z ^) |1 }$ a rows=[]
: T* t! @. T* _6 X7 f% ~* u# L rows.append([u"movie name",u"release", u"from userid"])
% Y/ R4 A6 k, f% z for movie_id in recommend_list[:20]:
2 U. |+ F; i0 E2 N$ x from_user=[]% D- O9 N6 o6 n8 C% y( s$ K& B3 o) K
for user_id in items_movie[movie_id]:
: H# c& D" X6 B) E: { if user_id in neighbors_id: E% _: y, T% g6 Z; Z# `
from_user.append(user_id)6 G+ t* {7 e+ {
rows.append([movies[movie_id][0],movies[movie_id][1],""])" C$ h6 P; J4 T4 k
table.add_rows(rows)* I' T/ q2 o8 Y; {2 i" a: J( h% _ Q$ Q
print table.draw() |
|