- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 8
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-
3 V. S4 p, a' W8 D+ |5 q7 ]5 z$ a+ l6 I
import math8 i+ O7 `/ W4 W4 n
import sys
6 s- [. \' ?1 Y0 L( ]' D) Mfrom texttable import Texttable
0 g K9 d: s. _" W- ^6 H
7 z; p8 ^6 L) A: {- P6 a: Y# w8 B0 c' w0 U6 r4 T
#
- u- A9 `2 O! z4 w. Y8 L# 使用 |A&B|/sqrt(|A || B |)计算余弦距离
3 r' t2 y' f6 ?2 V2 p! C#
& _* y: T" R+ T5 r. F#7 d: R/ A$ Q I9 p5 Y
#
! x, _1 w3 d1 k; K- X& n5 C8 Sdef calcCosDistSpe(user1,user2):
: R% i( i- S4 x( Y" o avg_x=0.0
2 J. Z8 {: J, }% T o3 B avg_y=0.0
' O- c* y' i5 _ for key in user1:; {4 {, y# L! b/ a: F' ?
avg_x+=key[1]( M( |; t) \9 b/ I: [
avg_x=avg_x/len(user1)' p( y% V( J4 Z# i' s2 \7 y: P3 B
/ i0 x% O1 G# {+ r1 O+ ` for key in user2:
5 Z$ U1 Z [7 j1 ^( J; O avg_y+=key[1]
* W/ _9 ?* T4 ]* j, K avg_y=avg_y/len(user2)
+ k9 |5 G, A; `7 X' q5 e4 s8 T 1 x0 Z$ _! j; ^7 D$ s" _1 P" s
u1_u2=0.0
& s- n1 [2 |+ w6 K, ~8 I for key1 in user1:' ?) }, q+ z; D- Y, G9 _1 _& y
for key2 in user2:
1 m1 p& V) ?' t4 i0 |8 [1 O if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:
9 Y! t: V1 O+ I u1_u2+=1
# M" d) ?* t% W( h( D: M u1u2=len(user1)*len(user2)*1.03 e/ d% }8 P4 e5 R7 c. ?1 x2 X
sx_sy=u1_u2/math.sqrt(u1u2)8 v, M& F: G) A& J' J. A1 M/ A3 o1 q
return sx_sy0 A' C9 l( x: }' x o, _; k
9 L; j% W5 Z% ~2 h3 R6 y$ V
V' O7 L! v/ Q* K7 T#
3 U, G0 N$ M$ `' p4 N* l8 _/ ~# 计算余弦距离' v( R. ` E4 y9 K; e% j$ [
#
) r6 T3 l) C, P. U9 j#6 @5 I* X! m W/ G4 o- M
def calcCosDist(user1,user2):
5 [- M' j$ u7 U8 O8 H! t sum_x=0.0* _' s% k# u5 ]7 K
sum_y=0.0
6 d! N) b4 x5 w0 l( O$ l1 E# ] sum_xy=0.0
% V: e* L* N% Q J; y) E/ F, S3 \ for key1 in user1:- a; E! l+ S- j( S" K/ l; d
for key2 in user2:9 Z! D: O( z9 h' \% C7 L6 G/ e
if key1[0]==key2[0] :
! e; b8 F. u. B4 ]2 D6 ^( @1 E& y sum_xy+=key1[1]*key2[1]
, B; K3 H, G F( ]* o o) s2 R sum_y+=key2[1]*key2[1]" B0 r* h% A0 q# E; u
sum_x+=key1[1]*key1[1]+ e. F( l2 u# y4 L+ n0 @, Q
& x7 Y. r/ _7 ~' O, t; B- L: J" F if sum_xy == 0.0 :
' s( u( ]) l9 z* r return 0& r; z8 ^$ b" S
sx_sy=math.sqrt(sum_x*sum_y) 2 N; g* {1 T. j* K) F/ d
return sum_xy/sx_sy; F( j! o; Z/ v( O: A Y2 t1 ^( I
6 w0 T. q7 {( B! s+ e" L
. F! y* s; O, a% z3 h#- b8 y* A+ l; T* X; M# X% {
#9 n6 J3 d. _$ l3 ~9 Y
# 相似余弦距离: g% d7 D; F/ v: e
#
# E8 g; @7 O& o7 y6 U1 r. W#
8 U! l, p, ^9 k p#- O( y$ C: f" |6 N/ R0 i) J
def calcSimlaryCosDist(user1,user2):
+ |0 Z, n4 j" n4 A+ s sum_x=0.0& ?8 f2 W" p, j+ i+ g8 G
sum_y=0.0( ?" Z+ m. w. e5 I# H! P
sum_xy=0.0) t+ o( x, ?$ m( I' U8 n
avg_x=0.0
0 c, `$ o$ G1 k" b avg_y=0.0
R, n* d, e5 U for key in user1:6 I- r/ k6 E: g* ?* P% R
avg_x+=key[1]& N2 H. ?2 A2 p$ S) W1 Q
avg_x=avg_x/len(user1)
: u+ N+ f) b7 A: U9 Z1 B/ C) _ : G4 u) `9 P+ w0 @2 x9 y z. Q
for key in user2:2 z" E, d5 ]. o, U6 M( O. r) G
avg_y+=key[1]
$ d0 T- d* U$ G, `2 K9 ~ t avg_y=avg_y/len(user2)6 ?3 O4 H2 n8 R& I3 x2 L1 c
5 e/ {9 q, q3 j4 `* y# a for key1 in user1:
: I$ ^5 Z4 p: J" m for key2 in user2:
7 I. f7 N3 m( |' {6 o6 C# { if key1[0]==key2[0] :
: ?) e6 h' |' }5 G sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)8 `/ [, Z/ e6 |; f! t6 {. l) z' ]" m
sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)
5 o3 w+ {' k0 V B% G% G sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)
+ L; J/ L3 n3 f# L, n
8 m4 n* Q! v2 F if sum_xy == 0.0 :! `% P7 {' q" _" E' r/ F0 i% |
return 0
6 l, V- L) z% P1 ^- c) \% Q# h sx_sy=math.sqrt(sum_x*sum_y)
8 \0 \* b* v" ~+ f. X return sum_xy/sx_sy
) x7 |3 p7 E" d/ q7 T- O: O
, W. n7 G' t( D. w& \3 C/ T( t4 N- {- S7 Z: M
#
7 W# N. s& W, ~; u, n# 读取文件) C7 U7 [2 a3 {& ~8 }
#/ f& |8 `) M- M8 Q1 q
#5 e- V; Z% _7 Q
def readFile(file_name):1 ~" V; q% Y. w- I3 ~0 r$ r
contents_lines=[]- ~. t/ S7 [+ _$ O3 n* K
f=open(file_name,"r")
D2 O: S8 a+ Z( x" n7 W& o, ~ contents_lines=f.readlines()
& C9 u1 D) ]1 D% a f.close()7 k7 ~8 _/ K) y6 m) Z
return contents_lines
: E6 I( z- Q" v+ i1 x9 _
& _; D) m( N0 n% j. K. Y
( I8 Z, [' ?5 D2 O+ \% F1 x* m' D) Q) @7 {
#
! f( X; @9 M8 q1 v: o# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间6 f& S6 z: r' d7 r. A; p
# 输入:数据集合
9 Y$ w1 G, P. ^- S* `# 输出:已经解压的排名信息2 V' f k; m! N: y5 o
#
- w) c$ t, V" z$ {def getRatingInformation(ratings):
7 _$ z( {/ l) k3 W* F/ J' }: b rates=[]9 H. R3 }! A2 o% ?
for line in ratings:: X- \' z, t. b3 r
rate=line.split("\t")
( @% q i7 g" l. f6 \* u rates.append([int(rate[0]),int(rate[1]),int(rate[2])])
2 s4 q1 f, C4 R! k return rates
" Z, K3 G3 |! o- z" W& A) e9 r
1 Q+ w: z' d0 C/ L
& j5 y4 e$ [1 d! R8 w5 ?* c#
* c0 s, i& N y! Z W* |# 生成用户评分的数据结构
( p) V6 D' k4 F5 K( d% R: K! m#
6 I% f3 i/ x, B) w% |# 输入:所以数据 [[2,1,5],[2,4,2]...]
/ O2 k0 J0 ~1 t+ y; P- \# 输出:1.用户打分字典 2.电影字典: y# s( w8 z3 c; y% `
# 使用字典,key是用户id,value是用户对电影的评价,
8 a' j/ q1 ^) \+ L8 a, {# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2) ]" j! S( V! z* @4 u) ^& x" z
#1 R4 m: H( K( \! w# J4 @
def createUserRankDic(rates):& `$ T6 h, q( }
user_rate_dic={} f8 R0 j% ?9 V; p+ M
item_to_user={}0 g# C5 O. F& Y: m/ L5 O
for i in rates:) S: z, S6 f, L/ h% e" P+ T! p; y- j
user_rank=(i[1],i[2])
$ l; A- {, F, z9 Y/ C if i[0] in user_rate_dic:
c, Y$ n+ z7 w6 \: o$ Q, d+ R user_rate_dic[i[0]].append(user_rank)
* ~ i* W; d" v# v3 [ else:
% J# F3 r7 r( Y9 ?9 z. I user_rate_dic[i[0]]=[user_rank]7 g+ i3 A% d6 }6 w
+ v; z4 a- \; Y/ g0 Z5 B1 o! B2 I if i[1] in item_to_user:
) |( A6 |( ~9 t1 M item_to_user[i[1]].append(i[0])
) y, G( M+ z# ~$ U else:# W4 u, _: e# V0 W( B' C
item_to_user[i[1]]=[i[0]]
; {1 g& a. D* ~6 Q7 l. c
: q. c) ~- x) D& m return user_rate_dic,item_to_user
* z* ]# c7 O G/ M8 R" w6 b' {9 a* t. x* G/ A( h
' \# i: S4 W( h8 A; l- F {" e#
+ R& a' _6 |7 E+ F# 计算与指定用户最相近的邻居
! J, |5 v( Y: Q1 c* u# 输入:指定用户ID,所以用户数据,所以物品数据
+ ~' j' t! K+ ~; d# h/ u0 B# 输出:与指定用户最相邻的邻居列表
- A2 Z/ r+ J% t#/ o6 V' s B& Y; a0 N% m) [
def calcNearestNeighbor(userid,users_dic,item_dic):
( y6 s& ]' D/ j' e0 k neighbors=[]
1 t6 m2 x+ w. B #neighbors.append(userid)
/ ^/ @% o6 g6 Z+ o% }) p# v for item in users_dic[userid]:# B1 {3 \" C0 X O" C, j% Z
for neighbor in item_dic[item[0]]:! {" N; [% S# L* a, J- q/ M
if neighbor != userid and neighbor not in neighbors:
- h& E+ U$ p/ F% a% F neighbors.append(neighbor)- b/ Y; \: W' j, D8 S) h
8 u% g4 J5 P* \" o% Z; B" F1 { neighbors_dist=[]1 o k' H( `1 c) H! V6 ?( r
for neighbor in neighbors:/ W3 _ {6 V9 D, N
dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe$ Z' X" P- K+ F" j! e4 R$ Z
neighbors_dist.append([dist,neighbor]); ^' t: Q% q! ?1 s$ Z( h/ q' F0 y
neighbors_dist.sort(reverse=True) y# {) v' s9 H% X- D
#print neighbors_dist
8 T* ?/ z3 D& j& n5 j return neighbors_dist2 g' {( a u3 Z6 W+ X) g6 O+ p
5 R) _% n T" H
& y1 v" ?' I4 {- ~* g! c6 ^#
, f/ b5 c/ _6 y6 g7 o6 s8 e# 使用UserFC进行推荐
- Q' Z9 v1 R+ r' A% J$ f( Y( Y- N# 输入:文件名,用户ID,邻居数量# N2 } Q8 h' E) u( v K
# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表
- x& `: Q3 V9 O9 o#8 H+ O' A' v6 O9 U3 M
def recommendByUserFC(file_name,userid,k=5):
# e% H3 B5 A: @7 M; G. r1 U ' Q+ S- O" p8 P% N& [9 Z; h& V
#读取文件数据/ I' y6 |# P( \; @, k( i( t6 l
test_contents=readFile(file_name)
( A4 a8 Q/ d+ l: a+ E
7 [4 O. i/ D4 p0 n6 F4 f #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] b6 l* `. t3 r7 m i) G J
test_rates=getRatingInformation(test_contents)9 A+ Y6 L( a: }# o
( p$ z5 z8 Z% G #格式化成字典数据
* |/ c& }9 \9 P # 1.用户字典:dic[用户id]=[(电影id,电影评分)...]
4 j; U9 n6 R+ P* A' C/ m # 2.电影字典:dic[电影id]=[用户id1,用户id2...]3 }4 [" p" k* Y! x2 e
test_dic,test_item_to_user=createUserRankDic(test_rates)
2 I# x- v$ g; S' Q* m
! }+ M- P' w, k; G$ l #寻找邻居) \4 `0 h$ p0 h: d
neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]
$ `6 t& Y9 @' @9 Z" g2 K* _" j z
3 k' W+ B A& h8 S+ ]7 J recommend_dic={}
2 R4 G. C: z9 P* o; z- j for neighbor in neighbors:
* d9 e; f6 C' {1 p neighbor_user_id=neighbor[1]* A" [/ n" f! ~( B
movies=test_dic[neighbor_user_id]
! D4 z, ^1 g0 q. k2 S for movie in movies:
3 U8 o3 ]! B+ t$ t, p' P7 Y. k #print movie6 j5 J. }( f N5 d+ k2 w# m$ ^
if movie[0] not in recommend_dic:
1 R% k5 x6 L3 J' U3 J# I, Q recommend_dic[movie[0]]=neighbor[0]
! c; h, k0 o# J) @" T; u8 X else:
0 H' X3 N8 N, e; ^' |) n recommend_dic[movie[0]]+=neighbor[0]8 J3 @1 m3 l+ L
#print len(recommend_dic)
5 u+ O" [1 q* \( ~, g: U2 i
. G' c: p( Y5 P/ u6 q' z5 ` g; x #建立推荐列表
' c o- V" V# v0 O& l recommend_list=[]# K, c% Y/ c+ D0 G2 _- W3 x3 F
for key in recommend_dic:
7 @% A4 b' r- H7 [ #print key8 r: f1 L& p, H+ G+ \' t3 a
recommend_list.append([recommend_dic[key],key])! o; F$ v% F. D
; W; c/ h; B: q `: ^* u1 C
7 P6 q5 }) Z7 Q% T X0 b& E recommend_list.sort(reverse=True)2 `0 s) m7 H9 z
#print recommend_list! i2 S" v, A7 a+ e5 O
user_movies = [ i[0] for i in test_dic[userid]]
4 F3 T0 n1 k% N" K4 N4 ^+ M) _$ _3 _6 \0 T# I& L2 y
return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors
- O( X% \* E4 {9 I7 i& a / V# j: [- p9 h" N. S2 S
- a0 }/ `! Z2 E4 t) V4 D
/ {2 e4 }3 b& U' ]
#
z9 h- H4 }, O% l#
# v- ]! w L: d) Q& c; O, E" V# 获取电影的列表
( x3 A+ H0 B8 N5 o#
* M+ l+ B5 m9 m7 \#
" C$ n# B' Q; G& G#
0 U$ k: o) v- g4 O* _( e, Edef getMoviesList(file_name):
( q& B4 n# m6 y9 E [' b& O5 E #print sys.getdefaultencoding()" X+ I" ~+ U% `( D6 a; l* Q. p& Q
movies_contents=readFile(file_name)2 f- g4 f$ C4 a: S( @& p# R- y
movies_info={}9 \. Z' D$ \0 z
for movie in movies_contents:
+ E5 l& P7 k7 E, @4 }" A movie_info=movie.split("|"): u3 d! j- w" g) E/ `
movies_info[int(movie_info[0])]=movie_info[1:]$ x$ U, t# n; w( v3 N
return movies_info
+ G3 y) [; C5 [( } & J. R" C1 \- ]4 _4 G! @% L
' }) y+ @8 T, u! h( H+ X
5 K8 G- Y3 m9 e F* f9 [: s: F#主程序
3 ]$ F+ I/ D! X- `0 p#输入 : 测试数据集合) t5 U4 r; s2 q5 e2 s
if __name__ == '__main__':
/ Y9 Y- s4 z* B; z8 a2 a9 W$ e y1 f3 f reload(sys)
, t1 e2 p1 R& {2 B0 C" ` sys.setdefaultencoding('utf-8')/ N3 |! M6 F% g3 w
movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")' v1 O* h9 u" `( w0 e& j
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)2 u- m$ h+ x9 R# k
neighbors_id=[ i[1] for i in neighbors]
6 m4 b/ O/ H+ q! j" `" E table = Texttable()0 d+ h& m# c+ H1 X( \0 v9 Q
table.set_deco(Texttable.HEADER)
7 n7 s* k" M& y, b2 J. t# f$ T table.set_cols_dtype(['t', # text & ]7 W' i, I- Z6 p
't', # float (decimal)/ Y9 u9 o! O/ E& g
't']) # automatic; W0 {# ~2 o5 A7 \
table.set_cols_align(["l", "l", "l"])
$ P1 n- e9 m7 |, b rows=[]8 t# n( G. y. c$ e0 \
rows.append([u"movie name",u"release", u"from userid"])
& V5 p+ M8 q0 ]5 q. Y! @ for movie_id in recommend_list[:20]:( g' N0 ^' j0 F3 U( {8 ^& G" p+ I# G
from_user=[]
o2 N6 ?9 M7 b9 v [" q for user_id in items_movie[movie_id]:' ~) B$ g- D) ] D7 q8 @
if user_id in neighbors_id:! y3 w1 {* k: a" n+ X
from_user.append(user_id)6 ^( Z8 K9 u' d( o" o2 w- a
rows.append([movies[movie_id][0],movies[movie_id][1],""])2 x( \6 D+ r |4 F- N1 s3 z
table.add_rows(rows), w# }* ^2 H' d$ @3 `
print table.draw() |
|