- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 8
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-9 G. k5 c, g- }$ B. O5 b8 Y8 z
& V. C" F5 f; K" P& w, m2 aimport math
9 n3 C$ Y0 X' A' h- K) Bimport sys. m6 r& C4 j, {3 F* u
from texttable import Texttable7 `. W4 X6 D3 |& A' `
' { e: h: W z' o) }( J: j* I) U+ _3 N3 r2 s1 X8 ~9 O, f
#
& [: R6 }9 u* k6 ]+ \* \5 r# 使用 |A&B|/sqrt(|A || B |)计算余弦距离2 |! b/ ?6 b, _
#( t; p1 G( b# |" o# j$ p9 V
#
* m; S# h1 l: R" g" j7 | l! u#
+ p0 Q+ a3 {( }7 d6 K3 Q) rdef calcCosDistSpe(user1,user2):
& a( _% t. D V7 q$ d7 g7 u2 I/ u avg_x=0.0
0 U" H- Q9 E2 L) t6 K6 i! t" e avg_y=0.0+ Y( H, ~% [2 _' N3 o* S& h" l" v
for key in user1:
+ _$ v" d1 K/ G5 O+ X" O3 {3 T0 k avg_x+=key[1]" t5 y9 n ^, ?9 K; T6 y: b
avg_x=avg_x/len(user1)
: t) |7 z' H$ | V v
% V8 ]: q) E5 u& S3 v for key in user2:
+ t; I( U" `8 M3 Q9 ~$ q avg_y+=key[1]
8 Y9 J. }# ~3 \0 e( h! C& x- d avg_y=avg_y/len(user2)
, c9 H* G" R7 O* t' ] $ ~' T3 w0 ^$ j* M; H
u1_u2=0.0
+ f, s5 E/ \0 a for key1 in user1:
, V/ p$ P) [+ I1 ` for key2 in user2:" Y/ X5 l+ Q! b
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:3 N3 O% F$ u" M: ^1 d" a& X
u1_u2+=1 ~/ [* ~$ z; W+ U% [% [
u1u2=len(user1)*len(user2)*1.0
% g) Z* ^, B: l' B3 ~8 q W sx_sy=u1_u2/math.sqrt(u1u2)
1 U4 F2 }$ K/ F( E6 ~% o2 l) f return sx_sy
( T/ `/ f- [" w# j5 S; j9 u4 F N2 k& G# A% N0 d
1 U. T' ? y$ p7 e# x6 i/ n! d& n, f. j0 b
# 计算余弦距离4 S3 K0 W4 w3 e" e! h) n4 H$ G
#
, H) l, E4 a6 ^% m& n: A#
. T5 ~$ N5 W; Q+ r% Y0 qdef calcCosDist(user1,user2):8 f: z7 ?! D G' s% Y6 J* `# E
sum_x=0.0
: Z7 e' V+ W0 L& Q sum_y=0.0
" f+ ~8 F5 W3 n- K" x sum_xy=0.0
+ {( `; k2 Q+ |9 I' {5 \; _ for key1 in user1:
5 \5 E3 ]% u7 h for key2 in user2:
. ? C: n; P! k4 }8 a2 w5 B" [ if key1[0]==key2[0] :* ?8 W2 n/ X/ X2 N& N$ ]
sum_xy+=key1[1]*key2[1]
+ U. C0 ^( b6 Q @ sum_y+=key2[1]*key2[1]
3 ~) |* ]) y4 f6 W+ J* j* @ sum_x+=key1[1]*key1[1]
1 j1 y: y3 A# ]8 T) C $ ]2 ^# Z5 J" ]# k4 U, V, y
if sum_xy == 0.0 :
; x3 l, o, i* R return 0# r- C' C; H+ Q7 u$ ?
sx_sy=math.sqrt(sum_x*sum_y) % i2 ~: L3 @1 O
return sum_xy/sx_sy
9 Y$ t1 w+ z2 N" u+ X8 a
# A7 d6 L! V) c2 o3 V, L+ E" ~. b5 c# J- V3 N# _: }* M/ E
#) T+ e% t/ N, q1 I p, e$ N
#
7 Z& H- L0 w4 y8 u6 _# 相似余弦距离
6 h! B a) m- Z4 t& E#! t% P4 k* Z! j2 B
#% M: \9 F$ A3 @1 M% b
#
" _. }0 Y7 Q8 \def calcSimlaryCosDist(user1,user2):
) k2 N5 `: c* p. G sum_x=0.0, I" u+ {2 o/ I! y
sum_y=0.0" @/ f9 W* Y0 k8 y
sum_xy=0.0) L1 t' c% }2 Y% n% v' Z1 w7 x5 _
avg_x=0.0
! w ~: t9 y- Z avg_y=0.0 m6 n/ P3 G$ O u
for key in user1:. c* \, o r* |9 J2 J, v
avg_x+=key[1]2 Q1 N" J* ^% c( S; b, J& p
avg_x=avg_x/len(user1)
! y5 _$ L; }2 b. @; S, M) P- z $ k; E/ k+ u$ X7 e6 H; C2 k% A: C
for key in user2:! B" ^5 B, v# ?0 D
avg_y+=key[1]
7 L7 G2 N7 G3 P8 G avg_y=avg_y/len(user2)
* h4 n5 a' e% S$ k , w0 R) U' E+ Y% s6 m7 M6 e
for key1 in user1:
; }6 x0 b* i* h q+ k. Z9 [, C& F for key2 in user2:
1 ]( |; G( U' [# S if key1[0]==key2[0] :6 t7 x& w4 ?' K* V$ R
sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y): \* `8 g: G G, Z
sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y); Q) i J& P* S8 e$ X
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)3 l! M) [; O0 P( c! t5 |5 X; D/ a" G
6 [7 b5 C4 O- |- C' q2 J
if sum_xy == 0.0 :5 ^& P8 U. w; a9 O3 a7 y& O
return 02 b ?; u1 k! {: v, r* P& E- D
sx_sy=math.sqrt(sum_x*sum_y)
' s* A# A/ S7 m1 t return sum_xy/sx_sy
3 ^5 ?2 f- F6 i8 _& S- {
( Z- n7 b, c0 o6 _! s' Q
* u* G: X: h+ z$ f7 `+ H#3 C+ ]8 b( J3 Y; l
# 读取文件
7 e$ n+ l4 @5 M; C; p#
+ d) ]3 @1 R3 [* {" v#$ r4 _; ~) @+ F( l0 t- s) k
def readFile(file_name):& ?& c) ~' ?& ?0 y# W/ V
contents_lines=[]
1 H' _2 P; s- Y* x& w6 N# H f=open(file_name,"r") t9 c* z/ I w3 B
contents_lines=f.readlines(). ~! I7 v% z: i6 g( ]% K
f.close()5 L( J1 Q8 L0 s2 k) ^1 a! U
return contents_lines4 h% o$ t1 v$ M+ K$ t
% ?8 t2 p# n) R2 v0 s) k( R5 m' |; S: a5 Q: Y
3 I+ v* c0 C3 S; P0 B
#3 {/ v- p z0 q" z5 q6 D3 V8 D& K
# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
7 g5 C$ _- f( u# 输入:数据集合
% l3 T0 u( a9 o" F6 D# 输出:已经解压的排名信息& ]/ Q5 F/ n& O) ^0 T" o
#
& ?: q9 Q+ f7 e# O; hdef getRatingInformation(ratings):
2 e# x7 D% W! u- V rates=[]/ a5 g( h8 P- D# }1 \. Y9 O
for line in ratings:* G: T9 b! \3 u/ P0 A% ]
rate=line.split("\t")
- d# v0 x8 v9 l1 V9 t5 |+ g% Y rates.append([int(rate[0]),int(rate[1]),int(rate[2])])
4 m, E9 B5 W% j6 r3 X% T" I' f& d return rates
- D# t% a# X+ x: `4 [+ L A4 @
# ~+ V& h6 U5 T+ x, Y& t2 c' i2 R: y& L. W2 b) z! h7 K8 T
#+ U& H9 O& U5 V! @
# 生成用户评分的数据结构
4 o2 q7 ?0 f; P, a# 3 X% }+ h( _1 v( |+ G! Q
# 输入:所以数据 [[2,1,5],[2,4,2]...]& I8 N% {3 G/ K+ C/ W
# 输出:1.用户打分字典 2.电影字典
. r# T9 I( d C6 C+ ]- _6 V# 使用字典,key是用户id,value是用户对电影的评价,' Y2 Q" }( I/ n2 V' q5 N
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2/ @/ m, g, w Q9 L% C# T6 ^1 `9 A, y
# m+ t, |" t- {" O- @2 B- z
def createUserRankDic(rates):
) Z- [) O" U" _& ]* y+ ]5 B user_rate_dic={}3 p/ C) i* ^4 ~* }6 F
item_to_user={}
- _; F4 Z3 n/ ~- T X2 e& ]& a for i in rates:
5 B( w& Q+ g* `1 y user_rank=(i[1],i[2])1 U- M+ i7 t F4 ?
if i[0] in user_rate_dic:
- j% D, O: p- g! | user_rate_dic[i[0]].append(user_rank)! d. P! m+ { s; i; @
else:+ [* M" T$ C9 L
user_rate_dic[i[0]]=[user_rank]
# |5 q* u& x# @ {
' H$ c1 P$ B. S: {, T if i[1] in item_to_user:
5 j9 ^4 x9 M* w6 m' ~" ~# d, W item_to_user[i[1]].append(i[0])7 K# w3 m! P/ k6 k: e+ k! z
else:: V- l; p8 G$ `
item_to_user[i[1]]=[i[0]]
5 b, y* z ?" g0 ] ; d; d% b- I- D7 N
return user_rate_dic,item_to_user
% b! q8 I% Y1 [0 m2 G) S, N/ e; C' `9 ` V
' V# U) j$ v8 A5 K0 N2 n
#
6 b4 Y7 E1 j2 ?& n" M9 c# 计算与指定用户最相近的邻居6 X v7 m" R R3 F }* x# ]
# 输入:指定用户ID,所以用户数据,所以物品数据- h/ b5 E6 Z0 L
# 输出:与指定用户最相邻的邻居列表
: ~: y. p9 R4 D* M1 {! h: ^#5 ~5 O0 T- C* q7 R
def calcNearestNeighbor(userid,users_dic,item_dic):
! o: U7 ?' @% {) e- ` neighbors=[]2 Y _% {; D1 i) U# W* \
#neighbors.append(userid): q' y6 J0 G0 I2 [/ }7 W
for item in users_dic[userid]:# t+ z& P) O2 S0 B, ~4 [$ `
for neighbor in item_dic[item[0]]:
, t. a# r3 e' u6 m% X M if neighbor != userid and neighbor not in neighbors: . I7 |8 L0 S6 X1 t* }1 M# y
neighbors.append(neighbor)
9 |: K6 e, S- \5 A , A/ b: f k6 p- P- o
neighbors_dist=[]
$ _- F' g4 m( @ for neighbor in neighbors:
6 h7 o8 H% t$ ]2 P! {5 v& g dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe* E2 n( \+ L! m h
neighbors_dist.append([dist,neighbor])1 {, u: Y4 d# ?' W# {' F. G) r
neighbors_dist.sort(reverse=True)
. `$ K' @& p' y7 E #print neighbors_dist" |3 X, Y; \7 A2 z6 ~. Q
return neighbors_dist* a% K4 a- n1 Y0 [& L
, J k* D) r1 E& @9 j; V; T
# i: F3 |9 ~( x: i% H#
; {5 D, b6 y0 |: H* c% h# 使用UserFC进行推荐
- ?# b4 f, F3 e( @- l' \# 输入:文件名,用户ID,邻居数量
/ S1 |7 k* u6 k/ t; ]% u; h# x# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表! Z& T. W r/ r/ t
#
8 m n* u$ c( Sdef recommendByUserFC(file_name,userid,k=5):
D' i& R; e, j. |3 G% D: T
/ R, X% V7 V \' y I* a #读取文件数据
( b k8 r* F, G+ \ test_contents=readFile(file_name)& W8 A! f W6 d3 X3 y
# b( ^! M( x* _/ r #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] + ]) w0 T7 r: R2 a" |% b1 X
test_rates=getRatingInformation(test_contents)1 O0 a! ?- ^, Q/ Q$ \& F I
( t3 b# S0 j4 O, J! Q
#格式化成字典数据
; S X, a0 u5 H6 h9 J+ F* e # 1.用户字典:dic[用户id]=[(电影id,电影评分)...]
9 U( Z" e4 R) s- Q) T; ?* |. L # 2.电影字典:dic[电影id]=[用户id1,用户id2...]1 e/ c% u1 k8 x$ z0 b
test_dic,test_item_to_user=createUserRankDic(test_rates)
- s! N+ J: h& R" h5 v
) Y# f1 H7 c4 b/ i1 z; | #寻找邻居
' D. x) r, l; Y( |) i neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]1 _5 g- r; M; t: Y
2 P% `% E0 h3 C. I recommend_dic={}# `* |0 }$ i; [7 ^0 j* q' y% d# H/ k
for neighbor in neighbors:
2 p: X0 c9 B' }! g, O neighbor_user_id=neighbor[1]; b* q1 @ ]6 X2 a6 V& `( |
movies=test_dic[neighbor_user_id]0 a6 z3 `" H% w' |5 {7 U. b
for movie in movies:
( B# r9 I! t, b8 w6 t* Z #print movie3 G' p- q7 ^# ~( s8 B! z
if movie[0] not in recommend_dic:, r4 M: H: D+ P5 y9 O2 }
recommend_dic[movie[0]]=neighbor[0]
4 M9 J9 T$ H5 e# X' C& [- W else:; W! f x8 T# J/ I
recommend_dic[movie[0]]+=neighbor[0]5 ?) x* T3 V, [. x1 T; A) Y
#print len(recommend_dic)
0 {! T4 s2 K# l6 s! [/ a; |2 h5 u & ]7 F' w1 E; \3 m) ~
#建立推荐列表
( D* n9 w0 G0 A recommend_list=[]
5 r6 S+ @. S. s7 a. c4 M2 L for key in recommend_dic:4 j5 R) B$ `& A9 u! L, H( ~
#print key5 m4 a* x2 @- ?
recommend_list.append([recommend_dic[key],key])7 _6 Z+ q# w7 m4 K7 F
9 n; y7 @4 F, W' h! B
" s" t y' ?% ^; u! B recommend_list.sort(reverse=True)/ d1 ?( `9 J. J- o
#print recommend_list
: @7 z$ f: J0 S# }- w G; ~ user_movies = [ i[0] for i in test_dic[userid]]
3 Q! o! B" D1 z& d, ]0 ~& |6 `9 \ U4 B/ Y4 E( F/ y+ b, T
return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors
" l. t+ ?5 C7 N9 ^' m% a
6 y1 V" v f7 {) r% V4 i# \ ) t% ]( U- r7 o$ E( u4 H
9 K" D# Y% _7 D% A7 @6 s
#
. t& u9 x `$ ]) j& b0 a+ X#
5 Y0 p; C6 ?. A4 l6 L* w5 j# 获取电影的列表1 G: w0 p- p6 Q
#9 I9 b2 j5 R$ c% }+ O
#0 H+ A( J3 r. `( k* k* w3 t2 j
#' v2 v: x/ Z b2 I
def getMoviesList(file_name):
6 `! L$ a6 k" h; u$ ? #print sys.getdefaultencoding()$ k& O6 T% K; @3 u. D+ c
movies_contents=readFile(file_name)$ e# e* N+ [/ B7 z
movies_info={}! r- K, {( R: ]% d$ R
for movie in movies_contents:
5 f/ F% N4 t+ F8 ^; G1 E movie_info=movie.split("|")
* v3 p9 J, O/ o# H1 ~! V9 e movies_info[int(movie_info[0])]=movie_info[1:]
1 a( N/ e$ o$ { d return movies_info
4 z4 ?; F+ [7 ^0 n2 q
% h) I6 Y$ z- m0 z 5 U4 V9 ~* G* C" a: w0 t+ s+ w$ }& w/ s
2 O' x* j# `% ~' w0 Y6 Y& G#主程序0 S9 @* v7 I: H& G) Z
#输入 : 测试数据集合1 _! c# c) H2 n* z! Q' Y
if __name__ == '__main__':
4 m6 p. f: _4 h5 B$ e5 S1 l! M3 Z reload(sys)
! H$ ], P. [2 y# n3 B' k5 H: {# D8 A sys.setdefaultencoding('utf-8')
7 K# `, v! b) d; n+ R$ B movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")5 F% |+ S0 b5 z9 g8 R2 e
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)
; M4 [# K) H% o, b: H+ z neighbors_id=[ i[1] for i in neighbors]
3 r0 w# Y2 J* Z' M; Z table = Texttable() U5 K/ D& ^2 y. z3 x5 q8 `' E; K1 x
table.set_deco(Texttable.HEADER)
- Z" H2 s2 ~2 Y! w) s table.set_cols_dtype(['t', # text
5 q! D% i4 i/ i3 u( S3 y: e 't', # float (decimal)
- i" g. o X8 _. ^, { 't']) # automatic
) e& G8 k8 h+ v. p table.set_cols_align(["l", "l", "l"])
: `5 a% \/ U* i1 a$ k$ X rows=[]9 {7 N' V: X! R. I. ^
rows.append([u"movie name",u"release", u"from userid"])4 _% p# z% B2 S" V
for movie_id in recommend_list[:20]:
/ |4 ~% `5 M& T" h from_user=[]
! P/ t: U9 k. m" l for user_id in items_movie[movie_id]:: E! K9 ]: y1 t: O5 L6 {
if user_id in neighbors_id:
" l9 p0 [! w; f2 j: } from_user.append(user_id)& w; s0 t0 D6 u, y: S
rows.append([movies[movie_id][0],movies[movie_id][1],""])$ |1 z( o3 V" A O9 C
table.add_rows(rows)- B R- K4 H' f1 R" x4 T5 o/ c, k: ]3 M
print table.draw() |
|