- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 3
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
|---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-% l6 S7 Z1 {$ S& p2 _5 Y
; E) I. ^# E6 c6 Uimport math
) ]/ y; G2 S1 m% Rimport sys, {# M0 w! M2 N% `" Q. s
from texttable import Texttable2 ^9 j* s: z9 j1 X8 f+ a, k) s
/ p" q6 N9 A/ ^' N9 F C
6 P, L( g0 {. I- o
#
; A7 }5 T5 l: P# 使用 |A&B|/sqrt(|A || B |)计算余弦距离! y3 ]( D6 d; Z2 A: l+ Y7 n
#
5 O! A$ h6 M: A8 _1 S#+ n( n b" a7 n5 s# r
#! m' M& P1 ^' q" B/ X! X
def calcCosDistSpe(user1,user2):
0 P& ^/ b( o4 K# H* O5 Z+ @0 T6 D; C! i avg_x=0.0 P0 _9 v5 }5 K3 a
avg_y=0.0
`+ K: Q6 p+ [4 h9 Q; z3 F for key in user1:
3 U m0 A, P( P4 ]: m. \% j avg_x+=key[1]: i1 D+ j6 j+ P( T
avg_x=avg_x/len(user1)9 B, m5 `# s" k+ G8 A
# K0 ?1 Y: _* l {
for key in user2:
5 m5 C- k/ E3 G! Y3 D- t( W avg_y+=key[1]- Z! L) N; x6 ]4 P, L
avg_y=avg_y/len(user2)8 a$ M" `& [8 T' r, J- e7 @4 J
: n: U$ F Y7 Y u1_u2=0.0, h: `" n$ G4 B7 d# r
for key1 in user1:9 q+ Z# Y' f- s9 p# E+ K( s3 c& v
for key2 in user2:% {, B. w% u0 k! K
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:$ u0 c, X! O+ F$ w
u1_u2+=16 ~4 P' X7 N7 B5 C
u1u2=len(user1)*len(user2)*1.0
9 b: v: H I. O$ o& [ sx_sy=u1_u2/math.sqrt(u1u2)
4 r) X: v2 C2 z" R+ a! ? return sx_sy7 s7 V& Y W1 `! h
7 o+ h b n5 {: d8 v+ I# }
) B0 {& c" e; z+ H5 z* ?1 W) H
#, Q0 Q# j( L/ W. c% L( M4 P
# 计算余弦距离) n9 ]5 L2 f( ~' z6 @4 ^: @
#
|9 ?2 i5 `4 U! A: s& M#8 p/ `. ]9 Z1 I7 G/ D& k
def calcCosDist(user1,user2):5 `$ z! \# g# ~ `( I( \9 G& j E
sum_x=0.0
& P& ~, ^* n: p, e( Z% s2 x/ t sum_y=0.0
/ |! @. W( \/ R& Y9 k sum_xy=0.0
. |: {- {9 h+ o# b1 H for key1 in user1:
5 b! I: l/ ?- o for key2 in user2:
" j0 `+ r" _5 h. r if key1[0]==key2[0] :: n+ C1 h# @8 m5 C8 H5 d G. s
sum_xy+=key1[1]*key2[1]
' `/ r5 R" f9 u8 T9 c& c sum_y+=key2[1]*key2[1]9 S, w: b) ~! }, f, l
sum_x+=key1[1]*key1[1]/ x6 P# j' m4 k3 F6 s) \
. R n* n# G$ n6 _ if sum_xy == 0.0 :
: ]9 }, {5 _5 F+ V- \% d3 f+ \ return 09 h3 E! v u5 P) g+ |0 s
sx_sy=math.sqrt(sum_x*sum_y) , S1 Q" B7 ~" O. M* J
return sum_xy/sx_sy
7 F5 J# _0 D1 N7 T$ {& V2 c5 ~# z ?" {0 W8 \ J$ s: _/ A
5 D& C) r6 M& V' [9 L# a
#" \/ R5 |) O: B/ `6 _
#
) Y& B5 s9 `$ x! c1 I# l# 相似余弦距离
) [7 p5 ~0 e2 z5 V0 v, Q( V#) ~" c. {7 }, p' J6 y: \! v9 S1 W
#
* N# T$ L8 n( ?: e8 O- B& f9 ]#2 J# q6 a/ q0 k, U
def calcSimlaryCosDist(user1,user2):
- {2 {3 v3 T3 } sum_x=0.0
# l& Y6 }3 O/ N! P2 f+ h sum_y=0.0
2 Z/ N8 @9 W! q sum_xy=0.0/ i/ k0 ?5 O( Q Y# d0 R! w
avg_x=0.0
* G; s& P9 x' H7 h, r; H" v1 y avg_y=0.0
0 s- d, C/ @, U. I for key in user1:
8 G' ~. H q: D1 `* R' ]: V2 ]: k$ c' y9 y avg_x+=key[1]
9 c9 l* A! A5 w1 y4 N avg_x=avg_x/len(user1)
4 X5 s% S& L* H4 u
) K; n0 ]# ~, m- l# g! m for key in user2:0 h \: K/ i2 T9 k* g: R9 n
avg_y+=key[1]
* v$ \8 h x+ ~+ R* ` avg_y=avg_y/len(user2)
) P) O& H6 U: W9 _
, i1 J$ F' f) v8 m for key1 in user1:8 I2 F: w* u6 U! s! |( A2 c
for key2 in user2:
2 }& c; I5 x2 ]5 I+ C2 s7 ~ if key1[0]==key2[0] :( E# X c8 m" o4 ^9 S9 m
sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)
( t. L9 j4 J5 c: [ sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)
- Q% J$ N P7 A4 `3 {, `% G0 Z sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)
' ~4 l5 S# f( w7 [* V
$ t6 k# `0 u" I1 q) I1 J if sum_xy == 0.0 :* ~1 a1 w1 ^. T& `5 _+ m/ p1 }/ E f
return 04 l* g# f7 ?, n1 i; D% w* n$ k. c' g
sx_sy=math.sqrt(sum_x*sum_y) % I. _/ f2 u7 f' K. D9 ~
return sum_xy/sx_sy
2 V% g6 H! Z5 L, o) S ]/ S 5 o. a: n. e! z# ?5 `3 A" f
; g) J6 p9 t" i3 L3 s1 e0 z) H3 F* S#. A$ X/ Q% F S0 Z' V2 C" Z% d
# 读取文件
' l. p [* m2 s& A, ]#
2 @# v- g% P2 A) I( H4 e3 h#! X2 U, T, y# k4 g
def readFile(file_name):
5 }6 Y" P$ j5 r9 g contents_lines=[]
2 K' j0 j) v6 W* i f=open(file_name,"r")+ W( ~- L" _. k, [
contents_lines=f.readlines()" w5 S) J, @2 N" f
f.close(); `$ {7 l4 d5 |8 }3 X- u
return contents_lines
6 b7 P; C% m, j# _; U5 g5 k& {" A m% q4 H1 X: T6 h
) s6 P$ P& B3 Q- v
6 Z& R, L2 t, x4 r5 ~' t
#
% U0 D& V! o+ r! H. ?6 V# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
) [0 h( Q( i O3 l# 输入:数据集合4 d5 S; e- Y# k! ?' w" c" E0 o
# 输出:已经解压的排名信息
5 D( W4 V9 h" d( Q ^' P1 O5 H#2 b" L1 {; D; |* m" t6 C
def getRatingInformation(ratings):
' D- Z5 P- P( @ W5 X rates=[]
0 }4 I: E: A1 [' `% _3 }6 J* x for line in ratings:3 \8 @% P; z" V) b. v6 R% V
rate=line.split("\t")
) J d+ T. G4 R+ I4 C1 u rates.append([int(rate[0]),int(rate[1]),int(rate[2])])
$ u$ K+ W. p [& J return rates
6 ?# Y |. M/ u1 _' {- E% q4 Z* T5 E
; x! o' P" Z$ S5 J8 r, g6 P, x#! `/ h4 k5 z$ ~
# 生成用户评分的数据结构0 H: H, B, b% ?
# % s2 ?% ~4 k1 X0 l# ` t
# 输入:所以数据 [[2,1,5],[2,4,2]...]2 q9 I7 D Z u$ p* W$ L) S5 v
# 输出:1.用户打分字典 2.电影字典
9 B/ o' H& n4 b, F& u$ `# 使用字典,key是用户id,value是用户对电影的评价,4 z; P P+ E1 i" [5 z9 Q- n8 l
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是20 X1 S# a8 Y4 M( q% c
#8 C; U+ s) x9 u/ \8 x! W
def createUserRankDic(rates):) S/ k' ~) n3 Z* b
user_rate_dic={}
: \, |3 c7 O) a6 G' I; X! q, U( i5 S item_to_user={}
4 x7 {% R/ A; p, ]2 }% ? for i in rates:
+ @9 D- A1 {9 D/ k0 Q user_rank=(i[1],i[2])4 H B9 Z' C* s/ o# l
if i[0] in user_rate_dic:8 d' a# @3 {" e" s- m) Z0 O
user_rate_dic[i[0]].append(user_rank)+ x, }7 x8 v& F8 O
else:
0 Y- h; t2 A' [3 R6 u/ O user_rate_dic[i[0]]=[user_rank]* F) [+ }/ m! z$ R& V
- b7 `, m' `. M if i[1] in item_to_user:
2 _$ h9 D1 P. S8 D% ^$ _ item_to_user[i[1]].append(i[0])2 {( O5 U4 x1 L$ j1 z+ I$ A5 W
else:
; h8 E) D# P! K( B$ A5 ], O( e, O item_to_user[i[1]]=[i[0]]
1 x# I( W% T5 s# }$ `
0 Z6 z& ~) e9 l6 f6 v' v return user_rate_dic,item_to_user+ L1 r5 ~4 O! e( Q2 V( H
/ I ^9 U9 J, M4 C4 ]
9 G: ]. e$ {3 O+ z/ ]1 M* S#
& I3 ^$ `& O/ G2 z; ?# 计算与指定用户最相近的邻居; G s4 k9 k( n E! r# @
# 输入:指定用户ID,所以用户数据,所以物品数据
3 c1 c1 c3 M- O- P" D% p# 输出:与指定用户最相邻的邻居列表4 a9 y1 _2 y3 R# V% d8 K- r. S+ T* @1 N
#
6 j, \0 w! [5 h0 `0 b: |def calcNearestNeighbor(userid,users_dic,item_dic):
. K: M! D: D9 @1 I neighbors=[]; v- h) {8 M% L: U) `" F
#neighbors.append(userid). a, ]/ h5 U% k$ ~3 M6 ^; C. Y! K$ S
for item in users_dic[userid]:2 J2 z. |% ]4 {( ]& b, y
for neighbor in item_dic[item[0]]:
) W- A) p, x) R& i, f! _1 B if neighbor != userid and neighbor not in neighbors:
9 q h( Z Y* k. { neighbors.append(neighbor)
8 A* S+ ^# ]) }8 d- G+ R
7 I V; r! l( l* k+ V( Q8 c3 ] neighbors_dist=[]
9 Q- R7 T; g w* b* a0 _ for neighbor in neighbors:7 h6 P9 `& T0 @* N V( n
dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe
! k) x9 s0 U4 \ neighbors_dist.append([dist,neighbor])8 _ m* L( p$ j+ c8 u
neighbors_dist.sort(reverse=True)
7 h9 Q' }' `9 A1 z& h #print neighbors_dist# u2 J- b% x- w0 A
return neighbors_dist$ Z1 H) P6 K; q4 n; S- V
/ i2 J" X: m) l, y; u, s h
9 X5 b2 w# t0 K#
: ]! k( {9 S2 g5 y. b9 X* G$ R* H# 使用UserFC进行推荐
- ^6 c) C( |% B# f8 Z# 输入:文件名,用户ID,邻居数量/ [) _* a) N! y& I
# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表
/ s6 W% `2 N4 [1 k! N! u) `% e#( l$ O7 v W: i) h
def recommendByUserFC(file_name,userid,k=5):! Z8 {2 F5 ?8 o# v w
) q1 w' K9 |2 k9 s% Z; i, n
#读取文件数据( M! V, a5 x# `, q: G) a& V$ {
test_contents=readFile(file_name)* D8 u+ |1 K$ V! J; B% G" V
: s7 w8 Q6 Z* Z! c0 ] #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] ; a. O0 _( }! \, s( {: v
test_rates=getRatingInformation(test_contents): s& U L) e" ^& X: E* h( M1 l
' e' e6 v8 z# \# h( p
#格式化成字典数据 % Z+ O; J' j. w
# 1.用户字典:dic[用户id]=[(电影id,电影评分)...], L- r4 e1 R b% g+ ], M$ s
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]
3 k7 `- T0 P" J0 b test_dic,test_item_to_user=createUserRankDic(test_rates)
# h( x/ S, Y; Z) h4 c0 f# u
4 V) x+ ]8 c2 d! p! J3 h0 a #寻找邻居* [. x( J# k9 U1 c* v
neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]
9 O6 g9 g# B# E$ J8 D. F6 N
" r9 k4 I3 g& I* J recommend_dic={}
) e2 P! V7 ?1 u: A- Y2 N for neighbor in neighbors:
- X( p" R* Z4 l( a: |. U7 e8 {) R" m neighbor_user_id=neighbor[1]
& a1 m- k/ b% g8 @* k0 I0 k9 X: |5 R movies=test_dic[neighbor_user_id]% {7 t1 e3 m! |$ |% o3 |4 J* B
for movie in movies:! h" {7 K; p$ l0 ], J7 d
#print movie
- E4 X' S4 u i5 j' D: H if movie[0] not in recommend_dic:
+ |# u6 l# \ u, r0 \ m7 v- P0 i recommend_dic[movie[0]]=neighbor[0]* z- F/ e; Z- s d# u
else:
, M0 d. u* a9 X% \ L' }' ] recommend_dic[movie[0]]+=neighbor[0]# P# }0 T' a6 g6 S9 A; c: g o
#print len(recommend_dic)5 D; m Y+ t6 e) u/ h% X( f
" r7 l) c3 E& i& f, E" q
#建立推荐列表
& @; @3 n, K/ ?, V# q recommend_list=[]/ o0 e4 g4 M8 y7 n! t
for key in recommend_dic:3 r1 B7 u% G2 i% L; M3 O3 j0 x
#print key
, b0 q+ p: U6 ^' c recommend_list.append([recommend_dic[key],key]); p2 b8 o0 h+ B8 E
, ?; o7 N p) C. q
9 h, D m3 N; }$ |0 m2 ~ recommend_list.sort(reverse=True)
& Y G1 C2 X, a% i, x #print recommend_list+ L s5 w, A7 a8 D1 d! ]) q% t
user_movies = [ i[0] for i in test_dic[userid]]% g% I) Y ~+ C4 d
8 X+ Z! j6 q" E) E7 W return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors
: D% C( _; g% i
/ i- p3 o5 g3 x: v9 K ( d0 S6 v7 \& r" O+ ~7 {
/ Z+ E6 Y$ X; R4 S! M x#
8 }, v* s1 q, |0 ~* j. y1 I#
3 ?* }. K1 H0 ~, {# 获取电影的列表
1 ]* x4 Y3 M1 ?#5 i% Y- K7 b7 H; s: r3 v; n
#& T/ p8 e: z' ^ K, A
#! _; x% i9 I7 n
def getMoviesList(file_name): Z3 [/ q, e$ J8 ?: Q
#print sys.getdefaultencoding()
, {( G( c" P, | e) ^+ q movies_contents=readFile(file_name)
1 }4 R& E$ A5 e$ b- p+ D movies_info={}
6 ^; W8 L: E- t8 g for movie in movies_contents:
9 u, u! h7 l+ C2 |% ~4 ^ movie_info=movie.split("|")
" z7 N8 x* j9 ? movies_info[int(movie_info[0])]=movie_info[1:]
" A5 K; i' p/ I, ` return movies_info
8 K- D9 `; H$ i8 T- M6 u/ P
* c+ m, |- n$ L9 C ) N: n! C1 q) f8 D
& a7 b! o* j5 S' K#主程序/ J$ C- T$ S# g4 N
#输入 : 测试数据集合
C2 b* }$ a/ [* A* H5 B# lif __name__ == '__main__':
% @5 p x \. B reload(sys)5 d& p; i' a. g+ E U
sys.setdefaultencoding('utf-8')
. `( T# B. e% x, l( v( p. Q6 I5 ^ movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")
6 F8 p" p8 L9 r' w. ]& C1 Q$ g recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)
# G* V8 [3 K0 a9 r1 {& Z$ |4 H4 { neighbors_id=[ i[1] for i in neighbors]
" ^$ \: p$ ~4 {; @9 P/ w table = Texttable(): }: s" d( B) t: v& b0 e) O5 l
table.set_deco(Texttable.HEADER)
; F4 U2 U6 \) @8 g, a table.set_cols_dtype(['t', # text / X1 F# ~; p4 E, \! f5 L2 d
't', # float (decimal)
" m9 k, ~& t6 u 't']) # automatic
3 l5 k! |/ \& d+ w3 W+ m table.set_cols_align(["l", "l", "l"])* i' R: X# G7 \! e
rows=[]
) t. f0 F( ]) V' I2 s rows.append([u"movie name",u"release", u"from userid"])
, P% ~$ m: T/ M. D for movie_id in recommend_list[:20]:! m4 V$ \( e) x
from_user=[]
[7 f z' d4 G& f5 ^ for user_id in items_movie[movie_id]:% [4 I, g1 Q$ Q# \9 y) Q- \
if user_id in neighbors_id:
Q3 F4 n) G" K: z, s from_user.append(user_id)
5 E$ P. N) ?2 @2 ^3 l+ c7 z9 c rows.append([movies[movie_id][0],movies[movie_id][1],""])
& l3 m9 X$ p% F" q9 R table.add_rows(rows)3 a& B! y; i4 I7 j# o
print table.draw() |
|