- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 8
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
|---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-
# a0 V( \( A- l2 A- X" l- E3 D3 V0 g( t1 b; K3 {0 E* Q1 W' |
import math/ m+ U# l$ F; K7 g
import sys+ y( k8 a, ^+ ?& D, W( V& n E' o
from texttable import Texttable- W7 P& [! [& c/ A! f, C' [6 k! ^, Y
6 n# l; \" `5 \2 B* | ^
# U7 i, c8 k& K; \. s8 V
#- L8 P2 D2 V0 y9 a0 s
# 使用 |A&B|/sqrt(|A || B |)计算余弦距离7 r; R& y1 ^% v5 O6 \2 X
#/ `+ [2 k: G/ F C4 k5 o
#2 z+ S( R4 d+ H/ U: E+ Q
#- s7 o _4 T: N; ?+ ]; Y
def calcCosDistSpe(user1,user2):
p$ R* M. n k, ?* ~; \7 t avg_x=0.06 |3 U5 d* m5 L. V* e' Q* ^* ^
avg_y=0.0
6 N/ o; V. `, {" ^$ A1 z$ U for key in user1:
1 k" N5 r3 a2 U( b5 h0 u avg_x+=key[1]
! ?: o( R& k4 k3 b4 W9 q& C" ] avg_x=avg_x/len(user1)
1 e& f# h' B% w2 P9 h8 @$ d # W8 o4 m$ e, `% }4 c6 }0 m5 x
for key in user2:, e W& p3 G" [" G! C5 |) w
avg_y+=key[1]
/ e" r( v' J" x. M avg_y=avg_y/len(user2)3 ~8 @: c* s, q/ n( U) u
; Z5 n9 c# ~3 ~5 ] N" P8 \: Q2 t u1_u2=0.0
) |0 g- X7 x4 [3 A for key1 in user1:0 E: O4 D" s, ~6 m- U6 z
for key2 in user2:0 H: k, M0 E/ S
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:
8 Z- I, V6 H" w" m u1_u2+=1- R* A% g$ d5 Q P" t
u1u2=len(user1)*len(user2)*1.09 _: K6 _; i6 O# l, H4 I) q }
sx_sy=u1_u2/math.sqrt(u1u2)8 p2 ^' N$ V5 h, e. j
return sx_sy
; j, ? ~, |1 _4 R5 _3 p+ w; r& K
9 z. @7 M E/ c( ~7 K J. \* `* e5 N3 v
0 t2 i0 @6 {/ D, d8 N#$ t2 x! ]7 k! \ I6 ~
# 计算余弦距离 l& x# w# u3 E' j% E' C7 v
#! q' |+ I, s0 m P" q
#
. d4 Q8 M& F4 ldef calcCosDist(user1,user2):
- C* f. ]: p5 N. g+ F9 h0 M+ K sum_x=0.0
- v3 c0 \% ^9 l& \ sum_y=0.01 r a& b6 `4 Z6 {
sum_xy=0.0
3 O8 [8 |# j5 p: ?" c for key1 in user1:
4 l# y5 O K, G: y7 c4 B for key2 in user2:
) b- ~/ `1 u' y; `+ X if key1[0]==key2[0] :. z- |. [8 p5 ?' p7 k2 e8 `! `. c
sum_xy+=key1[1]*key2[1]9 O. `4 p/ C1 [8 [" @& L/ d
sum_y+=key2[1]*key2[1]/ `3 B4 G w$ j: o
sum_x+=key1[1]*key1[1]
8 k) ~6 I, I# h% a
# u7 M2 N0 }; S/ x- T if sum_xy == 0.0 :
0 ~) N3 f) Z+ I4 [' r& E7 h5 ~ return 0
9 W, v$ \6 ?: D J/ @! N sx_sy=math.sqrt(sum_x*sum_y) " A( O, k4 H) ~
return sum_xy/sx_sy8 g9 n4 q8 T1 R$ n
4 p$ r9 j3 l- ]5 I7 c. b3 W2 [% Z; } q% i) l
#
6 j$ m6 j6 `. @$ s' v t$ R#" p* a1 K2 u: F
# 相似余弦距离% C. X) D2 _5 S; G4 p$ M* {( I
#
! \, K ^5 f# Z$ e0 i) A6 U. s#
0 k$ g$ N/ [* F" r j* G8 [- [5 ]" d#- B" X- G5 B! x$ x |
def calcSimlaryCosDist(user1,user2):
4 P' a0 Z3 Q( i8 Z* n9 Q, u sum_x=0.0
. G3 O x5 L3 i" V% U: @( _2 j sum_y=0.0: @# D. J* n0 y3 ~3 @; l! a# v
sum_xy=0.0 _, \; k) E% j7 M( i8 q
avg_x=0.0: |2 R* w" E2 m% S4 P& ?6 i
avg_y=0.0/ y4 {- C b+ F: R- O" ?! v A1 T: N
for key in user1:' o9 }+ U& ]# U4 H/ E8 U
avg_x+=key[1]& M* W' W# S' y, c9 l2 p
avg_x=avg_x/len(user1)
3 i& \" a9 f/ K: q ! C( r7 x5 U) {9 l4 P) z
for key in user2:( N) h+ d+ H) T9 B
avg_y+=key[1]
p3 P- N8 Q4 @ k2 E- j3 X avg_y=avg_y/len(user2)! L* e/ O# z \; j5 h3 ~
" p2 w$ G% t! q7 G( ^5 [1 F for key1 in user1:! P: I- H- {- a" f7 t3 f
for key2 in user2:
7 Z+ `# s! C0 x. a if key1[0]==key2[0] :
6 O6 K, W/ ~ |% _ sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)
. S' R& D4 Q2 Y( k1 H sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)* O: _, [0 B% L2 ~6 D, H
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)
Q/ _& }' I4 U) n
! M2 W$ _* T) ?' X if sum_xy == 0.0 :4 s1 H$ X: G& @* X& q7 C% O
return 0
Z9 T. d6 P+ K h& u sx_sy=math.sqrt(sum_x*sum_y)
- _; r' c, Z% h# _% c return sum_xy/sx_sy
- C1 i- j1 D/ k6 L% x
: g/ @5 H4 V4 ~4 t7 A! H" |" Z0 W9 t2 |( m
#( _. x* L/ K7 u9 A
# 读取文件: Z5 P+ |+ ~. a* F+ i, i
#
8 w. K( m. g; e; F; d+ t#, w- {4 q. S1 I& F0 O
def readFile(file_name):
. g5 h: K" F3 p; e/ p; ~ A3 H7 P contents_lines=[]
% M c; V# S% B8 p f=open(file_name,"r")& @( c! z" Y$ C' u, @8 o9 v7 ?6 O
contents_lines=f.readlines()$ {9 x; Y# U' r* o) {6 J
f.close()
. B: Q+ ], U0 ]& ^$ ?% L return contents_lines
! p" L3 j' o5 n7 B
2 I$ k5 \" k1 J3 r/ Y1 e5 f, g! O, K* s. v, D
+ W- I% T8 o2 [, w8 l% N9 f; w#
; l2 L" f$ W0 A% Y0 B4 U2 ^# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
) f }. N: b/ j6 X; C& E" C0 z# 输入:数据集合
) f7 e: B5 W' B0 E' M! ?: C# 输出:已经解压的排名信息
4 Y" P* r4 w* l: n#- l, Y/ t: z$ I
def getRatingInformation(ratings):- w9 {3 ?9 M+ u2 \1 h* Y6 E
rates=[] w( a" i; B/ m3 p
for line in ratings:7 Q1 V! C, D% F3 n) Q9 P8 _6 ~( V
rate=line.split("\t")
7 l6 u5 X0 e# ]; O( }, y4 }6 R rates.append([int(rate[0]),int(rate[1]),int(rate[2])])
, f; Y. |& V1 \. I9 Q return rates3 _0 }; ?" _; _% K: @; b" r
1 X" J" U3 p! L4 P% H* h
& @9 S7 |3 o$ [4 I) ^1 O! f
#) T3 Q6 x I% E+ c: } q4 w
# 生成用户评分的数据结构
. G7 I" Z6 D5 o& s2 }8 i#
; A) y5 h2 w5 D3 ^: h# 输入:所以数据 [[2,1,5],[2,4,2]...]9 m. _9 m0 Q! }+ k
# 输出:1.用户打分字典 2.电影字典
% C3 f/ g6 l2 h% s# 使用字典,key是用户id,value是用户对电影的评价,0 h: p" @1 h1 C& M D, F2 Z" z
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2 {* A9 R7 {* o& I3 s
#8 {: Y F; T* v9 I1 g" {4 p
def createUserRankDic(rates):
% o5 ]) ~* ?; ] user_rate_dic={}3 V! M" ~& ~2 E8 o: j9 a
item_to_user={}* x+ W* n0 K# c
for i in rates:% Y) m& V7 X3 { j. ^/ V
user_rank=(i[1],i[2]). s! C3 p+ p7 k# F5 c
if i[0] in user_rate_dic:4 o, l9 U$ W. U; p5 G) F, Q8 Y
user_rate_dic[i[0]].append(user_rank)
; y5 [& \4 k4 M) ? else:
% h7 W/ u+ m! j" D4 k1 { user_rate_dic[i[0]]=[user_rank]
" W4 K6 }* L) A c& t 9 M+ u% a6 `, P( c0 s6 ?8 d
if i[1] in item_to_user:
' g w7 K" z# r1 k! t6 E# {% W item_to_user[i[1]].append(i[0])
7 L' k0 o, ~) j- N else:
# E8 N& z+ R0 B8 ?% B item_to_user[i[1]]=[i[0]]
. R" M" z9 C5 u+ N: N Z' G5 L # z& i& ]: `+ A& ^0 x4 O! Y
return user_rate_dic,item_to_user
" Z! l6 G6 V& J: w. a9 C6 Q
( V9 C6 o1 P9 W: \- [. o- E; |2 D% x& P* S" N7 H5 b% b5 H
#
4 T. _; u# W5 h% t* Z# 计算与指定用户最相近的邻居
% z; Y9 u7 S$ k* c. ~; o. u9 {# 输入:指定用户ID,所以用户数据,所以物品数据3 y& v r+ m# I# y2 [9 [ v1 _
# 输出:与指定用户最相邻的邻居列表
0 [1 p2 `4 T, p G, @/ t#! B$ V) A4 n; E* a
def calcNearestNeighbor(userid,users_dic,item_dic):
. T7 f2 E5 o( k- g neighbors=[]) d5 s& d2 a/ k! T
#neighbors.append(userid)
& N9 x( u, \ m for item in users_dic[userid]:0 v% r. k: z" u4 P( N* T- _. P( I+ M
for neighbor in item_dic[item[0]]:
7 l$ ^+ \' l( [: q% h9 ? if neighbor != userid and neighbor not in neighbors:
r; N V- ]: q+ _$ {& c neighbors.append(neighbor)- P h/ h, q" [4 _
7 F8 e% K! M$ l neighbors_dist=[]
) i2 b4 V. ?6 s! C; X* o3 A for neighbor in neighbors:
0 J4 Y5 W( n; @8 k( x dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe
( ]+ T% ~, ~$ |" C8 h- E neighbors_dist.append([dist,neighbor])
; x. Z9 X' F0 [: `# L: P4 u neighbors_dist.sort(reverse=True)/ r1 t) S. ~; `$ j
#print neighbors_dist
- v6 _. m6 A6 v3 y return neighbors_dist% E0 h5 y( B! y' a% g
- d% y6 X6 q& g6 d5 ]% z) U0 _
. k4 y: \6 ~% @5 I, _+ w#$ `* n1 V7 T2 H% X G8 B% Y
# 使用UserFC进行推荐
w. w% T+ W( B9 e1 M- P0 o$ _6 n# 输入:文件名,用户ID,邻居数量
6 R. [% ]3 ^# W, I; f4 t/ Y# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表
7 O x* p; \/ ]#' n7 B+ s/ [# ?% z1 V
def recommendByUserFC(file_name,userid,k=5):
& H# m* V" @- ^' B$ h* h J4 l7 ~6 f/ D4 F. ?
#读取文件数据
; F, d6 p4 A7 U test_contents=readFile(file_name). e/ @2 ^, U- w6 t9 H
. D0 c0 S% \3 y# l/ J; t8 t; m #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] 1 H0 m5 h- r v n% k( h
test_rates=getRatingInformation(test_contents)' Y. A$ _2 X! l: m" r
2 O9 v; ]) t" z" d( c/ J #格式化成字典数据 ( R: U4 h; V/ ^! s* r
# 1.用户字典:dic[用户id]=[(电影id,电影评分)...]# ]1 E4 f* T- v+ c! O3 B( O
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]
. \' g8 ~& u, o+ B" _! V test_dic,test_item_to_user=createUserRankDic(test_rates)
9 W3 I% P( `/ ~# C% G$ m! M( q
# W( `& g7 I( z- | #寻找邻居
0 D c1 H# L- M1 q3 L neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]+ w0 q4 I7 N1 b! k
3 A3 Q6 W2 h) \- s
recommend_dic={}
2 ~$ D% x7 i3 m0 b for neighbor in neighbors:
3 _8 \. y! q- f3 F2 M7 z neighbor_user_id=neighbor[1]
/ l6 B: I' i) U4 K2 y' _ movies=test_dic[neighbor_user_id]# a" Y; q; U) j3 c3 N$ J2 e' I
for movie in movies:
3 z( e0 Q' q5 n9 T u9 ] #print movie
* w" g, z4 i4 I2 H; f) k1 _; P if movie[0] not in recommend_dic:
0 f# J" d0 N# @# @ recommend_dic[movie[0]]=neighbor[0]! H3 z7 W$ j f; M( e
else:
_* B3 ]/ v# d3 \$ {& \ recommend_dic[movie[0]]+=neighbor[0], s8 p% y8 n# ]$ ^+ I: {4 H1 Z' e- j
#print len(recommend_dic)
0 |% |# l+ A% w( I 3 Y4 Q: q2 W1 S' C9 _- s6 U( q
#建立推荐列表- Y% y; T5 e1 h" Z: m+ n1 ?$ U% n
recommend_list=[]
: f1 I& p- y( @ for key in recommend_dic:+ [' l: s$ @0 `* g
#print key
4 n2 V4 f, T' V recommend_list.append([recommend_dic[key],key])+ `* S6 c/ C W0 ?# W) L
% F6 Z4 k1 s$ d7 Y9 s
/ G0 N) l* H5 {& S recommend_list.sort(reverse=True)& s+ i3 T9 n% M6 n8 X' n
#print recommend_list$ I- \3 V, `" U$ m
user_movies = [ i[0] for i in test_dic[userid]]
) c* E# N! v; ?$ o: _
+ Q7 f( m2 B% x. y& [8 v5 m return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors
+ [* d0 O% p8 c. P7 X7 m8 _- ]7 ^ . X& U0 k+ f8 l* m) T. n
; f3 T9 }6 b4 P" ?: z W9 x. i
- U0 I3 M! p( F) e8 K. i
#& h; l7 X# S3 K9 [2 g1 ?
#
0 m0 [, a- d$ C+ B6 e* W6 d# 获取电影的列表" v/ ?! e& v' |
## i' [- K3 Q/ k* D
#, \- U2 @- o8 D9 [' H) O
#
3 e" g" E; m! s/ z# X* m- hdef getMoviesList(file_name):
. ?' g6 D; j1 U9 z N" }, V! g #print sys.getdefaultencoding()
( _3 C0 A% [2 @( n) ` movies_contents=readFile(file_name)5 u* e% x- L% Y7 Q! J8 N
movies_info={}/ T; u( @2 @ B) x% S3 n$ B
for movie in movies_contents:
1 @, p. b7 x$ D7 V/ {) H( h' C movie_info=movie.split("|")
) V1 I, h+ r6 ^( T Z0 ` movies_info[int(movie_info[0])]=movie_info[1:] X& i& H9 R2 ~# a, M
return movies_info- r; F; X0 [6 m) Y- {5 W( ^0 F+ A
. a0 {3 Z' L; O+ ?! t: W
& v" V% A# z% W3 i2 T. l9 U/ G( B
5 G! r0 @( E: p' S2 x#主程序$ k4 `1 ?& N: H9 b$ u
#输入 : 测试数据集合
3 ^8 S. [+ Q) y; n0 K R' I9 xif __name__ == '__main__':
l( s% G& Y- o# n. S( k reload(sys)
8 t% j1 R+ ^8 J" V: x sys.setdefaultencoding('utf-8')6 F* |3 R6 k8 S8 o' b' H- q) T+ v
movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item"), G2 }4 f9 E4 D, L6 h; M" g) ?$ B, L2 c
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)
/ Z: V; }, G' T# b neighbors_id=[ i[1] for i in neighbors]9 ^# w4 U( M8 }. `8 E: X
table = Texttable()
M! K5 f4 `& L4 ] table.set_deco(Texttable.HEADER)
, f, w8 J N% W$ |2 G1 e2 ], L# c table.set_cols_dtype(['t', # text
/ I+ p3 q+ @: j 't', # float (decimal)8 p; V2 ~$ V. N# C
't']) # automatic
$ |) X4 ?! O6 n: M table.set_cols_align(["l", "l", "l"])6 W" J& x; X8 N/ h0 p; {
rows=[]
$ ?& s. C1 {. B$ V rows.append([u"movie name",u"release", u"from userid"])! N6 g& L; E' q5 `. v* ]
for movie_id in recommend_list[:20]:
9 H5 w" E8 }& |, s) p* w: l from_user=[]+ l0 x& |; c4 F( ^# X/ X9 n/ l
for user_id in items_movie[movie_id]: h1 M6 T4 N" m
if user_id in neighbors_id:. ~ E' a6 l. [. p% f
from_user.append(user_id)/ I( Z( R, b% j2 A9 d* s' s
rows.append([movies[movie_id][0],movies[movie_id][1],""])
7 ?* g/ Q. ^* ]# ~) D8 v' S table.add_rows(rows)
- U$ r, W& w! J u5 g print table.draw() |
|