- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 3
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
|---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-
; e& l$ K$ A4 o/ i2 c9 S6 j W6 a! X3 o1 N3 S
import math( A* w' a& R+ j9 \. s
import sys$ y2 A8 G+ S" r' j
from texttable import Texttable* _' {( T& v* J7 i7 k4 y
; e" v: @9 R. t3 I$ u, X$ }! ?; ^+ c) r1 I# e; A3 _
# k# G/ f1 O3 Q* ]; `" T/ ^. w
# 使用 |A&B|/sqrt(|A || B |)计算余弦距离
" k( t( f, u1 c. e. q+ k#
( D$ k4 C# O) o8 M, b, u* _9 B#
; ?8 V( d9 C' g' m: |7 T#( d8 t4 D4 E' `% |7 Y
def calcCosDistSpe(user1,user2):
/ @, h: P# M5 F. u avg_x=0.0
# x+ ~8 V7 H U* l |/ E( k avg_y=0.0
& I6 i: V2 k3 m( r `& H7 H; ]5 T for key in user1:
! @: l* S- E3 M, ?, T; n+ M$ ^. v avg_x+=key[1]) D8 b! K! o) k4 ^2 K
avg_x=avg_x/len(user1)
9 v6 d- K! Y. {
5 c. e$ w- F0 s3 c- `) h for key in user2:/ f, U# R& L$ e/ B. F6 D
avg_y+=key[1]
6 v8 `: n* F3 @# p avg_y=avg_y/len(user2)( J+ T5 x0 O6 ^. L: a$ }
" _) L f0 q1 _) p3 S7 N
u1_u2=0.0/ r" x: [% n4 r
for key1 in user1:
: O' w! J. p0 g6 c1 `0 w ` for key2 in user2:
1 T% b; o* Y9 x+ B if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:) D6 x0 K. |/ {! ^! O7 |
u1_u2+=1/ n$ }$ Y* @ k V$ e9 ]
u1u2=len(user1)*len(user2)*1.0
3 B7 B$ E! X& t( ~; o' q/ i sx_sy=u1_u2/math.sqrt(u1u2)9 B7 G$ E5 ]8 h
return sx_sy
0 J% s; Q5 g8 B9 ~' w' @
# I# A: X- y* W; ~# e; `: F0 _$ z0 c9 d3 D4 i# x$ T x& t
#
3 z8 y# Z2 D+ X, u# 计算余弦距离7 C ]. G/ R% K' G7 Q$ f
#
2 }+ w8 U1 @5 \) K5 i#
4 o1 ], h* v2 Bdef calcCosDist(user1,user2):* f. J1 @! B( @, _( j
sum_x=0.0
4 C7 [: C$ b* ^2 \) V0 \ sum_y=0.0
( @+ \! O8 m; T4 d$ L% `4 j sum_xy=0.0$ G9 g' R" d, P5 S2 e6 t4 y
for key1 in user1:& G a3 w, p* v7 |& F; C1 B- x
for key2 in user2:+ r( G# k( _+ ]1 [& X2 n
if key1[0]==key2[0] :/ w* z+ F( [7 f* l( B; a+ D" X
sum_xy+=key1[1]*key2[1]
: q2 v. V1 g0 H- F$ K sum_y+=key2[1]*key2[1]
' }- e8 I Z5 `& B& @" I ^ sum_x+=key1[1]*key1[1]" i S$ U3 i& T O- x, h& g9 G
* l; h( P7 Y9 v, ^; ?, i6 [ if sum_xy == 0.0 :
- l8 N4 w' x. y& g0 G return 0
! e1 B6 v( [/ G/ A# i sx_sy=math.sqrt(sum_x*sum_y) : y) a6 l g) n# H" }" [ L
return sum_xy/sx_sy" Y$ u* s T8 L7 B: q
: u: Q* l# k- N. b- F+ U/ Q
9 g0 h, N2 Q1 E8 k; g3 K7 p#
& F) N- t6 x8 x& j#) f$ }" R- l2 i Z; t* a
# 相似余弦距离$ {' t( ^# q& ]8 I5 D' H9 X% V
#& f$ [: r0 a1 s. X9 A* u0 j
#' |0 j( L; Y6 B5 y- E
#
$ \6 b- p) o" J- `: Jdef calcSimlaryCosDist(user1,user2):
& h1 q' W& F( M sum_x=0.0* i- e& v: D. ~1 E0 j
sum_y=0.0! U+ P+ C( K3 Z
sum_xy=0.0$ x& d& R1 H- M4 U( @
avg_x=0.0/ X* c/ {2 ]# H2 h4 Q: R, ^% y- p: e
avg_y=0.00 q, x D8 l1 Y, a, Q
for key in user1:
3 h* f3 M" f! `) _1 ~ avg_x+=key[1]5 |/ ~9 m J7 S0 | N
avg_x=avg_x/len(user1)% r* v' y6 \- C# a8 @5 d6 I& k
+ ^8 L3 [! u" f6 b) @4 |/ Z
for key in user2:# F. x. Z; ]" C7 a0 X E2 s( p
avg_y+=key[1]
7 M. m* ~* v7 h( V avg_y=avg_y/len(user2)
# C. }3 I( H! Z0 L7 h( w g2 f , Q0 N! t/ K7 n# h/ e- [8 `
for key1 in user1:
" d) i/ y8 \( C. b; N/ | for key2 in user2:5 Q/ {* \6 W) \; f
if key1[0]==key2[0] :- w- | g" R, ~) b. \# {$ y8 S
sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)
. _. O$ b9 ]( Q: y8 I% S sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y)
" @6 ~8 _1 x1 l8 R Y* i1 j$ i sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)( s" A+ @3 _3 _( d2 b6 j: q! a
3 m% E: z/ X. }: J; t
if sum_xy == 0.0 :7 ^" I/ p) K( L' w$ H Z' N; r
return 0
' d- r* g. k0 D- L6 x sx_sy=math.sqrt(sum_x*sum_y)
( v( u+ N/ _6 t8 A- C, [! ^& a4 ]- U% n return sum_xy/sx_sy3 f6 ^4 q1 `9 X7 U. Y$ Z
# a* P3 K2 ?1 e
# j! a$ n3 r# H
#1 G# S$ h$ l$ f6 a% r! B/ P$ ]
# 读取文件! f9 L6 I* e( F, C0 @+ h) q- t# ?' d
#' O% u3 O0 i; h5 i& s, G) b# H- r
#
) H0 D) s* r: p- o5 }def readFile(file_name):. `! C6 J# I0 l/ H
contents_lines=[]
8 k- {3 W0 A8 V1 L. m5 p5 Y' l3 j- V! H f=open(file_name,"r")
( |1 r9 [8 ]+ C) I4 ] contents_lines=f.readlines()
) p: M: X# b I% n f.close()
! O$ ?1 O, {! z6 r+ [ return contents_lines" B4 h$ O/ L8 R( l1 @+ F( d* x3 g0 }
# k1 q M( ?$ ?/ J& F7 Z( e$ Z# A! n0 M; h1 b* f5 w7 N& m4 f
7 i7 ]3 l: g0 ^2 h9 i7 u1 B8 E# P#
# Z) j% p; C: Z7 f# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
$ f8 n+ u4 q2 e# 输入:数据集合- o, z& x, s- p9 p i
# 输出:已经解压的排名信息$ D3 ~1 D6 |5 m" ~
#
5 f8 p2 _" _! u) Zdef getRatingInformation(ratings):
8 z% E/ o# O! S" T rates=[]
- Z0 T! y3 x( I/ }/ J/ x% ^$ }9 k( z for line in ratings:1 {! Q# {4 L; t$ g
rate=line.split("\t")
/ G1 }* E5 J2 R3 {0 g) W' G rates.append([int(rate[0]),int(rate[1]),int(rate[2])])& h% J W/ u+ G2 ^- N7 u! c5 c/ G
return rates
- C: Z8 l1 ` Y. B+ y- y$ R) i. a% k. m
8 e8 g K2 `2 M; L9 c9 b! X
#4 X/ d& Y1 c3 z) e# H
# 生成用户评分的数据结构' G2 V. L. F7 n- o$ F6 V$ p
# 9 w2 E( H! u" S
# 输入:所以数据 [[2,1,5],[2,4,2]...]8 E; L' Z. r' }- H
# 输出:1.用户打分字典 2.电影字典" _3 @3 T! T& W% d5 x& ~4 {
# 使用字典,key是用户id,value是用户对电影的评价,9 Q& v# Q$ D/ B% z
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是26 s' X E8 g7 K/ [% Z/ Y" Q* B3 A; G
#) ~# b* Y) X u4 @' q: P
def createUserRankDic(rates):
8 K8 ]# s1 n8 Z, F9 N user_rate_dic={}- C% p2 z) L5 a4 u
item_to_user={}
1 h+ ^; ^7 s3 u8 w3 e for i in rates:1 J5 ?) U+ Y( G$ n( ?1 ~: r' v
user_rank=(i[1],i[2])) |0 z# @3 C) ]0 A# ^( I4 h
if i[0] in user_rate_dic: O" M$ N: N. ~. ?7 y
user_rate_dic[i[0]].append(user_rank)
; M" |! S4 E7 z. ?7 l& G. Q else:
9 Y" e' H% A( h0 {! w; e user_rate_dic[i[0]]=[user_rank]
8 Q& `+ x8 M5 e( n/ c: v3 S % }% k3 K& F6 u+ u+ d5 J, m J
if i[1] in item_to_user:
+ r3 U3 P ?, k" W: N item_to_user[i[1]].append(i[0])8 D8 ^' u% u. M, }/ L1 k/ F* B7 t- O
else:9 f+ [: A& v( w7 b. _" L! w3 C
item_to_user[i[1]]=[i[0]]
. f+ ~2 { v% \4 k2 | 2 e q& f- \4 Q
return user_rate_dic,item_to_user# p- t4 i. p8 c4 d2 D# f5 E; O6 X$ L
4 {' |5 T6 B) N: z
9 D0 c5 ^7 e7 q5 p# e$ d#' e5 Z, Z$ G$ T3 _0 @5 O; U
# 计算与指定用户最相近的邻居
% m% j' |4 d( D. N# 输入:指定用户ID,所以用户数据,所以物品数据5 p- M+ T8 E% m+ }! A5 O k
# 输出:与指定用户最相邻的邻居列表
( f. Z, n6 l' C5 w#
- ~: x4 x& N8 k [def calcNearestNeighbor(userid,users_dic,item_dic):/ x: L2 t0 N' D! }
neighbors=[]
! A8 n$ |, ^! n #neighbors.append(userid)1 i6 `* V) T, S" ]! {$ o8 n7 O
for item in users_dic[userid]:
0 y" _& \/ o' T" Z" V for neighbor in item_dic[item[0]]:; e z! a ?3 ]/ u" X' l
if neighbor != userid and neighbor not in neighbors:
, Z- S4 v8 d, @2 B neighbors.append(neighbor)7 m; O2 r8 u* g) L0 L2 H$ P7 ]
2 B% j- K! O& E9 p6 F neighbors_dist=[]
1 n1 _1 m3 b- W2 A8 k* C for neighbor in neighbors:
' T* Z* e" Z4 U7 b" O/ ]4 R: o dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe* D8 |$ U. X% Q
neighbors_dist.append([dist,neighbor]): P+ q& P( \* G# e
neighbors_dist.sort(reverse=True)
9 _, C9 G2 l" T& l/ X #print neighbors_dist
3 [% i" Q. r" w2 Q4 T& G return neighbors_dist
# e9 _4 M5 Z2 ]8 W8 |% S1 y2 A/ r% @9 @/ t
$ C& y+ t- n4 O5 r% X
#
4 }4 E9 y- x" A. c% W+ m5 t# 使用UserFC进行推荐
7 f3 z4 _+ ?. _2 \# 输入:文件名,用户ID,邻居数量& M9 h# j& F4 x0 \5 Y$ [' v
# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表, Z: C, n$ F6 N5 ]8 M$ q: I
#7 f( z! \6 q1 @8 }$ k H3 L
def recommendByUserFC(file_name,userid,k=5):* K! [ E5 y) A1 W1 V$ ]
3 J5 s* [# r2 |- s& A* e C0 _ #读取文件数据1 u! [6 D O; o( h9 Q
test_contents=readFile(file_name)7 D4 R- Q! H% k% x, @% F
1 A6 m+ B6 ]5 H+ S #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...] 6 q0 Q8 Y& g8 d( P
test_rates=getRatingInformation(test_contents)
$ M: g# ?9 Q2 `. w
?# F6 \2 l0 T! b #格式化成字典数据
4 B. A D, `# R2 C1 `6 { # 1.用户字典:dic[用户id]=[(电影id,电影评分)...]1 r$ X: R; \; n* \* j" ]
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]7 V- v; d0 H2 ], ^& y5 T
test_dic,test_item_to_user=createUserRankDic(test_rates)2 A( V [$ x: u! k; b0 n; e \: F
, g& F& p: ^- m; O2 U3 P( h. \( o
#寻找邻居
% l. a6 [8 b4 E; `; s neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]% a: W$ L& i5 i4 P
/ n' `) E+ Q5 G# N; Q
recommend_dic={}
4 L& L& b7 A( c8 ^' v2 ? for neighbor in neighbors:
0 \ y# d# R( u neighbor_user_id=neighbor[1]
$ m$ m6 c- a" g( d" t movies=test_dic[neighbor_user_id]
8 l7 @) _& z3 |9 r& ~ R for movie in movies:
; o" ~# c+ O* |$ C. J2 [$ Y #print movie7 x. w5 ^5 _( c/ W. Z) b) R& ^* y
if movie[0] not in recommend_dic:
: ?' }6 q+ _% e$ O9 @ recommend_dic[movie[0]]=neighbor[0]
' S6 w0 f g! y$ z else:
3 p9 m: q: C' c* {7 z recommend_dic[movie[0]]+=neighbor[0]
) C& l4 v" E) L E #print len(recommend_dic)
% O" y0 g. R C8 |: F . A5 ]6 m, x9 b
#建立推荐列表/ \0 q( {# L' ?, t! \
recommend_list=[]- s+ ~5 u" ~/ M
for key in recommend_dic:) v3 P6 z8 @! x! @& x: O
#print key
7 y3 U$ B- c: C/ u9 ` recommend_list.append([recommend_dic[key],key])6 t' O$ O4 u7 i
+ u8 n6 P9 { l/ g% {. ~
7 i6 _ d( `' L* L) S0 G' T
recommend_list.sort(reverse=True)
. S. `/ t7 U2 s% Y% j R #print recommend_list: @. _( b9 {: k4 Q7 W. f8 P- X9 y
user_movies = [ i[0] for i in test_dic[userid]] G8 ?* e2 R! P3 N) u0 b
; Y( y' l* Q8 H) E# ]- S/ _ return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors* a: W; M' o2 h2 [- H8 \) i4 f9 e
4 i- w8 q( j0 A& ?: T7 {
j$ P$ @- R1 _) ^- k( y" X* U' E& h h. O, J, Q2 n8 }% u* i& {' g$ Z
#
# T8 R/ D6 b( z0 W#: c4 F5 K5 a* ~# F% y
# 获取电影的列表1 I2 L, i% Z: n1 Q4 @! W- x9 M m
#
/ e5 O* h$ \# C+ j#! L) j/ k( {* t$ ~6 G
#! n1 K' i1 n5 v; r8 c
def getMoviesList(file_name):
3 M; P. K* b" { #print sys.getdefaultencoding()
( J* t) `8 C6 S movies_contents=readFile(file_name). ] l+ q w- w2 P8 q8 G$ K
movies_info={}
( [0 h R. O5 f/ p$ V for movie in movies_contents:* j5 W, J0 |& q4 d0 ^
movie_info=movie.split("|")
! j8 l$ q. e U movies_info[int(movie_info[0])]=movie_info[1:]
+ t& M/ X% D3 O' Q return movies_info
. x! `* k R# e% _ 4 I: O, y. ^ V: e+ d. |
- n3 x# ~+ J4 B/ t! q8 c- C7 m) e 4 L o/ c0 |4 I
#主程序
# t6 l, k4 j* [ U#输入 : 测试数据集合* o( k# l7 b4 h+ q# {9 c! {
if __name__ == '__main__':
9 Y7 C/ S) V4 Y! X2 ~2 j8 s6 Z reload(sys)
L$ H- }! p \, c9 x' e3 R sys.setdefaultencoding('utf-8')
) ]# e# o" h. y5 c6 k, D movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")
0 V4 ~3 s U- H1 R4 \* b recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)( d* \: L& B( h5 U
neighbors_id=[ i[1] for i in neighbors]
; |! G/ n! {2 p4 b* F! R. W5 G2 a+ { table = Texttable()
6 b4 g" j% j' Z8 `3 ?- Q table.set_deco(Texttable.HEADER)
+ U$ n2 d, N! W2 Q table.set_cols_dtype(['t', # text : r7 i& l3 m% g3 V
't', # float (decimal)
; U6 N# R7 G! a* k: p 't']) # automatic
; h6 D; o# {; H9 u" n! a table.set_cols_align(["l", "l", "l"])
% P( S9 t: F8 n' R2 C2 v$ R) e rows=[]3 h; H/ O5 |. {' l; R( N( g
rows.append([u"movie name",u"release", u"from userid"])7 y: h) t* Y8 R- t0 r) a1 H
for movie_id in recommend_list[:20]:
$ I' z" J% f1 q& O; l' Y& I from_user=[]- U/ t( Y; B5 d c
for user_id in items_movie[movie_id]:
9 q( X# B/ _0 N1 y7 _$ I if user_id in neighbors_id:
5 X) F) ?8 l0 W, D* L3 U& R! g8 }5 E from_user.append(user_id)
g/ W" j; ?$ _5 [% E+ n: X rows.append([movies[movie_id][0],movies[movie_id][1],""])
5 @% R, a0 N$ e: h& @ U7 V table.add_rows(rows)6 H! G4 B$ |5 ~
print table.draw() |
|