- 在线时间
- 2759 小时
- 最后登录
- 2017-9-15
- 注册时间
- 2011-4-3
- 听众数
- 538
- 收听数
- 4
- 能力
- 80 分
- 体力
- 1764 点
- 威望
- 27 点
- 阅读权限
- 150
- 积分
- 5990
- 相册
- 0
- 日志
- 0
- 记录
- 5
- 帖子
- 6675
- 主题
- 3503
- 精华
- 3
- 分享
- 6
- 好友
- 1721
TA的每日心情 | 开心 2017-2-7 15:12 |
|---|
签到天数: 691 天 [LV.9]以坛为家II
群组: 2013年国赛赛前培训 群组: 2014年地区赛数学建模 群组: 数学中国第二期SAS培训 群组: 物联网工程师考试 群组: 2013年美赛优秀论文解 |
# -*- coding=utf-8 -*-
( }- s# @, D M, @2 D- ^, c/ U' a$ N( f9 B5 y# G
import math6 D2 [$ u/ ^$ F% ?; S
import sys
- L0 M; i/ G; p( J- S* Q8 Bfrom texttable import Texttable
0 D8 P( B5 a% C$ N! i
p/ y% C0 M- V! F M
9 {; Q" r1 r3 }% V$ p#
! O. |3 x" Y- Z: Z) ]+ p7 K* f' X# 使用 |A&B|/sqrt(|A || B |)计算余弦距离
! H1 Z/ D* j4 _) J, k" d#
; f4 M, Y( {; J! g( E#
* a3 A8 ~, N0 Z#
6 E" {2 J, j* v, }& idef calcCosDistSpe(user1,user2):! y/ S0 V$ D, _
avg_x=0.05 F6 M2 K0 d- f
avg_y=0.0
: h4 l G( W+ `9 X' [ for key in user1:
/ G3 O/ R9 ?5 B avg_x+=key[1]
; _4 |& K. V" u1 }: r& L! K avg_x=avg_x/len(user1)
% h9 I* {4 v [8 ?/ M# l 4 K7 ]4 N; R( K8 L
for key in user2:3 t* A* F6 ^2 Z# K2 U
avg_y+=key[1]
, f. w4 T) X" y avg_y=avg_y/len(user2); i; n5 H+ p8 ^" X# c
; `1 N& H2 K7 l; R% W2 C
u1_u2=0.0
* C9 z2 f/ b0 F+ {1 q- c0 _ for key1 in user1: `$ W6 K. w6 ^5 n9 G
for key2 in user2:/ E# s+ }: p3 c, v- O- b; s- X' }+ C
if key1[1] > avg_x and key2[1]>avg_y and key1[0]==key2[0]:
. x8 [. ~4 j; m) Q/ f' A4 P- h u1_u2+=1
, m# o1 M5 A& y1 k( G( v u1u2=len(user1)*len(user2)*1.0
2 b" l9 W! B& N* M sx_sy=u1_u2/math.sqrt(u1u2)# l& @" g/ n7 l3 ~( {
return sx_sy
: e$ v5 G. W3 s) Y5 Y- p1 g7 I( u- G; k) A1 a z, ?3 ~
$ g3 b1 E$ f8 M; d- g#9 d- \' ~% Y4 j) H$ Z9 c
# 计算余弦距离- R0 N a6 k$ k& E* ^7 g1 q
#2 E& e1 e6 j2 Q( B6 l7 e5 B. @
#
- m& S$ g; S- O2 k: j8 O* Edef calcCosDist(user1,user2):
8 ]3 k5 h }$ A1 q! o$ { sum_x=0.0
" k1 q1 o `7 X, j. p8 W sum_y=0.0
' u" W3 p$ }/ u7 q% ^0 y& y& D sum_xy=0.0
) l( O6 ^# `3 X3 w( \: ~# a6 @ for key1 in user1:3 _1 ?5 F2 I! i. `, t7 ^
for key2 in user2:
" h" l! T, d. b F( B if key1[0]==key2[0] :' @ I7 h1 f s& @$ L
sum_xy+=key1[1]*key2[1]
& L0 i+ B7 O5 U R: o* @0 V, l5 { sum_y+=key2[1]*key2[1]
# }8 [8 J( i& G* O: P sum_x+=key1[1]*key1[1]
/ U$ R M3 z# z% l* S- Y1 u
" Q; e! _* f" }/ X, q' A+ ~ if sum_xy == 0.0 :
& b, U0 \- g& i( u9 b( c, X return 0
/ y. c# U$ f) h2 Z% v sx_sy=math.sqrt(sum_x*sum_y) ; s" P) h1 a3 ~, n5 Y2 @
return sum_xy/sx_sy
# @; @+ S2 t. o% U2 N
, @1 S- m3 P3 Z; \% _" E
) G/ ?1 y0 ^; \+ Q; h% G#' J: R* p$ e* Z0 [, i. p* p7 ^* n
#
: j* q2 J! n* i' Q5 O; w# 相似余弦距离
, z$ @' M/ O- A1 B#; s5 W4 r& `0 [9 v
#4 L; t+ b3 W# g$ o- L% t7 `
#
6 n5 ~1 b) Z# T: qdef calcSimlaryCosDist(user1,user2):
$ Y5 L' M) v" H" X5 X/ V sum_x=0.0
8 n+ F2 f& C( n3 j" d sum_y=0.0 r0 ^2 R* X w9 Z- t8 ?: ]/ y9 q
sum_xy=0.0
8 V: L) |5 {+ v3 d: h# D avg_x=0.0
4 E2 g& J1 K* }* R avg_y=0.0
1 V( N2 h2 u* }( V8 W7 s for key in user1:
& J" ]+ M+ F3 l* {' [ avg_x+=key[1]
' Z/ A, I9 I, S$ r: n5 W$ ^ avg_x=avg_x/len(user1). n6 j) W) p Y* i5 z+ N
) k M9 ?. g2 G
for key in user2:
% x1 o# z" i# f' W- |: d avg_y+=key[1]
+ F- h# @! x. K8 P avg_y=avg_y/len(user2)
: _% x2 c5 |/ P& o7 f $ E( D2 ?0 v9 m* W
for key1 in user1:. }; ~& D7 c4 [) D
for key2 in user2:" b7 O' h( S+ |- e" U; P
if key1[0]==key2[0] :
5 [- I. ]/ Z- M5 @1 w e+ y sum_xy+=(key1[1]-avg_x)*(key2[1]-avg_y)
+ f) P4 b, [0 N# i* p5 @ sum_y+=(key2[1]-avg_y)*(key2[1]-avg_y), S1 h) c: X8 d* G8 M
sum_x+=(key1[1]-avg_x)*(key1[1]-avg_x)+ u8 D" ]; @* @# S/ H$ X# ^4 y
! C: R3 O0 @- Y" v t; u
if sum_xy == 0.0 :
% Y% ?! _! s( [4 P4 C$ h return 0+ G, U. n4 s/ E4 c8 Q; k
sx_sy=math.sqrt(sum_x*sum_y)
* `) e( }; J* c' l e return sum_xy/sx_sy) d8 I! T0 {0 R' |" V
* k$ X, V7 m! ~+ [- H. ^) Q2 F
4 p# D2 I/ e+ R; n#
% M1 ]' F; Y: E0 P: j7 ?# 读取文件" |+ f: e l) @& W+ B3 g$ F! A
#9 o6 H# t7 S% u& q# i! m0 N3 Z) @7 b
#" `/ S6 z: e% b9 Z
def readFile(file_name):
, l- U" w5 z4 G! l6 Q contents_lines=[]
! R- e: u) n. r" A. ] f=open(file_name,"r")3 }' M0 v2 E+ _* X
contents_lines=f.readlines()
. v5 U1 N/ ~/ Q8 B% b f.close()
* W0 H5 Q) Y3 R' V4 { return contents_lines* ^7 S' j( v. |) U4 d' p/ i
$ l8 p. A# E4 }- J J& a
8 Q# X2 ~2 h0 P6 e9 r* T: _. p1 P# [* j n2 J h! c9 m. c! i
#
. A h* c @0 A0 i# 解压rating信息,格式:用户id\t硬盘id\t用户rating\t时间
' M/ o; w$ h; A+ K" x; {* H# 输入:数据集合& a" M" E2 m8 k( e! s$ f: q3 j1 g
# 输出:已经解压的排名信息# U4 Y4 S! T1 K2 }. j! [
#. q" n$ d7 _/ v( Y$ v: [" O! Y4 M0 u3 D
def getRatingInformation(ratings):
; M4 ]& @; P$ s! y7 G2 N% t rates=[]
) P, h" K Q, p ], [: k5 @ for line in ratings:
7 V0 L# |1 `! G# D$ [( ~/ Q) `: S rate=line.split("\t")
; @8 e" R9 W4 e* A' m4 ~ rates.append([int(rate[0]),int(rate[1]),int(rate[2])])
( ]$ v' w" t: @' y1 r return rates5 R7 L) R: ]! v5 C
0 r' G/ Y* v# b; P5 E
4 s9 O1 \8 B) R6 H T4 ^& Q# R2 n#
3 a/ W4 ~1 Q5 ~% f7 v# 生成用户评分的数据结构; }% v2 o9 R$ F& V( N
#
" @$ M4 x0 I( H J! `5 m+ {) i# 输入:所以数据 [[2,1,5],[2,4,2]...]4 y8 N3 \+ ]5 |& x7 z
# 输出:1.用户打分字典 2.电影字典
5 Y5 h( H3 {/ B* p6 g c) A4 G# 使用字典,key是用户id,value是用户对电影的评价,, e J* X$ t# H, `: L
# rate_dic[2]=[(1,5),(4,2)].... 表示用户2对电影1的评分是5,对电影4的评分是2( |5 g Z9 r& p# N
#
* g* u+ C3 B5 `def createUserRankDic(rates):: z% h: u$ q- B2 y
user_rate_dic={}
' n9 c n9 V2 C- e3 f4 s item_to_user={}
. F& |5 |1 N1 h0 v2 j4 T+ b$ I for i in rates:
4 ^, O3 u" i& F& J user_rank=(i[1],i[2])
9 \ V0 W: y# S! r0 V if i[0] in user_rate_dic:# t, ?2 g" N) o7 @0 J) Z8 P/ w
user_rate_dic[i[0]].append(user_rank)
9 }% I s7 U$ Y6 ?! U) ^ else:
& ~2 w( G$ g3 w. k- Q" m user_rate_dic[i[0]]=[user_rank]/ d* m+ _1 _' n. |3 y$ h: e8 L
& j( p5 G4 @0 [$ d, z8 Y( P
if i[1] in item_to_user:
& S7 o: g3 R# L5 g2 ` item_to_user[i[1]].append(i[0])$ ~( ?% P% F1 `! F0 N
else:, k$ m& d# {3 R- a0 e6 u
item_to_user[i[1]]=[i[0]]; ?( D; K. J9 c9 M9 p( e
/ ]8 C: E9 R" K4 c% h3 J
return user_rate_dic,item_to_user& N% E0 i I$ I }" N: V
; W2 y8 S1 @# B- @4 T
$ e7 B0 M- h) |( ?. e/ o#
9 g- q F3 D' U Y6 y5 J# 计算与指定用户最相近的邻居
& j9 c% X0 a1 Z: Z8 I `/ h# 输入:指定用户ID,所以用户数据,所以物品数据
9 q: P1 S! n' g# 输出:与指定用户最相邻的邻居列表
. b- z0 T% l2 H+ \#
; m" A1 [' f5 M1 J' j1 K2 Ldef calcNearestNeighbor(userid,users_dic,item_dic):
$ J% j/ j$ i" G- l8 D' E neighbors=[]
$ e4 w* v) L3 u M #neighbors.append(userid)" v5 J( P6 m( F2 J
for item in users_dic[userid]:' d& I9 a% e7 q5 g# ^
for neighbor in item_dic[item[0]]:4 Z Y! z! D/ j; \9 T0 J- q. n
if neighbor != userid and neighbor not in neighbors: ; L) k, w: a9 a, s+ F/ ^- K. I
neighbors.append(neighbor), f/ p' N$ g8 `+ ?: r9 r E: Y9 B
$ T4 u5 E9 e9 {: M neighbors_dist=[]
6 J: t6 b0 s, x for neighbor in neighbors:
9 @: c- ^$ u) f: z dist=calcSimlaryCosDist(users_dic[userid],users_dic[neighbor]) #calcSimlaryCosDist calcCosDist calcCosDistSpe) I; Q4 C* q: Q
neighbors_dist.append([dist,neighbor])
, \/ q t7 v) g neighbors_dist.sort(reverse=True)8 u0 j9 a% }( s6 c
#print neighbors_dist( G$ u" \$ z( h0 f: d/ J
return neighbors_dist5 U' m6 }4 p6 ~
% ?7 q6 T8 a: ~% E9 g! a, f" O n$ _: @
#( h+ b* ^( ~/ Z. {! E
# 使用UserFC进行推荐
1 v0 S$ t0 N. S, Z8 u# 输入:文件名,用户ID,邻居数量* c8 u; X$ [0 F" @
# 输出:推荐的电影ID,输入用户的电影列表,电影对应用户的反序表,邻居列表
# l" L4 z/ W' r1 n#
# Z4 I. O) Y Vdef recommendByUserFC(file_name,userid,k=5):
/ A6 h) \5 i4 H # b- f; F# O- |6 l
#读取文件数据
( @) e0 ^4 {. n' ~ test_contents=readFile(file_name)5 w: y% q" r6 b f) b F" E# E+ v# v
8 r/ b+ ?, ]! ]( O, K6 k3 d4 s5 H n #文件数据格式化成二维数组 List[[用户id,电影id,电影评分]...]
7 J7 r0 n. V2 e' Z test_rates=getRatingInformation(test_contents)
8 x* \1 \% z+ p 1 W/ l7 \: e$ X0 r3 Z3 z
#格式化成字典数据
( \, h" C/ Q o# o& B6 q+ H # 1.用户字典:dic[用户id]=[(电影id,电影评分)...]0 F0 k; v: g M" I$ z: O3 s5 |) G
# 2.电影字典:dic[电影id]=[用户id1,用户id2...]" D+ d4 Z. X. G9 _* Z+ S" G
test_dic,test_item_to_user=createUserRankDic(test_rates)
+ C7 Y a, z2 ?
* J. G0 p" D$ k% r* Z6 ^; m #寻找邻居
7 J8 c0 }7 h5 R3 q neighbors=calcNearestNeighbor(userid,test_dic,test_item_to_user)[:k]* |: a1 X: \# ]% ~0 p- U
9 ]( y2 n! X, v1 Y5 a
recommend_dic={}# a9 L) S: T/ {6 J% }/ z) b7 A8 Y
for neighbor in neighbors:0 R' O1 g" J" k5 [) x8 E" g9 Z
neighbor_user_id=neighbor[1]
' [9 i2 I5 |$ r! y" j movies=test_dic[neighbor_user_id]. d5 M1 U, Y0 G1 z ^
for movie in movies:/ ]3 G0 H: J2 z" X: f
#print movie
, R# Q- C4 J9 C* u% J if movie[0] not in recommend_dic:0 i+ Z1 m! ~% F1 u- F
recommend_dic[movie[0]]=neighbor[0]3 @ S2 G# _* _+ X, ?% P/ a+ Q: Z
else:; j. l7 B; Q% Y! c
recommend_dic[movie[0]]+=neighbor[0]) q$ M$ r. n# t) n
#print len(recommend_dic)
* B$ O; U9 x+ |3 f+ { ! m4 M1 U% n6 }6 C0 j3 s
#建立推荐列表1 ]; D* e% `( i+ q6 O4 R
recommend_list=[]
! J0 c U& d# ^ for key in recommend_dic:
: E# C" E, R" ]$ W #print key
: x" ~& t4 X6 p recommend_list.append([recommend_dic[key],key])
' c0 N" T/ Q7 y' u6 ~ 1 `1 }2 G2 L! w3 m/ p& r* g
% i' @. U) e2 J& J0 ~& X2 f
recommend_list.sort(reverse=True)+ Q5 f9 }! l4 |- W/ Z* ]
#print recommend_list' A4 b) o2 e( U3 `; n9 j
user_movies = [ i[0] for i in test_dic[userid]]
4 I: `5 R' \, h7 k5 a, e) V' f, \* O+ P4 S- v; b
return [i[1] for i in recommend_list],user_movies,test_item_to_user,neighbors! S% c4 x/ O' I: e- |
: h; G: J6 m$ [ J7 ? ' R! d' V) i% }9 T
% I# i( v. M6 f z0 a
#
d1 S9 i5 L' j8 {- a& C#
# i6 F- b6 t/ s5 e0 n' x s# 获取电影的列表# e4 s7 c, G3 ]# G
#0 E5 T9 p% Y$ O$ w) I. l
#
) ?# S( V8 G' }9 l#
$ P; V$ Z! _1 f! V. R" U! Ydef getMoviesList(file_name):8 G3 W7 b/ N' I7 v. M" ?
#print sys.getdefaultencoding()5 u- ?; |/ R; ?) s& v$ r' b
movies_contents=readFile(file_name)* C- W9 Q4 ^: j& ]4 ^6 m' g
movies_info={}
1 p. q- t* n2 t2 G8 o$ \) D$ Q1 k- V for movie in movies_contents:, k& j/ [2 J5 n$ F+ ~; |. [
movie_info=movie.split("|") b$ E% [9 G0 J+ Y! X
movies_info[int(movie_info[0])]=movie_info[1:]
" T I! v' |" l return movies_info9 |) O! _0 m: Q( }* e
8 z# @" r5 h) \, ?) {5 @) @
5 _9 N# b( [+ w. A$ u 0 b. x3 e3 l" i: L3 Q
#主程序
* |' B1 Y( n9 a3 J$ E" _* r#输入 : 测试数据集合
- ]6 u4 a: v5 |0 ]9 }5 A" p' vif __name__ == '__main__':% y& s* f2 A) `" {* P
reload(sys); Q% C' ~" T1 _! j$ S3 }# H
sys.setdefaultencoding('utf-8')
% Q0 Z" M& z8 y, Z2 S movies=getMoviesList("/Users/wuyinghao/Downloads/ml-100k/u.item")' r5 P6 W3 O o) | A6 c0 u
recommend_list,user_movie,items_movie,neighbors=recommendByUserFC("/Users/wuyinghao/Downloads/ml-100k/u.data",179,80)8 _# a' `* x) a
neighbors_id=[ i[1] for i in neighbors]+ B) B% ~' x; u& _2 C
table = Texttable()
7 I: Z8 u' U" _2 h# k table.set_deco(Texttable.HEADER) h% X( _% x3 l8 M2 i# y6 c# P
table.set_cols_dtype(['t', # text
" s$ A! c6 x- u6 }2 [7 e" D; B 't', # float (decimal)
& Q3 n* C) `* Y2 G; z* `# ?! ` 't']) # automatic
4 \4 Y2 m- Z9 {' r' O table.set_cols_align(["l", "l", "l"]). D- L7 o% u) G( q: a7 R5 Q
rows=[]
$ g5 u) ^& @- a0 v3 J rows.append([u"movie name",u"release", u"from userid"])
( ^ p' D) f( N3 D3 J for movie_id in recommend_list[:20]:
+ T4 b. u3 \ P7 V from_user=[]. w2 s/ ?, a& Q; M& m }; R1 j' ~
for user_id in items_movie[movie_id]:
* C C$ [3 H2 v8 Q+ Y if user_id in neighbors_id: ~) N2 p& d# g( Q2 W: H9 f
from_user.append(user_id)# ^8 J( J' n/ {8 o. V! g" k
rows.append([movies[movie_id][0],movies[movie_id][1],""])" _0 M3 I# u6 R( o
table.add_rows(rows)
* n' x: d4 h. n7 T3 O* Q print table.draw() |
|