{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np \n",
"from surprise import SVD\n",
"from surprise import KNNBaseline\n",
"from surprise.model_selection import train_test_split\n",
"from surprise.model_selection import LeaveOneOut\n",
"from surprise import Reader\n",
"from surprise import Dataset\n",
"from surprise import accuracy\n",
"from surprise.model_selection import train_test_split\n",
"from collections import defaultdict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据在此下载 ###\n",
"https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/ez_douban/intro.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"评分数量:2604995\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2410169 | \n",
" 1727 | \n",
" 9984 | \n",
" 2 | \n",
" Beauty and the Beast | \n",
"
\n",
" \n",
" | 1239338 | \n",
" 7328 | \n",
" 1602 | \n",
" 5 | \n",
" Firelight | \n",
"
\n",
" \n",
" | 1780092 | \n",
" 4470 | \n",
" 4008 | \n",
" 5 | \n",
" Kramer vs. Kramer | \n",
"
\n",
" \n",
" | 971816 | \n",
" 620 | \n",
" 1146 | \n",
" 4 | \n",
" Reservoir Dogs | \n",
"
\n",
" \n",
" | 1086919 | \n",
" 5560 | \n",
" 1341 | \n",
" 2 | \n",
" Silent Hill | \n",
"
\n",
" \n",
" | 1106322 | \n",
" 7670 | \n",
" 1362 | \n",
" 3 | \n",
" Tais-toi! | \n",
"
\n",
" \n",
" | 41897 | \n",
" 5241 | \n",
" 42 | \n",
" 2 | \n",
" Closer | \n",
"
\n",
" \n",
" | 2362262 | \n",
" 5224 | \n",
" 9173 | \n",
" 2 | \n",
" Sundae in New York | \n",
"
\n",
" \n",
" | 1117186 | \n",
" 1898 | \n",
" 1383 | \n",
" 4 | \n",
" Zodiac | \n",
"
\n",
" \n",
" | 2816937 | \n",
" 6071 | \n",
" 48616 | \n",
" 3 | \n",
" 舞牛 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId rating title\n",
"2410169 1727 9984 2 Beauty and the Beast\n",
"1239338 7328 1602 5 Firelight\n",
"1780092 4470 4008 5 Kramer vs. Kramer\n",
"971816 620 1146 4 Reservoir Dogs\n",
"1086919 5560 1341 2 Silent Hill\n",
"1106322 7670 1362 3 Tais-toi!\n",
"41897 5241 42 2 Closer\n",
"2362262 5224 9173 2 Sundae in New York\n",
"1117186 1898 1383 4 Zodiac\n",
"2816937 6071 48616 3 舞牛"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies = pd.read_csv( './data/douban/movies.csv')\n",
"ratings = pd.read_csv('./data/douban/ratings.csv')\n",
"\n",
"combine_movie_rating= pd.merge(ratings,movies,on='movieId',how='inner')\n",
"combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)\n",
"combine_movie_rating = combine_movie_rating.dropna(axis = 0 ,subset=['title'])\n",
"print('评分数量:%d' % len(combine_movie_rating))\n",
"combine_movie_rating.sample(10)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" title | \n",
" totalRatingCount | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 1 | \n",
" 4 | \n",
" Harry Potter and the Deathly Hallows: Part II | \n",
" 1703 | \n",
"
\n",
" \n",
" | 1 | \n",
" 21 | \n",
" 1 | \n",
" 4 | \n",
" Harry Potter and the Deathly Hallows: Part II | \n",
" 1703 | \n",
"
\n",
" \n",
" | 2 | \n",
" 25 | \n",
" 1 | \n",
" 5 | \n",
" Harry Potter and the Deathly Hallows: Part II | \n",
" 1703 | \n",
"
\n",
" \n",
" | 3 | \n",
" 34 | \n",
" 1 | \n",
" 4 | \n",
" Harry Potter and the Deathly Hallows: Part II | \n",
" 1703 | \n",
"
\n",
" \n",
" | 4 | \n",
" 36 | \n",
" 1 | \n",
" 5 | \n",
" Harry Potter and the Deathly Hallows: Part II | \n",
" 1703 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId rating title \\\n",
"0 0 1 4 Harry Potter and the Deathly Hallows: Part II \n",
"1 21 1 4 Harry Potter and the Deathly Hallows: Part II \n",
"2 25 1 5 Harry Potter and the Deathly Hallows: Part II \n",
"3 34 1 4 Harry Potter and the Deathly Hallows: Part II \n",
"4 36 1 5 Harry Potter and the Deathly Hallows: Part II \n",
"\n",
" totalRatingCount \n",
"0 1703 \n",
"1 1703 \n",
"2 1703 \n",
"3 1703 \n",
"4 1703 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movie_rating_count=pd.DataFrame(combine_movie_rating.\n",
" groupby(['movieId'])['rating'].\n",
" count().\n",
" reset_index().\n",
" rename(columns={'rating':'totalRatingCount'}) \n",
" )\n",
"rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n",
"rating_with_totalRatingCount.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.90 2351.0\n",
"0.91 2441.0\n",
"0.92 2654.0\n",
"0.93 2814.0\n",
"0.94 2958.0\n",
"0.95 3062.0\n",
"0.96 3330.0\n",
"0.97 3731.0\n",
"0.98 4432.0\n",
"0.99 5072.0\n",
"Name: totalRatingCount, dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rating_with_totalRatingCount['totalRatingCount'].quantile(np.arange(.9,1,.01))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"热门电影数据量:262455\n"
]
}
],
"source": [
"#取10%的最热门的电影\n",
"popular_threshold=2351 \n",
"rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n",
"popular_movies_rating= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')\n",
"print('热门电影数据量:%d' % len(popular_movies_rating))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.5501276131831262\n",
"MAE: 0.6968515836030541\n"
]
},
{
"data": {
"text/plain": [
"[Prediction(uid=1426, iid=1130, r_ui=5.0, est=3.6235014684425333, details={'was_impossible': False}),\n",
" Prediction(uid=9899, iid=609, r_ui=4.0, est=4.427492924992114, details={'was_impossible': False}),\n",
" Prediction(uid=10837, iid=150, r_ui=5.0, est=4.827915792544979, details={'was_impossible': False}),\n",
" Prediction(uid=4643, iid=995, r_ui=4.0, est=4.211769405268239, details={'was_impossible': False}),\n",
" Prediction(uid=9769, iid=247, r_ui=5.0, est=4.254900765324765, details={'was_impossible': False}),\n",
" Prediction(uid=5582, iid=170, r_ui=4.0, est=4.31240196218265, details={'was_impossible': False}),\n",
" Prediction(uid=6605, iid=156, r_ui=5.0, est=4.245027154626259, details={'was_impossible': False}),\n",
" Prediction(uid=9884, iid=21, r_ui=4.0, est=3.824987946672277, details={'was_impossible': False}),\n",
" Prediction(uid=17492, iid=96, r_ui=4.0, est=4.72638009370838, details={'was_impossible': False}),\n",
" Prediction(uid=25829, iid=738, r_ui=5.0, est=4.788603891699279, details={'was_impossible': False})]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"reader = Reader(rating_scale=(0.5, 5))\n",
"data = Dataset.load_from_df(popular_movies_rating[['userId', 'movieId', 'rating']], reader)\n",
"train, test = train_test_split(data, test_size=.25, random_state=0)\n",
"svd_model = SVD(random_state=0)\n",
"svd_model.fit(train)\n",
"predict = svd_model.test(test) \n",
"print(\"RMSE: \",accuracy.mae(predict, verbose=False))\n",
"print(\"MAE: \",accuracy.rmse(predict, verbose=False))\n",
"predict[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"code_folding": [
0
]
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def get_top_n(predictions, n=10):\n",
" top_n = defaultdict(list)\n",
" for uid, iid, true_r, est, _ in predictions:\n",
" top_n[uid].append((iid, est))\n",
"\n",
" for uid, user_ratings in top_n.items():\n",
" user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
" top_n[uid] = user_ratings[:n]\n",
"\n",
" return top_n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# trainset = data.build_full_trainset()\n",
"# svd_model.fit(trainset)\n",
"\n",
"#从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影,即该测试集中的数据不在训练集中\n",
"# testset = trainset.build_anti_testset()\n",
"# predictions = svd_model.test(testset)\n",
"\n",
"# top_n = get_top_n(predictions, n=8)\n",
"\n",
"# for uid, user_ratings in top_n.items():\n",
"# print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#定义留一法交叉验证,在测试集中每个用户只保留一条评分记录\n",
"LOOCV = LeaveOneOut(n_splits=1, random_state=1)\n",
"\n",
"for trainSet, testSet in LOOCV.split(data):\n",
" \n",
" #在训练集上训练模型\n",
" svd_model.fit(trainSet)\n",
" #在测试集上预测\n",
" leftOutPredictions = svd_model.test(testSet)\n",
" \n",
" #从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影,即该测试集中的数据不在训练集中\n",
" bigTestSet = trainSet.build_anti_testset()\n",
" \n",
" #得到所有用户没有看过的所有电影的预测评分\n",
" allPredictions = svd_model.test(bigTestSet)\n",
" \n",
" #从每个用户的未看过的电影的预测评分中抽取前10个得分最高的电影\n",
" topNPredicted = get_top_n(allPredictions, n=10)\n",
"\n",
"#打印为每个用户推荐的10部电影和对它们的评分\n",
"# for uid, user_ratings in topNPredicted.items():\n",
"# print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 整体命中率"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"整体命中率: 0.20156963596860727\n"
]
}
],
"source": [
"\n",
"def HitRate(topNPredicted, leftOutPredictions):\n",
" hits = 0\n",
" total = 0\n",
" \n",
" for leftOut in leftOutPredictions:\n",
" userID = leftOut[0]\n",
" leftOutMovieID = leftOut[1]\n",
" \n",
" hit = False\n",
" for movieID, predictedRating in topNPredicted[int(userID)]:\n",
" if (int(leftOutMovieID) == int(movieID)):\n",
" hit = True\n",
" break\n",
" if (hit) :\n",
" hits += 1\n",
"\n",
" total += 1\n",
"\n",
" return hits/total\n",
"print(\"整体命中率: \", HitRate(topNPredicted, leftOutPredictions))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 评分命中率"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"评分的命中率: \n",
"1.0 0.06666666666666667\n",
"2.0 0.06976744186046512\n",
"3.0 0.09087495306045813\n",
"4.0 0.16397885892397343\n",
"5.0 0.25305280528052804\n"
]
}
],
"source": [
"def RatingHitRate(topNPredicted, leftOutPredictions):\n",
" hits = defaultdict(float)\n",
" total = defaultdict(float) \n",
" for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
" \n",
" hit = False\n",
" for movieID, predictedRating in topNPredicted[int(userID)]:\n",
" if (int(leftOutMovieID) == movieID):\n",
" hit = True\n",
" break\n",
" if (hit) :\n",
" hits[actualRating] += 1\n",
" total[actualRating] += 1\n",
"\n",
" for rating in sorted(hits.keys()):\n",
" print(rating, hits[rating] / total[rating])\n",
"print(\"评分的命中率: \")\n",
"RatingHitRate(topNPredicted, leftOutPredictions)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 累积命中率 ###"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"累积命中率 (rating >= 4): 0.21934458177342428\n"
]
}
],
"source": [
"def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):\n",
" hits = 0\n",
" total = 0\n",
"\n",
" for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
"\n",
" if (actualRating >= ratingCutoff): \n",
" hit = False\n",
" for movieID, predictedRating in topNPredicted[int(userID)]:\n",
" if (int(leftOutMovieID) == movieID):\n",
" hit = True\n",
" break\n",
" if (hit) :\n",
" hits += 1\n",
" total += 1\n",
" return hits/total\n",
"print(\"累积命中率 (rating >= 4): \", CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 平均互惠命中排名 ###"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"平均互惠命中排名: 0.06899543157247194\n"
]
}
],
"source": [
"def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):\n",
" summation = 0\n",
" total = 0\n",
" \n",
" for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
" \n",
" hitRank = 0\n",
" rank = 0\n",
" for movieID, predictedRating in topNPredicted[int(userID)]:\n",
" rank = rank + 1\n",
" if (int(leftOutMovieID) == movieID):\n",
" hitRank = rank\n",
" break\n",
" if (hitRank > 0) :\n",
" summation += 1.0 / hitRank\n",
"\n",
" total += 1\n",
"\n",
" return summation / total\n",
"\n",
"print(\"平均互惠命中排名: \", AverageReciprocalHitRank(topNPredicted, leftOutPredictions))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}