{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np \n", "from surprise import SVD\n", "from surprise import KNNBaseline\n", "from surprise.model_selection import train_test_split\n", "from surprise.model_selection import LeaveOneOut\n", "from surprise import Reader\n", "from surprise import Dataset\n", "from surprise import accuracy\n", "from surprise.model_selection import train_test_split\n", "from collections import defaultdict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 数据在此下载 ###\n", "https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/ez_douban/intro.ipynb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "评分数量:2604995\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtitle
2410169172799842Beauty and the Beast
1239338732816025Firelight
1780092447040085Kramer vs. Kramer
97181662011464Reservoir Dogs
1086919556013412Silent Hill
1106322767013623Tais-toi!
418975241422Closer
2362262522491732Sundae in New York
1117186189813834Zodiac
28169376071486163舞牛
\n", "
" ], "text/plain": [ " userId movieId rating title\n", "2410169 1727 9984 2 Beauty and the Beast\n", "1239338 7328 1602 5 Firelight\n", "1780092 4470 4008 5 Kramer vs. Kramer\n", "971816 620 1146 4 Reservoir Dogs\n", "1086919 5560 1341 2 Silent Hill\n", "1106322 7670 1362 3 Tais-toi!\n", "41897 5241 42 2 Closer\n", "2362262 5224 9173 2 Sundae in New York\n", "1117186 1898 1383 4 Zodiac\n", "2816937 6071 48616 3 舞牛" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv( './data/douban/movies.csv')\n", "ratings = pd.read_csv('./data/douban/ratings.csv')\n", "\n", "combine_movie_rating= pd.merge(ratings,movies,on='movieId',how='inner')\n", "combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)\n", "combine_movie_rating = combine_movie_rating.dropna(axis = 0 ,subset=['title'])\n", "print('评分数量:%d' % len(combine_movie_rating))\n", "combine_movie_rating.sample(10)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtitletotalRatingCount
0014Harry Potter and the Deathly Hallows: Part II1703
12114Harry Potter and the Deathly Hallows: Part II1703
22515Harry Potter and the Deathly Hallows: Part II1703
33414Harry Potter and the Deathly Hallows: Part II1703
43615Harry Potter and the Deathly Hallows: Part II1703
\n", "
" ], "text/plain": [ " userId movieId rating title \\\n", "0 0 1 4 Harry Potter and the Deathly Hallows: Part II \n", "1 21 1 4 Harry Potter and the Deathly Hallows: Part II \n", "2 25 1 5 Harry Potter and the Deathly Hallows: Part II \n", "3 34 1 4 Harry Potter and the Deathly Hallows: Part II \n", "4 36 1 5 Harry Potter and the Deathly Hallows: Part II \n", "\n", " totalRatingCount \n", "0 1703 \n", "1 1703 \n", "2 1703 \n", "3 1703 \n", "4 1703 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_rating_count=pd.DataFrame(combine_movie_rating.\n", " groupby(['movieId'])['rating'].\n", " count().\n", " reset_index().\n", " rename(columns={'rating':'totalRatingCount'}) \n", " )\n", "rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n", "rating_with_totalRatingCount.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.90 2351.0\n", "0.91 2441.0\n", "0.92 2654.0\n", "0.93 2814.0\n", "0.94 2958.0\n", "0.95 3062.0\n", "0.96 3330.0\n", "0.97 3731.0\n", "0.98 4432.0\n", "0.99 5072.0\n", "Name: totalRatingCount, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rating_with_totalRatingCount['totalRatingCount'].quantile(np.arange(.9,1,.01))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "热门电影数据量:262455\n" ] } ], "source": [ "#取10%的最热门的电影\n", "popular_threshold=2351 \n", "rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n", "popular_movies_rating= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')\n", "print('热门电影数据量:%d' % len(popular_movies_rating))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RMSE: 0.5501276131831262\n", "MAE: 0.6968515836030541\n" ] }, { "data": { "text/plain": [ "[Prediction(uid=1426, iid=1130, r_ui=5.0, est=3.6235014684425333, details={'was_impossible': False}),\n", " Prediction(uid=9899, iid=609, r_ui=4.0, est=4.427492924992114, details={'was_impossible': False}),\n", " Prediction(uid=10837, iid=150, r_ui=5.0, est=4.827915792544979, details={'was_impossible': False}),\n", " Prediction(uid=4643, iid=995, r_ui=4.0, est=4.211769405268239, details={'was_impossible': False}),\n", " Prediction(uid=9769, iid=247, r_ui=5.0, est=4.254900765324765, details={'was_impossible': False}),\n", " Prediction(uid=5582, iid=170, r_ui=4.0, est=4.31240196218265, details={'was_impossible': False}),\n", " Prediction(uid=6605, iid=156, r_ui=5.0, est=4.245027154626259, details={'was_impossible': False}),\n", " Prediction(uid=9884, iid=21, r_ui=4.0, est=3.824987946672277, details={'was_impossible': False}),\n", " Prediction(uid=17492, iid=96, r_ui=4.0, est=4.72638009370838, details={'was_impossible': False}),\n", " Prediction(uid=25829, iid=738, r_ui=5.0, est=4.788603891699279, details={'was_impossible': False})]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reader = Reader(rating_scale=(0.5, 5))\n", "data = Dataset.load_from_df(popular_movies_rating[['userId', 'movieId', 'rating']], reader)\n", "train, test = train_test_split(data, test_size=.25, random_state=0)\n", "svd_model = SVD(random_state=0)\n", "svd_model.fit(train)\n", "predict = svd_model.test(test) \n", "print(\"RMSE: \",accuracy.mae(predict, verbose=False))\n", "print(\"MAE: \",accuracy.rmse(predict, verbose=False))\n", "predict[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "code_folding": [ 0 ] }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def get_top_n(predictions, n=10):\n", " top_n = defaultdict(list)\n", " for uid, iid, true_r, est, _ in predictions:\n", " top_n[uid].append((iid, est))\n", "\n", " for uid, user_ratings in top_n.items():\n", " user_ratings.sort(key=lambda x: x[1], reverse=True)\n", " top_n[uid] = user_ratings[:n]\n", "\n", " return top_n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# trainset = data.build_full_trainset()\n", "# svd_model.fit(trainset)\n", "\n", "#从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影,即该测试集中的数据不在训练集中\n", "# testset = trainset.build_anti_testset()\n", "# predictions = svd_model.test(testset)\n", "\n", "# top_n = get_top_n(predictions, n=8)\n", "\n", "# for uid, user_ratings in top_n.items():\n", "# print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "#定义留一法交叉验证,在测试集中每个用户只保留一条评分记录\n", "LOOCV = LeaveOneOut(n_splits=1, random_state=1)\n", "\n", "for trainSet, testSet in LOOCV.split(data):\n", " \n", " #在训练集上训练模型\n", " svd_model.fit(trainSet)\n", " #在测试集上预测\n", " leftOutPredictions = svd_model.test(testSet)\n", " \n", " #从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影,即该测试集中的数据不在训练集中\n", " bigTestSet = trainSet.build_anti_testset()\n", " \n", " #得到所有用户没有看过的所有电影的预测评分\n", " allPredictions = svd_model.test(bigTestSet)\n", " \n", " #从每个用户的未看过的电影的预测评分中抽取前10个得分最高的电影\n", " topNPredicted = get_top_n(allPredictions, n=10)\n", "\n", "#打印为每个用户推荐的10部电影和对它们的评分\n", "# for uid, user_ratings in topNPredicted.items():\n", "# print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 整体命中率" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "整体命中率: 0.20156963596860727\n" ] } ], "source": [ "\n", "def HitRate(topNPredicted, leftOutPredictions):\n", " hits = 0\n", " total = 0\n", " \n", " for leftOut in leftOutPredictions:\n", " userID = leftOut[0]\n", " leftOutMovieID = leftOut[1]\n", " \n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == int(movieID)):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits += 1\n", "\n", " total += 1\n", "\n", " return hits/total\n", "print(\"整体命中率: \", HitRate(topNPredicted, leftOutPredictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 评分命中率" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "评分的命中率: \n", "1.0 0.06666666666666667\n", "2.0 0.06976744186046512\n", "3.0 0.09087495306045813\n", "4.0 0.16397885892397343\n", "5.0 0.25305280528052804\n" ] } ], "source": [ "def RatingHitRate(topNPredicted, leftOutPredictions):\n", " hits = defaultdict(float)\n", " total = defaultdict(float) \n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", " \n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == movieID):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits[actualRating] += 1\n", " total[actualRating] += 1\n", "\n", " for rating in sorted(hits.keys()):\n", " print(rating, hits[rating] / total[rating])\n", "print(\"评分的命中率: \")\n", "RatingHitRate(topNPredicted, leftOutPredictions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 累积命中率 ###" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "累积命中率 (rating >= 4): 0.21934458177342428\n" ] } ], "source": [ "def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):\n", " hits = 0\n", " total = 0\n", "\n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", "\n", " if (actualRating >= ratingCutoff): \n", " hit = False\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " if (int(leftOutMovieID) == movieID):\n", " hit = True\n", " break\n", " if (hit) :\n", " hits += 1\n", " total += 1\n", " return hits/total\n", "print(\"累积命中率 (rating >= 4): \", CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 平均互惠命中排名 ###" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "平均互惠命中排名: 0.06899543157247194\n" ] } ], "source": [ "def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):\n", " summation = 0\n", " total = 0\n", " \n", " for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n", " \n", " hitRank = 0\n", " rank = 0\n", " for movieID, predictedRating in topNPredicted[int(userID)]:\n", " rank = rank + 1\n", " if (int(leftOutMovieID) == movieID):\n", " hitRank = rank\n", " break\n", " if (hitRank > 0) :\n", " summation += 1.0 / hitRank\n", "\n", " total += 1\n", "\n", " return summation / total\n", "\n", "print(\"平均互惠命中排名: \", AverageReciprocalHitRank(topNPredicted, leftOutPredictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }