{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np \n",
    "from surprise import SVD\n",
    "from surprise import KNNBaseline\n",
    "from surprise.model_selection import train_test_split\n",
    "from surprise.model_selection import LeaveOneOut\n",
    "from surprise import Reader\n",
    "from surprise import Dataset\n",
    "from surprise import accuracy\n",
    "from surprise.model_selection import train_test_split\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据在此下载 ###\n",
    "https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/ez_douban/intro.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "评分数量：2604995\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2410169</th>\n",
       "      <td>1727</td>\n",
       "      <td>9984</td>\n",
       "      <td>2</td>\n",
       "      <td>Beauty and the Beast</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1239338</th>\n",
       "      <td>7328</td>\n",
       "      <td>1602</td>\n",
       "      <td>5</td>\n",
       "      <td>Firelight</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1780092</th>\n",
       "      <td>4470</td>\n",
       "      <td>4008</td>\n",
       "      <td>5</td>\n",
       "      <td>Kramer vs. Kramer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>971816</th>\n",
       "      <td>620</td>\n",
       "      <td>1146</td>\n",
       "      <td>4</td>\n",
       "      <td>Reservoir Dogs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1086919</th>\n",
       "      <td>5560</td>\n",
       "      <td>1341</td>\n",
       "      <td>2</td>\n",
       "      <td>Silent Hill</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1106322</th>\n",
       "      <td>7670</td>\n",
       "      <td>1362</td>\n",
       "      <td>3</td>\n",
       "      <td>Tais-toi!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41897</th>\n",
       "      <td>5241</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>Closer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2362262</th>\n",
       "      <td>5224</td>\n",
       "      <td>9173</td>\n",
       "      <td>2</td>\n",
       "      <td>Sundae in New York</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1117186</th>\n",
       "      <td>1898</td>\n",
       "      <td>1383</td>\n",
       "      <td>4</td>\n",
       "      <td>Zodiac</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2816937</th>\n",
       "      <td>6071</td>\n",
       "      <td>48616</td>\n",
       "      <td>3</td>\n",
       "      <td>舞牛</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         userId  movieId  rating                 title\n",
       "2410169    1727     9984       2  Beauty and the Beast\n",
       "1239338    7328     1602       5             Firelight\n",
       "1780092    4470     4008       5     Kramer vs. Kramer\n",
       "971816      620     1146       4        Reservoir Dogs\n",
       "1086919    5560     1341       2           Silent Hill\n",
       "1106322    7670     1362       3             Tais-toi!\n",
       "41897      5241       42       2                Closer\n",
       "2362262    5224     9173       2    Sundae in New York\n",
       "1117186    1898     1383       4                Zodiac\n",
       "2816937    6071    48616       3                    舞牛"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies = pd.read_csv( './data/douban/movies.csv')\n",
    "ratings = pd.read_csv('./data/douban/ratings.csv')\n",
    "\n",
    "combine_movie_rating= pd.merge(ratings,movies,on='movieId',how='inner')\n",
    "combine_movie_rating=combine_movie_rating.drop(['timestamp'],axis = 1)\n",
    "combine_movie_rating = combine_movie_rating.dropna(axis = 0 ,subset=['title'])\n",
    "print('评分数量：%d' % len(combine_movie_rating))\n",
    "combine_movie_rating.sample(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>title</th>\n",
       "      <th>totalRatingCount</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>Harry Potter and the Deathly Hallows: Part II</td>\n",
       "      <td>1703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>Harry Potter and the Deathly Hallows: Part II</td>\n",
       "      <td>1703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>Harry Potter and the Deathly Hallows: Part II</td>\n",
       "      <td>1703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>34</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>Harry Potter and the Deathly Hallows: Part II</td>\n",
       "      <td>1703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>36</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>Harry Potter and the Deathly Hallows: Part II</td>\n",
       "      <td>1703</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating                                          title  \\\n",
       "0       0        1       4  Harry Potter and the Deathly Hallows: Part II   \n",
       "1      21        1       4  Harry Potter and the Deathly Hallows: Part II   \n",
       "2      25        1       5  Harry Potter and the Deathly Hallows: Part II   \n",
       "3      34        1       4  Harry Potter and the Deathly Hallows: Part II   \n",
       "4      36        1       5  Harry Potter and the Deathly Hallows: Part II   \n",
       "\n",
       "   totalRatingCount  \n",
       "0              1703  \n",
       "1              1703  \n",
       "2              1703  \n",
       "3              1703  \n",
       "4              1703  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_rating_count=pd.DataFrame(combine_movie_rating.\n",
    "                    groupby(['movieId'])['rating'].\n",
    "                    count().\n",
    "                    reset_index().\n",
    "                    rename(columns={'rating':'totalRatingCount'})                   \n",
    "                   )\n",
    "rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n",
    "rating_with_totalRatingCount.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.90    2351.0\n",
       "0.91    2441.0\n",
       "0.92    2654.0\n",
       "0.93    2814.0\n",
       "0.94    2958.0\n",
       "0.95    3062.0\n",
       "0.96    3330.0\n",
       "0.97    3731.0\n",
       "0.98    4432.0\n",
       "0.99    5072.0\n",
       "Name: totalRatingCount, dtype: float64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_with_totalRatingCount['totalRatingCount'].quantile(np.arange(.9,1,.01))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "热门电影数据量：262455\n"
     ]
    }
   ],
   "source": [
    "#取10%的最热门的电影\n",
    "popular_threshold=2351 \n",
    "rating_with_totalRatingCount = combine_movie_rating.merge(movie_rating_count,left_on='movieId',right_on='movieId')\n",
    "popular_movies_rating= rating_with_totalRatingCount.query('totalRatingCount>=@popular_threshold')\n",
    "print('热门电影数据量：%d' % len(popular_movies_rating))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RMSE:  0.5501276131831262\n",
      "MAE:  0.6968515836030541\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[Prediction(uid=1426, iid=1130, r_ui=5.0, est=3.6235014684425333, details={'was_impossible': False}),\n",
       " Prediction(uid=9899, iid=609, r_ui=4.0, est=4.427492924992114, details={'was_impossible': False}),\n",
       " Prediction(uid=10837, iid=150, r_ui=5.0, est=4.827915792544979, details={'was_impossible': False}),\n",
       " Prediction(uid=4643, iid=995, r_ui=4.0, est=4.211769405268239, details={'was_impossible': False}),\n",
       " Prediction(uid=9769, iid=247, r_ui=5.0, est=4.254900765324765, details={'was_impossible': False}),\n",
       " Prediction(uid=5582, iid=170, r_ui=4.0, est=4.31240196218265, details={'was_impossible': False}),\n",
       " Prediction(uid=6605, iid=156, r_ui=5.0, est=4.245027154626259, details={'was_impossible': False}),\n",
       " Prediction(uid=9884, iid=21, r_ui=4.0, est=3.824987946672277, details={'was_impossible': False}),\n",
       " Prediction(uid=17492, iid=96, r_ui=4.0, est=4.72638009370838, details={'was_impossible': False}),\n",
       " Prediction(uid=25829, iid=738, r_ui=5.0, est=4.788603891699279, details={'was_impossible': False})]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reader = Reader(rating_scale=(0.5, 5))\n",
    "data = Dataset.load_from_df(popular_movies_rating[['userId', 'movieId', 'rating']], reader)\n",
    "train, test = train_test_split(data, test_size=.25, random_state=0)\n",
    "svd_model = SVD(random_state=0)\n",
    "svd_model.fit(train)\n",
    "predict = svd_model.test(test)   \n",
    "print(\"RMSE: \",accuracy.mae(predict, verbose=False))\n",
    "print(\"MAE: \",accuracy.rmse(predict, verbose=False))\n",
    "predict[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "code_folding": [
     0
    ]
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_n(predictions, n=10):\n",
    "    top_n = defaultdict(list)\n",
    "    for uid, iid, true_r, est, _ in predictions:\n",
    "        top_n[uid].append((iid, est))\n",
    "\n",
    "    for uid, user_ratings in top_n.items():\n",
    "        user_ratings.sort(key=lambda x: x[1], reverse=True)\n",
    "        top_n[uid] = user_ratings[:n]\n",
    "\n",
    "    return top_n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# trainset = data.build_full_trainset()\n",
    "# svd_model.fit(trainset)\n",
    "\n",
    "#从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影，即该测试集中的数据不在训练集中\n",
    "# testset = trainset.build_anti_testset()\n",
    "# predictions = svd_model.test(testset)\n",
    "\n",
    "# top_n = get_top_n(predictions, n=8)\n",
    "\n",
    "# for uid, user_ratings in top_n.items():\n",
    "#      print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#定义留一法交叉验证,在测试集中每个用户只保留一条评分记录\n",
    "LOOCV = LeaveOneOut(n_splits=1, random_state=1)\n",
    "\n",
    "for trainSet, testSet in LOOCV.split(data):\n",
    "    \n",
    "    #在训练集上训练模型\n",
    "    svd_model.fit(trainSet)\n",
    "    #在测试集上预测\n",
    "    leftOutPredictions = svd_model.test(testSet)\n",
    "    \n",
    "    #从训练集中创建一个测试集,该测试集中包含了所有用户没有看过的电影，即该测试集中的数据不在训练集中\n",
    "    bigTestSet = trainSet.build_anti_testset()\n",
    "    \n",
    "    #得到所有用户没有看过的所有电影的预测评分\n",
    "    allPredictions = svd_model.test(bigTestSet)\n",
    "    \n",
    "    #从每个用户的未看过的电影的预测评分中抽取前10个得分最高的电影\n",
    "    topNPredicted = get_top_n(allPredictions, n=10)\n",
    "\n",
    "#打印为每个用户推荐的10部电影和对它们的评分\n",
    "# for uid, user_ratings in topNPredicted.items():\n",
    "#       print(uid, [(iid,round(rating,1)) for (iid, rating) in user_ratings])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 整体命中率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "整体命中率:  0.20156963596860727\n"
     ]
    }
   ],
   "source": [
    "\n",
    "def HitRate(topNPredicted, leftOutPredictions):\n",
    "    hits = 0\n",
    "    total = 0\n",
    " \n",
    "    for leftOut in leftOutPredictions:\n",
    "        userID = leftOut[0]\n",
    "        leftOutMovieID = leftOut[1]\n",
    "        \n",
    "        hit = False\n",
    "        for movieID, predictedRating in topNPredicted[int(userID)]:\n",
    "            if (int(leftOutMovieID) == int(movieID)):\n",
    "                hit = True\n",
    "                break\n",
    "        if (hit) :\n",
    "            hits += 1\n",
    "\n",
    "        total += 1\n",
    "\n",
    "    return hits/total\n",
    "print(\"整体命中率: \", HitRate(topNPredicted, leftOutPredictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 评分命中率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "评分的命中率: \n",
      "1.0 0.06666666666666667\n",
      "2.0 0.06976744186046512\n",
      "3.0 0.09087495306045813\n",
      "4.0 0.16397885892397343\n",
      "5.0 0.25305280528052804\n"
     ]
    }
   ],
   "source": [
    "def RatingHitRate(topNPredicted, leftOutPredictions):\n",
    "    hits = defaultdict(float)\n",
    "    total = defaultdict(float)    \n",
    "    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
    "        \n",
    "        hit = False\n",
    "        for movieID, predictedRating in topNPredicted[int(userID)]:\n",
    "            if (int(leftOutMovieID) == movieID):\n",
    "                hit = True\n",
    "                break\n",
    "        if (hit) :\n",
    "            hits[actualRating] += 1\n",
    "        total[actualRating] += 1\n",
    "\n",
    "    for rating in sorted(hits.keys()):\n",
    "        print(rating, hits[rating] / total[rating])\n",
    "print(\"评分的命中率: \")\n",
    "RatingHitRate(topNPredicted, leftOutPredictions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 累积命中率 ###"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "累积命中率 (rating >= 4):  0.21934458177342428\n"
     ]
    }
   ],
   "source": [
    "def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):\n",
    "    hits = 0\n",
    "    total = 0\n",
    "\n",
    "    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
    "\n",
    "        if (actualRating >= ratingCutoff):            \n",
    "            hit = False\n",
    "            for movieID, predictedRating in topNPredicted[int(userID)]:\n",
    "                if (int(leftOutMovieID) == movieID):\n",
    "                    hit = True\n",
    "                    break\n",
    "            if (hit) :\n",
    "                hits += 1\n",
    "            total += 1\n",
    "    return hits/total\n",
    "print(\"累积命中率 (rating >= 4): \", CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 平均互惠命中排名 ###"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "平均互惠命中排名:  0.06899543157247194\n"
     ]
    }
   ],
   "source": [
    "def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):\n",
    "    summation = 0\n",
    "    total = 0\n",
    "        \n",
    "    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:\n",
    "        \n",
    "        hitRank = 0\n",
    "        rank = 0\n",
    "        for movieID, predictedRating in topNPredicted[int(userID)]:\n",
    "            rank = rank + 1\n",
    "            if (int(leftOutMovieID) == movieID):\n",
    "                hitRank = rank\n",
    "                break\n",
    "        if (hitRank > 0) :\n",
    "                summation += 1.0 / hitRank\n",
    "\n",
    "        total += 1\n",
    "\n",
    "    return summation / total\n",
    "\n",
    "print(\"平均互惠命中排名: \", AverageReciprocalHitRank(topNPredicted, leftOutPredictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}