Files
Machine-Learning-Collection/ML/Pytorch/recommender_systems/2.non-personalized-recsys/part1-popularity_recsys.ipynb
2023-11-02 18:48:58 -07:00

504 lines
17 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Non-Personalized Recommender Systems: Popularity Based"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"\n",
"if os.path.exists('movielens_small.zip'):\n",
" !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip \n",
" !unzip ml-latest-small.zip\n",
" !rm ml-latest-small.zip\n",
" !mv ml-latest-small movielens_small"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Damped Mean\n",
"\n",
"$$ s(i) = \\frac{\\Sigma_{u \\in U_i} r_i + a \\times \\mu}{|U_i| + a} $$\n",
"\n",
"Where:\n",
"- $ s(i) $: The damped mean rating for item $ i $.\n",
"- $ \\Sigma_{u \\in U_i} r_i $: Sum of the ratings for item $ i $.\n",
"- $ a $: Damping factor, a value that determines the extent of smoothing.\n",
"- $ \\mu $: Global mean rating across all items.\n",
"- $ |U_i| $: Total number of ratings for item $ i $.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "937dd4ed",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_data():\n",
" # Load the MovieLens data\n",
" movies_df = pd.read_csv(\"movielens_small/movies.csv\")\n",
" ratings_df = pd.read_csv(\"movielens_small/ratings.csv\")\n",
" return movies_df, ratings_df\n",
"\n",
"def calculate_popularity(movies_df, ratings_df, damping_factor=5):\n",
" # Calculate the number of ratings, mean rating, and sum of ratings for each movie\n",
" num_ratings = ratings_df.groupby(\"movieId\")[\"rating\"].count()\n",
" mean_rating = ratings_df.groupby(\"movieId\")[\"rating\"].mean()\n",
" global_mean = ratings_df[\"rating\"].mean()\n",
" \n",
" # Calculate the damped mean rating for each movie\n",
" damped_numerator = num_ratings * mean_rating + damping_factor * global_mean\n",
" damped_denominator = num_ratings + damping_factor\n",
" damped_mean_rating = damped_numerator / damped_denominator\n",
" \n",
" # Add the popularity data to the movie data\n",
" movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)\n",
" movies_df['mean_rating'] = movies_df['movieId'].map(mean_rating)\n",
" movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)\n",
" return movies_df\n",
"\n",
"movies_df, ratings_df = load_data()\n",
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's see how using num_ratings compares to mean rating & damped mean rating."
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "7e649c6f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>314</th>\n",
" <td>356</td>\n",
" <td>Forrest Gump (1994)</td>\n",
" <td>Comedy|Drama|Romance|War</td>\n",
" <td>329.0</td>\n",
" <td>4.164134</td>\n",
" <td>4.144589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>318</td>\n",
" <td>Shawshank Redemption, The (1994)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>317.0</td>\n",
" <td>4.429022</td>\n",
" <td>4.400659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>296</td>\n",
" <td>Pulp Fiction (1994)</td>\n",
" <td>Comedy|Crime|Drama|Thriller</td>\n",
" <td>307.0</td>\n",
" <td>4.197068</td>\n",
" <td>4.175128</td>\n",
" </tr>\n",
" <tr>\n",
" <th>510</th>\n",
" <td>593</td>\n",
" <td>Silence of the Lambs, The (1991)</td>\n",
" <td>Crime|Horror|Thriller</td>\n",
" <td>279.0</td>\n",
" <td>4.161290</td>\n",
" <td>4.138462</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1939</th>\n",
" <td>2571</td>\n",
" <td>Matrix, The (1999)</td>\n",
" <td>Action|Sci-Fi|Thriller</td>\n",
" <td>278.0</td>\n",
" <td>4.192446</td>\n",
" <td>4.168457</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title genres \n",
"314 356 Forrest Gump (1994) Comedy|Drama|Romance|War \\\n",
"277 318 Shawshank Redemption, The (1994) Crime|Drama \n",
"257 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller \n",
"510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller \n",
"1939 2571 Matrix, The (1999) Action|Sci-Fi|Thriller \n",
"\n",
" num_ratings mean_rating damped_mean_rating \n",
"314 329.0 4.164134 4.144589 \n",
"277 317.0 4.429022 4.400659 \n",
"257 307.0 4.197068 4.175128 \n",
"510 279.0 4.161290 4.138462 \n",
"1939 278.0 4.192446 4.168457 "
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"num_ratings\", ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "c6ef332e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7656</th>\n",
" <td>88448</td>\n",
" <td>Paper Birds (Pájaros de papel) (2010)</td>\n",
" <td>Comedy|Drama</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8107</th>\n",
" <td>100556</td>\n",
" <td>Act of Killing, The (2012)</td>\n",
" <td>Documentary</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9083</th>\n",
" <td>143031</td>\n",
" <td>Jump In! (2007)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9094</th>\n",
" <td>143511</td>\n",
" <td>Human (2015)</td>\n",
" <td>Documentary</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9096</th>\n",
" <td>143559</td>\n",
" <td>L.A. Slasher (2015)</td>\n",
" <td>Comedy|Crime|Fantasy</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title genres \n",
"7656 88448 Paper Birds (Pájaros de papel) (2010) Comedy|Drama \\\n",
"8107 100556 Act of Killing, The (2012) Documentary \n",
"9083 143031 Jump In! (2007) Comedy|Drama|Romance \n",
"9094 143511 Human (2015) Documentary \n",
"9096 143559 L.A. Slasher (2015) Comedy|Crime|Fantasy \n",
"\n",
" num_ratings mean_rating damped_mean_rating \n",
"7656 1.0 5.0 3.637779 \n",
"8107 1.0 5.0 3.637779 \n",
"9083 1.0 5.0 3.637779 \n",
"9094 1.0 5.0 3.637779 \n",
"9096 1.0 5.0 3.637779 "
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"mean_rating\", ascending=False).head(5)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "f669fb09",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>318</td>\n",
" <td>Shawshank Redemption, The (1994)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>317.0</td>\n",
" <td>4.429022</td>\n",
" <td>4.400659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>659</th>\n",
" <td>858</td>\n",
" <td>Godfather, The (1972)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>192.0</td>\n",
" <td>4.289062</td>\n",
" <td>4.250077</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2226</th>\n",
" <td>2959</td>\n",
" <td>Fight Club (1999)</td>\n",
" <td>Action|Crime|Drama|Thriller</td>\n",
" <td>218.0</td>\n",
" <td>4.272936</td>\n",
" <td>4.239103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>922</th>\n",
" <td>1221</td>\n",
" <td>Godfather: Part II, The (1974)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>129.0</td>\n",
" <td>4.259690</td>\n",
" <td>4.205148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>50</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>Crime|Mystery|Thriller</td>\n",
" <td>204.0</td>\n",
" <td>4.237745</td>\n",
" <td>4.203344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>260</td>\n",
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" <td>251.0</td>\n",
" <td>4.231076</td>\n",
" <td>4.203125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>602</th>\n",
" <td>750</td>\n",
" <td>Dr. Strangelove or: How I Learned to Stop Worr...</td>\n",
" <td>Comedy|War</td>\n",
" <td>97.0</td>\n",
" <td>4.268041</td>\n",
" <td>4.196407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>914</th>\n",
" <td>1213</td>\n",
" <td>Goodfellas (1990)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>126.0</td>\n",
" <td>4.250000</td>\n",
" <td>4.194967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>461</th>\n",
" <td>527</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>Drama|War</td>\n",
" <td>220.0</td>\n",
" <td>4.225000</td>\n",
" <td>4.193546</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6710</th>\n",
" <td>58559</td>\n",
" <td>Dark Knight, The (2008)</td>\n",
" <td>Action|Crime|Drama|IMAX</td>\n",
" <td>149.0</td>\n",
" <td>4.238255</td>\n",
" <td>4.191922</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title \n",
"277 318 Shawshank Redemption, The (1994) \\\n",
"659 858 Godfather, The (1972) \n",
"2226 2959 Fight Club (1999) \n",
"922 1221 Godfather: Part II, The (1974) \n",
"46 50 Usual Suspects, The (1995) \n",
"224 260 Star Wars: Episode IV - A New Hope (1977) \n",
"602 750 Dr. Strangelove or: How I Learned to Stop Worr... \n",
"914 1213 Goodfellas (1990) \n",
"461 527 Schindler's List (1993) \n",
"6710 58559 Dark Knight, The (2008) \n",
"\n",
" genres num_ratings mean_rating \n",
"277 Crime|Drama 317.0 4.429022 \\\n",
"659 Crime|Drama 192.0 4.289062 \n",
"2226 Action|Crime|Drama|Thriller 218.0 4.272936 \n",
"922 Crime|Drama 129.0 4.259690 \n",
"46 Crime|Mystery|Thriller 204.0 4.237745 \n",
"224 Action|Adventure|Sci-Fi 251.0 4.231076 \n",
"602 Comedy|War 97.0 4.268041 \n",
"914 Crime|Drama 126.0 4.250000 \n",
"461 Drama|War 220.0 4.225000 \n",
"6710 Action|Crime|Drama|IMAX 149.0 4.238255 \n",
"\n",
" damped_mean_rating \n",
"277 4.400659 \n",
"659 4.250077 \n",
"2226 4.239103 \n",
"922 4.205148 \n",
"46 4.203344 \n",
"224 4.203125 \n",
"602 4.196407 \n",
"914 4.194967 \n",
"461 4.193546 \n",
"6710 4.191922 "
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"damped_mean_rating\", ascending=False).head(10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}