mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 11:18:01 +00:00
added trending, some comments in popularity
This commit is contained in:
@@ -1,458 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 207,
|
|
||||||
"id": "937dd4ed",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"\n",
|
|
||||||
"def load_data():\n",
|
|
||||||
" # Load the MovieLens data\n",
|
|
||||||
" movies_df = pd.read_csv(\"movielens_small/movies.csv\")\n",
|
|
||||||
" ratings_df = pd.read_csv(\"movielens_small/ratings.csv\")\n",
|
|
||||||
" return movies_df, ratings_df\n",
|
|
||||||
"\n",
|
|
||||||
"def calculate_popularity(movies_df, ratings_df, damping_factor=5):\n",
|
|
||||||
" # Calculate the number of ratings, mean rating, and sum of ratings for each movie\n",
|
|
||||||
" num_ratings = ratings_df.groupby(\"movieId\")[\"rating\"].count()\n",
|
|
||||||
" mean_rating = ratings_df.groupby(\"movieId\")[\"rating\"].mean()\n",
|
|
||||||
" global_mean = ratings_df[\"rating\"].mean()\n",
|
|
||||||
" \n",
|
|
||||||
" # Calculate the damped mean rating for each movie\n",
|
|
||||||
" damped_numerator = num_ratings * mean_rating + damping_factor * global_mean\n",
|
|
||||||
" damped_denominator = num_ratings + damping_factor\n",
|
|
||||||
" damped_mean_rating = damped_numerator / damped_denominator\n",
|
|
||||||
" \n",
|
|
||||||
" # Add the popularity data to the movie data\n",
|
|
||||||
" movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)\n",
|
|
||||||
" movies_df['mean_rating'] = movies_df['movieId'].map(mean_rating)\n",
|
|
||||||
" movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)\n",
|
|
||||||
" return movies_df\n",
|
|
||||||
"\n",
|
|
||||||
"movies_df, ratings_df = load_data()\n",
|
|
||||||
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 198,
|
|
||||||
"id": "7e649c6f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>movieId</th>\n",
|
|
||||||
" <th>title</th>\n",
|
|
||||||
" <th>genres</th>\n",
|
|
||||||
" <th>num_ratings</th>\n",
|
|
||||||
" <th>mean_rating</th>\n",
|
|
||||||
" <th>damped_mean_rating</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>314</th>\n",
|
|
||||||
" <td>356</td>\n",
|
|
||||||
" <td>Forrest Gump (1994)</td>\n",
|
|
||||||
" <td>Comedy|Drama|Romance|War</td>\n",
|
|
||||||
" <td>329.0</td>\n",
|
|
||||||
" <td>4.164134</td>\n",
|
|
||||||
" <td>4.144589</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>277</th>\n",
|
|
||||||
" <td>318</td>\n",
|
|
||||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
|
||||||
" <td>Crime|Drama</td>\n",
|
|
||||||
" <td>317.0</td>\n",
|
|
||||||
" <td>4.429022</td>\n",
|
|
||||||
" <td>4.400659</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>257</th>\n",
|
|
||||||
" <td>296</td>\n",
|
|
||||||
" <td>Pulp Fiction (1994)</td>\n",
|
|
||||||
" <td>Comedy|Crime|Drama|Thriller</td>\n",
|
|
||||||
" <td>307.0</td>\n",
|
|
||||||
" <td>4.197068</td>\n",
|
|
||||||
" <td>4.175128</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>510</th>\n",
|
|
||||||
" <td>593</td>\n",
|
|
||||||
" <td>Silence of the Lambs, The (1991)</td>\n",
|
|
||||||
" <td>Crime|Horror|Thriller</td>\n",
|
|
||||||
" <td>279.0</td>\n",
|
|
||||||
" <td>4.161290</td>\n",
|
|
||||||
" <td>4.138462</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1939</th>\n",
|
|
||||||
" <td>2571</td>\n",
|
|
||||||
" <td>Matrix, The (1999)</td>\n",
|
|
||||||
" <td>Action|Sci-Fi|Thriller</td>\n",
|
|
||||||
" <td>278.0</td>\n",
|
|
||||||
" <td>4.192446</td>\n",
|
|
||||||
" <td>4.168457</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" movieId title genres \n",
|
|
||||||
"314 356 Forrest Gump (1994) Comedy|Drama|Romance|War \\\n",
|
|
||||||
"277 318 Shawshank Redemption, The (1994) Crime|Drama \n",
|
|
||||||
"257 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller \n",
|
|
||||||
"510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller \n",
|
|
||||||
"1939 2571 Matrix, The (1999) Action|Sci-Fi|Thriller \n",
|
|
||||||
"\n",
|
|
||||||
" num_ratings mean_rating damped_mean_rating \n",
|
|
||||||
"314 329.0 4.164134 4.144589 \n",
|
|
||||||
"277 317.0 4.429022 4.400659 \n",
|
|
||||||
"257 307.0 4.197068 4.175128 \n",
|
|
||||||
"510 279.0 4.161290 4.138462 \n",
|
|
||||||
"1939 278.0 4.192446 4.168457 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 198,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"movies_df.sort_values(by=\"num_ratings\", ascending=False).head()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 204,
|
|
||||||
"id": "c6ef332e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>movieId</th>\n",
|
|
||||||
" <th>title</th>\n",
|
|
||||||
" <th>genres</th>\n",
|
|
||||||
" <th>num_ratings</th>\n",
|
|
||||||
" <th>mean_rating</th>\n",
|
|
||||||
" <th>damped_mean_rating</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>7656</th>\n",
|
|
||||||
" <td>88448</td>\n",
|
|
||||||
" <td>Paper Birds (Pájaros de papel) (2010)</td>\n",
|
|
||||||
" <td>Comedy|Drama</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>5.0</td>\n",
|
|
||||||
" <td>3.637779</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>8107</th>\n",
|
|
||||||
" <td>100556</td>\n",
|
|
||||||
" <td>Act of Killing, The (2012)</td>\n",
|
|
||||||
" <td>Documentary</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>5.0</td>\n",
|
|
||||||
" <td>3.637779</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9083</th>\n",
|
|
||||||
" <td>143031</td>\n",
|
|
||||||
" <td>Jump In! (2007)</td>\n",
|
|
||||||
" <td>Comedy|Drama|Romance</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>5.0</td>\n",
|
|
||||||
" <td>3.637779</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9094</th>\n",
|
|
||||||
" <td>143511</td>\n",
|
|
||||||
" <td>Human (2015)</td>\n",
|
|
||||||
" <td>Documentary</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>5.0</td>\n",
|
|
||||||
" <td>3.637779</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>9096</th>\n",
|
|
||||||
" <td>143559</td>\n",
|
|
||||||
" <td>L.A. Slasher (2015)</td>\n",
|
|
||||||
" <td>Comedy|Crime|Fantasy</td>\n",
|
|
||||||
" <td>1.0</td>\n",
|
|
||||||
" <td>5.0</td>\n",
|
|
||||||
" <td>3.637779</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" movieId title genres \n",
|
|
||||||
"7656 88448 Paper Birds (Pájaros de papel) (2010) Comedy|Drama \\\n",
|
|
||||||
"8107 100556 Act of Killing, The (2012) Documentary \n",
|
|
||||||
"9083 143031 Jump In! (2007) Comedy|Drama|Romance \n",
|
|
||||||
"9094 143511 Human (2015) Documentary \n",
|
|
||||||
"9096 143559 L.A. Slasher (2015) Comedy|Crime|Fantasy \n",
|
|
||||||
"\n",
|
|
||||||
" num_ratings mean_rating damped_mean_rating \n",
|
|
||||||
"7656 1.0 5.0 3.637779 \n",
|
|
||||||
"8107 1.0 5.0 3.637779 \n",
|
|
||||||
"9083 1.0 5.0 3.637779 \n",
|
|
||||||
"9094 1.0 5.0 3.637779 \n",
|
|
||||||
"9096 1.0 5.0 3.637779 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 204,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"movies_df.sort_values(by=\"mean_rating\", ascending=False).head(5)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 201,
|
|
||||||
"id": "f669fb09",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>movieId</th>\n",
|
|
||||||
" <th>title</th>\n",
|
|
||||||
" <th>genres</th>\n",
|
|
||||||
" <th>num_ratings</th>\n",
|
|
||||||
" <th>mean_rating</th>\n",
|
|
||||||
" <th>damped_mean_rating</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>277</th>\n",
|
|
||||||
" <td>318</td>\n",
|
|
||||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
|
||||||
" <td>Crime|Drama</td>\n",
|
|
||||||
" <td>317.0</td>\n",
|
|
||||||
" <td>4.429022</td>\n",
|
|
||||||
" <td>4.400659</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>659</th>\n",
|
|
||||||
" <td>858</td>\n",
|
|
||||||
" <td>Godfather, The (1972)</td>\n",
|
|
||||||
" <td>Crime|Drama</td>\n",
|
|
||||||
" <td>192.0</td>\n",
|
|
||||||
" <td>4.289062</td>\n",
|
|
||||||
" <td>4.250077</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2226</th>\n",
|
|
||||||
" <td>2959</td>\n",
|
|
||||||
" <td>Fight Club (1999)</td>\n",
|
|
||||||
" <td>Action|Crime|Drama|Thriller</td>\n",
|
|
||||||
" <td>218.0</td>\n",
|
|
||||||
" <td>4.272936</td>\n",
|
|
||||||
" <td>4.239103</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>922</th>\n",
|
|
||||||
" <td>1221</td>\n",
|
|
||||||
" <td>Godfather: Part II, The (1974)</td>\n",
|
|
||||||
" <td>Crime|Drama</td>\n",
|
|
||||||
" <td>129.0</td>\n",
|
|
||||||
" <td>4.259690</td>\n",
|
|
||||||
" <td>4.205148</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>46</th>\n",
|
|
||||||
" <td>50</td>\n",
|
|
||||||
" <td>Usual Suspects, The (1995)</td>\n",
|
|
||||||
" <td>Crime|Mystery|Thriller</td>\n",
|
|
||||||
" <td>204.0</td>\n",
|
|
||||||
" <td>4.237745</td>\n",
|
|
||||||
" <td>4.203344</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>224</th>\n",
|
|
||||||
" <td>260</td>\n",
|
|
||||||
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
|
|
||||||
" <td>Action|Adventure|Sci-Fi</td>\n",
|
|
||||||
" <td>251.0</td>\n",
|
|
||||||
" <td>4.231076</td>\n",
|
|
||||||
" <td>4.203125</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>602</th>\n",
|
|
||||||
" <td>750</td>\n",
|
|
||||||
" <td>Dr. Strangelove or: How I Learned to Stop Worr...</td>\n",
|
|
||||||
" <td>Comedy|War</td>\n",
|
|
||||||
" <td>97.0</td>\n",
|
|
||||||
" <td>4.268041</td>\n",
|
|
||||||
" <td>4.196407</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>914</th>\n",
|
|
||||||
" <td>1213</td>\n",
|
|
||||||
" <td>Goodfellas (1990)</td>\n",
|
|
||||||
" <td>Crime|Drama</td>\n",
|
|
||||||
" <td>126.0</td>\n",
|
|
||||||
" <td>4.250000</td>\n",
|
|
||||||
" <td>4.194967</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>461</th>\n",
|
|
||||||
" <td>527</td>\n",
|
|
||||||
" <td>Schindler's List (1993)</td>\n",
|
|
||||||
" <td>Drama|War</td>\n",
|
|
||||||
" <td>220.0</td>\n",
|
|
||||||
" <td>4.225000</td>\n",
|
|
||||||
" <td>4.193546</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>6710</th>\n",
|
|
||||||
" <td>58559</td>\n",
|
|
||||||
" <td>Dark Knight, The (2008)</td>\n",
|
|
||||||
" <td>Action|Crime|Drama|IMAX</td>\n",
|
|
||||||
" <td>149.0</td>\n",
|
|
||||||
" <td>4.238255</td>\n",
|
|
||||||
" <td>4.191922</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" movieId title \n",
|
|
||||||
"277 318 Shawshank Redemption, The (1994) \\\n",
|
|
||||||
"659 858 Godfather, The (1972) \n",
|
|
||||||
"2226 2959 Fight Club (1999) \n",
|
|
||||||
"922 1221 Godfather: Part II, The (1974) \n",
|
|
||||||
"46 50 Usual Suspects, The (1995) \n",
|
|
||||||
"224 260 Star Wars: Episode IV - A New Hope (1977) \n",
|
|
||||||
"602 750 Dr. Strangelove or: How I Learned to Stop Worr... \n",
|
|
||||||
"914 1213 Goodfellas (1990) \n",
|
|
||||||
"461 527 Schindler's List (1993) \n",
|
|
||||||
"6710 58559 Dark Knight, The (2008) \n",
|
|
||||||
"\n",
|
|
||||||
" genres num_ratings mean_rating \n",
|
|
||||||
"277 Crime|Drama 317.0 4.429022 \\\n",
|
|
||||||
"659 Crime|Drama 192.0 4.289062 \n",
|
|
||||||
"2226 Action|Crime|Drama|Thriller 218.0 4.272936 \n",
|
|
||||||
"922 Crime|Drama 129.0 4.259690 \n",
|
|
||||||
"46 Crime|Mystery|Thriller 204.0 4.237745 \n",
|
|
||||||
"224 Action|Adventure|Sci-Fi 251.0 4.231076 \n",
|
|
||||||
"602 Comedy|War 97.0 4.268041 \n",
|
|
||||||
"914 Crime|Drama 126.0 4.250000 \n",
|
|
||||||
"461 Drama|War 220.0 4.225000 \n",
|
|
||||||
"6710 Action|Crime|Drama|IMAX 149.0 4.238255 \n",
|
|
||||||
"\n",
|
|
||||||
" damped_mean_rating \n",
|
|
||||||
"277 4.400659 \n",
|
|
||||||
"659 4.250077 \n",
|
|
||||||
"2226 4.239103 \n",
|
|
||||||
"922 4.205148 \n",
|
|
||||||
"46 4.203344 \n",
|
|
||||||
"224 4.203125 \n",
|
|
||||||
"602 4.196407 \n",
|
|
||||||
"914 4.194967 \n",
|
|
||||||
"461 4.193546 \n",
|
|
||||||
"6710 4.191922 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 201,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"movies_df.sort_values(by=\"damped_mean_rating\", ascending=False).head(10)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.16"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,176 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "25aa1c78",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "107e909b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Load the transactions data\n",
|
|
||||||
"transactions = pd.read_csv(\"grocery_dataset.csv\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "289a9751",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"itemDescription\n",
|
|
||||||
"whole milk 515.0\n",
|
|
||||||
"other vegetables 361.0\n",
|
|
||||||
"rolls/buns 344.0\n",
|
|
||||||
"soda 271.0\n",
|
|
||||||
"yogurt 242.0\n",
|
|
||||||
"dtype: float64"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"member_purchases = transactions.groupby(['Member_number', 'itemDescription'])['itemDescription'].count().unstack().fillna(0)\n",
|
|
||||||
"item_descriptions = member_purchases.columns\n",
|
|
||||||
"\n",
|
|
||||||
"def simple_association(item_name):\n",
|
|
||||||
" item_basket = member_purchases[member_purchases[item_name] > 0]\n",
|
|
||||||
" co_purchase_counts = item_basket.sum().sort_values(ascending=False).drop(item_name)\n",
|
|
||||||
" return co_purchase_counts.head(5)\n",
|
|
||||||
"\n",
|
|
||||||
"ex_item = item_descriptions[20]\n",
|
|
||||||
"simple_association(ex_item)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "190a1485",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Top 10 recommendations for soda:\n",
|
|
||||||
"\n",
|
|
||||||
"itemDescription\n",
|
|
||||||
"oil 1.246844\n",
|
|
||||||
"beverages 1.162678\n",
|
|
||||||
"sausage 1.014975\n",
|
|
||||||
"grapes 1.001195\n",
|
|
||||||
"shopping bags 0.95459\n",
|
|
||||||
"frozen meals 0.943642\n",
|
|
||||||
"specialty bar 0.936182\n",
|
|
||||||
"butter 0.918418\n",
|
|
||||||
"candy 0.910056\n",
|
|
||||||
"specialty chocolate 0.904846\n",
|
|
||||||
"Name: soda, dtype: object \n",
|
|
||||||
"\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Function to create a transaction matrix\n",
|
|
||||||
"def create_transaction_matrix(transactions):\n",
|
|
||||||
" # Group the transactions by member number, date and item description\n",
|
|
||||||
" # Count the number of each item bought by each member on each date\n",
|
|
||||||
" # Unstack the item descriptions to create a matrix where rows are transactions and columns are items\n",
|
|
||||||
" # Fill any missing values with 0\n",
|
|
||||||
" # Set the index to be the member number and date\n",
|
|
||||||
" basket = (transactions.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription']\n",
|
|
||||||
" .count().unstack().reset_index().fillna(0)\n",
|
|
||||||
" .set_index(['Member_number', 'Date']))\n",
|
|
||||||
" \n",
|
|
||||||
" # Convert the counts to True or False\n",
|
|
||||||
" # True if the item was bought in the transaction, False otherwise\n",
|
|
||||||
" return basket.applymap(lambda x: True if x > 0 else False)\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to calculate a lift matrix\n",
|
|
||||||
"def calculate_lift_matrix(basket_sets, min_joint_probability=0.001):\n",
|
|
||||||
" # Calculate the joint probability of each pair of items\n",
|
|
||||||
" probability_pair = pd.DataFrame(index=basket_sets.columns, columns=basket_sets.columns)\n",
|
|
||||||
" for item1 in basket_sets.columns:\n",
|
|
||||||
" for item2 in basket_sets.columns:\n",
|
|
||||||
" joint_probability = (basket_sets[item1] & basket_sets[item2]).sum() / len(basket_sets)\n",
|
|
||||||
" if joint_probability > min_joint_probability:\n",
|
|
||||||
" probability_pair.loc[item1, item2] = joint_probability\n",
|
|
||||||
" else:\n",
|
|
||||||
" probability_pair.loc[item1, item2] = 0\n",
|
|
||||||
"\n",
|
|
||||||
" # Set the diagonal of the joint probability matrix to 0\n",
|
|
||||||
" np.fill_diagonal(probability_pair.values, 0)\n",
|
|
||||||
"\n",
|
|
||||||
" # Calculate the individual probability of each item\n",
|
|
||||||
" probability_item = basket_sets.mean()\n",
|
|
||||||
"\n",
|
|
||||||
" # Calculate the product of the individual probabilities\n",
|
|
||||||
" probability_product = np.outer(probability_item, probability_item)\n",
|
|
||||||
"\n",
|
|
||||||
" # Calculate the lift of each pair of items\n",
|
|
||||||
" lift_matrix = probability_pair.divide(probability_product, fill_value=0)\n",
|
|
||||||
" \n",
|
|
||||||
" return lift_matrix\n",
|
|
||||||
"\n",
|
|
||||||
"# Function to recommend items\n",
|
|
||||||
"def recommend_items(lift_matrix, item, num_recommendations=10):\n",
|
|
||||||
" # Sort the items in the lift matrix for the given item in descending order\n",
|
|
||||||
" # Take the top num_recommendations items\n",
|
|
||||||
" recommended_for_item = lift_matrix[item].sort_values(ascending=False).head(num_recommendations)\n",
|
|
||||||
" \n",
|
|
||||||
" # Print the recommended items\n",
|
|
||||||
" print(f\"Top {num_recommendations} recommendations for {item}:\\n\")\n",
|
|
||||||
" print(recommended_for_item, \"\\n\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Create transaction matrix\n",
|
|
||||||
"basket_sets = create_transaction_matrix(transactions)\n",
|
|
||||||
"\n",
|
|
||||||
"# Calculate the lift matrix\n",
|
|
||||||
"lift_matrix = calculate_lift_matrix(basket_sets)\n",
|
|
||||||
"\n",
|
|
||||||
"# Recommend items for 'meat'\n",
|
|
||||||
"recommend_items(lift_matrix, 'soda')"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.16"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "b0c33033",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.8.16"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,8 +1,46 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Non-Personalized Recommender Systems: Popularity Based"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 207,
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os \n",
|
||||||
|
"\n",
|
||||||
|
"if os.path.exists('movielens_small.zip'):\n",
|
||||||
|
" !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip \n",
|
||||||
|
" !unzip ml-latest-small.zip\n",
|
||||||
|
" !rm ml-latest-small.zip\n",
|
||||||
|
" !mv ml-latest-small movielens_small"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Damped Mean\n",
|
||||||
|
"\n",
|
||||||
|
"$$ s(i) = \\frac{\\Sigma_{u \\in U_i} r_i + a \\times \\mu}{|U_i| + a} $$\n",
|
||||||
|
"\n",
|
||||||
|
"Where:\n",
|
||||||
|
"- $ s(i) $: The damped mean rating for item $ i $.\n",
|
||||||
|
"- $ \\Sigma_{u \\in U_i} r_i $: Sum of the ratings for item $ i $.\n",
|
||||||
|
"- $ a $: Damping factor, a value that determines the extent of smoothing.\n",
|
||||||
|
"- $ \\mu $: Global mean rating across all items.\n",
|
||||||
|
"- $ |U_i| $: Total number of ratings for item $ i $.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
"id": "937dd4ed",
|
"id": "937dd4ed",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@@ -36,6 +74,13 @@
|
|||||||
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
|
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's see how using num_ratings compares to mean rating & damped mean rating."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 198,
|
"execution_count": 198,
|
||||||
@@ -450,7 +495,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.16"
|
"version": "3.11.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -2,9 +2,124 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 1,
|
||||||
"id": "b0c33033",
|
"id": "b0c33033",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Posts sorted by Reddit's 'Hot' score:\n",
|
||||||
|
" post_id post_title upvotes \n",
|
||||||
|
"9 10 Difference between CNN and RNN? 350 \\\n",
|
||||||
|
"8 9 The future of quantum computing in AI 600 \n",
|
||||||
|
"7 8 Experience with multi-modal learning? 450 \n",
|
||||||
|
"6 7 Looking for resources on probabilistic program... 700 \n",
|
||||||
|
"4 5 Tips for handling imbalanced datasets? 1100 \n",
|
||||||
|
"2 3 Has anyone tried the new reinforcement learnin... 900 \n",
|
||||||
|
"3 4 Discussion: Evolutionary algorithms vs gradien... 800 \n",
|
||||||
|
"5 6 Which GPU is best for neural network training? 300 \n",
|
||||||
|
"0 1 How do I start with machine learning? 600 \n",
|
||||||
|
"1 2 Best practices for deep learning optimization? 400 \n",
|
||||||
|
"\n",
|
||||||
|
" downvotes age_in_seconds age_in_hours reddit_hot hacker_news \n",
|
||||||
|
"9 50 256000 71.111111 8.166010 0.042205 \n",
|
||||||
|
"8 50 128000 35.555556 5.584807 0.227638 \n",
|
||||||
|
"7 50 64000 17.777778 4.024282 0.559318 \n",
|
||||||
|
"6 50 32000 8.888889 3.524024 2.416714 \n",
|
||||||
|
"4 100 8000 2.222222 3.177778 18.779258 \n",
|
||||||
|
"2 100 2000 0.555556 2.947534 38.776074 \n",
|
||||||
|
"3 100 4000 1.111111 2.933987 24.453093 \n",
|
||||||
|
"5 50 16000 4.444444 2.753496 2.886859 \n",
|
||||||
|
"0 100 500 0.138889 2.710081 36.655710 \n",
|
||||||
|
"1 50 1000 0.277778 2.566290 24.588946 \n",
|
||||||
|
"\n",
|
||||||
|
"Posts sorted by Hacker News score:\n",
|
||||||
|
" post_id post_title upvotes \n",
|
||||||
|
"2 3 Has anyone tried the new reinforcement learnin... 900 \\\n",
|
||||||
|
"0 1 How do I start with machine learning? 600 \n",
|
||||||
|
"1 2 Best practices for deep learning optimization? 400 \n",
|
||||||
|
"3 4 Discussion: Evolutionary algorithms vs gradien... 800 \n",
|
||||||
|
"4 5 Tips for handling imbalanced datasets? 1100 \n",
|
||||||
|
"5 6 Which GPU is best for neural network training? 300 \n",
|
||||||
|
"6 7 Looking for resources on probabilistic program... 700 \n",
|
||||||
|
"7 8 Experience with multi-modal learning? 450 \n",
|
||||||
|
"8 9 The future of quantum computing in AI 600 \n",
|
||||||
|
"9 10 Difference between CNN and RNN? 350 \n",
|
||||||
|
"\n",
|
||||||
|
" downvotes age_in_seconds age_in_hours reddit_hot hacker_news \n",
|
||||||
|
"2 100 2000 0.555556 2.947534 38.776074 \n",
|
||||||
|
"0 100 500 0.138889 2.710081 36.655710 \n",
|
||||||
|
"1 50 1000 0.277778 2.566290 24.588946 \n",
|
||||||
|
"3 100 4000 1.111111 2.933987 24.453093 \n",
|
||||||
|
"4 100 8000 2.222222 3.177778 18.779258 \n",
|
||||||
|
"5 50 16000 4.444444 2.753496 2.886859 \n",
|
||||||
|
"6 50 32000 8.888889 3.524024 2.416714 \n",
|
||||||
|
"7 50 64000 17.777778 4.024282 0.559318 \n",
|
||||||
|
"8 50 128000 35.555556 5.584807 0.227638 \n",
|
||||||
|
"9 50 256000 71.111111 8.166010 0.042205 \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import math\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"data = {\n",
|
||||||
|
" 'post_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
|
||||||
|
" 'post_title': [\n",
|
||||||
|
" \"How do I start with machine learning?\",\n",
|
||||||
|
" \"Best practices for deep learning optimization?\",\n",
|
||||||
|
" \"Has anyone tried the new reinforcement learning library?\",\n",
|
||||||
|
" \"Discussion: Evolutionary algorithms vs gradient descent\",\n",
|
||||||
|
" \"Tips for handling imbalanced datasets?\",\n",
|
||||||
|
" \"Which GPU is best for neural network training?\",\n",
|
||||||
|
" \"Looking for resources on probabilistic programming\",\n",
|
||||||
|
" \"Experience with multi-modal learning?\",\n",
|
||||||
|
" \"The future of quantum computing in AI\",\n",
|
||||||
|
" \"Difference between CNN and RNN?\"\n",
|
||||||
|
" ],\n",
|
||||||
|
" 'upvotes': [600, 400, 900, 800, 1100, 300, 700, 450, 600, 350],\n",
|
||||||
|
" 'downvotes': [100, 50, 100, 100, 100, 50, 50, 50, 50, 50],\n",
|
||||||
|
" 'age_in_seconds': [500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000]\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Convert to DataFrame\n",
|
||||||
|
"reddit_df = pd.DataFrame(data)\n",
|
||||||
|
"\n",
|
||||||
|
"# Calculate age in hours from age in seconds\n",
|
||||||
|
"reddit_df['age_in_hours'] = reddit_df['age_in_seconds'] / 3600\n",
|
||||||
|
"\n",
|
||||||
|
"# Reddit's \"Hot\" formula\n",
|
||||||
|
"def reddit_hot(U, D, t):\n",
|
||||||
|
" return math.log10(max(abs(U-D), 1)) + np.sign(U-D) * t / 45000\n",
|
||||||
|
"\n",
|
||||||
|
"# Modified Hacker News formula\n",
|
||||||
|
"def hacker_news(U, D, T, P=1, alpha=0.8, gamma=1.8):\n",
|
||||||
|
" return P * pow((U - D - 1), alpha) / pow((T + 2), gamma)\n",
|
||||||
|
"\n",
|
||||||
|
"# Apply the formulas\n",
|
||||||
|
"reddit_df['reddit_hot'] = reddit_df.apply(lambda row: reddit_hot(row['upvotes'], row['downvotes'], row['age_in_seconds']), axis=1)\n",
|
||||||
|
"reddit_df['hacker_news'] = reddit_df.apply(lambda row: hacker_news(row['upvotes'], row['downvotes'], row['age_in_hours']), axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Sort by Reddit's \"Hot\" score and print\n",
|
||||||
|
"reddit_df_sorted = reddit_df.sort_values(by='reddit_hot', ascending=False)\n",
|
||||||
|
"print(\"Posts sorted by Reddit's 'Hot' score:\")\n",
|
||||||
|
"print(reddit_df_sorted)\n",
|
||||||
|
"\n",
|
||||||
|
"# Sort by Hacker News score and print\n",
|
||||||
|
"hacker_news_df_sorted = reddit_df.sort_values(by='hacker_news', ascending=False)\n",
|
||||||
|
"print(\"\\nPosts sorted by Hacker News score:\")\n",
|
||||||
|
"print(hacker_news_df_sorted)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
@@ -25,7 +140,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.16"
|
"version": "3.11.3"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
Reference in New Issue
Block a user