mirror of
https://github.com/aladdinpersson/Machine-Learning-Collection.git
synced 2026-02-21 11:18:01 +00:00
added intro folder, non-personalized recsys code
This commit is contained in:
@@ -0,0 +1,458 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 207,
|
||||
"id": "937dd4ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def load_data():\n",
|
||||
" # Load the MovieLens data\n",
|
||||
" movies_df = pd.read_csv(\"movielens_small/movies.csv\")\n",
|
||||
" ratings_df = pd.read_csv(\"movielens_small/ratings.csv\")\n",
|
||||
" return movies_df, ratings_df\n",
|
||||
"\n",
|
||||
"def calculate_popularity(movies_df, ratings_df, damping_factor=5):\n",
|
||||
" # Calculate the number of ratings, mean rating, and sum of ratings for each movie\n",
|
||||
" num_ratings = ratings_df.groupby(\"movieId\")[\"rating\"].count()\n",
|
||||
" mean_rating = ratings_df.groupby(\"movieId\")[\"rating\"].mean()\n",
|
||||
" global_mean = ratings_df[\"rating\"].mean()\n",
|
||||
" \n",
|
||||
" # Calculate the damped mean rating for each movie\n",
|
||||
" damped_numerator = num_ratings * mean_rating + damping_factor * global_mean\n",
|
||||
" damped_denominator = num_ratings + damping_factor\n",
|
||||
" damped_mean_rating = damped_numerator / damped_denominator\n",
|
||||
" \n",
|
||||
" # Add the popularity data to the movie data\n",
|
||||
" movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)\n",
|
||||
" movies_df['mean_rating'] = movies_df['movieId'].map(mean_rating)\n",
|
||||
" movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)\n",
|
||||
" return movies_df\n",
|
||||
"\n",
|
||||
"movies_df, ratings_df = load_data()\n",
|
||||
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 198,
|
||||
"id": "7e649c6f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>314</th>\n",
|
||||
" <td>356</td>\n",
|
||||
" <td>Forrest Gump (1994)</td>\n",
|
||||
" <td>Comedy|Drama|Romance|War</td>\n",
|
||||
" <td>329.0</td>\n",
|
||||
" <td>4.164134</td>\n",
|
||||
" <td>4.144589</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>277</th>\n",
|
||||
" <td>318</td>\n",
|
||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>4.429022</td>\n",
|
||||
" <td>4.400659</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>257</th>\n",
|
||||
" <td>296</td>\n",
|
||||
" <td>Pulp Fiction (1994)</td>\n",
|
||||
" <td>Comedy|Crime|Drama|Thriller</td>\n",
|
||||
" <td>307.0</td>\n",
|
||||
" <td>4.197068</td>\n",
|
||||
" <td>4.175128</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>510</th>\n",
|
||||
" <td>593</td>\n",
|
||||
" <td>Silence of the Lambs, The (1991)</td>\n",
|
||||
" <td>Crime|Horror|Thriller</td>\n",
|
||||
" <td>279.0</td>\n",
|
||||
" <td>4.161290</td>\n",
|
||||
" <td>4.138462</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1939</th>\n",
|
||||
" <td>2571</td>\n",
|
||||
" <td>Matrix, The (1999)</td>\n",
|
||||
" <td>Action|Sci-Fi|Thriller</td>\n",
|
||||
" <td>278.0</td>\n",
|
||||
" <td>4.192446</td>\n",
|
||||
" <td>4.168457</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title genres \n",
|
||||
"314 356 Forrest Gump (1994) Comedy|Drama|Romance|War \\\n",
|
||||
"277 318 Shawshank Redemption, The (1994) Crime|Drama \n",
|
||||
"257 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller \n",
|
||||
"510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller \n",
|
||||
"1939 2571 Matrix, The (1999) Action|Sci-Fi|Thriller \n",
|
||||
"\n",
|
||||
" num_ratings mean_rating damped_mean_rating \n",
|
||||
"314 329.0 4.164134 4.144589 \n",
|
||||
"277 317.0 4.429022 4.400659 \n",
|
||||
"257 307.0 4.197068 4.175128 \n",
|
||||
"510 279.0 4.161290 4.138462 \n",
|
||||
"1939 278.0 4.192446 4.168457 "
|
||||
]
|
||||
},
|
||||
"execution_count": 198,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"num_ratings\", ascending=False).head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 204,
|
||||
"id": "c6ef332e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>7656</th>\n",
|
||||
" <td>88448</td>\n",
|
||||
" <td>Paper Birds (Pájaros de papel) (2010)</td>\n",
|
||||
" <td>Comedy|Drama</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8107</th>\n",
|
||||
" <td>100556</td>\n",
|
||||
" <td>Act of Killing, The (2012)</td>\n",
|
||||
" <td>Documentary</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9083</th>\n",
|
||||
" <td>143031</td>\n",
|
||||
" <td>Jump In! (2007)</td>\n",
|
||||
" <td>Comedy|Drama|Romance</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9094</th>\n",
|
||||
" <td>143511</td>\n",
|
||||
" <td>Human (2015)</td>\n",
|
||||
" <td>Documentary</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9096</th>\n",
|
||||
" <td>143559</td>\n",
|
||||
" <td>L.A. Slasher (2015)</td>\n",
|
||||
" <td>Comedy|Crime|Fantasy</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title genres \n",
|
||||
"7656 88448 Paper Birds (Pájaros de papel) (2010) Comedy|Drama \\\n",
|
||||
"8107 100556 Act of Killing, The (2012) Documentary \n",
|
||||
"9083 143031 Jump In! (2007) Comedy|Drama|Romance \n",
|
||||
"9094 143511 Human (2015) Documentary \n",
|
||||
"9096 143559 L.A. Slasher (2015) Comedy|Crime|Fantasy \n",
|
||||
"\n",
|
||||
" num_ratings mean_rating damped_mean_rating \n",
|
||||
"7656 1.0 5.0 3.637779 \n",
|
||||
"8107 1.0 5.0 3.637779 \n",
|
||||
"9083 1.0 5.0 3.637779 \n",
|
||||
"9094 1.0 5.0 3.637779 \n",
|
||||
"9096 1.0 5.0 3.637779 "
|
||||
]
|
||||
},
|
||||
"execution_count": 204,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"mean_rating\", ascending=False).head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 201,
|
||||
"id": "f669fb09",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>277</th>\n",
|
||||
" <td>318</td>\n",
|
||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>4.429022</td>\n",
|
||||
" <td>4.400659</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>659</th>\n",
|
||||
" <td>858</td>\n",
|
||||
" <td>Godfather, The (1972)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>192.0</td>\n",
|
||||
" <td>4.289062</td>\n",
|
||||
" <td>4.250077</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2226</th>\n",
|
||||
" <td>2959</td>\n",
|
||||
" <td>Fight Club (1999)</td>\n",
|
||||
" <td>Action|Crime|Drama|Thriller</td>\n",
|
||||
" <td>218.0</td>\n",
|
||||
" <td>4.272936</td>\n",
|
||||
" <td>4.239103</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>922</th>\n",
|
||||
" <td>1221</td>\n",
|
||||
" <td>Godfather: Part II, The (1974)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>129.0</td>\n",
|
||||
" <td>4.259690</td>\n",
|
||||
" <td>4.205148</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>50</td>\n",
|
||||
" <td>Usual Suspects, The (1995)</td>\n",
|
||||
" <td>Crime|Mystery|Thriller</td>\n",
|
||||
" <td>204.0</td>\n",
|
||||
" <td>4.237745</td>\n",
|
||||
" <td>4.203344</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>224</th>\n",
|
||||
" <td>260</td>\n",
|
||||
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
|
||||
" <td>Action|Adventure|Sci-Fi</td>\n",
|
||||
" <td>251.0</td>\n",
|
||||
" <td>4.231076</td>\n",
|
||||
" <td>4.203125</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>602</th>\n",
|
||||
" <td>750</td>\n",
|
||||
" <td>Dr. Strangelove or: How I Learned to Stop Worr...</td>\n",
|
||||
" <td>Comedy|War</td>\n",
|
||||
" <td>97.0</td>\n",
|
||||
" <td>4.268041</td>\n",
|
||||
" <td>4.196407</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>914</th>\n",
|
||||
" <td>1213</td>\n",
|
||||
" <td>Goodfellas (1990)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>126.0</td>\n",
|
||||
" <td>4.250000</td>\n",
|
||||
" <td>4.194967</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>461</th>\n",
|
||||
" <td>527</td>\n",
|
||||
" <td>Schindler's List (1993)</td>\n",
|
||||
" <td>Drama|War</td>\n",
|
||||
" <td>220.0</td>\n",
|
||||
" <td>4.225000</td>\n",
|
||||
" <td>4.193546</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6710</th>\n",
|
||||
" <td>58559</td>\n",
|
||||
" <td>Dark Knight, The (2008)</td>\n",
|
||||
" <td>Action|Crime|Drama|IMAX</td>\n",
|
||||
" <td>149.0</td>\n",
|
||||
" <td>4.238255</td>\n",
|
||||
" <td>4.191922</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title \n",
|
||||
"277 318 Shawshank Redemption, The (1994) \\\n",
|
||||
"659 858 Godfather, The (1972) \n",
|
||||
"2226 2959 Fight Club (1999) \n",
|
||||
"922 1221 Godfather: Part II, The (1974) \n",
|
||||
"46 50 Usual Suspects, The (1995) \n",
|
||||
"224 260 Star Wars: Episode IV - A New Hope (1977) \n",
|
||||
"602 750 Dr. Strangelove or: How I Learned to Stop Worr... \n",
|
||||
"914 1213 Goodfellas (1990) \n",
|
||||
"461 527 Schindler's List (1993) \n",
|
||||
"6710 58559 Dark Knight, The (2008) \n",
|
||||
"\n",
|
||||
" genres num_ratings mean_rating \n",
|
||||
"277 Crime|Drama 317.0 4.429022 \\\n",
|
||||
"659 Crime|Drama 192.0 4.289062 \n",
|
||||
"2226 Action|Crime|Drama|Thriller 218.0 4.272936 \n",
|
||||
"922 Crime|Drama 129.0 4.259690 \n",
|
||||
"46 Crime|Mystery|Thriller 204.0 4.237745 \n",
|
||||
"224 Action|Adventure|Sci-Fi 251.0 4.231076 \n",
|
||||
"602 Comedy|War 97.0 4.268041 \n",
|
||||
"914 Crime|Drama 126.0 4.250000 \n",
|
||||
"461 Drama|War 220.0 4.225000 \n",
|
||||
"6710 Action|Crime|Drama|IMAX 149.0 4.238255 \n",
|
||||
"\n",
|
||||
" damped_mean_rating \n",
|
||||
"277 4.400659 \n",
|
||||
"659 4.250077 \n",
|
||||
"2226 4.239103 \n",
|
||||
"922 4.205148 \n",
|
||||
"46 4.203344 \n",
|
||||
"224 4.203125 \n",
|
||||
"602 4.196407 \n",
|
||||
"914 4.194967 \n",
|
||||
"461 4.193546 \n",
|
||||
"6710 4.191922 "
|
||||
]
|
||||
},
|
||||
"execution_count": 201,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"damped_mean_rating\", ascending=False).head(10)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "25aa1c78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "107e909b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the transactions data\n",
|
||||
"transactions = pd.read_csv(\"grocery_dataset.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "289a9751",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"itemDescription\n",
|
||||
"whole milk 515.0\n",
|
||||
"other vegetables 361.0\n",
|
||||
"rolls/buns 344.0\n",
|
||||
"soda 271.0\n",
|
||||
"yogurt 242.0\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"member_purchases = transactions.groupby(['Member_number', 'itemDescription'])['itemDescription'].count().unstack().fillna(0)\n",
|
||||
"item_descriptions = member_purchases.columns\n",
|
||||
"\n",
|
||||
"def simple_association(item_name):\n",
|
||||
" item_basket = member_purchases[member_purchases[item_name] > 0]\n",
|
||||
" co_purchase_counts = item_basket.sum().sort_values(ascending=False).drop(item_name)\n",
|
||||
" return co_purchase_counts.head(5)\n",
|
||||
"\n",
|
||||
"ex_item = item_descriptions[20]\n",
|
||||
"simple_association(ex_item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "190a1485",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Top 10 recommendations for soda:\n",
|
||||
"\n",
|
||||
"itemDescription\n",
|
||||
"oil 1.246844\n",
|
||||
"beverages 1.162678\n",
|
||||
"sausage 1.014975\n",
|
||||
"grapes 1.001195\n",
|
||||
"shopping bags 0.95459\n",
|
||||
"frozen meals 0.943642\n",
|
||||
"specialty bar 0.936182\n",
|
||||
"butter 0.918418\n",
|
||||
"candy 0.910056\n",
|
||||
"specialty chocolate 0.904846\n",
|
||||
"Name: soda, dtype: object \n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Function to create a transaction matrix\n",
|
||||
"def create_transaction_matrix(transactions):\n",
|
||||
" # Group the transactions by member number, date and item description\n",
|
||||
" # Count the number of each item bought by each member on each date\n",
|
||||
" # Unstack the item descriptions to create a matrix where rows are transactions and columns are items\n",
|
||||
" # Fill any missing values with 0\n",
|
||||
" # Set the index to be the member number and date\n",
|
||||
" basket = (transactions.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription']\n",
|
||||
" .count().unstack().reset_index().fillna(0)\n",
|
||||
" .set_index(['Member_number', 'Date']))\n",
|
||||
" \n",
|
||||
" # Convert the counts to True or False\n",
|
||||
" # True if the item was bought in the transaction, False otherwise\n",
|
||||
" return basket.applymap(lambda x: True if x > 0 else False)\n",
|
||||
"\n",
|
||||
"# Function to calculate a lift matrix\n",
|
||||
"def calculate_lift_matrix(basket_sets, min_joint_probability=0.001):\n",
|
||||
" # Calculate the joint probability of each pair of items\n",
|
||||
" probability_pair = pd.DataFrame(index=basket_sets.columns, columns=basket_sets.columns)\n",
|
||||
" for item1 in basket_sets.columns:\n",
|
||||
" for item2 in basket_sets.columns:\n",
|
||||
" joint_probability = (basket_sets[item1] & basket_sets[item2]).sum() / len(basket_sets)\n",
|
||||
" if joint_probability > min_joint_probability:\n",
|
||||
" probability_pair.loc[item1, item2] = joint_probability\n",
|
||||
" else:\n",
|
||||
" probability_pair.loc[item1, item2] = 0\n",
|
||||
"\n",
|
||||
" # Set the diagonal of the joint probability matrix to 0\n",
|
||||
" np.fill_diagonal(probability_pair.values, 0)\n",
|
||||
"\n",
|
||||
" # Calculate the individual probability of each item\n",
|
||||
" probability_item = basket_sets.mean()\n",
|
||||
"\n",
|
||||
" # Calculate the product of the individual probabilities\n",
|
||||
" probability_product = np.outer(probability_item, probability_item)\n",
|
||||
"\n",
|
||||
" # Calculate the lift of each pair of items\n",
|
||||
" lift_matrix = probability_pair.divide(probability_product, fill_value=0)\n",
|
||||
" \n",
|
||||
" return lift_matrix\n",
|
||||
"\n",
|
||||
"# Function to recommend items\n",
|
||||
"def recommend_items(lift_matrix, item, num_recommendations=10):\n",
|
||||
" # Sort the items in the lift matrix for the given item in descending order\n",
|
||||
" # Take the top num_recommendations items\n",
|
||||
" recommended_for_item = lift_matrix[item].sort_values(ascending=False).head(num_recommendations)\n",
|
||||
" \n",
|
||||
" # Print the recommended items\n",
|
||||
" print(f\"Top {num_recommendations} recommendations for {item}:\\n\")\n",
|
||||
" print(recommended_for_item, \"\\n\\n\")\n",
|
||||
"\n",
|
||||
"# Create transaction matrix\n",
|
||||
"basket_sets = create_transaction_matrix(transactions)\n",
|
||||
"\n",
|
||||
"# Calculate the lift matrix\n",
|
||||
"lift_matrix = calculate_lift_matrix(basket_sets)\n",
|
||||
"\n",
|
||||
"# Recommend items for 'meat'\n",
|
||||
"recommend_items(lift_matrix, 'soda')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0c33033",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,153 @@
|
||||
Summary
|
||||
=======
|
||||
|
||||
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.
|
||||
|
||||
Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.
|
||||
|
||||
The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.
|
||||
|
||||
This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.
|
||||
|
||||
This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.
|
||||
|
||||
|
||||
Usage License
|
||||
=============
|
||||
|
||||
Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:
|
||||
|
||||
* The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.
|
||||
* The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).
|
||||
* The user may redistribute the data set, including transformations, so long as it is distributed under these same license conditions.
|
||||
* The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.
|
||||
* The executable software scripts are provided "as is" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.
|
||||
|
||||
In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).
|
||||
|
||||
If you have any further questions or comments, please email <grouplens-info@umn.edu>
|
||||
|
||||
|
||||
Citation
|
||||
========
|
||||
|
||||
To acknowledge use of the dataset in publications, please cite the following paper:
|
||||
|
||||
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>
|
||||
|
||||
|
||||
Further Information About GroupLens
|
||||
===================================
|
||||
|
||||
GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:
|
||||
|
||||
* recommender systems
|
||||
* online communities
|
||||
* mobile and ubiquitious technologies
|
||||
* digital libraries
|
||||
* local geographic information systems
|
||||
|
||||
GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit <http://movielens.org> to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at <grouplens-info@cs.umn.edu> - we are always interested in working with external collaborators.
|
||||
|
||||
|
||||
Content and Use of Files
|
||||
========================
|
||||
|
||||
Formatting and Encoding
|
||||
-----------------------
|
||||
|
||||
The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.
|
||||
|
||||
|
||||
User Ids
|
||||
--------
|
||||
|
||||
MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).
|
||||
|
||||
|
||||
Movie Ids
|
||||
---------
|
||||
|
||||
Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).
|
||||
|
||||
|
||||
Ratings Data File Structure (ratings.csv)
|
||||
-----------------------------------------
|
||||
|
||||
All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,rating,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Tags Data File Structure (tags.csv)
|
||||
-----------------------------------
|
||||
|
||||
All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:
|
||||
|
||||
userId,movieId,tag,timestamp
|
||||
|
||||
The lines within this file are ordered first by userId, then, within user, by movieId.
|
||||
|
||||
Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.
|
||||
|
||||
Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.
|
||||
|
||||
|
||||
Movies Data File Structure (movies.csv)
|
||||
---------------------------------------
|
||||
|
||||
Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,title,genres
|
||||
|
||||
Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.
|
||||
|
||||
Genres are a pipe-separated list, and are selected from the following:
|
||||
|
||||
* Action
|
||||
* Adventure
|
||||
* Animation
|
||||
* Children's
|
||||
* Comedy
|
||||
* Crime
|
||||
* Documentary
|
||||
* Drama
|
||||
* Fantasy
|
||||
* Film-Noir
|
||||
* Horror
|
||||
* Musical
|
||||
* Mystery
|
||||
* Romance
|
||||
* Sci-Fi
|
||||
* Thriller
|
||||
* War
|
||||
* Western
|
||||
* (no genres listed)
|
||||
|
||||
|
||||
Links Data File Structure (links.csv)
|
||||
---------------------------------------
|
||||
|
||||
Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:
|
||||
|
||||
movieId,imdbId,tmdbId
|
||||
|
||||
movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.
|
||||
|
||||
imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.
|
||||
|
||||
tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.
|
||||
|
||||
Use of the resources listed above is subject to the terms of each provider.
|
||||
|
||||
|
||||
Cross-Validation
|
||||
----------------
|
||||
|
||||
Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,458 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 207,
|
||||
"id": "937dd4ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def load_data():\n",
|
||||
" # Load the MovieLens data\n",
|
||||
" movies_df = pd.read_csv(\"movielens_small/movies.csv\")\n",
|
||||
" ratings_df = pd.read_csv(\"movielens_small/ratings.csv\")\n",
|
||||
" return movies_df, ratings_df\n",
|
||||
"\n",
|
||||
"def calculate_popularity(movies_df, ratings_df, damping_factor=5):\n",
|
||||
" # Calculate the number of ratings, mean rating, and sum of ratings for each movie\n",
|
||||
" num_ratings = ratings_df.groupby(\"movieId\")[\"rating\"].count()\n",
|
||||
" mean_rating = ratings_df.groupby(\"movieId\")[\"rating\"].mean()\n",
|
||||
" global_mean = ratings_df[\"rating\"].mean()\n",
|
||||
" \n",
|
||||
" # Calculate the damped mean rating for each movie\n",
|
||||
" damped_numerator = num_ratings * mean_rating + damping_factor * global_mean\n",
|
||||
" damped_denominator = num_ratings + damping_factor\n",
|
||||
" damped_mean_rating = damped_numerator / damped_denominator\n",
|
||||
" \n",
|
||||
" # Add the popularity data to the movie data\n",
|
||||
" movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)\n",
|
||||
" movies_df['mean_rating'] = movies_df['movieId'].map(mean_rating)\n",
|
||||
" movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)\n",
|
||||
" return movies_df\n",
|
||||
"\n",
|
||||
"movies_df, ratings_df = load_data()\n",
|
||||
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 198,
|
||||
"id": "7e649c6f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>314</th>\n",
|
||||
" <td>356</td>\n",
|
||||
" <td>Forrest Gump (1994)</td>\n",
|
||||
" <td>Comedy|Drama|Romance|War</td>\n",
|
||||
" <td>329.0</td>\n",
|
||||
" <td>4.164134</td>\n",
|
||||
" <td>4.144589</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>277</th>\n",
|
||||
" <td>318</td>\n",
|
||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>4.429022</td>\n",
|
||||
" <td>4.400659</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>257</th>\n",
|
||||
" <td>296</td>\n",
|
||||
" <td>Pulp Fiction (1994)</td>\n",
|
||||
" <td>Comedy|Crime|Drama|Thriller</td>\n",
|
||||
" <td>307.0</td>\n",
|
||||
" <td>4.197068</td>\n",
|
||||
" <td>4.175128</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>510</th>\n",
|
||||
" <td>593</td>\n",
|
||||
" <td>Silence of the Lambs, The (1991)</td>\n",
|
||||
" <td>Crime|Horror|Thriller</td>\n",
|
||||
" <td>279.0</td>\n",
|
||||
" <td>4.161290</td>\n",
|
||||
" <td>4.138462</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1939</th>\n",
|
||||
" <td>2571</td>\n",
|
||||
" <td>Matrix, The (1999)</td>\n",
|
||||
" <td>Action|Sci-Fi|Thriller</td>\n",
|
||||
" <td>278.0</td>\n",
|
||||
" <td>4.192446</td>\n",
|
||||
" <td>4.168457</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title genres \n",
|
||||
"314 356 Forrest Gump (1994) Comedy|Drama|Romance|War \\\n",
|
||||
"277 318 Shawshank Redemption, The (1994) Crime|Drama \n",
|
||||
"257 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller \n",
|
||||
"510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller \n",
|
||||
"1939 2571 Matrix, The (1999) Action|Sci-Fi|Thriller \n",
|
||||
"\n",
|
||||
" num_ratings mean_rating damped_mean_rating \n",
|
||||
"314 329.0 4.164134 4.144589 \n",
|
||||
"277 317.0 4.429022 4.400659 \n",
|
||||
"257 307.0 4.197068 4.175128 \n",
|
||||
"510 279.0 4.161290 4.138462 \n",
|
||||
"1939 278.0 4.192446 4.168457 "
|
||||
]
|
||||
},
|
||||
"execution_count": 198,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"num_ratings\", ascending=False).head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 204,
|
||||
"id": "c6ef332e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>7656</th>\n",
|
||||
" <td>88448</td>\n",
|
||||
" <td>Paper Birds (Pájaros de papel) (2010)</td>\n",
|
||||
" <td>Comedy|Drama</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8107</th>\n",
|
||||
" <td>100556</td>\n",
|
||||
" <td>Act of Killing, The (2012)</td>\n",
|
||||
" <td>Documentary</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9083</th>\n",
|
||||
" <td>143031</td>\n",
|
||||
" <td>Jump In! (2007)</td>\n",
|
||||
" <td>Comedy|Drama|Romance</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9094</th>\n",
|
||||
" <td>143511</td>\n",
|
||||
" <td>Human (2015)</td>\n",
|
||||
" <td>Documentary</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9096</th>\n",
|
||||
" <td>143559</td>\n",
|
||||
" <td>L.A. Slasher (2015)</td>\n",
|
||||
" <td>Comedy|Crime|Fantasy</td>\n",
|
||||
" <td>1.0</td>\n",
|
||||
" <td>5.0</td>\n",
|
||||
" <td>3.637779</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title genres \n",
|
||||
"7656 88448 Paper Birds (Pájaros de papel) (2010) Comedy|Drama \\\n",
|
||||
"8107 100556 Act of Killing, The (2012) Documentary \n",
|
||||
"9083 143031 Jump In! (2007) Comedy|Drama|Romance \n",
|
||||
"9094 143511 Human (2015) Documentary \n",
|
||||
"9096 143559 L.A. Slasher (2015) Comedy|Crime|Fantasy \n",
|
||||
"\n",
|
||||
" num_ratings mean_rating damped_mean_rating \n",
|
||||
"7656 1.0 5.0 3.637779 \n",
|
||||
"8107 1.0 5.0 3.637779 \n",
|
||||
"9083 1.0 5.0 3.637779 \n",
|
||||
"9094 1.0 5.0 3.637779 \n",
|
||||
"9096 1.0 5.0 3.637779 "
|
||||
]
|
||||
},
|
||||
"execution_count": 204,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"mean_rating\", ascending=False).head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 201,
|
||||
"id": "f669fb09",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>movieId</th>\n",
|
||||
" <th>title</th>\n",
|
||||
" <th>genres</th>\n",
|
||||
" <th>num_ratings</th>\n",
|
||||
" <th>mean_rating</th>\n",
|
||||
" <th>damped_mean_rating</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>277</th>\n",
|
||||
" <td>318</td>\n",
|
||||
" <td>Shawshank Redemption, The (1994)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>317.0</td>\n",
|
||||
" <td>4.429022</td>\n",
|
||||
" <td>4.400659</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>659</th>\n",
|
||||
" <td>858</td>\n",
|
||||
" <td>Godfather, The (1972)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>192.0</td>\n",
|
||||
" <td>4.289062</td>\n",
|
||||
" <td>4.250077</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2226</th>\n",
|
||||
" <td>2959</td>\n",
|
||||
" <td>Fight Club (1999)</td>\n",
|
||||
" <td>Action|Crime|Drama|Thriller</td>\n",
|
||||
" <td>218.0</td>\n",
|
||||
" <td>4.272936</td>\n",
|
||||
" <td>4.239103</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>922</th>\n",
|
||||
" <td>1221</td>\n",
|
||||
" <td>Godfather: Part II, The (1974)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>129.0</td>\n",
|
||||
" <td>4.259690</td>\n",
|
||||
" <td>4.205148</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>50</td>\n",
|
||||
" <td>Usual Suspects, The (1995)</td>\n",
|
||||
" <td>Crime|Mystery|Thriller</td>\n",
|
||||
" <td>204.0</td>\n",
|
||||
" <td>4.237745</td>\n",
|
||||
" <td>4.203344</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>224</th>\n",
|
||||
" <td>260</td>\n",
|
||||
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
|
||||
" <td>Action|Adventure|Sci-Fi</td>\n",
|
||||
" <td>251.0</td>\n",
|
||||
" <td>4.231076</td>\n",
|
||||
" <td>4.203125</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>602</th>\n",
|
||||
" <td>750</td>\n",
|
||||
" <td>Dr. Strangelove or: How I Learned to Stop Worr...</td>\n",
|
||||
" <td>Comedy|War</td>\n",
|
||||
" <td>97.0</td>\n",
|
||||
" <td>4.268041</td>\n",
|
||||
" <td>4.196407</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>914</th>\n",
|
||||
" <td>1213</td>\n",
|
||||
" <td>Goodfellas (1990)</td>\n",
|
||||
" <td>Crime|Drama</td>\n",
|
||||
" <td>126.0</td>\n",
|
||||
" <td>4.250000</td>\n",
|
||||
" <td>4.194967</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>461</th>\n",
|
||||
" <td>527</td>\n",
|
||||
" <td>Schindler's List (1993)</td>\n",
|
||||
" <td>Drama|War</td>\n",
|
||||
" <td>220.0</td>\n",
|
||||
" <td>4.225000</td>\n",
|
||||
" <td>4.193546</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6710</th>\n",
|
||||
" <td>58559</td>\n",
|
||||
" <td>Dark Knight, The (2008)</td>\n",
|
||||
" <td>Action|Crime|Drama|IMAX</td>\n",
|
||||
" <td>149.0</td>\n",
|
||||
" <td>4.238255</td>\n",
|
||||
" <td>4.191922</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" movieId title \n",
|
||||
"277 318 Shawshank Redemption, The (1994) \\\n",
|
||||
"659 858 Godfather, The (1972) \n",
|
||||
"2226 2959 Fight Club (1999) \n",
|
||||
"922 1221 Godfather: Part II, The (1974) \n",
|
||||
"46 50 Usual Suspects, The (1995) \n",
|
||||
"224 260 Star Wars: Episode IV - A New Hope (1977) \n",
|
||||
"602 750 Dr. Strangelove or: How I Learned to Stop Worr... \n",
|
||||
"914 1213 Goodfellas (1990) \n",
|
||||
"461 527 Schindler's List (1993) \n",
|
||||
"6710 58559 Dark Knight, The (2008) \n",
|
||||
"\n",
|
||||
" genres num_ratings mean_rating \n",
|
||||
"277 Crime|Drama 317.0 4.429022 \\\n",
|
||||
"659 Crime|Drama 192.0 4.289062 \n",
|
||||
"2226 Action|Crime|Drama|Thriller 218.0 4.272936 \n",
|
||||
"922 Crime|Drama 129.0 4.259690 \n",
|
||||
"46 Crime|Mystery|Thriller 204.0 4.237745 \n",
|
||||
"224 Action|Adventure|Sci-Fi 251.0 4.231076 \n",
|
||||
"602 Comedy|War 97.0 4.268041 \n",
|
||||
"914 Crime|Drama 126.0 4.250000 \n",
|
||||
"461 Drama|War 220.0 4.225000 \n",
|
||||
"6710 Action|Crime|Drama|IMAX 149.0 4.238255 \n",
|
||||
"\n",
|
||||
" damped_mean_rating \n",
|
||||
"277 4.400659 \n",
|
||||
"659 4.250077 \n",
|
||||
"2226 4.239103 \n",
|
||||
"922 4.205148 \n",
|
||||
"46 4.203344 \n",
|
||||
"224 4.203125 \n",
|
||||
"602 4.196407 \n",
|
||||
"914 4.194967 \n",
|
||||
"461 4.193546 \n",
|
||||
"6710 4.191922 "
|
||||
]
|
||||
},
|
||||
"execution_count": 201,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"movies_df.sort_values(by=\"damped_mean_rating\", ascending=False).head(10)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,176 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "25aa1c78",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "107e909b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the transactions data\n",
|
||||
"transactions = pd.read_csv(\"grocery_dataset.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "289a9751",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"itemDescription\n",
|
||||
"whole milk 515.0\n",
|
||||
"other vegetables 361.0\n",
|
||||
"rolls/buns 344.0\n",
|
||||
"soda 271.0\n",
|
||||
"yogurt 242.0\n",
|
||||
"dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"member_purchases = transactions.groupby(['Member_number', 'itemDescription'])['itemDescription'].count().unstack().fillna(0)\n",
|
||||
"item_descriptions = member_purchases.columns\n",
|
||||
"\n",
|
||||
"def simple_association(item_name):\n",
|
||||
" item_basket = member_purchases[member_purchases[item_name] > 0]\n",
|
||||
" co_purchase_counts = item_basket.sum().sort_values(ascending=False).drop(item_name)\n",
|
||||
" return co_purchase_counts.head(5)\n",
|
||||
"\n",
|
||||
"ex_item = item_descriptions[20]\n",
|
||||
"simple_association(ex_item)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "190a1485",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Top 10 recommendations for soda:\n",
|
||||
"\n",
|
||||
"itemDescription\n",
|
||||
"oil 1.246844\n",
|
||||
"beverages 1.162678\n",
|
||||
"sausage 1.014975\n",
|
||||
"grapes 1.001195\n",
|
||||
"shopping bags 0.95459\n",
|
||||
"frozen meals 0.943642\n",
|
||||
"specialty bar 0.936182\n",
|
||||
"butter 0.918418\n",
|
||||
"candy 0.910056\n",
|
||||
"specialty chocolate 0.904846\n",
|
||||
"Name: soda, dtype: object \n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Function to create a transaction matrix\n",
|
||||
"def create_transaction_matrix(transactions):\n",
|
||||
" # Group the transactions by member number, date and item description\n",
|
||||
" # Count the number of each item bought by each member on each date\n",
|
||||
" # Unstack the item descriptions to create a matrix where rows are transactions and columns are items\n",
|
||||
" # Fill any missing values with 0\n",
|
||||
" # Set the index to be the member number and date\n",
|
||||
" basket = (transactions.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription']\n",
|
||||
" .count().unstack().reset_index().fillna(0)\n",
|
||||
" .set_index(['Member_number', 'Date']))\n",
|
||||
" \n",
|
||||
" # Convert the counts to True or False\n",
|
||||
" # True if the item was bought in the transaction, False otherwise\n",
|
||||
" return basket.applymap(lambda x: True if x > 0 else False)\n",
|
||||
"\n",
|
||||
"# Function to calculate a lift matrix\n",
|
||||
"def calculate_lift_matrix(basket_sets, min_joint_probability=0.001):\n",
|
||||
" # Calculate the joint probability of each pair of items\n",
|
||||
" probability_pair = pd.DataFrame(index=basket_sets.columns, columns=basket_sets.columns)\n",
|
||||
" for item1 in basket_sets.columns:\n",
|
||||
" for item2 in basket_sets.columns:\n",
|
||||
" joint_probability = (basket_sets[item1] & basket_sets[item2]).sum() / len(basket_sets)\n",
|
||||
" if joint_probability > min_joint_probability:\n",
|
||||
" probability_pair.loc[item1, item2] = joint_probability\n",
|
||||
" else:\n",
|
||||
" probability_pair.loc[item1, item2] = 0\n",
|
||||
"\n",
|
||||
" # Set the diagonal of the joint probability matrix to 0\n",
|
||||
" np.fill_diagonal(probability_pair.values, 0)\n",
|
||||
"\n",
|
||||
" # Calculate the individual probability of each item\n",
|
||||
" probability_item = basket_sets.mean()\n",
|
||||
"\n",
|
||||
" # Calculate the product of the individual probabilities\n",
|
||||
" probability_product = np.outer(probability_item, probability_item)\n",
|
||||
"\n",
|
||||
" # Calculate the lift of each pair of items\n",
|
||||
" lift_matrix = probability_pair.divide(probability_product, fill_value=0)\n",
|
||||
" \n",
|
||||
" return lift_matrix\n",
|
||||
"\n",
|
||||
"# Function to recommend items\n",
|
||||
"def recommend_items(lift_matrix, item, num_recommendations=10):\n",
|
||||
" # Sort the items in the lift matrix for the given item in descending order\n",
|
||||
" # Take the top num_recommendations items\n",
|
||||
" recommended_for_item = lift_matrix[item].sort_values(ascending=False).head(num_recommendations)\n",
|
||||
" \n",
|
||||
" # Print the recommended items\n",
|
||||
" print(f\"Top {num_recommendations} recommendations for {item}:\\n\")\n",
|
||||
" print(recommended_for_item, \"\\n\\n\")\n",
|
||||
"\n",
|
||||
"# Create transaction matrix\n",
|
||||
"basket_sets = create_transaction_matrix(transactions)\n",
|
||||
"\n",
|
||||
"# Calculate the lift matrix\n",
|
||||
"lift_matrix = calculate_lift_matrix(basket_sets)\n",
|
||||
"\n",
|
||||
"# Recommend items for 'meat'\n",
|
||||
"recommend_items(lift_matrix, 'soda')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0c33033",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user