added trending, some comments in popularity

This commit is contained in:
Aladdin Persson
2023-11-02 18:48:58 -07:00
parent 0e22471b42
commit 5280b33a5c
5 changed files with 164 additions and 671 deletions

View File

@@ -1,458 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 207,
"id": "937dd4ed",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_data():\n",
" # Load the MovieLens data\n",
" movies_df = pd.read_csv(\"movielens_small/movies.csv\")\n",
" ratings_df = pd.read_csv(\"movielens_small/ratings.csv\")\n",
" return movies_df, ratings_df\n",
"\n",
"def calculate_popularity(movies_df, ratings_df, damping_factor=5):\n",
" # Calculate the number of ratings, mean rating, and sum of ratings for each movie\n",
" num_ratings = ratings_df.groupby(\"movieId\")[\"rating\"].count()\n",
" mean_rating = ratings_df.groupby(\"movieId\")[\"rating\"].mean()\n",
" global_mean = ratings_df[\"rating\"].mean()\n",
" \n",
" # Calculate the damped mean rating for each movie\n",
" damped_numerator = num_ratings * mean_rating + damping_factor * global_mean\n",
" damped_denominator = num_ratings + damping_factor\n",
" damped_mean_rating = damped_numerator / damped_denominator\n",
" \n",
" # Add the popularity data to the movie data\n",
" movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)\n",
" movies_df['mean_rating'] = movies_df['movieId'].map(mean_rating)\n",
" movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)\n",
" return movies_df\n",
"\n",
"movies_df, ratings_df = load_data()\n",
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "7e649c6f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>314</th>\n",
" <td>356</td>\n",
" <td>Forrest Gump (1994)</td>\n",
" <td>Comedy|Drama|Romance|War</td>\n",
" <td>329.0</td>\n",
" <td>4.164134</td>\n",
" <td>4.144589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>318</td>\n",
" <td>Shawshank Redemption, The (1994)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>317.0</td>\n",
" <td>4.429022</td>\n",
" <td>4.400659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>296</td>\n",
" <td>Pulp Fiction (1994)</td>\n",
" <td>Comedy|Crime|Drama|Thriller</td>\n",
" <td>307.0</td>\n",
" <td>4.197068</td>\n",
" <td>4.175128</td>\n",
" </tr>\n",
" <tr>\n",
" <th>510</th>\n",
" <td>593</td>\n",
" <td>Silence of the Lambs, The (1991)</td>\n",
" <td>Crime|Horror|Thriller</td>\n",
" <td>279.0</td>\n",
" <td>4.161290</td>\n",
" <td>4.138462</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1939</th>\n",
" <td>2571</td>\n",
" <td>Matrix, The (1999)</td>\n",
" <td>Action|Sci-Fi|Thriller</td>\n",
" <td>278.0</td>\n",
" <td>4.192446</td>\n",
" <td>4.168457</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title genres \n",
"314 356 Forrest Gump (1994) Comedy|Drama|Romance|War \\\n",
"277 318 Shawshank Redemption, The (1994) Crime|Drama \n",
"257 296 Pulp Fiction (1994) Comedy|Crime|Drama|Thriller \n",
"510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller \n",
"1939 2571 Matrix, The (1999) Action|Sci-Fi|Thriller \n",
"\n",
" num_ratings mean_rating damped_mean_rating \n",
"314 329.0 4.164134 4.144589 \n",
"277 317.0 4.429022 4.400659 \n",
"257 307.0 4.197068 4.175128 \n",
"510 279.0 4.161290 4.138462 \n",
"1939 278.0 4.192446 4.168457 "
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"num_ratings\", ascending=False).head()"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "c6ef332e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7656</th>\n",
" <td>88448</td>\n",
" <td>Paper Birds (Pájaros de papel) (2010)</td>\n",
" <td>Comedy|Drama</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8107</th>\n",
" <td>100556</td>\n",
" <td>Act of Killing, The (2012)</td>\n",
" <td>Documentary</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9083</th>\n",
" <td>143031</td>\n",
" <td>Jump In! (2007)</td>\n",
" <td>Comedy|Drama|Romance</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9094</th>\n",
" <td>143511</td>\n",
" <td>Human (2015)</td>\n",
" <td>Documentary</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9096</th>\n",
" <td>143559</td>\n",
" <td>L.A. Slasher (2015)</td>\n",
" <td>Comedy|Crime|Fantasy</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>3.637779</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title genres \n",
"7656 88448 Paper Birds (Pájaros de papel) (2010) Comedy|Drama \\\n",
"8107 100556 Act of Killing, The (2012) Documentary \n",
"9083 143031 Jump In! (2007) Comedy|Drama|Romance \n",
"9094 143511 Human (2015) Documentary \n",
"9096 143559 L.A. Slasher (2015) Comedy|Crime|Fantasy \n",
"\n",
" num_ratings mean_rating damped_mean_rating \n",
"7656 1.0 5.0 3.637779 \n",
"8107 1.0 5.0 3.637779 \n",
"9083 1.0 5.0 3.637779 \n",
"9094 1.0 5.0 3.637779 \n",
"9096 1.0 5.0 3.637779 "
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"mean_rating\", ascending=False).head(5)"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "f669fb09",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>movieId</th>\n",
" <th>title</th>\n",
" <th>genres</th>\n",
" <th>num_ratings</th>\n",
" <th>mean_rating</th>\n",
" <th>damped_mean_rating</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>318</td>\n",
" <td>Shawshank Redemption, The (1994)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>317.0</td>\n",
" <td>4.429022</td>\n",
" <td>4.400659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>659</th>\n",
" <td>858</td>\n",
" <td>Godfather, The (1972)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>192.0</td>\n",
" <td>4.289062</td>\n",
" <td>4.250077</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2226</th>\n",
" <td>2959</td>\n",
" <td>Fight Club (1999)</td>\n",
" <td>Action|Crime|Drama|Thriller</td>\n",
" <td>218.0</td>\n",
" <td>4.272936</td>\n",
" <td>4.239103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>922</th>\n",
" <td>1221</td>\n",
" <td>Godfather: Part II, The (1974)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>129.0</td>\n",
" <td>4.259690</td>\n",
" <td>4.205148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>50</td>\n",
" <td>Usual Suspects, The (1995)</td>\n",
" <td>Crime|Mystery|Thriller</td>\n",
" <td>204.0</td>\n",
" <td>4.237745</td>\n",
" <td>4.203344</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>260</td>\n",
" <td>Star Wars: Episode IV - A New Hope (1977)</td>\n",
" <td>Action|Adventure|Sci-Fi</td>\n",
" <td>251.0</td>\n",
" <td>4.231076</td>\n",
" <td>4.203125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>602</th>\n",
" <td>750</td>\n",
" <td>Dr. Strangelove or: How I Learned to Stop Worr...</td>\n",
" <td>Comedy|War</td>\n",
" <td>97.0</td>\n",
" <td>4.268041</td>\n",
" <td>4.196407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>914</th>\n",
" <td>1213</td>\n",
" <td>Goodfellas (1990)</td>\n",
" <td>Crime|Drama</td>\n",
" <td>126.0</td>\n",
" <td>4.250000</td>\n",
" <td>4.194967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>461</th>\n",
" <td>527</td>\n",
" <td>Schindler's List (1993)</td>\n",
" <td>Drama|War</td>\n",
" <td>220.0</td>\n",
" <td>4.225000</td>\n",
" <td>4.193546</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6710</th>\n",
" <td>58559</td>\n",
" <td>Dark Knight, The (2008)</td>\n",
" <td>Action|Crime|Drama|IMAX</td>\n",
" <td>149.0</td>\n",
" <td>4.238255</td>\n",
" <td>4.191922</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" movieId title \n",
"277 318 Shawshank Redemption, The (1994) \\\n",
"659 858 Godfather, The (1972) \n",
"2226 2959 Fight Club (1999) \n",
"922 1221 Godfather: Part II, The (1974) \n",
"46 50 Usual Suspects, The (1995) \n",
"224 260 Star Wars: Episode IV - A New Hope (1977) \n",
"602 750 Dr. Strangelove or: How I Learned to Stop Worr... \n",
"914 1213 Goodfellas (1990) \n",
"461 527 Schindler's List (1993) \n",
"6710 58559 Dark Knight, The (2008) \n",
"\n",
" genres num_ratings mean_rating \n",
"277 Crime|Drama 317.0 4.429022 \\\n",
"659 Crime|Drama 192.0 4.289062 \n",
"2226 Action|Crime|Drama|Thriller 218.0 4.272936 \n",
"922 Crime|Drama 129.0 4.259690 \n",
"46 Crime|Mystery|Thriller 204.0 4.237745 \n",
"224 Action|Adventure|Sci-Fi 251.0 4.231076 \n",
"602 Comedy|War 97.0 4.268041 \n",
"914 Crime|Drama 126.0 4.250000 \n",
"461 Drama|War 220.0 4.225000 \n",
"6710 Action|Crime|Drama|IMAX 149.0 4.238255 \n",
"\n",
" damped_mean_rating \n",
"277 4.400659 \n",
"659 4.250077 \n",
"2226 4.239103 \n",
"922 4.205148 \n",
"46 4.203344 \n",
"224 4.203125 \n",
"602 4.196407 \n",
"914 4.194967 \n",
"461 4.193546 \n",
"6710 4.191922 "
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_df.sort_values(by=\"damped_mean_rating\", ascending=False).head(10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,176 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "25aa1c78",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "107e909b",
"metadata": {},
"outputs": [],
"source": [
"# Load the transactions data\n",
"transactions = pd.read_csv(\"grocery_dataset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "289a9751",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"itemDescription\n",
"whole milk 515.0\n",
"other vegetables 361.0\n",
"rolls/buns 344.0\n",
"soda 271.0\n",
"yogurt 242.0\n",
"dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"member_purchases = transactions.groupby(['Member_number', 'itemDescription'])['itemDescription'].count().unstack().fillna(0)\n",
"item_descriptions = member_purchases.columns\n",
"\n",
"def simple_association(item_name):\n",
" item_basket = member_purchases[member_purchases[item_name] > 0]\n",
" co_purchase_counts = item_basket.sum().sort_values(ascending=False).drop(item_name)\n",
" return co_purchase_counts.head(5)\n",
"\n",
"ex_item = item_descriptions[20]\n",
"simple_association(ex_item)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "190a1485",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Top 10 recommendations for soda:\n",
"\n",
"itemDescription\n",
"oil 1.246844\n",
"beverages 1.162678\n",
"sausage 1.014975\n",
"grapes 1.001195\n",
"shopping bags 0.95459\n",
"frozen meals 0.943642\n",
"specialty bar 0.936182\n",
"butter 0.918418\n",
"candy 0.910056\n",
"specialty chocolate 0.904846\n",
"Name: soda, dtype: object \n",
"\n",
"\n"
]
}
],
"source": [
"# Function to create a transaction matrix\n",
"def create_transaction_matrix(transactions):\n",
" # Group the transactions by member number, date and item description\n",
" # Count the number of each item bought by each member on each date\n",
" # Unstack the item descriptions to create a matrix where rows are transactions and columns are items\n",
" # Fill any missing values with 0\n",
" # Set the index to be the member number and date\n",
" basket = (transactions.groupby(['Member_number', 'Date', 'itemDescription'])['itemDescription']\n",
" .count().unstack().reset_index().fillna(0)\n",
" .set_index(['Member_number', 'Date']))\n",
" \n",
" # Convert the counts to True or False\n",
" # True if the item was bought in the transaction, False otherwise\n",
" return basket.applymap(lambda x: True if x > 0 else False)\n",
"\n",
"# Function to calculate a lift matrix\n",
"def calculate_lift_matrix(basket_sets, min_joint_probability=0.001):\n",
" # Calculate the joint probability of each pair of items\n",
" probability_pair = pd.DataFrame(index=basket_sets.columns, columns=basket_sets.columns)\n",
" for item1 in basket_sets.columns:\n",
" for item2 in basket_sets.columns:\n",
" joint_probability = (basket_sets[item1] & basket_sets[item2]).sum() / len(basket_sets)\n",
" if joint_probability > min_joint_probability:\n",
" probability_pair.loc[item1, item2] = joint_probability\n",
" else:\n",
" probability_pair.loc[item1, item2] = 0\n",
"\n",
" # Set the diagonal of the joint probability matrix to 0\n",
" np.fill_diagonal(probability_pair.values, 0)\n",
"\n",
" # Calculate the individual probability of each item\n",
" probability_item = basket_sets.mean()\n",
"\n",
" # Calculate the product of the individual probabilities\n",
" probability_product = np.outer(probability_item, probability_item)\n",
"\n",
" # Calculate the lift of each pair of items\n",
" lift_matrix = probability_pair.divide(probability_product, fill_value=0)\n",
" \n",
" return lift_matrix\n",
"\n",
"# Function to recommend items\n",
"def recommend_items(lift_matrix, item, num_recommendations=10):\n",
" # Sort the items in the lift matrix for the given item in descending order\n",
" # Take the top num_recommendations items\n",
" recommended_for_item = lift_matrix[item].sort_values(ascending=False).head(num_recommendations)\n",
" \n",
" # Print the recommended items\n",
" print(f\"Top {num_recommendations} recommendations for {item}:\\n\")\n",
" print(recommended_for_item, \"\\n\\n\")\n",
"\n",
"# Create transaction matrix\n",
"basket_sets = create_transaction_matrix(transactions)\n",
"\n",
"# Calculate the lift matrix\n",
"lift_matrix = calculate_lift_matrix(basket_sets)\n",
"\n",
"# Recommend items for 'meat'\n",
"recommend_items(lift_matrix, 'soda')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,33 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "b0c33033",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,8 +1,46 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Non-Personalized Recommender Systems: Popularity Based"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 207, "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"\n",
"if os.path.exists('movielens_small.zip'):\n",
" !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip \n",
" !unzip ml-latest-small.zip\n",
" !rm ml-latest-small.zip\n",
" !mv ml-latest-small movielens_small"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Damped Mean\n",
"\n",
"$$ s(i) = \\frac{\\Sigma_{u \\in U_i} r_i + a \\times \\mu}{|U_i| + a} $$\n",
"\n",
"Where:\n",
"- $ s(i) $: The damped mean rating for item $ i $.\n",
"- $ \\Sigma_{u \\in U_i} r_i $: Sum of the ratings for item $ i $.\n",
"- $ a $: Damping factor, a value that determines the extent of smoothing.\n",
"- $ \\mu $: Global mean rating across all items.\n",
"- $ |U_i| $: Total number of ratings for item $ i $.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "937dd4ed", "id": "937dd4ed",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -36,6 +74,13 @@
"movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)" "movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's see how using num_ratings compares to mean rating & damped mean rating."
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 198, "execution_count": 198,
@@ -450,7 +495,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.16" "version": "3.11.3"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -2,9 +2,124 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"id": "b0c33033", "id": "b0c33033",
"metadata": {}, "metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Posts sorted by Reddit's 'Hot' score:\n",
" post_id post_title upvotes \n",
"9 10 Difference between CNN and RNN? 350 \\\n",
"8 9 The future of quantum computing in AI 600 \n",
"7 8 Experience with multi-modal learning? 450 \n",
"6 7 Looking for resources on probabilistic program... 700 \n",
"4 5 Tips for handling imbalanced datasets? 1100 \n",
"2 3 Has anyone tried the new reinforcement learnin... 900 \n",
"3 4 Discussion: Evolutionary algorithms vs gradien... 800 \n",
"5 6 Which GPU is best for neural network training? 300 \n",
"0 1 How do I start with machine learning? 600 \n",
"1 2 Best practices for deep learning optimization? 400 \n",
"\n",
" downvotes age_in_seconds age_in_hours reddit_hot hacker_news \n",
"9 50 256000 71.111111 8.166010 0.042205 \n",
"8 50 128000 35.555556 5.584807 0.227638 \n",
"7 50 64000 17.777778 4.024282 0.559318 \n",
"6 50 32000 8.888889 3.524024 2.416714 \n",
"4 100 8000 2.222222 3.177778 18.779258 \n",
"2 100 2000 0.555556 2.947534 38.776074 \n",
"3 100 4000 1.111111 2.933987 24.453093 \n",
"5 50 16000 4.444444 2.753496 2.886859 \n",
"0 100 500 0.138889 2.710081 36.655710 \n",
"1 50 1000 0.277778 2.566290 24.588946 \n",
"\n",
"Posts sorted by Hacker News score:\n",
" post_id post_title upvotes \n",
"2 3 Has anyone tried the new reinforcement learnin... 900 \\\n",
"0 1 How do I start with machine learning? 600 \n",
"1 2 Best practices for deep learning optimization? 400 \n",
"3 4 Discussion: Evolutionary algorithms vs gradien... 800 \n",
"4 5 Tips for handling imbalanced datasets? 1100 \n",
"5 6 Which GPU is best for neural network training? 300 \n",
"6 7 Looking for resources on probabilistic program... 700 \n",
"7 8 Experience with multi-modal learning? 450 \n",
"8 9 The future of quantum computing in AI 600 \n",
"9 10 Difference between CNN and RNN? 350 \n",
"\n",
" downvotes age_in_seconds age_in_hours reddit_hot hacker_news \n",
"2 100 2000 0.555556 2.947534 38.776074 \n",
"0 100 500 0.138889 2.710081 36.655710 \n",
"1 50 1000 0.277778 2.566290 24.588946 \n",
"3 100 4000 1.111111 2.933987 24.453093 \n",
"4 100 8000 2.222222 3.177778 18.779258 \n",
"5 50 16000 4.444444 2.753496 2.886859 \n",
"6 50 32000 8.888889 3.524024 2.416714 \n",
"7 50 64000 17.777778 4.024282 0.559318 \n",
"8 50 128000 35.555556 5.584807 0.227638 \n",
"9 50 256000 71.111111 8.166010 0.042205 \n"
]
}
],
"source": [
"import math\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = {\n",
" 'post_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
" 'post_title': [\n",
" \"How do I start with machine learning?\",\n",
" \"Best practices for deep learning optimization?\",\n",
" \"Has anyone tried the new reinforcement learning library?\",\n",
" \"Discussion: Evolutionary algorithms vs gradient descent\",\n",
" \"Tips for handling imbalanced datasets?\",\n",
" \"Which GPU is best for neural network training?\",\n",
" \"Looking for resources on probabilistic programming\",\n",
" \"Experience with multi-modal learning?\",\n",
" \"The future of quantum computing in AI\",\n",
" \"Difference between CNN and RNN?\"\n",
" ],\n",
" 'upvotes': [600, 400, 900, 800, 1100, 300, 700, 450, 600, 350],\n",
" 'downvotes': [100, 50, 100, 100, 100, 50, 50, 50, 50, 50],\n",
" 'age_in_seconds': [500, 1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000]\n",
"}\n",
"\n",
"\n",
"# Convert to DataFrame\n",
"reddit_df = pd.DataFrame(data)\n",
"\n",
"# Calculate age in hours from age in seconds\n",
"reddit_df['age_in_hours'] = reddit_df['age_in_seconds'] / 3600\n",
"\n",
"# Reddit's \"Hot\" formula\n",
"def reddit_hot(U, D, t):\n",
" return math.log10(max(abs(U-D), 1)) + np.sign(U-D) * t / 45000\n",
"\n",
"# Modified Hacker News formula\n",
"def hacker_news(U, D, T, P=1, alpha=0.8, gamma=1.8):\n",
" return P * pow((U - D - 1), alpha) / pow((T + 2), gamma)\n",
"\n",
"# Apply the formulas\n",
"reddit_df['reddit_hot'] = reddit_df.apply(lambda row: reddit_hot(row['upvotes'], row['downvotes'], row['age_in_seconds']), axis=1)\n",
"reddit_df['hacker_news'] = reddit_df.apply(lambda row: hacker_news(row['upvotes'], row['downvotes'], row['age_in_hours']), axis=1)\n",
"\n",
"# Sort by Reddit's \"Hot\" score and print\n",
"reddit_df_sorted = reddit_df.sort_values(by='reddit_hot', ascending=False)\n",
"print(\"Posts sorted by Reddit's 'Hot' score:\")\n",
"print(reddit_df_sorted)\n",
"\n",
"# Sort by Hacker News score and print\n",
"hacker_news_df_sorted = reddit_df.sort_values(by='hacker_news', ascending=False)\n",
"print(\"\\nPosts sorted by Hacker News score:\")\n",
"print(hacker_news_df_sorted)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [], "outputs": [],
"source": [] "source": []
} }
@@ -25,7 +140,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.16" "version": "3.11.3"
} }
}, },
"nbformat": 4, "nbformat": 4,