Files

1357 lines
46 KiB
Plaintext
Raw Permalink Normal View History

2021-01-30 21:49:15 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "joint-electric",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "quantitative-beverage",
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv(\"train.csv\")\n",
"test = pd.read_csv(\"test.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "twelve-insulin",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>...</th>\n",
" <th>var_190</th>\n",
" <th>var_191</th>\n",
" <th>var_192</th>\n",
" <th>var_193</th>\n",
" <th>var_194</th>\n",
" <th>var_195</th>\n",
" <th>var_196</th>\n",
" <th>var_197</th>\n",
" <th>var_198</th>\n",
" <th>var_199</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train_0</td>\n",
" <td>0</td>\n",
" <td>8.9255</td>\n",
" <td>-6.7863</td>\n",
" <td>11.9081</td>\n",
" <td>5.0930</td>\n",
" <td>11.4607</td>\n",
" <td>-9.2834</td>\n",
" <td>5.1187</td>\n",
" <td>18.6266</td>\n",
" <td>...</td>\n",
" <td>4.4354</td>\n",
" <td>3.9642</td>\n",
" <td>3.1364</td>\n",
" <td>1.6910</td>\n",
" <td>18.5227</td>\n",
" <td>-2.3978</td>\n",
" <td>7.8784</td>\n",
" <td>8.5635</td>\n",
" <td>12.7803</td>\n",
" <td>-1.0914</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>train_1</td>\n",
" <td>0</td>\n",
" <td>11.5006</td>\n",
" <td>-4.1473</td>\n",
" <td>13.8588</td>\n",
" <td>5.3890</td>\n",
" <td>12.3622</td>\n",
" <td>7.0433</td>\n",
" <td>5.6208</td>\n",
" <td>16.5338</td>\n",
" <td>...</td>\n",
" <td>7.6421</td>\n",
" <td>7.7214</td>\n",
" <td>2.5837</td>\n",
" <td>10.9516</td>\n",
" <td>15.4305</td>\n",
" <td>2.0339</td>\n",
" <td>8.1267</td>\n",
" <td>8.7889</td>\n",
" <td>18.3560</td>\n",
" <td>1.9518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>train_2</td>\n",
" <td>0</td>\n",
" <td>8.6093</td>\n",
" <td>-2.7457</td>\n",
" <td>12.0805</td>\n",
" <td>7.8928</td>\n",
" <td>10.5825</td>\n",
" <td>-9.0837</td>\n",
" <td>6.9427</td>\n",
" <td>14.6155</td>\n",
" <td>...</td>\n",
" <td>2.9057</td>\n",
" <td>9.7905</td>\n",
" <td>1.6704</td>\n",
" <td>1.6858</td>\n",
" <td>21.6042</td>\n",
" <td>3.1417</td>\n",
" <td>-6.5213</td>\n",
" <td>8.2675</td>\n",
" <td>14.7222</td>\n",
" <td>0.3965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>train_3</td>\n",
" <td>0</td>\n",
" <td>11.0604</td>\n",
" <td>-2.1518</td>\n",
" <td>8.9522</td>\n",
" <td>7.1957</td>\n",
" <td>12.5846</td>\n",
" <td>-1.8361</td>\n",
" <td>5.8428</td>\n",
" <td>14.9250</td>\n",
" <td>...</td>\n",
" <td>4.4666</td>\n",
" <td>4.7433</td>\n",
" <td>0.7178</td>\n",
" <td>1.4214</td>\n",
" <td>23.0347</td>\n",
" <td>-1.2706</td>\n",
" <td>-2.9275</td>\n",
" <td>10.2922</td>\n",
" <td>17.9697</td>\n",
" <td>-8.9996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>train_4</td>\n",
" <td>0</td>\n",
" <td>9.8369</td>\n",
" <td>-1.4834</td>\n",
" <td>12.8746</td>\n",
" <td>6.6375</td>\n",
" <td>12.2772</td>\n",
" <td>2.4486</td>\n",
" <td>5.9405</td>\n",
" <td>19.2514</td>\n",
" <td>...</td>\n",
" <td>-1.4905</td>\n",
" <td>9.5214</td>\n",
" <td>-0.1508</td>\n",
" <td>9.1942</td>\n",
" <td>13.2876</td>\n",
" <td>-1.5121</td>\n",
" <td>3.9267</td>\n",
" <td>9.5031</td>\n",
" <td>17.9974</td>\n",
" <td>-8.8104</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 202 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_2 var_3 var_4 var_5 var_6 \\\n",
"0 train_0 0 8.9255 -6.7863 11.9081 5.0930 11.4607 -9.2834 5.1187 \n",
"1 train_1 0 11.5006 -4.1473 13.8588 5.3890 12.3622 7.0433 5.6208 \n",
"2 train_2 0 8.6093 -2.7457 12.0805 7.8928 10.5825 -9.0837 6.9427 \n",
"3 train_3 0 11.0604 -2.1518 8.9522 7.1957 12.5846 -1.8361 5.8428 \n",
"4 train_4 0 9.8369 -1.4834 12.8746 6.6375 12.2772 2.4486 5.9405 \n",
"\n",
" var_7 ... var_190 var_191 var_192 var_193 var_194 var_195 \\\n",
"0 18.6266 ... 4.4354 3.9642 3.1364 1.6910 18.5227 -2.3978 \n",
"1 16.5338 ... 7.6421 7.7214 2.5837 10.9516 15.4305 2.0339 \n",
"2 14.6155 ... 2.9057 9.7905 1.6704 1.6858 21.6042 3.1417 \n",
"3 14.9250 ... 4.4666 4.7433 0.7178 1.4214 23.0347 -1.2706 \n",
"4 19.2514 ... -1.4905 9.5214 -0.1508 9.1942 13.2876 -1.5121 \n",
"\n",
" var_196 var_197 var_198 var_199 \n",
"0 7.8784 8.5635 12.7803 -1.0914 \n",
"1 8.1267 8.7889 18.3560 1.9518 \n",
"2 -6.5213 8.2675 14.7222 0.3965 \n",
"3 -2.9275 10.2922 17.9697 -8.9996 \n",
"4 3.9267 9.5031 17.9974 -8.8104 \n",
"\n",
"[5 rows x 202 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "appreciated-affairs",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 59.75it/s]\n"
]
}
],
"source": [
"col_names = [f\"var_{i}\" for i in range(200)]\n",
"for col in tqdm(col_names):\n",
" count = test[col].value_counts()\n",
" uniques = count.index[count == 1]\n",
" test[col + \"_u\"] = test[col].isin(uniques)\n",
"\n",
"test[\"has_unique\"] = test[[col + \"_u\" for col in col_names]].any(axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "mighty-basics",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"100000"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test[\"has_unique\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "sustainable-palestinian",
"metadata": {},
"outputs": [],
"source": [
"real_test = test.loc[test[\"has_unique\"], [\"ID_code\"] + col_names]\n",
"fake_test = test.loc[~test[\"has_unique\"], [\"ID_code\"] + col_names]\n",
"train_and_test = pd.concat([train, real_test], axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "military-tiger",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:43<00:00, 4.64it/s]\n"
]
}
],
"source": [
"for col in tqdm(col_names):\n",
" count = train_and_test[col].value_counts().to_dict()\n",
" train_and_test[col+\"_unique\"] = train_and_test[col].apply(\n",
" lambda x: 1 if count[x] == 1 else 0).values\n",
" fake_test[col+\"_unique\"] = 0 "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "extraordinary-phrase",
"metadata": {},
"outputs": [],
"source": [
"real_test = train_and_test[train_and_test[\"ID_code\"].str.contains(\"test\")].copy()\n",
"real_test.drop([\"target\"], axis=1, inplace=True)\n",
"train = train_and_test[train_and_test[\"ID_code\"].str.contains(\"train\")].copy()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "quantitative-iraqi",
"metadata": {},
"outputs": [],
"source": [
"test = pd.concat([real_test, fake_test], axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "instant-kitty",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>...</th>\n",
" <th>var_190_unique</th>\n",
" <th>var_191_unique</th>\n",
" <th>var_192_unique</th>\n",
" <th>var_193_unique</th>\n",
" <th>var_194_unique</th>\n",
" <th>var_195_unique</th>\n",
" <th>var_196_unique</th>\n",
" <th>var_197_unique</th>\n",
" <th>var_198_unique</th>\n",
" <th>var_199_unique</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train_0</td>\n",
" <td>0.0</td>\n",
" <td>8.9255</td>\n",
" <td>-6.7863</td>\n",
" <td>11.9081</td>\n",
" <td>5.0930</td>\n",
" <td>11.4607</td>\n",
" <td>-9.2834</td>\n",
" <td>5.1187</td>\n",
" <td>18.6266</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>train_1</td>\n",
" <td>0.0</td>\n",
" <td>11.5006</td>\n",
" <td>-4.1473</td>\n",
" <td>13.8588</td>\n",
" <td>5.3890</td>\n",
" <td>12.3622</td>\n",
" <td>7.0433</td>\n",
" <td>5.6208</td>\n",
" <td>16.5338</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>train_2</td>\n",
" <td>0.0</td>\n",
" <td>8.6093</td>\n",
" <td>-2.7457</td>\n",
" <td>12.0805</td>\n",
" <td>7.8928</td>\n",
" <td>10.5825</td>\n",
" <td>-9.0837</td>\n",
" <td>6.9427</td>\n",
" <td>14.6155</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>train_3</td>\n",
" <td>0.0</td>\n",
" <td>11.0604</td>\n",
" <td>-2.1518</td>\n",
" <td>8.9522</td>\n",
" <td>7.1957</td>\n",
" <td>12.5846</td>\n",
" <td>-1.8361</td>\n",
" <td>5.8428</td>\n",
" <td>14.9250</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>train_4</td>\n",
" <td>0.0</td>\n",
" <td>9.8369</td>\n",
" <td>-1.4834</td>\n",
" <td>12.8746</td>\n",
" <td>6.6375</td>\n",
" <td>12.2772</td>\n",
" <td>2.4486</td>\n",
" <td>5.9405</td>\n",
" <td>19.2514</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199995</th>\n",
" <td>train_199995</td>\n",
" <td>0.0</td>\n",
" <td>11.4880</td>\n",
" <td>-0.4956</td>\n",
" <td>8.2622</td>\n",
" <td>3.5142</td>\n",
" <td>10.3404</td>\n",
" <td>11.6081</td>\n",
" <td>5.6709</td>\n",
" <td>15.1516</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199996</th>\n",
" <td>train_199996</td>\n",
" <td>0.0</td>\n",
" <td>4.9149</td>\n",
" <td>-2.4484</td>\n",
" <td>16.7052</td>\n",
" <td>6.6345</td>\n",
" <td>8.3096</td>\n",
" <td>-10.5628</td>\n",
" <td>5.8802</td>\n",
" <td>21.5940</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199997</th>\n",
" <td>train_199997</td>\n",
" <td>0.0</td>\n",
" <td>11.2232</td>\n",
" <td>-5.0518</td>\n",
" <td>10.5127</td>\n",
" <td>5.6456</td>\n",
" <td>9.3410</td>\n",
" <td>-5.4086</td>\n",
" <td>4.5555</td>\n",
" <td>21.5571</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199998</th>\n",
" <td>train_199998</td>\n",
" <td>0.0</td>\n",
" <td>9.7148</td>\n",
" <td>-8.6098</td>\n",
" <td>13.6104</td>\n",
" <td>5.7930</td>\n",
" <td>12.5173</td>\n",
" <td>0.5339</td>\n",
" <td>6.0479</td>\n",
" <td>17.0152</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199999</th>\n",
" <td>train_199999</td>\n",
" <td>0.0</td>\n",
" <td>10.8762</td>\n",
" <td>-5.7105</td>\n",
" <td>12.1183</td>\n",
" <td>8.0328</td>\n",
" <td>11.5577</td>\n",
" <td>0.3488</td>\n",
" <td>5.2839</td>\n",
" <td>15.2058</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200000 rows × 402 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_2 var_3 var_4 \\\n",
"0 train_0 0.0 8.9255 -6.7863 11.9081 5.0930 11.4607 \n",
"1 train_1 0.0 11.5006 -4.1473 13.8588 5.3890 12.3622 \n",
"2 train_2 0.0 8.6093 -2.7457 12.0805 7.8928 10.5825 \n",
"3 train_3 0.0 11.0604 -2.1518 8.9522 7.1957 12.5846 \n",
"4 train_4 0.0 9.8369 -1.4834 12.8746 6.6375 12.2772 \n",
"... ... ... ... ... ... ... ... \n",
"199995 train_199995 0.0 11.4880 -0.4956 8.2622 3.5142 10.3404 \n",
"199996 train_199996 0.0 4.9149 -2.4484 16.7052 6.6345 8.3096 \n",
"199997 train_199997 0.0 11.2232 -5.0518 10.5127 5.6456 9.3410 \n",
"199998 train_199998 0.0 9.7148 -8.6098 13.6104 5.7930 12.5173 \n",
"199999 train_199999 0.0 10.8762 -5.7105 12.1183 8.0328 11.5577 \n",
"\n",
" var_5 var_6 var_7 ... var_190_unique var_191_unique \\\n",
"0 -9.2834 5.1187 18.6266 ... 0 0 \n",
"1 7.0433 5.6208 16.5338 ... 0 0 \n",
"2 -9.0837 6.9427 14.6155 ... 0 0 \n",
"3 -1.8361 5.8428 14.9250 ... 0 0 \n",
"4 2.4486 5.9405 19.2514 ... 0 0 \n",
"... ... ... ... ... ... ... \n",
"199995 11.6081 5.6709 15.1516 ... 0 1 \n",
"199996 -10.5628 5.8802 21.5940 ... 0 0 \n",
"199997 -5.4086 4.5555 21.5571 ... 0 0 \n",
"199998 0.5339 6.0479 17.0152 ... 0 0 \n",
"199999 0.3488 5.2839 15.2058 ... 0 0 \n",
"\n",
" var_192_unique var_193_unique var_194_unique var_195_unique \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 1 1 1 0 \n",
"... ... ... ... ... \n",
"199995 0 0 0 0 \n",
"199996 0 1 0 0 \n",
"199997 0 0 0 0 \n",
"199998 0 0 0 0 \n",
"199999 0 0 0 0 \n",
"\n",
" var_196_unique var_197_unique var_198_unique var_199_unique \n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"199995 1 0 0 0 \n",
"199996 0 0 0 0 \n",
"199997 0 0 0 1 \n",
"199998 0 0 0 1 \n",
"199999 0 0 0 1 \n",
"\n",
"[200000 rows x 402 columns]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "human-japanese",
"metadata": {},
"outputs": [],
"source": [
"train.to_csv(\"new_shiny_train.csv\", index=False)\n",
"test.to_csv(\"new_shiny_test.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "outer-walter",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>target</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>...</th>\n",
" <th>var_190_unique</th>\n",
" <th>var_191_unique</th>\n",
" <th>var_192_unique</th>\n",
" <th>var_193_unique</th>\n",
" <th>var_194_unique</th>\n",
" <th>var_195_unique</th>\n",
" <th>var_196_unique</th>\n",
" <th>var_197_unique</th>\n",
" <th>var_198_unique</th>\n",
" <th>var_199_unique</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>train_0</td>\n",
" <td>0.0</td>\n",
" <td>8.9255</td>\n",
" <td>-6.7863</td>\n",
" <td>11.9081</td>\n",
" <td>5.0930</td>\n",
" <td>11.4607</td>\n",
" <td>-9.2834</td>\n",
" <td>5.1187</td>\n",
" <td>18.6266</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>train_1</td>\n",
" <td>0.0</td>\n",
" <td>11.5006</td>\n",
" <td>-4.1473</td>\n",
" <td>13.8588</td>\n",
" <td>5.3890</td>\n",
" <td>12.3622</td>\n",
" <td>7.0433</td>\n",
" <td>5.6208</td>\n",
" <td>16.5338</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>train_2</td>\n",
" <td>0.0</td>\n",
" <td>8.6093</td>\n",
" <td>-2.7457</td>\n",
" <td>12.0805</td>\n",
" <td>7.8928</td>\n",
" <td>10.5825</td>\n",
" <td>-9.0837</td>\n",
" <td>6.9427</td>\n",
" <td>14.6155</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>train_3</td>\n",
" <td>0.0</td>\n",
" <td>11.0604</td>\n",
" <td>-2.1518</td>\n",
" <td>8.9522</td>\n",
" <td>7.1957</td>\n",
" <td>12.5846</td>\n",
" <td>-1.8361</td>\n",
" <td>5.8428</td>\n",
" <td>14.9250</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>train_4</td>\n",
" <td>0.0</td>\n",
" <td>9.8369</td>\n",
" <td>-1.4834</td>\n",
" <td>12.8746</td>\n",
" <td>6.6375</td>\n",
" <td>12.2772</td>\n",
" <td>2.4486</td>\n",
" <td>5.9405</td>\n",
" <td>19.2514</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199995</th>\n",
" <td>train_199995</td>\n",
" <td>0.0</td>\n",
" <td>11.4880</td>\n",
" <td>-0.4956</td>\n",
" <td>8.2622</td>\n",
" <td>3.5142</td>\n",
" <td>10.3404</td>\n",
" <td>11.6081</td>\n",
" <td>5.6709</td>\n",
" <td>15.1516</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199996</th>\n",
" <td>train_199996</td>\n",
" <td>0.0</td>\n",
" <td>4.9149</td>\n",
" <td>-2.4484</td>\n",
" <td>16.7052</td>\n",
" <td>6.6345</td>\n",
" <td>8.3096</td>\n",
" <td>-10.5628</td>\n",
" <td>5.8802</td>\n",
" <td>21.5940</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199997</th>\n",
" <td>train_199997</td>\n",
" <td>0.0</td>\n",
" <td>11.2232</td>\n",
" <td>-5.0518</td>\n",
" <td>10.5127</td>\n",
" <td>5.6456</td>\n",
" <td>9.3410</td>\n",
" <td>-5.4086</td>\n",
" <td>4.5555</td>\n",
" <td>21.5571</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199998</th>\n",
" <td>train_199998</td>\n",
" <td>0.0</td>\n",
" <td>9.7148</td>\n",
" <td>-8.6098</td>\n",
" <td>13.6104</td>\n",
" <td>5.7930</td>\n",
" <td>12.5173</td>\n",
" <td>0.5339</td>\n",
" <td>6.0479</td>\n",
" <td>17.0152</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199999</th>\n",
" <td>train_199999</td>\n",
" <td>0.0</td>\n",
" <td>10.8762</td>\n",
" <td>-5.7105</td>\n",
" <td>12.1183</td>\n",
" <td>8.0328</td>\n",
" <td>11.5577</td>\n",
" <td>0.3488</td>\n",
" <td>5.2839</td>\n",
" <td>15.2058</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200000 rows × 402 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code target var_0 var_1 var_2 var_3 var_4 \\\n",
"0 train_0 0.0 8.9255 -6.7863 11.9081 5.0930 11.4607 \n",
"1 train_1 0.0 11.5006 -4.1473 13.8588 5.3890 12.3622 \n",
"2 train_2 0.0 8.6093 -2.7457 12.0805 7.8928 10.5825 \n",
"3 train_3 0.0 11.0604 -2.1518 8.9522 7.1957 12.5846 \n",
"4 train_4 0.0 9.8369 -1.4834 12.8746 6.6375 12.2772 \n",
"... ... ... ... ... ... ... ... \n",
"199995 train_199995 0.0 11.4880 -0.4956 8.2622 3.5142 10.3404 \n",
"199996 train_199996 0.0 4.9149 -2.4484 16.7052 6.6345 8.3096 \n",
"199997 train_199997 0.0 11.2232 -5.0518 10.5127 5.6456 9.3410 \n",
"199998 train_199998 0.0 9.7148 -8.6098 13.6104 5.7930 12.5173 \n",
"199999 train_199999 0.0 10.8762 -5.7105 12.1183 8.0328 11.5577 \n",
"\n",
" var_5 var_6 var_7 ... var_190_unique var_191_unique \\\n",
"0 -9.2834 5.1187 18.6266 ... 0 0 \n",
"1 7.0433 5.6208 16.5338 ... 0 0 \n",
"2 -9.0837 6.9427 14.6155 ... 0 0 \n",
"3 -1.8361 5.8428 14.9250 ... 0 0 \n",
"4 2.4486 5.9405 19.2514 ... 0 0 \n",
"... ... ... ... ... ... ... \n",
"199995 11.6081 5.6709 15.1516 ... 0 1 \n",
"199996 -10.5628 5.8802 21.5940 ... 0 0 \n",
"199997 -5.4086 4.5555 21.5571 ... 0 0 \n",
"199998 0.5339 6.0479 17.0152 ... 0 0 \n",
"199999 0.3488 5.2839 15.2058 ... 0 0 \n",
"\n",
" var_192_unique var_193_unique var_194_unique var_195_unique \\\n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 1 1 1 0 \n",
"... ... ... ... ... \n",
"199995 0 0 0 0 \n",
"199996 0 1 0 0 \n",
"199997 0 0 0 0 \n",
"199998 0 0 0 0 \n",
"199999 0 0 0 0 \n",
"\n",
" var_196_unique var_197_unique var_198_unique var_199_unique \n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"199995 1 0 0 0 \n",
"199996 0 0 0 0 \n",
"199997 0 0 0 1 \n",
"199998 0 0 0 1 \n",
"199999 0 0 0 1 \n",
"\n",
"[200000 rows x 402 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "therapeutic-scratch",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID_code</th>\n",
" <th>var_0</th>\n",
" <th>var_1</th>\n",
" <th>var_2</th>\n",
" <th>var_3</th>\n",
" <th>var_4</th>\n",
" <th>var_5</th>\n",
" <th>var_6</th>\n",
" <th>var_7</th>\n",
" <th>var_8</th>\n",
" <th>...</th>\n",
" <th>var_190_unique</th>\n",
" <th>var_191_unique</th>\n",
" <th>var_192_unique</th>\n",
" <th>var_193_unique</th>\n",
" <th>var_194_unique</th>\n",
" <th>var_195_unique</th>\n",
" <th>var_196_unique</th>\n",
" <th>var_197_unique</th>\n",
" <th>var_198_unique</th>\n",
" <th>var_199_unique</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>test_3</td>\n",
" <td>8.5374</td>\n",
" <td>-1.3222</td>\n",
" <td>12.0220</td>\n",
" <td>6.5749</td>\n",
" <td>8.8458</td>\n",
" <td>3.1744</td>\n",
" <td>4.9397</td>\n",
" <td>20.5660</td>\n",
" <td>3.3755</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>test_7</td>\n",
" <td>17.3035</td>\n",
" <td>-2.4212</td>\n",
" <td>13.3989</td>\n",
" <td>8.3998</td>\n",
" <td>11.0777</td>\n",
" <td>9.6449</td>\n",
" <td>5.9596</td>\n",
" <td>17.8477</td>\n",
" <td>-4.8068</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>test_11</td>\n",
" <td>10.6137</td>\n",
" <td>-2.1898</td>\n",
" <td>8.9090</td>\n",
" <td>3.8014</td>\n",
" <td>13.8602</td>\n",
" <td>-5.9802</td>\n",
" <td>5.5515</td>\n",
" <td>15.4716</td>\n",
" <td>-0.1714</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>test_15</td>\n",
" <td>14.8595</td>\n",
" <td>-4.5378</td>\n",
" <td>13.6483</td>\n",
" <td>5.6480</td>\n",
" <td>9.9144</td>\n",
" <td>1.5190</td>\n",
" <td>5.0358</td>\n",
" <td>13.4524</td>\n",
" <td>-2.5419</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>test_16</td>\n",
" <td>14.1732</td>\n",
" <td>-5.1490</td>\n",
" <td>9.7591</td>\n",
" <td>3.7316</td>\n",
" <td>10.3700</td>\n",
" <td>-21.9202</td>\n",
" <td>7.7130</td>\n",
" <td>18.8749</td>\n",
" <td>0.4680</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 401 columns</p>\n",
"</div>"
],
"text/plain": [
" ID_code var_0 var_1 var_2 var_3 var_4 var_5 var_6 \\\n",
"3 test_3 8.5374 -1.3222 12.0220 6.5749 8.8458 3.1744 4.9397 \n",
"7 test_7 17.3035 -2.4212 13.3989 8.3998 11.0777 9.6449 5.9596 \n",
"11 test_11 10.6137 -2.1898 8.9090 3.8014 13.8602 -5.9802 5.5515 \n",
"15 test_15 14.8595 -4.5378 13.6483 5.6480 9.9144 1.5190 5.0358 \n",
"16 test_16 14.1732 -5.1490 9.7591 3.7316 10.3700 -21.9202 7.7130 \n",
"\n",
" var_7 var_8 ... var_190_unique var_191_unique var_192_unique \\\n",
"3 20.5660 3.3755 ... 0 0 0 \n",
"7 17.8477 -4.8068 ... 0 0 0 \n",
"11 15.4716 -0.1714 ... 0 0 0 \n",
"15 13.4524 -2.5419 ... 0 0 1 \n",
"16 18.8749 0.4680 ... 0 0 0 \n",
"\n",
" var_193_unique var_194_unique var_195_unique var_196_unique \\\n",
"3 0 0 0 0 \n",
"7 0 0 0 0 \n",
"11 0 0 0 1 \n",
"15 0 0 0 0 \n",
"16 0 0 0 0 \n",
"\n",
" var_197_unique var_198_unique var_199_unique \n",
"3 0 0 1 \n",
"7 0 1 0 \n",
"11 0 0 1 \n",
"15 0 0 1 \n",
"16 0 0 0 \n",
"\n",
"[5 rows x 401 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head(5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}