{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "electoral-scientist", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "id": "surrounded-albert", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"train.csv\")\n", "test = pd.read_csv(\"test.csv\")\n", "test_ids = test[\"PassengerId\"]\n", "\n", "def clean(data):\n", " data = data.drop([\"Ticket\", \"PassengerId\", \"Name\", \"Cabin\"], axis=1)\n", " \n", " cols = [\"SibSp\", \"Parch\", \"Fare\", \"Age\"]\n", " for col in cols:\n", " data[col].fillna(data[col].median(), inplace=True)\n", " \n", " data.Embarked.fillna(\"U\", inplace=True)\n", " return data\n", "\n", "data = clean(data)\n", "test = clean(test)" ] }, { "cell_type": "code", "execution_count": 3, "id": "electronic-wyoming", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpParchFareEmbarked
003male22.0107.2500S
111female38.01071.2833C
213female26.0007.9250S
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", "0 0 3 male 22.0 1 0 7.2500 S\n", "1 1 1 female 38.0 1 0 71.2833 C\n", "2 1 3 female 26.0 0 0 7.9250 S" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "id": "legendary-conditions", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['female' 'male']\n", "['C' 'Q' 'S' 'U']\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpParchFareEmbarked
003122.0107.25002
111038.01071.28330
213026.0007.92502
311035.01053.10002
403135.0008.05002
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", "0 0 3 1 22.0 1 0 7.2500 2\n", "1 1 1 0 38.0 1 0 71.2833 0\n", "2 1 3 0 26.0 0 0 7.9250 2\n", "3 1 1 0 35.0 1 0 53.1000 2\n", "4 0 3 1 35.0 0 0 8.0500 2" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "le = preprocessing.LabelEncoder()\n", "columns = [\"Sex\", \"Embarked\"]\n", "\n", "for col in columns:\n", " data[col] = le.fit_transform(data[col])\n", " test[col] = le.transform(test[col])\n", " print(le.classes_)\n", " \n", "data.head(5)" ] }, { "cell_type": "code", "execution_count": 5, "id": "assumed-screening", "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import train_test_split\n", "\n", "y = data[\"Survived\"]\n", "X = data.drop(\"Survived\", axis=1)\n", "\n", "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 6, "id": "industrial-internship", "metadata": {}, "outputs": [], "source": [ "clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 7, "id": "fifteen-enemy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.8888888888888888" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions = clf.predict(X_val)\n", "from sklearn.metrics import accuracy_score\n", "accuracy_score(y_val, predictions)" ] }, { "cell_type": "code", "execution_count": 8, "id": "juvenile-anthropology", "metadata": {}, "outputs": [], "source": [ "submission_preds = clf.predict(test)" ] }, { "cell_type": "code", "execution_count": 9, "id": "virgin-settlement", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame({\"PassengerId\": test_ids.values,\n", " \"Survived\": submission_preds,\n", " })" ] }, { "cell_type": "code", "execution_count": 10, "id": "tribal-bidding", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"submission.csv\", index=False)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }