Kaggle: Titanic

The Titanic

Kaggle: Titanic

1
2
3
4
Simple implementation with basic data cleaning, one-hot encoding and lightGBM classifier.

Score: 0.76794
Rank: 11846/17593

Import Packages / Load Dataset

1
%cd /content/drive/My Drive/Kaggle/titanic
/content/drive/My Drive/Kaggle/titanic
1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import lightgbm
import xgboost
import os
1
df_train = pd.read_csv(os.getcwd()+'/train.csv')

Data Exploration

1
df_train.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
1
df_train.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
1
2
df_train.isna().sum()
# Age, Cabin have lots of missing values.
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
1
df_train.isna().mean()
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64
1
df_train.Embarked.unique()
array(['S', 'C', 'Q', nan], dtype=object)
1
2
# Cabin Types
df_train.Cabin.dropna().map(lambda x: x[0]).unique()
array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

Preprocessing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
df_clean = df_train.copy()
# Make Sex a binary attribute
df_clean.Sex = df_clean.Sex.apply(lambda x: (x=='male') * 1)
# Cabin: Keep the initial
df_clean.Cabin = df_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
# Title from Name
df_clean["Title"] = df_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
# Make rare titles "Other"
df_clean.Title = df_clean.Title.map(lambda x: "Other" if (df_clean.Title.value_counts()[x] < 10) else x)
# Impute Cabin simply by set "Unk" group
df_clean.Cabin = df_clean.Cabin.fillna(value="Unk")
# Impute Age by the median of same Pclass&Title
byPclassAndTitle = df_clean.groupby(["Pclass", "Title"]).agg(np.median)["Age"].reset_index()
df_clean = pd.concat(
[
df_clean[df_clean.Age.notna()],
pd.merge(df_clean[df_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_clean.columns]
],
axis=0
)
# Embarked
df_clean.Embarked = df_clean.Embarked.fillna("unk")
df_clean.sort_values("PassengerId", inplace=True)
1
df_clean.Title.value_counts()
Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Title, dtype: int64
1
df_clean.Embarked.value_counts()
S      644
C      168
Q       77
unk      2
Name: Embarked, dtype: int64

Model

1
2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
1
df_clean.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title'],
      dtype='object')
1
2
3
4
5
6
7
8
9
10
# kept_cols = ['Pclass', 'Sex', 'Age', 'SibSp',
# 'Parch', 'Fare', 'Cabin', 'Embarked', 'Title']

# cat_cols = ['Cabin', 'Embarked', 'Title']

kept_cols = ['Pclass', 'Sex', 'Age', 'SibSp',
'Parch', 'Fare' , 'Title']
cat_cols = ['Title']

num_cols = [i for i in kept_cols if i not in cat_cols]
1
X = pd.concat([df_clean[num_cols], pd.get_dummies(df_clean[cat_cols], drop_first=True)], axis=1)
1
X
Pclass Sex Age SibSp Parch Fare Title_Miss Title_Mr Title_Mrs Title_Other
0 3 1 22.0 1 0 7.2500 0 1 0 0
1 1 0 38.0 1 0 71.2833 0 0 1 0
2 3 0 26.0 0 0 7.9250 1 0 0 0
3 1 0 35.0 1 0 53.1000 0 0 1 0
4 3 1 35.0 0 0 8.0500 0 1 0 0
... ... ... ... ... ... ... ... ... ... ...
886 2 1 27.0 0 0 13.0000 0 0 0 1
887 1 0 19.0 0 0 30.0000 1 0 0 0
176 3 0 18.0 1 2 23.4500 1 0 0 0
889 1 1 26.0 0 0 30.0000 0 1 0 0
890 3 1 32.0 0 0 7.7500 0 1 0 0

891 rows × 10 columns

1
y = df_clean['Survived']
1
2
3
4
5
6
7
8
9
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
clf = lightgbm.LGBMClassifier(
max_depth=5,
min_child_weight=0.1,
n_jobs=-1, num_leaves=15,
)
# clf = xgboost.XGBClassifier()
clf.fit(X=X_train, y=y_train)
clf.score(X_val, y_val)
0.8715083798882681

Do the same thing for our test data

1
df_test = pd.read_csv(os.getcwd()+'/test.csv')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
df_test_clean = df_test.copy()
# Make Sex a binary attribute
df_test_clean.Sex = df_test_clean.Sex.apply(lambda x: (x=='male') * 1)
# Cabin: Keep the initial
df_test_clean.Cabin = df_test_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
# Title from Name
df_test_clean["Title"] = df_test_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
# Make rare titles "Other"
df_test_clean.Title = df_test_clean.Title.map(lambda x: "Other" if x not in ["Mr", "Miss", "Mrs", "Master"] else x)
# Impute Cabin simply by set "Unk" group
df_test_clean.Cabin = df_test_clean.Cabin.fillna(value="Unk")

# Impute Age by the median of same Pclass&Title
# Use the result from train


df_test_clean = pd.concat(
[
df_test_clean[df_test_clean.Age.notna()],
pd.merge(df_test_clean[df_test_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_test_clean.columns]
],
axis=0
)
# Embarked
df_test_clean.Embarked = df_test_clean.Embarked.fillna("unk")
df_test_clean.sort_values("PassengerId", inplace=True)
1
X_test = pd.concat([df_test_clean[num_cols], pd.get_dummies(df_test_clean[cat_cols], drop_first=True)], axis=1)
1
X_test = X_test.assign(Cabin_T = 0).assign(Embarked_unk=0)[X_train.columns]
1
clf.predict(X_test)
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
1
y_test_truth = pd.read_csv(os.getcwd()+'/gender_submission.csv')
1
clf.score(X_test, y_test_truth.Survived)
0.8827751196172249

Export Prediction

1
2
3
4
y_submission = y_test_truth.copy()
y_submission["Survived"] = clf.predict(X_test)
y_submission.set_index("PassengerId", inplace=True)
y_submission.to_csv(os.getcwd()+'/Submission.csv')