In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [30]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
In [31]:
df = pd.read_csv('titanic_toy.csv')
In [32]:
df.head()
Out[32]: Age Fare Family Survived
0 22.0 7.2500 1 0
1 38.0 71.2833 1 1
2 26.0 7.9250 0 1
3 35.0 53.1000 1 1
4 35.0 8.0500 0 0
In [33]:
df.isnull().mean()
Out[33]: Age 0.198653
Fare 0.050505
Family 0.000000
Survived 0.000000
dtype: float64
In [34]:
X = df.drop(columns=['Survived'])
y = df['Survived']
In [35]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_stat
In [36]:
X_train['Age_99'] = X_train['Age'].fillna(99)
X_train['Age_minus1'] = X_train['Age'].fillna(-1)
X_train['Fare_999'] = X_train['Fare'].fillna(999)
X_train['Fare_minus1'] = X_train['Fare'].fillna(-1)
<ipython-input-36-cb3531bd821d>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-doc
s/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train['Age_99'] = X_train['Age'].fillna(99)
<ipython-input-36-cb3531bd821d>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-doc
s/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train['Age_minus1'] = X_train['Age'].fillna(-1)
<ipython-input-36-cb3531bd821d>:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-doc
s/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train['Fare_999'] = X_train['Fare'].fillna(999)
<ipython-input-36-cb3531bd821d>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-doc
s/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
X_train['Fare_minus1'] = X_train['Fare'].fillna(-1)
In [37]:
print('Original Age variable variance: ', X_train['Age'].var())
print('Age Variance after 99 wala imputation: ', X_train['Age_99'].var())
print('Age Variance after -1 wala imputation: ', X_train['Age_minus1'].var())
print('Original Fare variable variance: ', X_train['Fare'].var())
print('Fare Variance after 999 wala imputation: ', X_train['Fare_999'].var())
print('Fare Variance after -1 wala imputation: ', X_train['Fare_minus1'].var()
Original Age variable variance: 204.3495133904614
Age Variance after 99 wala imputation: 951.7275570187172
Age Variance after -1 wala imputation: 318.0896202624484
Original Fare variable variance: 2448.197913706318
Fare Variance after 999 wala imputation: 47219.20265217623
Fare Variance after -1 wala imputation: 2378.5676784883503
In [38]:
fig = plt.figure()
ax = fig.add_subplot(111)
# original variable distribution
X_train['Age'].plot(kind='kde', ax=ax)
# variable imputed with the median
X_train['Age_99'].plot(kind='kde', ax=ax, color='red')
# variable imputed with the mean
X_train['Age_minus1'].plot(kind='kde', ax=ax, color='green')
# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
Out[38]: <matplotlib.legend.Legend at 0x227a8f2a3a0>
In [20]:
fig = plt.figure()
ax = fig.add_subplot(111)
# original variable distribution
X_train['Fare'].plot(kind='kde', ax=ax)
# variable imputed with the median
X_train['Fare_999'].plot(kind='kde', ax=ax, color='red')
# variable imputed with the mean
X_train['Fare_minus1'].plot(kind='kde', ax=ax, color='green')
# add legends
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
Out[20]: <matplotlib.legend.Legend at 0x227a8bb0430>
In [13]:
X_train.cov()
Out[13]: Age Fare Family Age_99 Age_minus1 Fare_99
Age 204.349513 70.719262 -6.498901 204.349513 204.349513 162.79343
Fare 70.719262 2448.197914 17.258917 -101.671097 125.558364 2448.19791
Family -6.498901 17.258917 2.735252 -7.387287 -4.149246 11.52862
Age_99 204.349513 -101.671097 -7.387287 951.727557 -189.535540 -159.93166
Age_minus1 204.349513 125.558364 -4.149246 -189.535540 318.089620 257.37988
Fare_999 162.793430 2448.197914 11.528625 -159.931663 257.379887 47219.20265
Fare_minus1 63.321188 2448.197914 16.553989 -94.317400 114.394141 762.47498
In [14]:
X_train.corr()
Out[14]: Age Fare Family Age_99 Age_minus1 Fare_999 Fare_m
Age 1.000000 0.092644 -0.299113 1.000000 1.000000 0.051179 0.08
Fare 0.092644 1.000000 0.208268 -0.066273 0.142022 1.000000 1.00
Family -0.299113 0.208268 1.000000 -0.144787 -0.140668 0.032079 0.20
Age_99 1.000000 -0.066273 -0.144787 1.000000 -0.344476 -0.023857 -0.06
Age_minus1 1.000000 0.142022 -0.140668 -0.344476 1.000000 0.066411 0.1
Fare_999 0.051179 1.000000 0.032079 -0.023857 0.066411 1.000000 0.07
Fare_minus1 0.084585 1.000000 0.205233 -0.062687 0.131514 0.071946 1.00
Using Sklearn
In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_stat
In [22]:
imputer1 = SimpleImputer(strategy='constant',fill_value=99)
imputer2 = SimpleImputer(strategy='constant',fill_value=999)
In [40]:
trf = ColumnTransformer([
('imputer1',imputer1,['Age']),
('imputer2',imputer2,['Fare'])
],remainder='passthrough')
In [41]:
trf.fit(X_train)
Out[41]: ColumnTransformer(remainder='passthrough',
transformers=[('imputer1',
SimpleImputer(fill_value=99,
strategy='constant'),
['Age']),
('imputer2',
SimpleImputer(fill_value=999,
strategy='constant'),
['Fare'])])
In [42]:
trf.named_transformers_['imputer1'].statistics_
Out[42]: array([99.])
In [43]:
trf.named_transformers_['imputer2'].statistics_
Out[43]: array([999.])
In [44]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)
In [45]:
X_train
Out[45]: array([[ 40. , 27.7208, 0. ],
[ 4. , 16.7 , 2. ],
[ 47. , 9. , 0. ],
...,
[ 71. , 49.5042, 0. ],
[ 99. , 221.7792, 0. ],
[ 99. , 25.925 , 0. ]])
In [ ]: