29/04/2018 PROJECT_B_FINAL
In [144]:
from scipy import stats
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
In [3]:
train_path='/Users/stefanoskarageorgiou/Desktop/train1.csv'
train=pd.read_csv(train_path)
In [4]:
train.head()
Out[4]:
MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities
60 RL 65.0 8450 Pave NaN Reg Lvl AllPub
20 RL 80.0 9600 Pave NaN Reg Lvl AllPub
3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub
4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub
5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub
s × 81 columns
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 1/14
29/04/2018 PROJECT_B_FINAL
In [5]:
train.describe()
Out[5]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond Yea
count 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000 1460.000000 1460.0
mean 730.500000 56.897260 70.049958 10516.828082 6.099315 5.575342 1971.2
std 421.610009 42.300571 24.284752 9981.264932 1.382997 1.112799 30.2
min 1.000000 20.000000 21.000000 1300.000000 1.000000 1.000000 1872.0
25% 365.750000 20.000000 59.000000 7553.500000 5.000000 5.000000 1954.0
50% 730.500000 50.000000 69.000000 9478.500000 6.000000 5.000000 1973.0
75% 1095.250000 70.000000 80.000000 11601.500000 7.000000 6.000000 2000.0
max 1460.000000 190.000000 313.000000 215245.000000 10.000000 9.000000 2010.0
8 rows × 38 columns
In [6]:
train.isnull().sum().sort_values(ascending=False)[:20]
Out[6]:
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
LotFrontage 259
GarageCond 81
GarageType 81
GarageYrBlt 81
GarageFinish 81
GarageQual 81
BsmtExposure 38
BsmtFinType2 38
BsmtFinType1 37
BsmtCond 37
BsmtQual 37
MasVnrArea 8
MasVnrType 8
Electrical 1
Utilities 0
dtype: int64
filling the missing values with the mean
In [7]:
train['LotFrontage'].fillna((train['LotFrontage'].mean()), inplace=True)
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 2/14
29/04/2018 PROJECT_B_FINAL
In [8]:
train.Alley=train.Alley.fillna(value="NA")
In [9]:
train['MasVnrArea'].fillna((train['MasVnrArea'].mean()), inplace=True)
In [10]:
train.MasVnrType=train.MasVnrType.fillna(value="NA")
In [11]:
train.BsmtQual=train.BsmtQual.fillna(value="NA")
In [12]:
train.BsmtCond=train.BsmtCond.fillna(value="NA")
In [13]:
train.BsmtExposure=train.BsmtExposure.fillna(value="NA")
In [14]:
train.BsmtFinType1=train.BsmtFinType1.fillna(value="NA")
In [15]:
train.BsmtFinType2=train.BsmtFinType2.fillna(value="NA")
In [16]:
train.Electrical=train.Electrical.fillna(value="NA")
In [17]:
train.FireplaceQu=train.FireplaceQu.fillna(value="NA")
In [18]:
train.GarageType=train.GarageType.fillna(value="NA")
In [19]:
train.GarageYrBlt=train.GarageYrBlt.fillna(value=train.YearBuilt)
In [20]:
train.GarageFinish=train.GarageFinish.fillna(value='NA')
In [21]:
train.GarageQual=train.GarageQual.fillna(value='NA')
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 3/14
29/04/2018 PROJECT_B_FINAL
In [22]:
train.GarageCond=train.GarageCond.fillna(value='NA')
In [23]:
train.PoolQC=train.PoolQC.fillna(value='NA')
In [24]:
train.Fence=train.Fence.fillna(value='NA')
In [25]:
train.MiscFeature=train.MiscFeature.fillna(value='NA')
check if every value is nan
In [26]:
train.isnull().sum().sort_values(ascending=False)[:20]
Out[26]:
SalePrice 0
Heating 0
RoofStyle 0
RoofMatl 0
Exterior1st 0
Exterior2nd 0
MasVnrType 0
MasVnrArea 0
ExterQual 0
ExterCond 0
Foundation 0
BsmtQual 0
BsmtCond 0
BsmtExposure 0
BsmtFinType1 0
BsmtFinSF1 0
BsmtFinType2 0
BsmtFinSF2 0
BsmtUnfSF 0
YearRemodAdd 0
dtype: int64
In [27]:
y=train.SalePrice
In [28]:
X=train.drop('SalePrice',axis=1)
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 4/14
29/04/2018 PROJECT_B_FINAL
In [29]:
mapping1 = {'NA':0, 'Po':1 , 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
mapping2 = {'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
mapping3 = {'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
mapping4 = {'NA':0, 'Unf':1, 'RFn':2, 'Fin':3}
In [30]:
X.ExterQual = X.ExterQual.map(mapping1)
X.ExterCond = X.ExterCond.map(mapping1)
X.BsmtQual = X.BsmtQual.map(mapping1)
X.BsmtCond = X.BsmtCond.map(mapping1)
X.HeatingQC = X.HeatingQC.map(mapping1)
X.BsmtExposure = X.BsmtExposure.map(mapping2)
X.BsmtFinType1 = X.BsmtFinType1.map(mapping3)
X.BsmtFinType2 = X.BsmtFinType2.map(mapping3)
X.PoolQC=X.PoolQC.map(mapping1)
X.GarageCond=X.GarageCond.map(mapping1)
X.GarageQual=X.GarageQual.map(mapping1)
X.GarageFinish=X.GarageFinish.map(mapping4)
X.FireplaceQu=X.FireplaceQu.map(mapping1)
X.KitchenQual=X.KitchenQual.map(mapping1)
In [149]:
plt.plot(train['SalePrice'])
plt.show()
In [31]:
X.PoolQC.unique()
Out[31]:
array([0, 5, 2, 4])
In [32]:
X.shape
Out[32]:
(1460, 80)
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 5/14
29/04/2018 PROJECT_B_FINAL
In [33]:
X=pd.get_dummies(X)
In [34]:
X.shape
Out[34]:
(1460, 246)
In [35]:
from sklearn.linear_model import LinearRegression
In [41]:
lr = LinearRegression()
lr.fit(X, y)
Out[41]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=
False)
In [45]:
import math
#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
assert len(y) == len(y_pred)
terms_to_sum = [(math.log(abs(y_pred[i] + 1)) - math.log(y[i] + 1)) ** 2.0 for i
return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5
In [46]:
from sklearn.model_selection import train_test_split
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y,test_size=0.25)
In [47]:
X_tr.shape, X_ts.shape, y_tr.shape, y_ts.shape
Out[47]:
((1095, 246), (365, 246), (1095,), (365,))
In [48]:
lr=LinearRegression()
lr.fit(X_tr,y_tr)
Out[48]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=
False)
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 6/14
29/04/2018 PROJECT_B_FINAL
In [49]:
y_pred=lr.predict(X_ts)
y_true=y_ts.as_matrix()
res=rmsle(y_true,y_pred)
In [50]:
res
Out[50]:
0.23831635704788817
In [134]:
plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8,square=True);
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 7/14
29/04/2018 PROJECT_B_FINAL
In [141]:
data = pd.concat([train['SalePrice'], train['OverallQual']], axis=1)
data.plot.scatter(x='OverallQual', y='SalePrice',ylim=(0,800000))
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a21a7b890>
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 8/14
29/04/2018 PROJECT_B_FINAL
In [145]:
sns.distplot(train['OverallQual'], fit=stats.norm);
plt.figure()
stats.probplot(train['SalePrice'],plot=plt)
Out[145]:
((array([-3.30513952, -3.04793228, -2.90489705, ..., 2.90489705,
3.04793228, 3.30513952]),
array([ 34900, 35311, 37900, ..., 625000, 745000, 755000])),
(74160.16474519415, 180921.19589041095, 0.9319665641512986))
In [51]:
#improve_1
In [ ]:
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 9/14
29/04/2018 PROJECT_B_FINAL
In [119]:
corr = train.corr()
print (corr['SalePrice'].sort_values(ascending=False)[:25], '\n')
print (corr['SalePrice'].sort_values(ascending=False)[-25:])
(SalePrice 1.000000
OverallQual 0.790982
GrLivArea 0.708624
GarageCars 0.640409
GarageArea 0.623431
TotalBsmtSF 0.613581
1stFlrSF 0.605852
FullBath 0.560664
TotRmsAbvGrd 0.533723
YearBuilt 0.522897
GarageYrBlt 0.508043
YearRemodAdd 0.507101
MasVnrArea 0.475241
Fireplaces 0.466929
BsmtFinSF1 0.386420
LotFrontage 0.334901
WoodDeckSF 0.324413
2ndFlrSF 0.319334
OpenPorchSF 0.315856
HalfBath 0.284108
LotArea 0.263843
BsmtFullBath 0.227122
BsmtUnfSF 0.214479
BedroomAbvGr 0.168213
ScreenPorch 0.111447
Name: SalePrice, dtype: float64, '\n')
Fireplaces 0.466929
BsmtFinSF1 0.386420
LotFrontage 0.334901
WoodDeckSF 0.324413
2ndFlrSF 0.319334
OpenPorchSF 0.315856
HalfBath 0.284108
LotArea 0.263843
BsmtFullBath 0.227122
BsmtUnfSF 0.214479
BedroomAbvGr 0.168213
ScreenPorch 0.111447
PoolArea 0.092404
MoSold 0.046432
3SsnPorch 0.044584
BsmtFinSF2 -0.011378
BsmtHalfBath -0.016844
MiscVal -0.021190
Id -0.021917
LowQualFinSF -0.025606
YrSold -0.028923
OverallCond -0.077856
MSSubClass -0.084284
EnclosedPorch -0.128578
KitchenAbvGr -0.135907
Name: SalePrice, dtype: float64
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 10/14
29/04/2018 PROJECT_B_FINAL
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [120]:
X_1=train.drop(['SalePrice','MiscVal','BsmtFinSF2','BsmtHalfBath','Id','LowQualFinSF
In [121]:
from sklearn.model_selection import train_test_split
X_1_tr, X_1_ts, y_tr, y_ts = train_test_split(X_1, y,test_size=0.25)
In [122]:
X_1_tr.shape, X_1_ts.shape, y_tr.shape, y_ts.shape
Out[122]:
((1095, 73), (365, 73), (1095,), (365,))
In [123]:
mapping1 = {'NA':0, 'Po':1 , 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
mapping2 = {'NA':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4}
mapping3 = {'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}
mapping4 = {'NA':0, 'Unf':1, 'RFn':2, 'Fin':3}
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 11/14
29/04/2018 PROJECT_B_FINAL
In [124]:
X_1.ExterQual = X_1.ExterQual.map(mapping1)
X_1.ExterCond = X_1.ExterCond.map(mapping1)
X_1.BsmtQual = X_1.BsmtQual.map(mapping1)
X_1.BsmtCond = X_1.BsmtCond.map(mapping1)
X_1.HeatingQC = X_1.HeatingQC.map(mapping1)
X_1.BsmtExposure = X_1.BsmtExposure.map(mapping2)
X_1.BsmtFinType1 = X_1.BsmtFinType1.map(mapping3)
X_1.BsmtFinType2 = X_1.BsmtFinType2.map(mapping3)
X_1.PoolQC=X_1.PoolQC.map(mapping1)
X_1.GarageCond=X_1.GarageCond.map(mapping1)
X_1.GarageQual=X_1.GarageQual.map(mapping1)
X_1.GarageFinish=X_1.GarageFinish.map(mapping4)
X_1.FireplaceQu=X_1.FireplaceQu.map(mapping1)
X_1.KitchenQual=X_1.KitchenQual.map(mapping1)
In [125]:
from sklearn.model_selection import train_test_split
X_1_tr, X_1_ts, y_tr, y_ts = train_test_split(X, y,test_size=0.25)
In [126]:
lr=LinearRegression()
lr.fit(X_1_tr,y_tr)
Out[126]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=
False)
In [127]:
y_pred=lr.predict(X_1_ts)
y_true=y_ts.as_matrix()
res=rmsle(y_true,y_pred)
In [128]:
res
Out[128]:
0.20235051475055066
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 12/14
29/04/2018 PROJECT_B_FINAL
In [150]:
actual_values = y_ts
plt.scatter(y_pred, actual_values, alpha=0.75, color='b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression Model')
plt.show()
In [87]:
# improve 2
In [129]:
from sklearn.linear_model import Lasso
In [131]:
lass = Lasso(alpha=0.01) # alpha determines the strength of the regularization
# the higer the value, the more coefficients will be zero
lass.fit(X_1_tr, y_tr)
/anaconda2/lib/python2.7/site-packages/sklearn/linear_model/coordinate
_descent.py:491: ConvergenceWarning: Objective did not converge. You m
ight want to increase the number of iterations. Fitting data with very
small alpha may cause precision problems.
ConvergenceWarning)
Out[131]:
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=Non
e,
selection='cyclic', tol=0.0001, warm_start=False)
In [132]:
y_pred=lass.predict(X_1_ts)
y_true=y_ts.as_matrix()
res=rmsle(y_true,y_pred)
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 13/14
29/04/2018 PROJECT_B_FINAL
In [133]:
res
Out[133]:
0.18142288750842797
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
http://localhost:8888/notebooks/PROJECT_B_FINAL.ipynb# 14/14