#import lib
from imblearn.under_sampling import TomekLinks, ClusterCentroids
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV,
cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter
import imblearn
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# import data
wine = pd.read_csv(
'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-
quality/winequality-white.csv', delimiter=";")
EDA
column names
wine.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual
sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',
'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
barplot of the response
plt.figure(figsize=(10, 6))
sns.countplot(wine["quality"], palette="muted")
wine["quality"].value_counts()
6 2198
5 1457
7 880
8 175
4 163
3 20
9 5
Name: quality, dtype: int64
Data Transformation
We want to transfer the score(num) to low-medium-high quality level(categorical) by:
3,4 -> low
5,6 -> medium
7,8,9 -> high
quality = wine["quality"].values
category = []
for num in quality:
if num < 5:
category.append("Low")
elif num > 6:
category.append("High")
else:
category.append("Medium")
[(i, category.count(i)) for i in set(category)]
[('Medium', 3655), ('Low', 183), ('High', 1060)]
barplot of the response after transformation
plt.figure(figsize=(10, 6))
sns.countplot(category, palette="muted")
<matplotlib.axes._subplots.AxesSubplot at 0x10eb7c6d8>
Cor Mat to check features
plt.figure(figsize=(12, 6))
sns.heatmap(wine.corr(), annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x10eb93f28>
Set up model matrix
quality = wine["quality"].values
category = []
for num in quality:
if num < 5:
category.append("Low")
elif num > 6:
category.append("High")
else:
category.append("Midium")
category = pd.DataFrame(data=category, columns=["category"])
data = pd.concat([wine, category], axis=1)
data.drop(columns="quality", axis=1, inplace=True)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=2018)
Build models
random forest
clf = RandomForestClassifier(random_state=2018, oob_score=True)
param_dist = {"n_estimators": [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]}
rfc_gs = GridSearchCV(clf, param_grid=param_dist, scoring='accuracy',
cv=5)
rfc_gs.fit(X_train, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True,
class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn',
n_jobs=None,
oob_score=True, random_state=2018, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'n_estimators': [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
rfc_gs.best_score_
0.8363961204696274
SVM
# strtifiedKFold
skf = StratifiedKFold(n_splits=4)
pipe_svm = Pipeline([('clf', svm.SVC())])
grid_params = dict(clf__C=[0.1, 0.3, 1, 3, 10],
clf__gamma=[0.1, 0.3, 1, 3, 10],
clf__kernel=['rbf', 'sigmoid'])
gs_svm = GridSearchCV(estimator=pipe_svm,
param_grid=grid_params,
scoring='accuracy',
cv=skf)
gs_svm.fit(X_train, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None,
shuffle=False),
error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None,
coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'clf__C': [0.1, 0.3, 1, 3, 10], 'clf__gamma': [0.1,
0.3, 1, 3, 10], 'clf__kernel': ['rbf', 'sigmoid']},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
print(gs_svm.best_score_)
0.8083205717202654
# just for comparision
pred_svm = gs_svm.predict(X_test)
print(classification_report(y_test, pred_svm))
print("The SVM model accuracy on Test data is %s" %
accuracy_score(y_test, pred_svm))
precision recall f1-score support
0 1.00 0.34 0.51 209
1 1.00 0.06 0.11 35
2 0.81 1.00 0.90 736
micro avg 0.83 0.83 0.83 980
macro avg 0.94 0.47 0.51 980
weighted avg 0.86 0.83 0.79 980
The RF model accuracy on Test data is 0.826530612244898
# @hidden_cell
# relabel back : 0 means good, 1 for low, 2 for medium for better
visualization
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_svm_re = list(pred_svm)
for i in range(len(pred_svm_re)):
if pred_svm_re[i] == 0:
pred_svm_re[i] = "good"
if pred_svm_re[i] == 1:
pred_svm_re[i] = "low"
if pred_svm_re[i] == 2:
pred_svm_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_svm_re, name='Predicted')
svm_confusion = pd.crosstab(y_actu, y_pred)
svm_confusion
Predicted good low medium
Actual
good 72 0 137
low 0 2 33
medium 0 0 736
Decision Tree
clf = Pipeline([
('scl', StandardScaler()),
('pca', PCA(random_state=42)),
('clf', DecisionTreeClassifier(random_state=42))])
criterion = ['gini', 'entropy']
splitter = ['best']
max_depth = [8, 9, 10, 11, 15, 20, 25]
min_samples_leaf = [2, 3, 5]
class_weight = ['balanced', None]
param_grid =\
[{'clf__class_weight': class_weight,
'clf__criterion': criterion,
'clf__splitter': splitter,
'clf__max_depth': max_depth,
'clf__min_samples_leaf': min_samples_leaf
}]
gs_dt = GridSearchCV(estimator=clf, param_grid=param_grid,
scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_dt.fit(X_train, y_train)
Fitting 5 folds for each of 84 candidates, totalling 420 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent
workers.
[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 2.6s
[Parallel(n_jobs=-1)]: Done 321 tasks | elapsed: 5.3s
[Parallel(n_jobs=-1)]: Done 405 out of 420 | elapsed: 6.3s
remaining: 0.2s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed: 6.5s finished
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('scl', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=42,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
DecisionTreeClassifier(class_weight=None, criterion='gini',
max_depth=None,
ma... min_weight_fraction_leaf=0.0, presort=False,
random_state=42,
splitter='best'))]),
fit_params=None, iid='warn', n_jobs=-1,
param_grid=[{'clf__class_weight': ['balanced', None],
'clf__criterion': ['gini', 'entropy'], 'clf__splitter': ['best'],
'clf__max_depth': [8, 9, 10, 11, 15, 20, 25], 'clf__min_samples_leaf':
[2, 3, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=1)
print(gs_dt.best_score_)
0.7595712098009189
KNN
# knn with pca grid search
pipe_knn_pca = Pipeline([('pca', PCA()),
('clf', KNeighborsClassifier())])
grid_params_knn_pca = \
dict(pca__n_components=range(1, 11), clf__n_neighbors=range(1,
10))
gs_knn_pca = GridSearchCV(estimator=pipe_knn_pca,
param_grid=grid_params_knn_pca,
scoring='accuracy')
gs_knn_pca.fit(X_train, y_train)
/Users/tianqiluke/anaconda3/lib/python3.6/site-packages/sklearn/
model_selection/_split.py:2053: FutureWarning: You should specify a
value for 'cv' instead of relying on the default value. The default
value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'pca__n_components': range(1, 11),
'clf__n_neighbors': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
print(gs_knn_pca.best_score_)
0.7391526288922919
" Therefore, random Forest gives best result. "
check its performance on test
# check its performance on test
pred_rfc = rfc_gs.predict(X_test)
print(classification_report(y_test, pred_rfc))
print("The RF model accuracy on Test data is %s" %
accuracy_score(y_test, pred_rfc))
precision recall f1-score support
0 0.79 0.61 0.69 209
1 0.57 0.11 0.19 35
2 0.86 0.95 0.90 736
micro avg 0.85 0.85 0.85 980
macro avg 0.74 0.56 0.59 980
weighted avg 0.84 0.85 0.83 980
The RF model accuracy on Test data is 0.8469387755102041
# @hidden_cell
# relabel back : 0 means good, 1 for low, 2 for medium for better
visualization
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_rfc_re = list(pred_rfc)
for i in range(len(pred_rfc_re)):
if pred_rfc_re[i] == 0:
pred_rfc_re[i] = "good"
if pred_rfc_re[i] == 1:
pred_rfc_re[i] = "low"
if pred_rfc_re[i] == 2:
pred_rfc_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_rfc_re, name='Predicted')
rfc_confusion = pd.crosstab(y_actu, y_pred)
rfc_confusion
Predicted good low medium
Actual
good 128 0 81
low 0 4 31
medium 35 3 698
Resampling
# under sample "2" ;
# over sample "1", "0"
smt = ClusterCentroids(ratio={2: 1500})
X_sm, y_sm = smt.fit_sample(X_train, y_train)
smt2 = SMOTE(ratio={0: 1500, 1: 1500})
X_sm2, y_sm2 = smt2.fit_sample(X_sm, y_sm)
Random Forest
rfc_rs = RandomForestClassifier(random_state=2018)
param_dist = {"n_estimators": [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]}
rfc_gs_rs = GridSearchCV(rfc_rs, param_grid=param_dist,
scoring='accuracy', cv=5)
rfc_gs_rs.fit(X_sm2, y_sm2)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=RandomForestClassifier(bootstrap=True,
class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn',
n_jobs=None,
oob_score=False, random_state=2018, verbose=0,
warm_start=False),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'n_estimators': [50, 100, 150, 200, 250],
'min_samples_leaf': [1, 2, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
rfc_gs_rs.best_score_
0.862
importances = rfc_gs_rs.best_estimator_.feature_importances_
wine.columns[:-1]
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual
sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',
'density',
'pH', 'sulphates', 'alcohol'],
dtype='object')
feature_importances = pd.DataFrame(importances,index = wine.columns[:-
1],
columns=['importance']).sort_values('importance',
ascending=False)
feature_importances.plot(kind='barh')
<matplotlib.axes._subplots.AxesSubplot at 0x1a2ef1d7f0>
SVM_rs
# grid search after resample
pipe_svm = Pipeline([('clf', svm.SVC())])
grid_params = dict(clf__C=[0.1, 0.3, 1, 3, 10],
clf__gamma=[0.1, 0.3, 1, 3, 10],
clf__kernel=['rbf', 'sigmoid'])
gs_svm_sm = GridSearchCV(estimator=pipe_svm,
param_grid=grid_params,
scoring='accuracy',
cv=skf)
gs_svm_sm.fit(X_sm2, y_sm2)
GridSearchCV(cv=StratifiedKFold(n_splits=4, random_state=None,
shuffle=False),
error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None,
coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'clf__C': [0.1, 0.3, 1, 3, 10], 'clf__gamma': [0.1,
0.3, 1, 3, 10], 'clf__kernel': ['rbf', 'sigmoid']},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
gs_svm_sm.best_score_
0.822
DT_rs
clf = Pipeline([
('scl', StandardScaler()),
('pca', PCA(random_state=42)),
('clf', DecisionTreeClassifier(random_state=42))])
criterion = ['gini', 'entropy']
splitter = ['best']
max_depth = [8, 9, 10, 11, 15, 20, 25]
min_samples_leaf = [2, 3, 5]
class_weight = ['balanced', None]
param_grid =\
[{'clf__class_weight': class_weight,
'clf__criterion': criterion,
'clf__splitter': splitter,
'clf__max_depth': max_depth,
'clf__min_samples_leaf': min_samples_leaf
}]
gs_dt_rs = GridSearchCV(estimator=clf,
param_grid=param_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-1)
gs_dt_rs.fit(X_sm2, y_sm2)
Fitting 5 folds for each of 84 candidates, totalling 420 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent
workers.
[Parallel(n_jobs=-1)]: Done 160 tasks | elapsed: 1.8s
[Parallel(n_jobs=-1)]: Done 420 out of 420 | elapsed: 4.1s finished
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('scl', StandardScaler(copy=True, with_mean=True,
with_std=True)), ('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=42,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
DecisionTreeClassifier(class_weight=None, criterion='gini',
max_depth=None,
ma... min_weight_fraction_leaf=0.0, presort=False,
random_state=42,
splitter='best'))]),
fit_params=None, iid='warn', n_jobs=-1,
param_grid=[{'clf__class_weight': ['balanced', None],
'clf__criterion': ['gini', 'entropy'], 'clf__splitter': ['best'],
'clf__max_depth': [8, 9, 10, 11, 15, 20, 25], 'clf__min_samples_leaf':
[2, 3, 5]}],
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=1)
gs_dt_rs.best_score_
0.7455555555555555
KNN_rs
# knn with pca grid search
pipe_knn_pca = Pipeline([('pca', PCA()),
('clf', KNeighborsClassifier())])
grid_params_knn_pca = dict(pca__n_components=range(1, 11),
clf__n_neighbors=range(1, 10))
gs_knn_pca_rs = GridSearchCV(estimator=pipe_knn_pca,
param_grid=grid_params_knn_pca,
scoring='accuracy')
gs_knn_pca_rs.fit(X_sm2, y_sm2)
/Users/tianqiluke/anaconda3/lib/python3.6/site-packages/sklearn/
model_selection/_split.py:2053: FutureWarning: You should specify a
value for 'cv' instead of relying on the default value. The default
value will change from 3 to 5 in version 0.22.
warnings.warn(CV_WARNING, FutureWarning)
GridSearchCV(cv='warn', error_score='raise-deprecating',
estimator=Pipeline(memory=None,
steps=[('pca', PCA(copy=True, iterated_power='auto',
n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)), ('clf',
KNeighborsClassifier(algorithm='auto', leaf_size=30,
metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform'))]),
fit_params=None, iid='warn', n_jobs=None,
param_grid={'pca__n_components': range(1, 11),
'clf__n_neighbors': range(1, 10)},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring='accuracy', verbose=0)
gs_knn_pca_rs.best_score_
0.7715555555555556
" RF gives best result, but SVM also good as well, check both model under resample. "
" check their performance on test data "
SVM performance on test data
classification_report
# model after resample
svm_rs_predicted = gs_svm_sm.predict(X_test)
print(classification_report(y_test, svm_rs_predicted))
print("---------------------------------------------------------")
print("The accuracy score of SVM (resampled) on test is %s" %
accuracy_score(y_test, svm_rs_predicted))
precision recall f1-score support
0 0.68 0.51 0.58 209
1 0.18 0.20 0.19 35
2 0.84 0.90 0.87 736
micro avg 0.79 0.79 0.79 980
macro avg 0.57 0.53 0.55 980
weighted avg 0.78 0.79 0.78 980
---------------------------------------------------------
The accuracy score of SVM (resampled) is 0.7887755102040817
confusion matrix
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
sm_pred_re = list(svm_rs_predicted)
for i in range(len(svm_rs_predicted)):
if sm_pred_re[i] == 0:
sm_pred_re[i] = "good"
if sm_pred_re[i] == 1:
sm_pred_re[i] = "low"
if sm_pred_re[i] == 2:
sm_pred_re[i] = "medium"
svm_rs_confusion = pd.crosstab(
pd.Series(y_test_re, name='actual'), pd.Series(sm_pred_re,
name='predicted'))
svm_rs_confusion
predicted good low medium
actual
good 106 4 99
low 0 7 28
medium 49 27 660
Random Forest performance on test data
classification_report
pred_rfc_rs = rfc_gs_rs.predict(X_test)
print(classification_report(y_test, pred_rfc_rs))
print("The RF model(resampled) accuracy on test is %s" %
accuracy_score(y_test, pred_rfc_rs))
precision recall f1-score support
0 0.52 0.84 0.64 209
1 0.24 0.49 0.32 35
2 0.92 0.71 0.80 736
micro avg 0.73 0.73 0.73 980
macro avg 0.56 0.68 0.59 980
weighted avg 0.81 0.73 0.75 980
The RF model(resampled) accuracy on test is 0.7326530612244898
y_test_re = list(y_test)
for i in range(len(y_test_re)):
if y_test_re[i] == 0:
y_test_re[i] = "good"
if y_test_re[i] == 1:
y_test_re[i] = "low"
if y_test_re[i] == 2:
y_test_re[i] = "medium"
pred_rfc_re = list(pred_rfc_rs)
for i in range(len(pred_rfc_re)):
if pred_rfc_re[i] == 0:
pred_rfc_re[i] = "good"
if pred_rfc_re[i] == 1:
pred_rfc_re[i] = "low"
if pred_rfc_re[i] == 2:
pred_rfc_re[i] = "medium"
y_actu = pd.Series(y_test_re, name='Actual')
y_pred = pd.Series(pred_rfc_re, name='Predicted')
rfc_rsconfusion = pd.crosstab(y_actu, y_pred)
confusion matrix
rfc_rs_confusion
Predicted good low medium
Actual
good 174 0 35
low 2 11 22
medium 151 46 539
svm(resampled) has higher accuracy on test compare to random forest(78 to 73), but
RF(resampled) performs better on good and low. it's trade off to use which model on future,
depends on business purpose.