import numpy as np
import pandas as pd
In [2]:
#Only execute this cell if the directory in which your dataset is different
from the directory that you are running the
#Jupyter Notebook
#import os
#os.chdir('C:\\Shripad\\Personal\\DataScience\\DSBA\\Curricumulum\\4 Data
Mining\\3 Random Forest')
In [2]:
from sklearn.ensemble import RandomForestClassifier
In [3]:
bank_df = pd.read_csv("Banking Dataset.csv")
In [4]:
bank_df.head(10)
Out[4]:
Cust_I Targ Ag Gende Occupati No_OF_CR_TX AGE_BK SC Holding_Peri
Balance
D et e r on NS T R od
160378.
0 C1 0 30 M SAL 2 26-30 826 9
60
84370.5 SELF-
1 C10 1 41 M 14 41-45 843 9
9 EMP
60849.2
2 C100 0 49 F PROF 49 46-50 328 26
6
10558.8
3 C1000 0 49 M SAL 23 46-50 619 19
1
C1000 97100.4
4 0 43 M SENP 3 41-45 397 8
0 8
C1000 160378.
5 0 30 M SAL 2 26-30 781 11
1 60
C1000 26275.5
6 0 43 M PROF 23 41-45 354 12
2 5
7 C1000 0 53 M 33616.4 SAL 45 >50 239 5
Cust_I Targ Ag Gende Occupati No_OF_CR_TX AGE_BK SC Holding_Peri
Balance
D et e r on NS T R od
3 7
C1000
8 0 45 M 1881.37 PROF 3 41-45 339 13
4
C1000
9 0 37 M 3274.37 PROF 33 36-40 535 9
5
In [5]:
bank_df.shape
Out[5]:
(20000, 10)
In [6]:
bank_df.info() # many columns are of type object i.e. strings. These need to
be converted to ordinal type
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Cust_ID 20000 non-null object
1 Target 20000 non-null int64
2 Age 20000 non-null int64
3 Gender 20000 non-null object
4 Balance 20000 non-null float64
5 Occupation 20000 non-null object
6 No_OF_CR_TXNS 20000 non-null int64
7 AGE_BKT 20000 non-null object
8 SCR 20000 non-null int64
9 Holding_Period 20000 non-null int64
dtypes: float64(1), int64(5), object(4)
memory usage: 1.5+ MB
In [33]:
## For RandomForestClassifier, none of the data type need to be Object, but
everything should be integers
In [7]:
# Decision tree in Python can take only numerical / categorical colums. It
cannot take string / object types.
# The following code loops through each column and checks if the column type
is object then converts those columns
# into categorical with each distinct value becoming a category or code.
for feature in bank_df.columns:
if bank_df[feature].dtype == 'object':
bank_df[feature] = pd.Categorical(bank_df[feature]).codes
In [8]:
bank_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Cust_ID 20000 non-null int16
1 Target 20000 non-null int64
2 Age 20000 non-null int64
3 Gender 20000 non-null int8
4 Balance 20000 non-null float64
5 Occupation 20000 non-null int8
6 No_OF_CR_TXNS 20000 non-null int64
7 AGE_BKT 20000 non-null int8
8 SCR 20000 non-null int64
9 Holding_Period 20000 non-null int64
dtypes: float64(1), int16(1), int64(5), int8(3)
memory usage: 1.0 MB
In [9]:
# capture the target column ("default") into separate vectors for training
set and test set
X = bank_df.drop(["Target","Cust_ID"] , axis=1)
y = bank_df.pop("Target")
In [10]:
# splitting data into training and test set for independent attributes
# X_train = independent variable for Train, X_test = independent variable for
Test,
# train_labels = dependent varliable for Train, test_labels = dependent
variable for Test
from sklearn.model_selection import train_test_split
X_train, X_test, train_labels, test_labels = train_test_split(X, y,
test_size=.30, random_state=1)
Ensemble RandomForest Classifier
In [22]:
rfcl = RandomForestClassifier(n_estimators = 501,
oob_score=True,
max_depth=10,
max_features=5,
min_samples_leaf = 50,
min_samples_split = 110,
)
In [23]:
# n_estimators = 501 i.e.number of trees that want to build within Random
Forest classifier
#rfcl = RandomForestClassifier(n_estimators = 501, oob_score=True,
max_depth=10, max_features=3, min_samples_leaf: 50)
# rfcl = RandomForestClassifier(n_estimators = 501, oob_score=True,)
rfcl = rfcl.fit(X_train, train_labels)
In [24]:
#out of bag (oob) score, by default False means oob is not stored in the
random forest classifier
rfcl.oob_score
Out[24]:
True
In [25]:
rfcl.oob_score_
Out[25]:
0.9155714285714286
In [26]:
#max_features = out of total 8 independent features, at random 4 variables
are chosen for split
#min_samples_split approx. 3 times min_samples_leaf
from sklearn.model_selection import GridSearchCV
param_grid = {
'max_depth': [7, 10],
'max_features': [4, 6],
'min_samples_leaf': [50, 100],
'min_samples_split': [150, 300],
'n_estimators': [301, 501]
}
In [27]:
rfcl = RandomForestClassifier()
In [28]:
# cv = cross validation, value of 3 i.e. Number of combination is 3.
# Random forest model will be created with first as 7, 4, 50, 150 and 301 and
split data into 3 (fold) parts
grid_search = GridSearchCV(estimator = rfcl, param_grid = param_grid, cv = 3)
In [29]:
grid_search.fit(X_train, train_labels)
Out[29]:
GridSearchCV(cv=3, estimator=RandomForestClassifier(),
param_grid={'max_depth': [7, 10], 'max_features': [4, 6],
'min_samples_leaf': [50, 100],
'min_samples_split': [150, 300],
'n_estimators': [301, 501]})
In [30]:
grid_search.best_params_
Out[30]:
{'max_depth': 7,
'max_features': 6,
'min_samples_leaf': 50,
'min_samples_split': 150,
'n_estimators': 501}
In [ ]:
best_grid = grid_search.best_estimator_
In [ ]:
ytrain_predict = best_grid.predict_proba(X_train)
ytest_predict = best_grid.predict_proba(X_test)
In [ ]:
ytrain_predict = best_grid.predict(X_train)
ytest_predict = best_grid.predict(X_test)
In [29]:
from sklearn.metrics import confusion_matrix,classification_report
In [30]:
confusion_matrix(train_labels,ytrain_predict)
Out[30]:
array([[12754, 28],
[ 1152, 66]], dtype=int64)
In [31]:
confusion_matrix(test_labels,ytest_predict)
Out[31]:
array([[5475, 10],
[ 490, 25]], dtype=int64)
In [32]:
print(classification_report(train_labels,ytrain_predict))
precision recall f1-score support
0 0.92 1.00 0.96 12782
1 0.70 0.05 0.10 1218
accuracy 0.92 14000
macro avg 0.81 0.53 0.53 14000
weighted avg 0.90 0.92 0.88 14000
In [33]:
print(classification_report(test_labels,ytest_predict))
precision recall f1-score support
0 0.92 1.00 0.96 5485
1 0.71 0.05 0.09 515
accuracy 0.92 6000
macro avg 0.82 0.52 0.52 6000
weighted avg 0.90 0.92 0.88 6000
In [34]:
import matplotlib.pyplot as plt
In [35]:
# AUC and ROC for the training data
# predict probabilities
probs = best_grid.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(train_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.844
In [36]:
# AUC and ROC for the test data
# predict probabilities
probs = best_grid.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = probs[:, 1]
# calculate AUC
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(test_labels, probs)
print('AUC: %.3f' % auc)
# calculate roc curve
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.')
# show the plot
plt.show()
AUC: 0.777