import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("breast-cancer.csv")
df.head()
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
poi
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980
5 rows × 32 columns
df.shape
(569, 32)
df.dtypes
Show hidden output
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 569 non-null int64
1 diagnosis 569 non-null object
2 radius_mean 569 non-null float64
3 texture_mean 569 non-null float64
4 perimeter_mean 569 non-null float64
5 area_mean 569 non-null float64
6 smoothness_mean 569 non-null float64
7 compactness_mean 569 non-null float64
8 concavity_mean 569 non-null float64
9 concave points_mean 569 non-null float64
10 symmetry_mean 569 non-null float64
11 fractal_dimension_mean 569 non-null float64
12 radius_se 569 non-null float64
13 texture_se 569 non-null float64
14 perimeter_se 569 non-null float64
15 area_se 569 non-null float64
16 smoothness_se 569 non-null float64
17 compactness_se 569 non-null float64
18 concavity_se 569 non-null float64
19 concave points_se 569 non-null float64
20 symmetry_se 569 non-null float64
21 fractal_dimension_se 569 non-null float64
22 radius_worst 569 non-null float64
23 texture_worst 569 non-null float64
24 perimeter_worst 569 non-null float64
25 area_worst 569 non-null float64
26 smoothness_worst 569 non-null float64
27 compactness_worst 569 non-null float64
28 concavity_worst 569 non-null float64
29 concave points_worst 569 non-null float64
30 symmetry_worst 569 non-null float64
31 fractal_dimension_worst 569 non-null float64
dtypes: float64(30), int64(1), object(1)
memory usage: 142.4+ KB
df.isnull().sum()
Show hidden output
df.duplicated().sum()
np.int64(0)
df.describe()
c
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean
point
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.
8 rows × 31 columns
!pip install missingno
Show hidden output
import missingno as msno # it is a python libaray that bascially help in see missing value with help of visual.
msno.bar(df,color = 'pink')
<Axes: >
# there is no missing value.
df['diagnosis'] = df['diagnosis'].apply(lambda val:1 if val == 'M' else 0)
plt.hist(df['diagnosis'])
plt.title('Diagnosis (M=1 ,B=0)')
plt.show()
# each 5 row having 6 column
#density graph
plt.figure(figsize =(20,15))
plotnumber = 1
for column in df :
if plotnumber<=30:
ax = plt.subplot(5,6,plotnumber)
sns.distplot(df[column])
plt.xlabel(column)
plotnumber +=1
plt.tight_layout()
plt.show()
df.corr()
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean
id 1.000000 0.039769 0.074626 0.099770 0.073159 0.096893 -0.012968 0.000096
diagnosis 0.039769 1.000000 0.730029 0.415185 0.742636 0.708984 0.358560 0.596534
radius_mean 0.074626 0.730029 1.000000 0.323782 0.997855 0.987357 0.170581 0.506124
texture_mean 0.099770 0.415185 0.323782 1.000000 0.329533 0.321086 -0.023389 0.236702
perimeter_mean 0.073159 0.742636 0.997855 0.329533 1.000000 0.986507 0.207278 0.556936
area_mean 0.096893 0.708984 0.987357 0.321086 0.986507 1.000000 0.177028 0.498502
smoothness_mean -0.012968 0.358560 0.170581 -0.023389 0.207278 0.177028 1.000000 0.659123
compactness_mean 0.000096 0.596534 0.506124 0.236702 0.556936 0.498502 0.659123 1.000000
concavity_mean 0.050080 0.696360 0.676764 0.302418 0.716136 0.685983 0.521984 0.883121
concave points_mean 0.044158 0.776614 0.822529 0.293464 0.850977 0.823269 0.553695 0.831135
symmetry_mean -0.022114 0.330499 0.147741 0.071401 0.183027 0.151293 0.557775 0.602641
fractal_dimension_mean -0.052511 -0.012838 -0.311631 -0.076437 -0.261477 -0.283110 0.584792 0.565369
radius_se 0.143048 0.567134 0.679090 0.275869 0.691765 0.732562 0.301467 0.497473
texture_se -0.007526 -0.008303 -0.097317 0.386358 -0.086761 -0.066280 0.068406 0.046205
perimeter_se 0.137331 0.556141 0.674172 0.281673 0.693135 0.726628 0.296092 0.548905
area_se 0.177742 0.548236 0.735864 0.259845 0.744983 0.800086 0.246552 0.455653
smoothness_se 0.096781 -0.067016 -0.222600 0.006614 -0.202694 -0.166777 0.332375 0.135299
compactness_se 0.033961 0.292999 0.206000 0.191975 0.250744 0.212583 0.318943 0.738722
concavity_se 0.055239 0.253730 0.194204 0.143293 0.228082 0.207660 0.248396 0.570517
concave points_se 0.078768 0.408042 0.376169 0.163851 0.407217 0.372320 0.380676 0.642262
symmetry_se -0.017306 -0.006522 -0.104321 0.009127 -0.081629 -0.072497 0.200774 0.229977
fractal_dimension_se 0.025725 0.077972 -0.042641 0.054458 -0.005523 -0.019887 0.283607 0.507318
radius_worst 0.082405 0.776454 0.969539 0.352573 0.969476 0.962746 0.213120 0.535315
texture_worst 0.064720 0.456903 0.297008 0.912045 0.303038 0.287489 0.036072 0.248133
perimeter_worst 0.079986 0.782914 0.965137 0.358040 0.970387 0.959120 0.238853 0.590210
area_worst 0.107187 0.733825 0.941082 0.343546 0.941550 0.959213 0.206718 0.509604
smoothness_worst 0.010338 0.421465 0.119616 0.077503 0.150549 0.123523 0.805324 0.565541
compactness_worst -0.002968 0.590998 0.413463 0.277830 0.455774 0.390410 0.472468 0.865809
concavity_worst 0.023203 0.659610 0.526911 0.301025 0.563879 0.512606 0.434926 0.816275
concave points_worst 0.035174 0.793566 0.744214 0.295316 0.771241 0.722017 0.503053 0.815573
symmetry_worst -0.044224 0.416294 0.163953 0.105008 0.189115 0.143570 0.394309 0.510223
fractal_dimension_worst -0.029866 0.323872 0.007066 0.119205 0.051019 0.003738 0.499316 0.687382
32 rows × 32 columns
#heatmap
plt.figure(figsize = (20,15))
sns.heatmap(df.corr(), annot=True, fmt=".1f", cmap='coolwarm')
plt.show()
plt.figure(figsize = (20,15))
corr = df.corr()
mask = np.triu(np.ones_like(corr,dtype = bool))
sns.heatmap(corr,mask = mask ,linewidths = 1,annot = True, fmt='.2f' )
plt.show()
#highly correlated feature
#multicollinearity
df.drop('id',axis = 1,inplace = True)
#feature selection
corr_matrix = df.corr().abs()
mask = np.triu(np.ones_like(corr_matrix ,dtype = bool))
tri_df = corr_matrix.mask(mask)
to_drop = [x for x in tri_df.columns if any(tri_df[x]>0.92)]
df = df.drop(to_drop, axis = 1)
print(df.shape[1])
23
df.head()
concave
diagnosis texture_mean smoothness_mean compactness_mean symmetry_mean fractal_dimension_mean texture_se area_
points_mean
0 1 10.38 0.11840 0.27760 0.14710 0.2419 0.07871 0.9053 153.
1 1 17.77 0.08474 0.07864 0.07017 0.1812 0.05667 0.7339 74.
2 1 21.25 0.10960 0.15990 0.12790 0.2069 0.05999 0.7869 94.
3 1 20.38 0.14250 0.28390 0.10520 0.2597 0.09744 1.1560 27.
4 1 14.34 0.10030 0.13280 0.10430 0.1809 0.05883 0.7813 94.
5 rows × 23 columns
# 32 features reduce it now 23
x = df.drop('diagnosis',axis = 1)
y = df['diagnosis']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2 ,random_state = 0)
# Scaling data
from sklearn .preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape
(455, 22)
keyboard_arrow_down ML Model
from sklearn .linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
▾ LogisticRegression i ?
LogisticRegression()
y_pred = log_reg.predict(x_test)
y_pred
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
0, 1, 1, 0])
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,log_reg.predict(x_train))) # it's give train accuracy .
log_reg_acc = accuracy_score(y_test,log_reg.predict(x_test)) # it's give test accuracy .
print(log_reg_acc)
y_pred = log_reg.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.989010989010989
0.9649122807017544
[[66 1]
[ 3 44]]
precision recall f1-score support
0 0.96 0.99 0.97 67
1 0.98 0.94 0.96 47
accuracy 0.96 114
macro avg 0.97 0.96 0.96 114
weighted avg 0.97 0.96 0.96 114
# KNN
from sklearn .neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
▾ KNeighborsClassifier i ?
KNeighborsClassifier()
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,knn.predict(x_train))) # it's give train accuracy .
knn_acc = accuracy_score(y_test,knn.predict(x_test)) # it's give test accuracy .
print(knn_acc)
y_pred = knn.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.967032967032967
0.956140350877193
[[66 1]
[ 4 43]]
precision recall f1-score support
0 0.94 0.99 0.96 67
1 0.98 0.91 0.95 47
accuracy 0.96 114
macro avg 0.96 0.95 0.95 114
weighted avg 0.96 0.96 0.96 114
# SVC
# Hyperparameter tuning
from sklearn .svm import SVC
from sklearn .model_selection import GridSearchCV
svc = SVC(probability = True)
parameters = {
'gamma': [0.0001,0.001,0.01,0.1],
'C': [0.01,0.1,0.05,0.5,10,15]
}
grid_search = GridSearchCV(svc,parameters)
grid_search.fit(x_train,y_train)
▸ GridSearchCV
i ?
▸ best_estimator_:
SVC
▸ SVC ?
grid_search.best_params_
{'C': 15, 'gamma': 0.01}
grid_search.best_score_
np.float64(0.9802197802197803)
svc = SVC(C = 15, gamma = 0.01,probability = True)
svc.fit(x_train,y_train)
▾ SVC i ?
SVC(C=15, gamma=0.01, probability=True)
y_pred = svc.predict(x_test)
y_pred
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 1, 0])
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,svc.predict(x_train))) # it's give train accuracy .
svc_acc = accuracy_score(y_test,svc.predict(x_test)) # it's give test accuracy .
print(svc_acc)
y_pred = svc.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.989010989010989
0.9824561403508771
[[67 0]
[ 2 45]]
precision recall f1-score support
0 0.97 1.00 0.99 67
1 1.00 0.96 0.98 47
accuracy 0.98 114
macro avg 0.99 0.98 0.98 114
weighted avg 0.98 0.98 0.98 114
# Decision Tree
from sklearn .tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
parameters = {
'criterion': ['ginni','entropy'],
'max_depth': range(2,32,1),
'min_samples_leaf':range(1,10,1),
'min_samples_split':range(2,10,1),
'splitter':['best','random']
}
grid_search_dt = GridSearchCV(dtc,parameters,cv = 5,n_jobs = -1,verbose = 1)
grid_search_dt.fit(x_train,y_train)
Fitting 5 folds for each of 8640 candidates, totalling 43200 fits
▸ GridSearchCV
i ?
▸ best_estimator_:
DecisionTreeClassifier
▸ DecisionTreeClassifier ?
grid_search_dt.best_params_
{'criterion': 'entropy',
'max_depth': 12,
'min_samples_leaf': 1,
'min_samples_split': 2,
'splitter': 'random'}
grid_search_dt.best_score_
np.float64(0.9582417582417582)
dtc = DecisionTreeClassifier(criterion = 'entropy',max_depth = 12,min_samples_leaf = 3,min_samples_split = 9,splitter = 'random')
dtc.fit(x_train,y_train)
▾ DecisionTreeClassifier i ?
DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3,
min_samples_split=9, splitter='random')
y_pred = dtc.predict(x_test)
y_pred
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
0, 1, 1, 0])
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,dtc.predict(x_train))) # it's give train accuracy .
dtc_acc = accuracy_score(y_test,dtc.predict(x_test)) # it's give test accuracy .
print(dtc_acc)
y_pred = svc.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.9714285714285714
0.956140350877193
[[67 0]
[ 2 45]]
precision recall f1-score support
0 0.97 1.00 0.99 67
1 1.00 0.96 0.98 47
accuracy 0.98 114
macro avg 0.99 0.98 0.98 114
weighted avg 0.98 0.98 0.98 114
#RFC
from sklearn.ensemble import RandomForestClassifier
rand_clf = RandomForestClassifier(criterion = 'entropy',max_depth = 10,max_features = 0.5,min_samples_leaf = 2,min_samples_split = 3,n_e
rand_clf.fit(x_train,y_train)
▾ RandomForestClassifier i ?
RandomForestClassifier(criterion='entropy', max_depth=10, max_features=0.5,
min_samples_leaf=2, min_samples_split=3,
n_estimators=130)
y_pred = rand_clf.predict(x_test)
y_pred
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 1, 1, 0])
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,rand_clf.predict(x_train))) # it's give train accuracy .
rand_clf_acc = accuracy_score(y_test,rand_clf.predict(x_test)) # it's give test accuracy .
print(rand_clf_acc)
y_pred = rand_clf.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.9956043956043956
0.9736842105263158
[[66 1]
[ 2 45]]
precision recall f1-score support
0 0.97 0.99 0.98 67
1 0.98 0.96 0.97 47
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
#GBC
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
parameters = {
'loss': ['deviance','exponential'],
'learning_rate': [0.001,0.1],
'n_estimators':[100,150,180]
}
grid_search_gbc = GridSearchCV(gbc,parameters,cv = 2,n_jobs = -5,verbose = 1)
grid_search_gbc.fit(x_train,y_train)
Fitting 2 folds for each of 12 candidates, totalling 24 fits
▸ GridSearchCV
i ?
▸ best_estimator_:
GradientBoostingClassifier
▸ GradientBoostingClassifier ?
grid_search_gbc.best_params_
{'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 150}
grid_search_gbc.best_score_
np.float64(0.9604780122111447)
gbc = GradientBoostingClassifier(learning_rate = 0.1,loss = 'exponential', n_estimators = 100)
gbc.fit(x_train,y_train)
▾ GradientBoostingClassifier i ?
GradientBoostingClassifier(loss='exponential')
y_pred = gbc.predict(x_test)
y_pred
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 1, 1, 0])
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,gbc.predict(x_train))) # it's give train accuracy .
gbc_acc = accuracy_score(y_test,gbc.predict(x_test)) # it's give test accuracy .
print(gbc_acc)
y_pred = gbc.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
1.0
0.9649122807017544
[[64 3]
[ 1 46]]
precision recall f1-score support
0 0.98 0.96 0.97 67
1 0.94 0.98 0.96 47
accuracy 0.96 114
macro avg 0.96 0.97 0.96 114
weighted avg 0.97 0.96 0.97 114
!pip install xgboost
Show hidden output
#XGB
from xgboost import XGBClassifier
xgb = XGBClassifier(objective = 'binary:logistic',learning_rate = 0.01,max_depth= 5,n_estimator = 180)
xgb.fit(x_train,y_train)
▾ XGBClassifier i ?
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
feature_weights=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.01, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimator=180,
n_estimators=None, n_jobs=None, ...)
from sklearn.metrics import accuracy_score ,confusion_matrix,classification_report
print(accuracy_score(y_train,xgb.predict(x_train))) # it's give train accuracy .
xgb_acc = accuracy_score(y_test,xgb.predict(x_test)) # it's give test accuracy .
print(xgb_acc)
y_pred = xgb.predict(x_test)
print(confusion_matrix(y_test,y_pred)) #it's give confusion matrix.
print (classification_report(y_test,y_pred)) # it's give classifiaction report.
0.9934065934065934
0.956140350877193
[[65 2]
[ 3 44]]
precision recall f1-score support
0 0.96 0.97 0.96 67
1 0.96 0.94 0.95 47
accuracy 0.96 114
macro avg 0.96 0.95 0.95 114
weighted avg 0.96 0.96 0.96 114
models = pd.DataFrame({
'Model': ['Logistic Regression', 'KNN', 'SVM', 'Decision Tree Classifier', 'Random Forest Classifier', 'Gradient Boosting Classifier
'Score': [100*round(log_reg_acc,4), 100*round(knn_acc,4), 100*round(svc_acc,4), 100*round(dtc_acc,4), 100*round(rand_clf_acc,4),
100*round(gbc_acc,4), 100*round(xgb_acc,4)]
})
models.sort_values(by = 'Score', ascending = False)
Model Score
2 SVM 98.25
4 Random Forest Classifier 97.37
0 Logistic Regression 96.49
5 Gradient Boosting Classifier 96.49
1 KNN 95.61
3 Decision Tree Classifier 95.61
6 XgBoost 95.61
from sklearn import metrics
plt.figure(figsize=(8,5))
models = [
{
'label': 'LR',
'model': log_reg,
},
{
'label': 'DT',
'model': dtc,
},
{
'label': 'SVM',
'model': svc,
},
{
'label': 'KNN',
'model': knn,
},
{
'label': 'XGBoost',
'model': xgb,
},
{
'label': 'RF',
'model': rand_clf,
},
{
'label': 'GBDT',
'model': gbc,
}
]
for m in models:
model = m['model']
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
fpr1, tpr1, thresholds = metrics.roc_curve(y_test, model.predict_proba(x_test)[:,1]
auc = metrics.roc_auc_score(y_test,model.predict(x_test))
plt.plot(fpr1, tpr1, label='%s - ROC (area = %0.2f)' % (m['label'], auc))
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1 - Specificity (False Positive Rate)', fontsize=12)
plt.ylabel('Sensitivity (True Positive Rate)', fontsize=12)
plt.title('ROC - Breast Cancer Prediction', fontsize=12)
plt.legend(loc="lower right", fontsize=12)
plt.savefig("roc_breast_cancer.jpeg", format='jpeg', dpi=400, bbox_inches='tight')
plt.show()