#Print system version
!jupyter --version
import sys
print("Python version:", sys.version)
# importing necessary libraries
import pandas as pd # for data manipulation and analysis
import collections # for creating and manipulating Python's collections like OrderedDict, defaultdict,
Counter, etc.
import numpy as np # for scientific computing with Python
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
import seaborn as sns # for advanced visualization
# Classifier Libraries
from sklearn.linear_model import LogisticRegression # for implementing logistic regression algorithm
from sklearn.tree import DecisionTreeClassifier # for implementing decision tree algorithm
from sklearn.ensemble import RandomForestClassifier # for implementing random forest algorithm
from sklearn.svm import SVC # for implementing Support Vector Machine (SVM) algorithm
from sklearn.naive_bayes import GaussianNB # for implementing Naive Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier # for implementing K-Nearest Neighbors (KNN)
algorithm
# For Statistical testing
from scipy.stats import ttest_ind # for computing t-test for two independent samples
import statsmodels.api as sm # for statistical models and tests
from scipy.stats import chi2_contingency # for computing chi-square statistic and p-value for a
contingency table
import scipy.stats as stats # for implementing skewness and other stats
# Other Libraries
from sklearn.model_selection import train_test_split # for splitting data into training and testing sets
from sklearn.pipeline import make_pipeline # for building a pipeline of transforms with a final
estimator
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline # for building a pipeline
with imbalanced datasets
from imblearn.over_sampling import SMOTE # for oversampling imbalanced datasets using Synthetic
Minority Over-sampling Technique (SMOTE)
from imblearn.under_sampling import NearMiss # for undersampling imbalanced datasets using
NearMiss algorithm
from imblearn.metrics import classification_report_imbalanced # for generating a classification
report for imbalanced datasets
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report # for computing various performance metrics for classification models
from collections import Counter # for counting the frequency of elements in a list
from sklearn.model_selection import KFold, StratifiedKFold # for k-fold cross-validation
from sklearn.model_selection import cross_val_score # for evaluating a model using cross-validation
from sklearn.metrics import cohen_kappa_score # for computing Cohen's kappa score for inter-rater
agreement
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000) # for setting the maximum number of columns to
display in pandas dataframes
# first read the data file
df= pd.read_csv('/content/drive/MyDrive/anemia.csv')
df.shape
df.head()
# Print summary statistics
df.describe()
df.shape
df.info()
#columns name
df.columns
#Checking Null
# Import numpy
import numpy as np
# Inspect missing values in the dataset
print(df.isnull().values.sum())
# Replace the ' 's with NaN
df = df.replace(" ",np.NaN)
# Count the number of NaNs in the dataset to verify
print(df.isnull().values.sum())
# Create a copy of the DataFrame to avoid modifying the original data
df_copy = df.copy()
# Rename values in the 'Result' column just for the plot
df_copy['Result'] = df_copy['Result'].replace({0: 'Non Anemic', 1: 'Anemic'})
# Rename values in the 'Gender' column
df_copy['Gender'] = df_copy['Gender'].replace({0: 'Male', 1: 'Female'})
# Define custom hex colors
custom_colors = ['#B43757', '#a37b85']
custom_colors_gender = ['#90ADC6', '#C6A990']
print(df_copy)
result_counts = df_copy['Result'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors,
shadow=True)
plt.title('Distribution of Anemia Result')
plt.show()
# Create a count plot of the anemia result
ax= sns.countplot(x='Result', data=df_copy, palette=custom_colors)
plt.title('Count of Anemia Result')
# Add labels to the bars
for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
result_counts = df_copy['Result'].value_counts()
# Print the counts of the two categories
print(result_counts)
# Check if the two categories are balanced or not
if result_counts[0] == result_counts[1]:
print('The two categories are balanced.')
else:
print('The two categories are not balanced.')
print("-----")
# The classes are heavily skewed we need to solve this issue later.
print('Non Anemic', round(df['Result'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Anemic', round(df['Result'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
ax= sns.countplot(x='Gender', hue ='Result', data=df_copy, palette=custom_colors)
plt.title('Number of Individuals with and without Anemia by Gender')
# Add labels to the bars
for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
result_counts = df_copy['Gender'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors_gender,
shadow=True)
plt.title('Gender distribution ')
plt.show()
df.head()
df_copy.head()
# anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()
# ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=custom_colors_gender)
# ax.set_xticklabels(['Male', 'Female'])
# plt.title('Mean Anemia Rate by Gender')
# plt.xlabel('Gender')
# plt.ylabel('Mean Anemia Rate')
# plt.show()
print(sns.barplot.__doc__)
color_gen = {'0': '#90ADC6', '1': '#C6A990'}
anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()
# Create the bar plot
ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=color_gen)
# Add labels to the bars
for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.2f}'.format(p.get_height()), ha='center')
ax.set_xticklabels(['Male', 'Female'])
# Add plot titles and labels
plt.title('Mean Anemia Rate by Gender | Which gender has more the anemic condition?',
fontsize=16, fontweight='bold')
plt.xlabel('Gender' , fontsize=12)
plt.ylabel('Mean Anemia Rate' , fontsize=12)
# Remove spines
sns.despine(left=True, bottom=True)
# Remove vertical lines from the grid
plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)
plt.figure(figsize=(8, 6))
# Show the plot
plt.show()
anemia_rates = df.groupby('Gender')['Result'].mean().round(2)
anemia_rates
# Create separate subsets for males and females
male_data = df_copy[df_copy['Gender'] == 'Male']
female_data = df_copy[df_copy['Gender'] == 'Female']
# Plot horizontal violinplot using Seaborn
sns.violinplot(x='Hemoglobin', y='Gender', hue='Result', data=df_copy, palette=custom_colors,
inner='quartile', scale='width', cut=0)
# Add mean and median lines
for i, group in enumerate([male_data, female_data]):
median = group['Hemoglobin'].median()
mean = group['Hemoglobin'].mean()
plt.axhline(y=i, xmin=0.05, xmax=0.48, color='black', linewidth=2)
plt.text(0.51, i+0.1, f'Median: {median:.2f}', ha='left', va='center')
plt.text(0.51, i-0.1, f'Mean: {mean:.2f}', ha='left', va='center')
# Add IQR whiskers
q1_male, q3_male = male_data['Hemoglobin'].quantile([0.25, 0.75])
q1_female, q3_female = female_data['Hemoglobin'].quantile([0.25, 0.75])
plt.axhline(y=0, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.axhline(y=1, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.plot([q1_male, q1_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q3_male, q3_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q1_female, q1_female], [0.8, 1.2], color='black', linewidth=2)
plt.plot([q3_female, q3_female], [0.8, 1.2], color='black', linewidth=2)
plt.text((q1_male+q3_male)/2, -0.3, f'IQR: {q3_male-q1_male:.2f}', ha='center', va='center')
plt.text((q1_female+q3_female)/2, 1.3, f'IQR: {q3_female-q1_female:.2f}', ha='center', va='center')
# Add title and labels
plt.title('Distribution of Hemoglobin Levels by Gender')
plt.xlabel('Hemoglobin Level')
plt.ylabel('Gender')
# Show the plot
plt.show()
iqr = np.percentile(df['Hemoglobin'], 75) - np.percentile(df['Hemoglobin'], 25)
# Bin width using the Freedman-Diaconis rule
bin_width = 2 * iqr / (len(df)**(1/3))
sns.distplot(df['Hemoglobin'], hist=True, kde=True,
bins=int(round((df['Hemoglobin'].max() - df['Hemoglobin'].min()) / bin_width)),
color='#d60266',
hist_kws={'edgecolor':'black', 'alpha': 0.8},
kde_kws={'linewidth': 2})
# Add labels and adjust font sizes
#plt.title('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
plt.xlabel('Hemoglobin', fontsize=12)
plt.ylabel('Count', fontsize=12)
# # Add legend
# plt.legend(labels=['Hemoglobin'], loc='upper right')
# Remove spines
sns.despine(left=True, bottom=True)
# Remove vertical lines from the grid
plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)
# Adjust plot size
plt.figure(figsize=(8, 6))
# Show plot
plt.show()
# Calculate skewness using the skew() function
skewness = stats.skew(df['Hemoglobin'])
# Calculate kurtosis using the kurtosis() function False Parameter
kurtosis = stats.kurtosis(df['Hemoglobin'], fisher=False)
# Print the result
print("Skewness:", skewness)
# Print the result
print("Kurtosis:", kurtosis)
# Create a dictionary with the values
hemoglobin_data = {'Metric': ['Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest
Hemoglobin Level'],
'Value': [df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}
# Create a pandas DataFrame from the dictionary
hemoglobin_table = pd.DataFrame(hemoglobin_data)
# Create the table using Seaborn styling
styled_table = (hemoglobin_table.style
.set_caption('Hemoglobin Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Filter the data by anemia status
anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Plot histograms with Seaborn
sns.histplot(anemia_data, x='Hemoglobin', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='Hemoglobin', ax=ax2, color='green', binwidth=0.5)
# Set titles and axis labels
ax1.set_title('Hemoglobin Levels in Patients with Anemia', fontsize=14, fontweight='bold')
ax2.set_title('Hemoglobin Levels in Patients without Anemia', fontsize=14, fontweight='bold')
fig.suptitle('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
# Customize tick labels and grid
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
# Remove spines
sns.despine(left=True, bottom=True)
# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)
# Adjust plot size
plt.tight_layout()
# Show the plot
plt.show()
# Create a dictionary with the values mean corpuscular hemoglobin MCH
MCH_data = {'Metric': ['Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level'],
'Value': [df['MCH'].max(), df['MCH'].mean(), df['MCH'].min()]}
# Create a pandas DataFrame from the dictionary
MCH_table = pd.DataFrame(MCH_data)
# Create the table using Seaborn styling
styled_table = (MCH_table.style
.set_caption('MCH Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Filter the data by anemia status
anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))
# Plot histograms with Seaborn
sns.histplot(anemia_data, x='MCH', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='MCH', ax=ax2, color='green', binwidth=0.5)
# Set titles and axis labels
ax1.set_title('Mean Corpuscular Hemoglobin Levels in Patients with Anemia', fontsize=14,
fontweight='bold')
ax2.set_title('Mean Corpuscular Hemoglobin Levels in Patients without Anemia', fontsize=14,
fontweight='bold')
fig.suptitle('Distribution of Mean Corpuscular Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
# Customize tick labels and grid
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
# Remove spines
sns.despine(left=True, bottom=True)
# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)
# Adjust plot size
plt.tight_layout()
# Show the plot
plt.show()
# Create a dictionary with the values mean corpuscular hemoglobin MCH
MCHC_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min()]}
# Create a pandas DataFrame from the dictionary
MCHC_table = pd.DataFrame(MCHC_data)
# Create the table using Seaborn styling
styled_table = (MCHC_table.style
.set_caption('MCHC Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# Filter data for anemia and non-anemia cases
anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Plot histograms with Seaborn
sns.histplot(anemia_data, x='MCHC', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCHC', ax=ax2, color='green', bins=20)
# Customize tick labels and grid
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
# Set titles and axis labels
ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia', fontweight='bold')
fig.suptitle('Mean Corpuscular Hemoglobin Concentration Levels', fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')
# Remove spines
sns.despine(left=True, bottom=True)
# Show the plot
plt.show()
# Filter data for anemia and non-anemia cases
anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]
# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))
# Plot histograms with Seaborn
sns.histplot(anemia_data, x='MCV', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCV', ax=ax2, color='green', bins=20)
# Customize tick labels and grid
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)
# Set titles and axis labels
ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia',fontweight='bold' )
fig.suptitle('Mean Corpuscular Volume Levels',fontweight='bold')
ax1.set_xlabel('Mean Corpuscular VolumeLevel',fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Volume Level',fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')
# Remove spines
sns.despine(left=True, bottom=True)
# Show the plot
plt.show()
# Create a dictionary with the values mean corpuscular hemoglobin MCH
MCV_data = {'Metric': ['Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level'],
'Value': [df['MCV'].max(), df['MCV'].mean(), df['MCV'].min()]}
# Create a pandas DataFrame from the dictionary
MCV_table = pd.DataFrame(MCV_data)
# Create the table using Seaborn styling
styled_table = (MCV_table.style
.set_caption('MCV Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
# dictionary with the values for MCHC, MCV, MCH, and hemoglobin
blood_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level',
'Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level',
'Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level',
'Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest Hemoglobin Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min(),
df['MCV'].max(), df['MCV'].mean(), df['MCV'].min(),
df['MCH'].max(), df['MCH'].mean(), df['MCH'].min(),
df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}
# Create a pandas df
blood_table = pd.DataFrame(blood_data)
# Create the table
styled_table = (blood_table.style
.set_caption('Blood Test Results')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption', 'props': [('font-size', '18px'),
('font-weight', 'bold'), ('padding-bottom', '10px')]}])
.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)
sns.set_style("whitegrid")
sns.boxplot(x='Result', y='Hemoglobin', data=df_copy, palette=custom_colors)
plt.title('Distribution of Hemoglobin Levels by Anemia Result')
plt.xlabel('Anemia Result')
plt.ylabel('Hemoglobin Level')
# Remove spines
sns.despine(left=True, bottom=True)
plt.show()
# Set plot style
# Create violin plot using Seaborn
ax = sns.violinplot(x='Result', y='Hemoglobin', hue='Gender', data=df_copy,
palette=custom_colors_gender, split=True)
# Set plot title and axis labels
ax.set_title('Distribution of Hemoglobin Levels by Gender and Anemic Condition', fontsize=14,
fontweight='bold')
ax.set_xlabel('Anemia Result', fontsize=12, fontweight='bold')
ax.set_ylabel('Hemoglobin Level', fontsize=12, fontweight='bold')
# Add legend and adjust its position
ax.legend(title='Gender', title_fontsize=12, fontsize=10, loc='upper right')
# Remove spines
sns.despine(left=True, bottom=True)
# Show the plot
plt.show()
df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',
'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)
sns.pairplot(df,hue='Result')
sns.set(style="ticks")
RELATIONS_COLS = ["Hemoglobin", "MCH", "MCHC","MCV"]
g = sns.PairGrid(data=df, vars=RELATIONS_COLS, hue="Result", palette=custom_colors)
g.map_diag(sns.kdeplot, shade=True)
g.map_offdiag(sns.regplot, scatter_kws={'alpha':0.5})
g.add_legend(title="Result")
legend = g._legend
# set figure size
g.fig.set_size_inches(12, 12)
# update legend labels
new_labels = ['Non-anemic', 'Anemic']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)
# legend.texts[0].set_text('Non Anemic')
# legend.texts[1].set_text('Anemic')
# g.fig.suptitle("Relations in the Dataset", y=1.03)
**STATISTICAL TEST **(T-TEST)A t-test is a statistical test used to determine whether there is a
significant difference between the means of two groups. In our case, we are using a t-test to
determine whether there is a significant difference in the mean hemoglobin levels between males
and females.
As we see Hemoglobin have negaive skewness but t-test asumes have normal distribution. So before
performing t-test, we would be taking the logarithm of the data, which can help to reduce the
skewness.
df_stat = df.copy()
df_stat.head()
male_hemoglobin = df_stat.loc[df_stat['Gender'] == 0, 'Hemoglobin']
female_hemoglobin = df_stat.loc[df_stat['Gender'] == 1, 'Hemoglobin']
# Compute the t-test statistic and p-value
t_statistic, p_value = ttest_ind(male_hemoglobin, female_hemoglobin)
# Print the results
print("T-Statistic: {:.2f}".format(t_statistic))
print("P-Value: {:.3f}".format(p_value))
# Compare the p-value with the significance level (0.05)
if p_value < 0.05:
print("Reject null hypothesis: Gender has an impact on hemoglobin levels.")
else:
print("Fail to reject null hypothesis: Gender has no impact on hemoglobin levels.")
**ODDS RATIO**
# Create binary variables for gender and anemia status
df_stat['is_female'] = np.where(df_stat['Gender'] == 1, 1, 0)
df_stat['is_anemic'] = np.where(df_stat['Result'] == 1, 1, 0)
# Fit a logistic regression model with gender and anemia status as predictors
logit_model = sm.Logit(df_stat['is_anemic'], sm.add_constant(df_stat['is_female']))
result = logit_model.fit()
# Print the odds ratio for gender
print("Odds Ratio for Gender: {:.2f}".format(np.exp(result.params[1])))
**chi-square test**
# Create a contingency table of gender and anemia status
cont_table = pd.crosstab(df_stat['Gender'], df_stat['Result'])
# Perform the chi-square test of independence
chi2_statistic, p_value, dof, expected = chi2_contingency(cont_table)
# Print the results
print("Chi-Square Statistic: {:.2f}".format(chi2_statistic))
print("P-Value: {:.3f}".format(p_value))
# Compare the p-value with the significance level (0.05)
if p_value < 0.05:
print("Reject null hypothesis: Gender and anemia status are dependent.")
else:
print("Fail to reject null hypothesis: Gender and anemia status are independent.")
**FEATURE SELECTION**
CORRELATION.............PERSON CORRELATION
df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',
'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)
# create a correlation matrix
corr_matrix = df.corr().round(2)
# plot the correlation matrix using a heatmap from seaborn
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)
#plt.title('Correlation Matrix', fontweight='bold')
plt.show()
**SELECTKBEST**
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = df.iloc[:,0:5] # independent columns
y = df.iloc[:,5]
k_values = [2, 3, 4, 5] # different values of K to try
best_k = 0 # variable to keep track of best K value
best_score = 0 # variable to keep track of best score
for k in k_values:
# apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_) # score for each feature
dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs','Score'] # naming the dataframe columns
# get the best K value based on score
if featureScores['Score'].sum() > best_score:
best_score = featureScores['Score'].sum()
best_k = k
print(f"The best value of K is {best_k} with score {best_score}.")
print("---")
print(featureScores)
print("---")
print(featureScores.nlargest(3,'Score'))
**Extremely Randomized Trees.**
# Extremely Randomized Trees.
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(3).plot(kind='barh', color='#808080')
plt.xlabel('Importance')
# plt.title('Top 3 Features Importance', fontweight='bold', fontsize=12)
# Remove spines
sns.despine(left=False, bottom=True)
plt.show()
plt.savefig('Top2Feature.jpg')
**SCALING FEATURES**
# Scale Hemoglobin by log
df['Hemoglobin_log'] = np.log(df.Hemoglobin + 0.01)
# Scale Hemoglobin by Standardization
from sklearn.preprocessing import StandardScaler # importing a class from a module of a library
ss = StandardScaler() # object of the class StandardScaler ()
df['Hemoglobin_scaled'] = ss.fit_transform(df['Hemoglobin'].values.reshape(-1,1))
#SCALE BY NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # object of the class StandardScaler ()
df['Hemoglobin_minmax'] = mm.fit_transform(df['Hemoglobin'].values.reshape(-1,1))
#Feature engineering to a better visualization of the values
# Let's explore the Aby Result and see the distribuition of Hemoglobin
fig , axs = plt.subplots(nrows = 1 , ncols = 4 , figsize = (16,4))
sns.boxplot(x ="Result",y="Hemoglobin",data=df, ax = axs[0])
axs[0].set_title("Result vs Hemoglobin")
sns.boxplot(x ="Result",y="Hemoglobin_log",data=df, ax = axs[1])
axs[1].set_title("Result vs Log Hemoglobin")
sns.boxplot(x ="Result",y="Hemoglobin_scaled",data=df, ax = axs[2])
axs[2].set_title("Result vs Scaled Hemoglobin")
sns.boxplot(x ="Result",y="Hemoglobin_minmax",data=df, ax = axs[3])
axs[3].set_title("Result vs Min Max Hemoglobin")
# fig.suptitle('Amount by Class', fontsize=20)
plt.show()
**Splitting data into Training and Testing samples(70:30)**
df.columns
# Separate Target Variable and Predictor Variables
# Here I am keeping the selected feature only
X = df.drop(['MCHC','Hemoglobin_log', 'Hemoglobin_scaled', 'Hemoglobin_minmax', 'Result',
'MCH'],axis=1)
y = df['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True,
random_state=101)
# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)
**CLASSIFICATION MODELS**
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression # Importing Classifier Step
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
# Model Evolution
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))
print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_test)))
print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred)))
# print('Confusion Matrix : \n', cnf_matrix)
print("\n")
# Predicted values counts for Anemic and Non Anemic of test dataset
pd.Series(y_pred).value_counts()
# Actual values counts for Anemic and Non Anemic of test dataset
pd.Series(y_test).value_counts()
183/181
**MODEL EVOLUTION MATRIX**
# confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix
# Heatmap for Confusion Matrix
p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="autumn"
,fmt='g')
plt.title('Confusion matrix', y=1.1, fontsize = 22)
plt.ylabel('Actual',fontsize = 18)
plt.xlabel('Predicted',fontsize = 18)
# ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
# ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);
plt.show()
181/181
**ROC**
metrics.roc_auc_score(y_test , y_pred)
y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba
# plot ROC Curve
plt.figure(figsize=(8,6))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, y_pred)
print("AUC - ",auc,"\n")
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC curve for anemic cases classification', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.show()
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred)
print('f1=%.3f' % (f1))
# create figure and axis objects with custom size and padding
fig, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)
# plot no skill
ax.plot([0, 1], [0.5, 0.5], linestyle='--', color='gray', lw=1)
# plot the precision-recall curve
ax.plot(recall, precision, marker='.', markersize=5)
# set axis labels and title
ax.set_xlabel('Recall', fontsize=14)
ax.set_ylabel('Precision', fontsize=14)
ax.set_title('Precision-Recall Curve for anemic cases classification', fontsize=16)
# show F1 score in the plot
ax.text(0.05, 0.95, f'F1 Score = {f1:.3f}', transform=ax.transAxes, fontsize=14)
# show the plot
plt.show()
# As found in EDA, the response variable Result have unequal data.
# Imbalanced data typically refers to a problem with classification problems where the classes are
not represented equally. If one applies classifiers on the dataset, they are likely to predict everything
as the majority class. This was often regarded as a problem in learning from highly imbalanced
datasets.
# To tackle the imbalance, we will be focuing on
# Random Oversampling
# Random oversampling duplicates examples from the minority class in the training dataset and can
result in overfitting for some models.
# Random undersampling
# Random undersampling deletes examples from the majority class and can result in losing
information invaluable to a model.
# Synthetic Minority OverSampling Technique (SMOTE)
# In this technique, instead of simply duplicating data from the minority class, we synthesize new
data from the minority class. This is a type of data augmentation for tabular data can be very
effective. This approach to synthesizing new data is called the Synthetic Minority Oversampling
TEchnique, or SMOTE for short.
# Adaptive Synthetic Sampling Method for Imbalanced Data (ADASYN)
# ADASYN (Adaptive Synthetic) is an algorithm that generates synthetic data, and its greatest
advantages are not copying the same minority data, and generating more data for “harder to learn”
examples.
# Import imbalace technique algorithims
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification
print('Original dataset shape %s' % Counter(y_train))
# Undersampling only on train
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_rus))
# Undersampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_rus, y_train_rus)
y_pred_rus = logreg.predict(X_test)
print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred_rus , y_test)))
print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_rus)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_rus)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_rus)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_rus)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_rus)))
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_rus)
auc = metrics.roc_auc_score(y_test, y_pred_rus)
print("AUC - ",auc,"\n")
# plot the ROC curve
plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")
# set the x-axis and y-axis limits
plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Undersampling', fontsize=12, fontweight='bold')
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_rus)
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_rus)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Undersampling', fontsize=12, fontweight='bold')
# show the plot
plt.show()
# Heatmap for Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test , y_pred_rus)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="winter"
,fmt='g')
plt.title('Confusion matrix Random Undersampling', y=1.1, fontsize = 12, fontweight='bold')
plt.xlabel('Predicted',fontsize = 12, fontweight='bold')
plt.ylabel('Actual',fontsize = 12, fontweight='bold')
# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);
# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);
plt.show()
from imblearn.over_sampling import RandomOverSampler
print('Original dataset shape %s' % Counter(y_train))
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_ros))
# Oversampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_ros, y_train_ros)
y_pred_ros = logreg.predict(X_test)
print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_ros)))
print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_ros)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_ros)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_ros)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_ros)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_ros)))
F1 score of 0.94 on the test set with data leakage and a score of 0.94 without data leakage.
Here, data leakage did not have a significant impact on the model's performance.
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_ros)
auc = metrics.roc_auc_score(y_test, y_pred_ros)
print("AUC - ",auc,"\n")
# plot the ROC curve
plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")
# set the x-axis and y-axis limits
plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Oversampling', fontsize=12, fontweight='bold')
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_ros)
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_ros)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Oversampling', fontsize=12, fontweight='bold')
# show the plot
plt.show()
# Heatmap for Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test , y_pred_ros)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="summer"
,fmt='g')
plt.title('Confusion matrix Random Oversampling ', y=1.1, fontsize=12, fontweight='bold')
plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)
# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);
# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);
plt.show()
#LOgistic Regression with smote data
from imblearn.over_sampling import SMOTE, ADASYN
print('Original dataset shape %s' % Counter(y_train))
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_smote))
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)
y_pred_smote = logreg.predict(X_test)
print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_smote)))
print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_smote)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_smote)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_smote)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_smote)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_smote)))
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_smote)
auc = metrics.roc_auc_score(y_test, y_pred_smote)
print("AUC - ",auc,"\n")
# plot the ROC curve
plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")
# set the x-axis and y-axis limits
plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12, fontweight='bold')
plt.title('ROC curve for LR SMOTE', fontsize=12, fontweight='bold')
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_smote)
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_smote)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR SMOTE', fontsize=12, fontweight='bold')
# show the plot
plt.show()
# Heatmap for Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test , y_pred_smote)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Purples"
,fmt='g')
plt.title('Confusion matrix with SMOTE', y=1.1, fontsize = 12)
plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)
plt.show()
#Logistic Regression with ADASYN data
print('Original dataset shape %s' % Counter(y_train))
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_adasyn))
# ADASYN Sampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = logreg.predict(X_test)
print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_pred_adasyn)))
print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_adasyn)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_adasyn)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_adasyn)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_adasyn)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_adasyn)))
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_adasyn)
auc = metrics.roc_auc_score(y_test, y_pred_adasyn)
print("AUC - ",auc,"\n")
# plot the ROC curve
plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")
# set the x-axis and y-axis limits
plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12,fontweight='bold')
plt.title('ROC curve for LR ADASYN', fontsize=12,fontweight='bold')
# add legend
plt.legend(loc="lower right")
plt.show()
# calculate precision-recall curve
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_adasyn)
# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_adasyn)
print('f1=%.3f' % (f1))
# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')
# plot the roc curve for the model
plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR ADASYN', fontsize=12, fontweight='bold')
# show the plot
plt.show()
# Heatmap for Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test , y_pred_adasyn)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Wistia"
,fmt='g')
plt.title('Confusion matrix with LR ADASYN', y=1.1, fontsize = 12,fontweight='bold')
plt.xlabel('Predicted',fontsize = 12,fontweight='bold')
plt.ylabel('Actual',fontsize = 12,fontweight='bold')
plt.show()
**DISTRIBUTION OF BALANCED DATA SET (BUILDING DIFFERENT MODELS)**
names_lst = []
# Empty list to capture performance matrix for train set
aucs_train_lst = []
accuracy_train_lst = []
precision_train_lst = []
recall_train_lst = []
f1_train_lst = []
# Empty list to capture performance matrix for test set
aucs_test_lst = []
accuracy_test_lst = []
precision_test_lst = []
recall_test_lst = []
f1_test_lst = []
kappa_lst = []
# Function for model building and performance measure
def build_measure_model(models):
plt.figure(figsize=(12,6))
for name, model, X_train, y_train, X_test, y_test in models:
names_lst.append(name)
# Build model
model.fit(X_train, y_train)
# Predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# calculate accuracy
Accuracy_train = metrics.accuracy_score(y_train, y_train_pred)
accuracy_train_lst.append(Accuracy_train)
Accuracy_test = metrics.accuracy_score(y_test, y_test_pred)
accuracy_test_lst.append(Accuracy_test)
# calculate auc
Aucs_train = metrics.roc_auc_score(y_train, y_train_pred)
aucs_train_lst.append(Aucs_train)
Aucs_test = metrics.roc_auc_score(y_test , y_test_pred)
aucs_test_lst.append(Aucs_test)
# calculate precision
PrecisionScore_train = metrics.precision_score(y_train , y_train_pred)
precision_train_lst.append(PrecisionScore_train)
PrecisionScore_test = metrics.precision_score(y_test , y_test_pred)
precision_test_lst.append(PrecisionScore_test)
# calculate recall
RecallScore_train = metrics.recall_score(y_train , y_train_pred)
recall_train_lst.append(RecallScore_train)
RecallScore_test = metrics.recall_score(y_test , y_test_pred)
recall_test_lst.append(RecallScore_test)
# calculate f1 score
F1Score_train = metrics.f1_score(y_train , y_train_pred)
f1_train_lst.append(F1Score_train)
F1Score_test = metrics.f1_score(y_test , y_test_pred)
f1_test_lst.append(F1Score_test)
#print('F1 Score of '+ name +' model : {0:0.5f}'.format(F1Score_test))
# calculate kappa Statictis
kappa = cohen_kappa_score(y_test, y_test_pred)
kappa_lst.append(kappa)
# draw confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test , y_test_pred)
print("Model Name :", name)
print('Train Accuracy :{0:0.5f}'.format(Accuracy_train))
print('Test Accuracy :{0:0.5f}'.format(Accuracy_test))
print('Train AUC : {0:0.5f}'.format(Aucs_train))
print('Test AUC : {0:0.5f}'.format(Aucs_test))
print('Train Precision : {0:0.5f}'.format(PrecisionScore_train))
print('Test Precision : {0:0.5f}'.format(PrecisionScore_test))
print('Train Recall : {0:0.5f}'.format(RecallScore_train))
print('Test Recall : {0:0.5f}'.format(RecallScore_test))
print('Train F1 : {0:0.5f}'.format(F1Score_train))
print('Test F1 : {0:0.5f}'.format(F1Score_test))
print('Kappa Statistic : {0:0.5f}'.format(kappa))
print('Confusion Matrix : \n', cnf_matrix)
print("\n")
# plot ROC Curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
auc = metrics.roc_auc_score(y_test, y_test_pred)
plt.plot(fpr,tpr,linewidth=2, label=name + ", auc="+str(auc))
#---------- For loops ends here--------#
plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
#plt.title('ROC curve for Predicting a anemia cases')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()
**DECISION TREE**
DTmodels = []
dt = DecisionTreeClassifier()
DTmodels.append(('DT imbalance', dt,X_train,y_train,X_test,y_test))
DTmodels.append(('DT Undersampling', dt,X_train_rus,y_train_rus,X_test,y_test))
DTmodels.append(('DT Oversampling', dt,X_train_ros,y_train_ros,X_test,y_test))
DTmodels.append(('DT SMOTE', dt,X_train_smote,y_train_smote,X_test,y_test))
DTmodels.append(('DT ADASYN', dt,X_train_adasyn,y_train_adasyn,X_test,y_test))
# Call function to create model and measure its performance
build_measure_model(DTmodels)
**RANDOM FOREST**
# Random Forest (RF)
RFmodels = []
RFmodels.append(('RF imbalance', RandomForestClassifier(),X_train,y_train,X_test,y_test))
RFmodels.append(('RF Undersampling',
RandomForestClassifier(),X_train_rus,y_train_rus,X_test,y_test))
RFmodels.append(('RF Oversampling',
RandomForestClassifier(),X_train_ros,y_train_ros,X_test,y_test))
RFmodels.append(('RF SMOTE',
RandomForestClassifier(),X_train_smote,y_train_smote,X_test,y_test))
RFmodels.append(('RF ADASYN',
RandomForestClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))
# Call function to create model and measure its performance
build_measure_model(RFmodels)
**KNN**
# K-Nearest Neighbors (KNN)
KNNmodels = []
KNNmodels.append(('KNN imbalance', KNeighborsClassifier(),X_train,y_train,X_test,y_test))
KNNmodels.append(('KNN Undersampling',
KNeighborsClassifier(),X_train_rus,y_train_rus,X_test,y_test))
KNNmodels.append(('KNN Oversampling',
KNeighborsClassifier(),X_train_ros,y_train_ros,X_test,y_test))
KNNmodels.append(('KNN SMOTE',
KNeighborsClassifier(),X_train_smote,y_train_smote,X_test,y_test))
KNNmodels.append(('KNN ADASYN',
KNeighborsClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))
# Call function to create model and measure its performance
build_measure_model(KNNmodels)
**SVM**
# Support Vector Machines (SVM)
SVMmodels = []
SVMmodels.append(('SVM imbalance', SVC(gamma='auto'),X_train,y_train,X_test,y_test))
SVMmodels.append(('SVM Undersampling',
SVC(gamma='auto'),X_train_rus,y_train_rus,X_test,y_test))
SVMmodels.append(('SVM Oversampling',
SVC(gamma='auto'),X_train_ros,y_train_ros,X_test,y_test))
SVMmodels.append(('SVM SMOTE',
SVC(gamma='auto'),X_train_smote,y_train_smote,X_test,y_test))
SVMmodels.append(('SVM ADASYN',
SVC(gamma='auto'),X_train_adasyn,y_train_adasyn,X_test,y_test))
# Call function to create model and measure its performance
build_measure_model(SVMmodels)
# Gaussian Naive Bayes (NB)
NBmodels = []
NBmodels.append(('NB imbalance', GaussianNB(),X_train,y_train,X_test,y_test))
NBmodels.append(('NB Undersampling', GaussianNB(),X_train_rus,y_train_rus,X_test,y_test))
NBmodels.append(('NB Oversampling', GaussianNB(),X_train_ros,y_train_ros,X_test,y_test))
NBmodels.append(('NB SMOTE', GaussianNB(),X_train_smote,y_train_smote,X_test,y_test))
NBmodels.append(('NB ADASYN', GaussianNB(),X_train_adasyn,y_train_adasyn,X_test,y_test))
# Call function to create model and measure its performance
build_measure_model(NBmodels)
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
# Random Forest model training
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
# Predict from the test set
y_pred = random_forest.predict(X_test)
# Model evaluation
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: {0:0.5f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUC: {0:0.5f}'.format(metrics.roc_auc_score(y_test, y_pred)))
print('Precision: {0:0.5f}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall: {0:0.5f}'.format(metrics.recall_score(y_test, y_pred)))
print('F1: {0:0.5f}'.format(metrics.f1_score(y_test, y_pred)))
**PERFORMANCE MEASURE OF CLASSSIFIERS**
data = {'Model':names_lst,
#'Accuracy_Train':accuracy_train_lst,
'Accuracy_Test':accuracy_test_lst,
#'AUC_Train':aucs_train_lst,
'AUC_Test':aucs_test_lst,
#'PrecisionScore_Train':precision_train_lst,
'PrecisionScore_Test':precision_test_lst,
#'RecallScore_Train':recall_train_lst,
'RecallScore_Test':recall_test_lst,
#'F1Score_Train':f1_train_lst,
'F1Score_Test':f1_test_lst,
'Kappa Stat' : kappa_lst
}
print("Performance measures of various classifiers: \n")
performance_df = pd.DataFrame(data)
performance_df = performance_df.round(3)
finaltable =
performance_df.sort_values(['F1Score_Test','RecallScore_Test','AUC_Test'],ascending=False)
finaltable
finaltable.to_excel('my_table.xlsx', index=False)
**HYPERPARAMETER TUNING**
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
# define the classifiers to be evaluated
classifiers_ = [DecisionTreeClassifier(),
RandomForestClassifier(),
SVC(),
GaussianNB(),
LogisticRegression(),
KNeighborsClassifier()]
# define the parameter grids for each classifier
param_grids = [{'max_depth': range(1, 10), 'criterion': ['gini', 'entropy']}, # decision tree
{'n_estimators': [50, 100, 200], 'max_depth': range(1, 10)}, # random forest
{'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, # SVM
{}, # naive bayes - no hyperparameters to tune
# Naive Bayes is a probabilistic classifier that is based on Bayes' theorem and the "naive" assumption
that
# the presence or absence of a particular feature is independent of the presence or absence of any
other feature.
# Naive Bayes makes no assumptions about the distribution of the data, unlike other classifiers like
decision trees,
# SVM, or logistic regression
{'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, #logistic
{"n_neighbors": list(range(2,60,1)),'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}] # KNN
regression
# perform GridSearchCV for each classifier
for clf, param_grid in zip(classifiers_, param_grids):
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print(clf.__class__.__name__, "best params:", grid_search.best_params_, "best score:",
grid_search.best_score_)
print("F1 score:", f1_score(y_test, y_pred))
**CROSS VALIDATION**
# perform 5-fold cross-validation for each classifier
for clf in classifiers_:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(clf.__class__.__name__, "mean accuracy:", scores.mean(), "std deviation:", scores.std())
from sklearn.tree import export_graphviz
import graphviz
#hyperparametered DT
dt_tuning = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
# Train the decision tree based on selected parameter
dt_tuning.fit(X_train, y_train)
# Generate a DOT file representing the decision tree
dot_data = export_graphviz(dt_tuning, out_file=None, feature_names=X_train.columns,
class_names=['Non Anemic', 'Anemic'], filled=True, rounded=True)
# Visualize the decision path for the new data point
graph = graphviz.Source(dot_data)
graph.render('decision_tree') # save the decision tree as a PDF file
graph
# Testing data
new_data = { 'Gender': 1, 'Hemoglobin': 11, 'MCV':50}
new_X = pd.DataFrame([new_data])
prediction = dt.predict(new_X)
print('Prediction:', prediction)
import matplotlib.pyplot as plt
# Define the labels, scores and colors for each model
labels = ['Decision Tree', 'Random Forest', 'SVM', 'Naive Bayes', 'Logistic Regression', 'KNN']
scores = [1.00, 1.00, 0.994, 0.914, 0.935, 0.988,0.975]
colors = ['#50BFE6','#9C51B6','#FF5470','#0066CC','#FF5050','#E97451']
# Sort the scores and labels in descending order
sorted_scores, sorted_labels = zip(*sorted(zip(scores, labels), reverse=True))
# Set up the plot
fig, ax = plt.subplots(figsize=(12,8))
ax.bar(sorted_labels, sorted_scores, color=colors)
# Set the title and axis labels
ax.set_title('Comparison of Model Performance Grid search', fontsize=12, fontweight='bold')
ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')
# Set the tick font size
ax.tick_params(axis='both', which='major', labelsize=12)
# Add the accuracy score as text above each bar
for i, score in enumerate(sorted_scores):
ax.text(i, score+0.01, f'{score*100:.1f}%', fontsize=12, ha='center')
# Remove spines
sns.despine(left=True, bottom=True)
# Display the plot
plt.show()
**PLOTTING ACCURACY**
import pickle
from sklearn.ensemble import RandomForestClassifier
# Save the Random Forest model as a pickle file
filename = 'random_forest_model.pkl'
pickle.dump(random_forest, open(filename, 'wb'))
print("Random Forest model exported as pickle file:", filename)