0% found this document useful (0 votes)

42 views33 pages

Anemia Data Analysis with Python

Uploaded by

sksharini67

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

42 views33 pages

Anemia Data Analysis with Python

Uploaded by

sksharini67

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 33

#Print system version

!jupyter --version
import sys
print("Python version:", sys.version)

# importing necessary libraries

import pandas as pd # for data manipulation and analysis

import collections # for creating and manipulating Python's collections like OrderedDict, defaultdict,
Counter, etc.
import numpy as np # for scientific computing with Python
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
import seaborn as sns # for advanced visualization

# Classifier Libraries
from sklearn.linear_model import LogisticRegression # for implementing logistic regression algorithm
from sklearn.tree import DecisionTreeClassifier # for implementing decision tree algorithm
from sklearn.ensemble import RandomForestClassifier # for implementing random forest algorithm
from sklearn.svm import SVC # for implementing Support Vector Machine (SVM) algorithm
from sklearn.naive_bayes import GaussianNB # for implementing Naive Bayes algorithm
from sklearn.neighbors import KNeighborsClassifier # for implementing K-Nearest Neighbors (KNN)
algorithm

# For Statistical testing

from scipy.stats import ttest_ind # for computing t-test for two independent samples
import statsmodels.api as sm # for statistical models and tests
from scipy.stats import chi2_contingency # for computing chi-square statistic and p-value for a
contingency table
import scipy.stats as stats # for implementing skewness and other stats

# Other Libraries
from sklearn.model_selection import train_test_split # for splitting data into training and testing sets
from sklearn.pipeline import make_pipeline # for building a pipeline of transforms with a final
estimator
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline # for building a pipeline
with imbalanced datasets
from imblearn.over_sampling import SMOTE # for oversampling imbalanced datasets using Synthetic
Minority Over-sampling Technique (SMOTE)
from imblearn.under_sampling import NearMiss # for undersampling imbalanced datasets using
NearMiss algorithm
from imblearn.metrics import classification_report_imbalanced # for generating a classification
report for imbalanced datasets
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,
classification_report # for computing various performance metrics for classification models
from collections import Counter # for counting the frequency of elements in a list
from sklearn.model_selection import KFold, StratifiedKFold # for k-fold cross-validation
from sklearn.model_selection import cross_val_score # for evaluating a model using cross-validation
from sklearn.metrics import cohen_kappa_score # for computing Cohen's kappa score for inter-rater
agreement

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 5000) # for setting the maximum number of columns to
display in pandas dataframes

# first read the data file

df= pd.read_csv('/content/drive/MyDrive/anemia.csv')

df.shape

df.head()

# Print summary statistics

df.describe()

df.shape

df.info()

#columns name
df.columns

#Checking Null
# Import numpy
import numpy as np

# Inspect missing values in the dataset

print(df.isnull().values.sum())

# Replace the ' 's with NaN

df = df.replace(" ",np.NaN)

# Count the number of NaNs in the dataset to verify

print(df.isnull().values.sum())

# Create a copy of the DataFrame to avoid modifying the original data

df_copy = df.copy()

# Rename values in the 'Result' column just for the plot

df_copy['Result'] = df_copy['Result'].replace({0: 'Non Anemic', 1: 'Anemic'})
# Rename values in the 'Gender' column
df_copy['Gender'] = df_copy['Gender'].replace({0: 'Male', 1: 'Female'})

# Define custom hex colors

custom_colors = ['#B43757', '#a37b85']
custom_colors_gender = ['#90ADC6', '#C6A990']
print(df_copy)

result_counts = df_copy['Result'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors,
shadow=True)
plt.title('Distribution of Anemia Result')
plt.show()

# Create a count plot of the anemia result

ax= sns.countplot(x='Result', data=df_copy, palette=custom_colors)
plt.title('Count of Anemia Result')

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Result'].value_counts()

# Print the counts of the two categories

print(result_counts)

# Check if the two categories are balanced or not

if result_counts[0] == result_counts[1]:
print('The two categories are balanced.')
else:
print('The two categories are not balanced.')

print("-----")
# The classes are heavily skewed we need to solve this issue later.
print('Non Anemic', round(df['Result'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Anemic', round(df['Result'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

ax= sns.countplot(x='Gender', hue ='Result', data=df_copy, palette=custom_colors)

plt.title('Number of Individuals with and without Anemia by Gender')

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.0f}'.format(p.get_height()), ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

plt.show()

result_counts = df_copy['Gender'].value_counts()
plt.pie(result_counts, labels=result_counts.index, autopct='%1.1f%%', colors=custom_colors_gender,
shadow=True)
plt.title('Gender distribution ')
plt.show()

df.head()

df_copy.head()

# anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=custom_colors_gender)

# ax.set_xticklabels(['Male', 'Female'])
# plt.title('Mean Anemia Rate by Gender')
# plt.xlabel('Gender')
# plt.ylabel('Mean Anemia Rate')
# plt.show()

print(sns.barplot.__doc__)

color_gen = {'0': '#90ADC6', '1': '#C6A990'}

anemia_rates = df.groupby('Gender')['Result'].mean().reset_index()

# Create the bar plot

ax = sns.barplot(x='Gender', y='Result', data=anemia_rates, palette=color_gen)

# Add labels to the bars

for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), '{:.2f}'.format(p.get_height()), ha='center')
ax.set_xticklabels(['Male', 'Female'])

# Add plot titles and labels

plt.title('Mean Anemia Rate by Gender | Which gender has more the anemic condition?',
fontsize=16, fontweight='bold')
plt.xlabel('Gender' , fontsize=12)
plt.ylabel('Mean Anemia Rate' , fontsize=12)

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid

plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

plt.figure(figsize=(8, 6))
# Show the plot
plt.show()

anemia_rates = df.groupby('Gender')['Result'].mean().round(2)
anemia_rates

# Create separate subsets for males and females

male_data = df_copy[df_copy['Gender'] == 'Male']
female_data = df_copy[df_copy['Gender'] == 'Female']

# Plot horizontal violinplot using Seaborn

sns.violinplot(x='Hemoglobin', y='Gender', hue='Result', data=df_copy, palette=custom_colors,
inner='quartile', scale='width', cut=0)

# Add mean and median lines

for i, group in enumerate([male_data, female_data]):
median = group['Hemoglobin'].median()
mean = group['Hemoglobin'].mean()
plt.axhline(y=i, xmin=0.05, xmax=0.48, color='black', linewidth=2)
plt.text(0.51, i+0.1, f'Median: {median:.2f}', ha='left', va='center')
plt.text(0.51, i-0.1, f'Mean: {mean:.2f}', ha='left', va='center')

# Add IQR whiskers

q1_male, q3_male = male_data['Hemoglobin'].quantile([0.25, 0.75])
q1_female, q3_female = female_data['Hemoglobin'].quantile([0.25, 0.75])
plt.axhline(y=0, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.axhline(y=1, xmin=0.25, xmax=0.75, color='black', linewidth=2)
plt.plot([q1_male, q1_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q3_male, q3_male], [-0.2, 0.2], color='black', linewidth=2)
plt.plot([q1_female, q1_female], [0.8, 1.2], color='black', linewidth=2)
plt.plot([q3_female, q3_female], [0.8, 1.2], color='black', linewidth=2)
plt.text((q1_male+q3_male)/2, -0.3, f'IQR: {q3_male-q1_male:.2f}', ha='center', va='center')
plt.text((q1_female+q3_female)/2, 1.3, f'IQR: {q3_female-q1_female:.2f}', ha='center', va='center')

# Add title and labels

plt.title('Distribution of Hemoglobin Levels by Gender')
plt.xlabel('Hemoglobin Level')
plt.ylabel('Gender')

# Show the plot

plt.show()

iqr = np.percentile(df['Hemoglobin'], 75) - np.percentile(df['Hemoglobin'], 25)

# Bin width using the Freedman-Diaconis rule

bin_width = 2 * iqr / (len(df)**(1/3))
sns.distplot(df['Hemoglobin'], hist=True, kde=True,
bins=int(round((df['Hemoglobin'].max() - df['Hemoglobin'].min()) / bin_width)),
color='#d60266',
hist_kws={'edgecolor':'black', 'alpha': 0.8},
kde_kws={'linewidth': 2})

# Add labels and adjust font sizes

#plt.title('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
plt.xlabel('Hemoglobin', fontsize=12)
plt.ylabel('Count', fontsize=12)

# # Add legend
# plt.legend(labels=['Hemoglobin'], loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Remove vertical lines from the grid

plt.grid(axis='y', alpha=0.3)
plt.gca().xaxis.grid(False)

# Adjust plot size

plt.figure(figsize=(8, 6))
# Show plot
plt.show()

# Calculate skewness using the skew() function

skewness = stats.skew(df['Hemoglobin'])

# Calculate kurtosis using the kurtosis() function False Parameter

kurtosis = stats.kurtosis(df['Hemoglobin'], fisher=False)

# Print the result

print("Skewness:", skewness)
# Print the result
print("Kurtosis:", kurtosis)

# Create a dictionary with the values

hemoglobin_data = {'Metric': ['Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest
Hemoglobin Level'],
'Value': [df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas DataFrame from the dictionary

hemoglobin_table = pd.DataFrame(hemoglobin_data)

# Create the table using Seaborn styling

styled_table = (hemoglobin_table.style
.set_caption('Hemoglobin Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status

anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='Hemoglobin', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='Hemoglobin', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels

ax1.set_title('Hemoglobin Levels in Patients with Anemia', fontsize=14, fontweight='bold')
ax2.set_title('Hemoglobin Levels in Patients without Anemia', fontsize=14, fontweight='bold')
fig.suptitle('Distribution of Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size

plt.tight_layout()

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCH_data = {'Metric': ['Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level'],
'Value': [df['MCH'].max(), df['MCH'].mean(), df['MCH'].min()]}

# Create a pandas DataFrame from the dictionary

MCH_table = pd.DataFrame(MCH_data)

# Create the table using Seaborn styling

styled_table = (MCH_table.style
.set_caption('MCH Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter the data by anemia status

anemia_data = df[df['Result'] == 1]
no_anemia_data = df[df['Result'] == 0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCH', ax=ax1, color='red', binwidth=0.5)
sns.histplot(no_anemia_data, x='MCH', ax=ax2, color='green', binwidth=0.5)

# Set titles and axis labels

ax1.set_title('Mean Corpuscular Hemoglobin Levels in Patients with Anemia', fontsize=14,
fontweight='bold')
ax2.set_title('Mean Corpuscular Hemoglobin Levels in Patients without Anemia', fontsize=14,
fontweight='bold')
fig.suptitle('Distribution of Mean Corpuscular Hemoglobin Levels', fontsize=16, fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax2.set_xlabel('Mean Corpuscular Hemoglobin Level', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Remove spines
sns.despine(left=True, bottom=True)

# # Add legend
# ax1.legend(labels=['Patients with Anemia'], loc='upper right', fontsize=10)
# ax2.legend(labels=['Patients without Anemia'], loc='upper right', fontsize=10)

# Adjust plot size

plt.tight_layout()

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCHC_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min()]}

# Create a pandas DataFrame from the dictionary

MCHC_table = pd.DataFrame(MCHC_data)

# Create the table using Seaborn styling

styled_table = (MCHC_table.style
.set_caption('MCHC Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# Filter data for anemia and non-anemia cases

anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCHC', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCHC', ax=ax2, color='green', bins=20)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels

ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia', fontweight='bold')
fig.suptitle('Mean Corpuscular Hemoglobin Concentration Levels', fontweight='bold')
ax1.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Hemoglobin Concentration Level', fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

# Filter data for anemia and non-anemia cases

anemia_data = df[df['Result']==1]
no_anemia_data = df[df['Result']==0]

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13,5))

# Plot histograms with Seaborn

sns.histplot(anemia_data, x='MCV', ax=ax1, color='red', bins=20)
sns.histplot(no_anemia_data, x='MCV', ax=ax2, color='green', bins=20)

# Customize tick labels and grid

ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax1.grid(axis='y', linestyle='--', alpha=0.7)
ax2.grid(axis='y', linestyle='--', alpha=0.7)

# Set titles and axis labels

ax1.set_title('Having Anemia', fontweight='bold')
ax2.set_title('Not Having Anemia',fontweight='bold' )
fig.suptitle('Mean Corpuscular Volume Levels',fontweight='bold')
ax1.set_xlabel('Mean Corpuscular VolumeLevel',fontweight='bold')
ax2.set_xlabel('Mean Corpuscular Volume Level',fontweight='bold')
ax1.set_ylabel('Count')
ax2.set_ylabel('Count')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

# Create a dictionary with the values mean corpuscular hemoglobin MCH

MCV_data = {'Metric': ['Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level'],
'Value': [df['MCV'].max(), df['MCV'].mean(), df['MCV'].min()]}
# Create a pandas DataFrame from the dictionary
MCV_table = pd.DataFrame(MCV_data)

# Create the table using Seaborn styling

styled_table = (MCV_table.style
.set_caption('MCV Levels')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption',
'props': [('font-size', '18px'),
('font-weight', 'bold'),
('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

# dictionary with the values for MCHC, MCV, MCH, and hemoglobin
blood_data = {'Metric': ['Highest MCHC Level', 'Average MCHC Level', 'Lowest MCHC Level',
'Highest MCV Level', 'Average MCV Level', 'Lowest MCV Level',
'Highest MCH Level', 'Average MCH Level', 'Lowest MCH Level',
'Highest Hemoglobin Level', 'Average Hemoglobin Level', 'Lowest Hemoglobin Level'],
'Value': [df['MCHC'].max(), df['MCHC'].mean(), df['MCHC'].min(),
df['MCV'].max(), df['MCV'].mean(), df['MCV'].min(),
df['MCH'].max(), df['MCH'].mean(), df['MCH'].min(),
df['Hemoglobin'].max(), df['Hemoglobin'].mean(), df['Hemoglobin'].min()]}

# Create a pandas df
blood_table = pd.DataFrame(blood_data)

# Create the table

styled_table = (blood_table.style
.set_caption('Blood Test Results')
.set_properties(**{'text-align': 'center'})
.set_table_styles([{'selector': 'caption', 'props': [('font-size', '18px'),
('font-weight', 'bold'), ('padding-bottom', '10px')]}])

.format({'Value': '{:.2f}'}))
# Display the table
display(styled_table)

sns.set_style("whitegrid")
sns.boxplot(x='Result', y='Hemoglobin', data=df_copy, palette=custom_colors)
plt.title('Distribution of Hemoglobin Levels by Anemia Result')
plt.xlabel('Anemia Result')
plt.ylabel('Hemoglobin Level')

# Remove spines
sns.despine(left=True, bottom=True)
plt.show()

# Set plot style

# Create violin plot using Seaborn

ax = sns.violinplot(x='Result', y='Hemoglobin', hue='Gender', data=df_copy,
palette=custom_colors_gender, split=True)

# Set plot title and axis labels

ax.set_title('Distribution of Hemoglobin Levels by Gender and Anemic Condition', fontsize=14,
fontweight='bold')
ax.set_xlabel('Anemia Result', fontsize=12, fontweight='bold')
ax.set_ylabel('Hemoglobin Level', fontsize=12, fontweight='bold')

# Add legend and adjust its position

ax.legend(title='Gender', title_fontsize=12, fontsize=10, loc='upper right')

# Remove spines
sns.despine(left=True, bottom=True)

# Show the plot

plt.show()

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',

'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

sns.pairplot(df,hue='Result')

sns.set(style="ticks")

RELATIONS_COLS = ["Hemoglobin", "MCH", "MCHC","MCV"]

g = sns.PairGrid(data=df, vars=RELATIONS_COLS, hue="Result", palette=custom_colors)

g.map_diag(sns.kdeplot, shade=True)
g.map_offdiag(sns.regplot, scatter_kws={'alpha':0.5})
g.add_legend(title="Result")
legend = g._legend

# set figure size

g.fig.set_size_inches(12, 12)

# update legend labels

new_labels = ['Non-anemic', 'Anemic']
for t, l in zip(g._legend.texts, new_labels): t.set_text(l)

# legend.texts[0].set_text('Non Anemic')
# legend.texts[1].set_text('Anemic')
# g.fig.suptitle("Relations in the Dataset", y=1.03)
**STATISTICAL TEST **(T-TEST)A t-test is a statistical test used to determine whether there is a
significant difference between the means of two groups. In our case, we are using a t-test to
determine whether there is a significant difference in the mean hemoglobin levels between males
and females.

As we see Hemoglobin have negaive skewness but t-test asumes have normal distribution. So before
performing t-test, we would be taking the logarithm of the data, which can help to reduce the
skewness.

df_stat = df.copy()
df_stat.head()

male_hemoglobin = df_stat.loc[df_stat['Gender'] == 0, 'Hemoglobin']

female_hemoglobin = df_stat.loc[df_stat['Gender'] == 1, 'Hemoglobin']

# Compute the t-test statistic and p-value

t_statistic, p_value = ttest_ind(male_hemoglobin, female_hemoglobin)

# Print the results

print("T-Statistic: {:.2f}".format(t_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)

if p_value < 0.05:
print("Reject null hypothesis: Gender has an impact on hemoglobin levels.")
else:
print("Fail to reject null hypothesis: Gender has no impact on hemoglobin levels.")

**ODDS RATIO**

# Create binary variables for gender and anemia status

df_stat['is_female'] = np.where(df_stat['Gender'] == 1, 1, 0)
df_stat['is_anemic'] = np.where(df_stat['Result'] == 1, 1, 0)

# Fit a logistic regression model with gender and anemia status as predictors
logit_model = sm.Logit(df_stat['is_anemic'], sm.add_constant(df_stat['is_female']))
result = logit_model.fit()

# Print the odds ratio for gender

print("Odds Ratio for Gender: {:.2f}".format(np.exp(result.params[1])))

**chi-square test**

# Create a contingency table of gender and anemia status

cont_table = pd.crosstab(df_stat['Gender'], df_stat['Result'])
# Perform the chi-square test of independence
chi2_statistic, p_value, dof, expected = chi2_contingency(cont_table)

# Print the results

print("Chi-Square Statistic: {:.2f}".format(chi2_statistic))
print("P-Value: {:.3f}".format(p_value))

# Compare the p-value with the significance level (0.05)

if p_value < 0.05:
print("Reject null hypothesis: Gender and anemia status are dependent.")
else:
print("Fail to reject null hypothesis: Gender and anemia status are independent.")

**FEATURE SELECTION**

CORRELATION.............PERSON CORRELATION

df[['Gender','Hemoglobin','Result', 'MCH', 'MCV',

'MCHC']].corr()['Result'].sort_values(ascending=False).head(10)

# create a correlation matrix

corr_matrix = df.corr().round(2)

# plot the correlation matrix using a heatmap from seaborn

sns.heatmap(corr_matrix, cmap='coolwarm', annot=True)
#plt.title('Correlation Matrix', fontweight='bold')
plt.show()

**SELECTKBEST**

import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X = df.iloc[:,0:5] # independent columns

y = df.iloc[:,5]

k_values = [2, 3, 4, 5] # different values of K to try

best_k = 0 # variable to keep track of best K value
best_score = 0 # variable to keep track of best score

for k in k_values:
# apply SelectKBest class to extract top k best features
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)

dfscores = pd.DataFrame(fit.scores_) # score for each feature

dfcolumns = pd.DataFrame(X.columns)
# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs','Score'] # naming the dataframe columns

# get the best K value based on score

if featureScores['Score'].sum() > best_score:
best_score = featureScores['Score'].sum()
best_k = k

print(f"The best value of K is {best_k} with score {best_score}.")

print("---")
print(featureScores)
print("---")
print(featureScores.nlargest(3,'Score'))

Extremely Randomized Trees.

# Extremely Randomized Trees.

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,y)

print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers

#plot graph of feature importances for better visualization

feat_importances = pd.Series(model.feature_importances_, index=X.columns)

feat_importances.nlargest(3).plot(kind='barh', color='#808080')
plt.xlabel('Importance')
# plt.title('Top 3 Features Importance', fontweight='bold', fontsize=12)
# Remove spines
sns.despine(left=False, bottom=True)
plt.show()
plt.savefig('Top2Feature.jpg')

**SCALING FEATURES**

# Scale Hemoglobin by log

df['Hemoglobin_log'] = np.log(df.Hemoglobin + 0.01)

# Scale Hemoglobin by Standardization

from sklearn.preprocessing import StandardScaler # importing a class from a module of a library

ss = StandardScaler() # object of the class StandardScaler ()

df['Hemoglobin_scaled'] = ss.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#SCALE BY NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler() # object of the class StandardScaler ()
df['Hemoglobin_minmax'] = mm.fit_transform(df['Hemoglobin'].values.reshape(-1,1))

#Feature engineering to a better visualization of the values

# Let's explore the Aby Result and see the distribuition of Hemoglobin
fig , axs = plt.subplots(nrows = 1 , ncols = 4 , figsize = (16,4))

sns.boxplot(x ="Result",y="Hemoglobin",data=df, ax = axs[0])

axs[0].set_title("Result vs Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_log",data=df, ax = axs[1])

axs[1].set_title("Result vs Log Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_scaled",data=df, ax = axs[2])

axs[2].set_title("Result vs Scaled Hemoglobin")

sns.boxplot(x ="Result",y="Hemoglobin_minmax",data=df, ax = axs[3])

axs[3].set_title("Result vs Min Max Hemoglobin")

# fig.suptitle('Amount by Class', fontsize=20)

plt.show()

Splitting data into Training and Testing samples(70:30)

df.columns

# Separate Target Variable and Predictor Variables

# Here I am keeping the selected feature only
X = df.drop(['MCHC','Hemoglobin_log', 'Hemoglobin_scaled', 'Hemoglobin_minmax', 'Result',
'MCH'],axis=1)
y = df['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True,

random_state=101)

# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)

**CLASSIFICATION MODELS**
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression # Importing Classifier Step

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Model Evolution
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_test)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred)))
# print('Confusion Matrix : \n', cnf_matrix)
print("\n")

# Predicted values counts for Anemic and Non Anemic of test dataset
pd.Series(y_pred).value_counts()

# Actual values counts for Anemic and Non Anemic of test dataset
pd.Series(y_test).value_counts()

183/181

MODEL EVOLUTION MATRIX

# confusion matrix

cnf_matrix = metrics.confusion_matrix(y_test,y_pred)
cnf_matrix

# Heatmap for Confusion Matrix

p = sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="autumn"
,fmt='g')

plt.title('Confusion matrix', y=1.1, fontsize = 22)

plt.ylabel('Actual',fontsize = 18)
plt.xlabel('Predicted',fontsize = 18)

# ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
# ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);

plt.show()

181/181
**ROC**

metrics.roc_auc_score(y_test , y_pred)

y_pred_proba = logreg.predict_proba(X_test)
y_pred_proba

# plot ROC Curve

plt.figure(figsize=(8,6))

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

auc = metrics.roc_auc_score(y_test, y_pred)
print("AUC - ",auc,"\n")

plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)

plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC curve for anemic cases classification', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred)

print('f1=%.3f' % (f1))

# create figure and axis objects with custom size and padding
fig, ax = plt.subplots(figsize=(8, 6))
plt.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95)

# plot no skill
ax.plot([0, 1], [0.5, 0.5], linestyle='--', color='gray', lw=1)

# plot the precision-recall curve

ax.plot(recall, precision, marker='.', markersize=5)

# set axis labels and title

ax.set_xlabel('Recall', fontsize=14)
ax.set_ylabel('Precision', fontsize=14)
ax.set_title('Precision-Recall Curve for anemic cases classification', fontsize=16)

# show F1 score in the plot

ax.text(0.05, 0.95, f'F1 Score = {f1:.3f}', transform=ax.transAxes, fontsize=14)
# show the plot
plt.show()

# As found in EDA, the response variable Result have unequal data.

# Imbalanced data typically refers to a problem with classification problems where the classes are
not represented equally. If one applies classifiers on the dataset, they are likely to predict everything
as the majority class. This was often regarded as a problem in learning from highly imbalanced
datasets.

# To tackle the imbalance, we will be focuing on

# Random Oversampling
# Random oversampling duplicates examples from the minority class in the training dataset and can
result in overfitting for some models.

# Random undersampling
# Random undersampling deletes examples from the majority class and can result in losing
information invaluable to a model.

# Synthetic Minority OverSampling Technique (SMOTE)

# In this technique, instead of simply duplicating data from the minority class, we synthesize new
data from the minority class. This is a type of data augmentation for tabular data can be very
effective. This approach to synthesizing new data is called the Synthetic Minority Oversampling
TEchnique, or SMOTE for short.

# Adaptive Synthetic Sampling Method for Imbalanced Data (ADASYN)

# ADASYN (Adaptive Synthetic) is an algorithm that generates synthetic data, and its greatest
advantages are not copying the same minority data, and generating more data for “harder to learn”
examples.

# Import imbalace technique algorithims

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score,

classification_report
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification

print('Original dataset shape %s' % Counter(y_train))

# Undersampling only on train

rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train_rus))

# Undersampling with Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train_rus, y_train_rus)

y_pred_rus = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred_rus , y_test)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_rus)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_rus)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_rus)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_rus)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_rus)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_rus)

auc = metrics.roc_auc_score(y_test, y_pred_rus)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_rus)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_rus)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Undersampling', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_rus)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="winter"
,fmt='g')

plt.title('Confusion matrix Random Undersampling', y=1.1, fontsize = 12, fontweight='bold')

plt.xlabel('Predicted',fontsize = 12, fontweight='bold')
plt.ylabel('Actual',fontsize = 12, fontweight='bold')

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);

# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

from imblearn.over_sampling import RandomOverSampler

print('Original dataset shape %s' % Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_ros))

# Oversampling with Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train_ros, y_train_ros)

y_pred_ros = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_ros)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_ros)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_ros)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_ros)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_ros)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_ros)))

F1 score of 0.94 on the test set with data leakage and a score of 0.94 without data leakage.

Here, data leakage did not have a significant impact on the model's performance.
# plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_ros)

auc = metrics.roc_auc_score(y_test, y_pred_ros)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=14,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=14, fontweight='bold')
plt.title('ROC curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_ros)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_ros)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR Random Oversampling', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_ros)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="summer"
,fmt='g')

plt.title('Confusion matrix Random Oversampling ', y=1.1, fontsize=12, fontweight='bold')

plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

# ax.xaxis.set_ticklabels(['non anemic', 'anemic']);

# ax.yaxis.set_ticklabels(['non anemic', 'anemic']);

plt.show()

#LOgistic Regression with smote data

from imblearn.over_sampling import SMOTE, ADASYN

print('Original dataset shape %s' % Counter(y_train))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_smote))

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)

y_pred_smote = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_test , y_pred_smote)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_smote)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_smote)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_smote)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_smote)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_smote)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_smote)

auc = metrics.roc_auc_score(y_test, y_pred_smote)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])
# add labels and title
plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12, fontweight='bold')
plt.title('ROC curve for LR SMOTE', fontsize=12, fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_smote)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_smote)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')

# add labels and title

plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR SMOTE', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_smote)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Purples"
,fmt='g')

plt.title('Confusion matrix with SMOTE', y=1.1, fontsize = 12)

plt.xlabel('Predicted',fontsize = 12)
plt.ylabel('Actual',fontsize = 12)

plt.show()

#Logistic Regression with ADASYN data

print('Original dataset shape %s' % Counter(y_train))

adasyn = ADASYN(random_state=42)

X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_adasyn))
# ADASYN Sampling with Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_adasyn, y_train_adasyn)

y_pred_adasyn = logreg.predict(X_test)

print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred , y_pred_adasyn)))

print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_adasyn)))
print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_adasyn)))
print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_adasyn)))
print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_adasyn)))
print('Kappa Statistic : {0:0.5f}'.format(cohen_kappa_score(y_test, y_pred_adasyn)))

# plot ROC Curve

plt.figure(figsize=(10, 8))
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_adasyn)

auc = metrics.roc_auc_score(y_test, y_pred_adasyn)

print("AUC - ",auc,"\n")

# plot the ROC curve

plt.plot(fpr, tpr, linewidth=2, label="ROC Curve (AUC = {:.2f})".format(auc))
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label="Random Guess")

# set the x-axis and y-axis limits

plt.xlim([0, 1])
plt.ylim([0, 1.05])

# add labels and title

plt.xlabel('False Positive Rate (1 - Specificity)', fontsize=12,fontweight='bold')
plt.ylabel('True Positive Rate (Sensitivity)', fontsize=12,fontweight='bold')
plt.title('ROC curve for LR ADASYN', fontsize=12,fontweight='bold')

# add legend
plt.legend(loc="lower right")

plt.show()

# calculate precision-recall curve

precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_pred_adasyn)

# calculate F1 score
f1 = metrics.f1_score(y_test, y_pred_adasyn)
print('f1=%.3f' % (f1))

# plot no skill
plt.plot([0, 1], [0.5, 0.5], linestyle='--')

# plot the roc curve for the model

plt.plot(recall, precision, marker='.')
# add labels and title
plt.xlabel('Recall', fontsize=14,fontweight='bold')
plt.ylabel('Precision', fontsize=14, fontweight='bold')
plt.title('Precision Recall Curve for LR ADASYN', fontsize=12, fontweight='bold')

# show the plot

plt.show()

# Heatmap for Confusion Matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_pred_adasyn)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, annot_kws={"size": 25}, cmap="Wistia"
,fmt='g')

plt.title('Confusion matrix with LR ADASYN', y=1.1, fontsize = 12,fontweight='bold')

plt.xlabel('Predicted',fontsize = 12,fontweight='bold')
plt.ylabel('Actual',fontsize = 12,fontweight='bold')

plt.show()

DISTRIBUTION OF BALANCED DATA SET (BUILDING DIFFERENT MODELS)

names_lst = []

# Empty list to capture performance matrix for train set

aucs_train_lst = []
accuracy_train_lst = []
precision_train_lst = []
recall_train_lst = []
f1_train_lst = []

# Empty list to capture performance matrix for test set

aucs_test_lst = []
accuracy_test_lst = []
precision_test_lst = []
recall_test_lst = []
f1_test_lst = []
kappa_lst = []

# Function for model building and performance measure

def build_measure_model(models):
plt.figure(figsize=(12,6))

for name, model, X_train, y_train, X_test, y_test in models:

names_lst.append(name)

# Build model
model.fit(X_train, y_train)
# Predict
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# calculate accuracy
Accuracy_train = metrics.accuracy_score(y_train, y_train_pred)
accuracy_train_lst.append(Accuracy_train)

Accuracy_test = metrics.accuracy_score(y_test, y_test_pred)

accuracy_test_lst.append(Accuracy_test)

# calculate auc
Aucs_train = metrics.roc_auc_score(y_train, y_train_pred)
aucs_train_lst.append(Aucs_train)

Aucs_test = metrics.roc_auc_score(y_test , y_test_pred)

aucs_test_lst.append(Aucs_test)

# calculate precision
PrecisionScore_train = metrics.precision_score(y_train , y_train_pred)
precision_train_lst.append(PrecisionScore_train)

PrecisionScore_test = metrics.precision_score(y_test , y_test_pred)

precision_test_lst.append(PrecisionScore_test)

# calculate recall
RecallScore_train = metrics.recall_score(y_train , y_train_pred)
recall_train_lst.append(RecallScore_train)

RecallScore_test = metrics.recall_score(y_test , y_test_pred)

recall_test_lst.append(RecallScore_test)

# calculate f1 score
F1Score_train = metrics.f1_score(y_train , y_train_pred)
f1_train_lst.append(F1Score_train)

F1Score_test = metrics.f1_score(y_test , y_test_pred)

f1_test_lst.append(F1Score_test)

#print('F1 Score of '+ name +' model : {0:0.5f}'.format(F1Score_test))

# calculate kappa Statictis

kappa = cohen_kappa_score(y_test, y_test_pred)
kappa_lst.append(kappa)

# draw confusion matrix

cnf_matrix = metrics.confusion_matrix(y_test , y_test_pred)
print("Model Name :", name)

print('Train Accuracy :{0:0.5f}'.format(Accuracy_train))

print('Test Accuracy :{0:0.5f}'.format(Accuracy_test))

print('Train AUC : {0:0.5f}'.format(Aucs_train))

print('Test AUC : {0:0.5f}'.format(Aucs_test))

print('Train Precision : {0:0.5f}'.format(PrecisionScore_train))

print('Test Precision : {0:0.5f}'.format(PrecisionScore_test))

print('Train Recall : {0:0.5f}'.format(RecallScore_train))

print('Test Recall : {0:0.5f}'.format(RecallScore_test))

print('Train F1 : {0:0.5f}'.format(F1Score_train))
print('Test F1 : {0:0.5f}'.format(F1Score_test))

print('Kappa Statistic : {0:0.5f}'.format(kappa))

print('Confusion Matrix : \n', cnf_matrix)

print("\n")

# plot ROC Curve

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
auc = metrics.roc_auc_score(y_test, y_test_pred)
plt.plot(fpr,tpr,linewidth=2, label=name + ", auc="+str(auc))

#---------- For loops ends here--------#

plt.legend(loc=4)
plt.plot([0,1], [0,1], 'k--' )
plt.rcParams['font.size'] = 12
#plt.title('ROC curve for Predicting a anemia cases')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.show()

**DECISION TREE**

DTmodels = []

dt = DecisionTreeClassifier()

DTmodels.append(('DT imbalance', dt,X_train,y_train,X_test,y_test))

DTmodels.append(('DT Undersampling', dt,X_train_rus,y_train_rus,X_test,y_test))
DTmodels.append(('DT Oversampling', dt,X_train_ros,y_train_ros,X_test,y_test))
DTmodels.append(('DT SMOTE', dt,X_train_smote,y_train_smote,X_test,y_test))
DTmodels.append(('DT ADASYN', dt,X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(DTmodels)

**RANDOM FOREST**

# Random Forest (RF)

RFmodels = []

RFmodels.append(('RF imbalance', RandomForestClassifier(),X_train,y_train,X_test,y_test))

RFmodels.append(('RF Undersampling',
RandomForestClassifier(),X_train_rus,y_train_rus,X_test,y_test))
RFmodels.append(('RF Oversampling',
RandomForestClassifier(),X_train_ros,y_train_ros,X_test,y_test))
RFmodels.append(('RF SMOTE',
RandomForestClassifier(),X_train_smote,y_train_smote,X_test,y_test))
RFmodels.append(('RF ADASYN',
RandomForestClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(RFmodels)

**KNN**

# K-Nearest Neighbors (KNN)

KNNmodels = []

KNNmodels.append(('KNN imbalance', KNeighborsClassifier(),X_train,y_train,X_test,y_test))

KNNmodels.append(('KNN Undersampling',
KNeighborsClassifier(),X_train_rus,y_train_rus,X_test,y_test))
KNNmodels.append(('KNN Oversampling',
KNeighborsClassifier(),X_train_ros,y_train_ros,X_test,y_test))
KNNmodels.append(('KNN SMOTE',
KNeighborsClassifier(),X_train_smote,y_train_smote,X_test,y_test))
KNNmodels.append(('KNN ADASYN',
KNeighborsClassifier(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(KNNmodels)

**SVM**

# Support Vector Machines (SVM)

SVMmodels = []

SVMmodels.append(('SVM imbalance', SVC(gamma='auto'),X_train,y_train,X_test,y_test))

SVMmodels.append(('SVM Undersampling',
SVC(gamma='auto'),X_train_rus,y_train_rus,X_test,y_test))
SVMmodels.append(('SVM Oversampling',
SVC(gamma='auto'),X_train_ros,y_train_ros,X_test,y_test))
SVMmodels.append(('SVM SMOTE',
SVC(gamma='auto'),X_train_smote,y_train_smote,X_test,y_test))
SVMmodels.append(('SVM ADASYN',
SVC(gamma='auto'),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(SVMmodels)

# Gaussian Naive Bayes (NB)

NBmodels = []

NBmodels.append(('NB imbalance', GaussianNB(),X_train,y_train,X_test,y_test))

NBmodels.append(('NB Undersampling', GaussianNB(),X_train_rus,y_train_rus,X_test,y_test))
NBmodels.append(('NB Oversampling', GaussianNB(),X_train_ros,y_train_ros,X_test,y_test))
NBmodels.append(('NB SMOTE', GaussianNB(),X_train_smote,y_train_smote,X_test,y_test))
NBmodels.append(('NB ADASYN', GaussianNB(),X_train_adasyn,y_train_adasyn,X_test,y_test))

# Call function to create model and measure its performance

build_measure_model(NBmodels)

from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics

# Random Forest model training

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Predict from the test set

y_pred = random_forest.predict(X_test)

# Model evaluation
print(metrics.classification_report(y_test, y_pred))
print('Accuracy: {0:0.5f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('AUC: {0:0.5f}'.format(metrics.roc_auc_score(y_test, y_pred)))
print('Precision: {0:0.5f}'.format(metrics.precision_score(y_test, y_pred)))
print('Recall: {0:0.5f}'.format(metrics.recall_score(y_test, y_pred)))
print('F1: {0:0.5f}'.format(metrics.f1_score(y_test, y_pred)))

PERFORMANCE MEASURE OF CLASSSIFIERS

data = {'Model':names_lst,
#'Accuracy_Train':accuracy_train_lst,
'Accuracy_Test':accuracy_test_lst,
#'AUC_Train':aucs_train_lst,
'AUC_Test':aucs_test_lst,
#'PrecisionScore_Train':precision_train_lst,
'PrecisionScore_Test':precision_test_lst,
#'RecallScore_Train':recall_train_lst,
'RecallScore_Test':recall_test_lst,
#'F1Score_Train':f1_train_lst,
'F1Score_Test':f1_test_lst,
'Kappa Stat' : kappa_lst
}

print("Performance measures of various classifiers: \n")

performance_df = pd.DataFrame(data)
performance_df = performance_df.round(3)
finaltable =
performance_df.sort_values(['F1Score_Test','RecallScore_Test','AUC_Test'],ascending=False)
finaltable

finaltable.to_excel('my_table.xlsx', index=False)

**HYPERPARAMETER TUNING**

# Use GridSearchCV to find the best parameters.

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# define the classifiers to be evaluated

classifiers_ = [DecisionTreeClassifier(),
RandomForestClassifier(),
SVC(),
GaussianNB(),
LogisticRegression(),
KNeighborsClassifier()]

# define the parameter grids for each classifier

param_grids = [{'max_depth': range(1, 10), 'criterion': ['gini', 'entropy']}, # decision tree
{'n_estimators': [50, 100, 200], 'max_depth': range(1, 10)}, # random forest
{'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}, # SVM
{}, # naive bayes - no hyperparameters to tune
# Naive Bayes is a probabilistic classifier that is based on Bayes' theorem and the "naive" assumption
that
# the presence or absence of a particular feature is independent of the presence or absence of any
other feature.
# Naive Bayes makes no assumptions about the distribution of the data, unlike other classifiers like
decision trees,
# SVM, or logistic regression

{'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}, #logistic
{"n_neighbors": list(range(2,60,1)),'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}] # KNN
regression
# perform GridSearchCV for each classifier
for clf, param_grid in zip(classifiers_, param_grids):
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
print(clf.__class__.__name__, "best params:", grid_search.best_params_, "best score:",
grid_search.best_score_)
print("F1 score:", f1_score(y_test, y_pred))

**CROSS VALIDATION**

# perform 5-fold cross-validation for each classifier

for clf in classifiers_:
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(clf.__class__.__name__, "mean accuracy:", scores.mean(), "std deviation:", scores.std())

from sklearn.tree import export_graphviz

import graphviz

#hyperparametered DT
dt_tuning = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# Train the decision tree based on selected parameter

dt_tuning.fit(X_train, y_train)
# Generate a DOT file representing the decision tree
dot_data = export_graphviz(dt_tuning, out_file=None, feature_names=X_train.columns,
class_names=['Non Anemic', 'Anemic'], filled=True, rounded=True)

# Visualize the decision path for the new data point

graph = graphviz.Source(dot_data)
graph.render('decision_tree') # save the decision tree as a PDF file
graph

# Testing data

new_data = { 'Gender': 1, 'Hemoglobin': 11, 'MCV':50}

new_X = pd.DataFrame([new_data])
prediction = dt.predict(new_X)
print('Prediction:', prediction)

import matplotlib.pyplot as plt

# Define the labels, scores and colors for each model

labels = ['Decision Tree', 'Random Forest', 'SVM', 'Naive Bayes', 'Logistic Regression', 'KNN']
scores = [1.00, 1.00, 0.994, 0.914, 0.935, 0.988,0.975]
colors = ['#50BFE6','#9C51B6','#FF5470','#0066CC','#FF5050','#E97451']

# Sort the scores and labels in descending order

sorted_scores, sorted_labels = zip(*sorted(zip(scores, labels), reverse=True))

# Set up the plot

fig, ax = plt.subplots(figsize=(12,8))
ax.bar(sorted_labels, sorted_scores, color=colors)

# Set the title and axis labels

ax.set_title('Comparison of Model Performance Grid search', fontsize=12, fontweight='bold')
ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy Score', fontsize=12, fontweight='bold')

# Set the tick font size

ax.tick_params(axis='both', which='major', labelsize=12)

# Add the accuracy score as text above each bar

for i, score in enumerate(sorted_scores):
ax.text(i, score+0.01, f'{score*100:.1f}%', fontsize=12, ha='center')

# Remove spines
sns.despine(left=True, bottom=True)

# Display the plot

plt.show()

**PLOTTING ACCURACY**

import pickle
from sklearn.ensemble import RandomForestClassifier

# Save the Random Forest model as a pickle file

filename = 'random_forest_model.pkl'
pickle.dump(random_forest, open(filename, 'wb'))

print("Random Forest model exported as pickle file:", filename)

Health Risk Prediction
No ratings yet
Health Risk Prediction
80 pages
Heart - Disease - 1.ipynb - Colaboratory
No ratings yet
Heart - Disease - 1.ipynb - Colaboratory
9 pages
Heart Disease Diagnosis Using Machine Learning
No ratings yet
Heart Disease Diagnosis Using Machine Learning
26 pages
DMML Lab Report 02
No ratings yet
DMML Lab Report 02
11 pages
DSDBAAssignment2 SUMEET
No ratings yet
DSDBAAssignment2 SUMEET
8 pages
Data Visualization
No ratings yet
Data Visualization
159 pages
Data Analyzer
No ratings yet
Data Analyzer
10 pages
Ai in HC - 2
No ratings yet
Ai in HC - 2
9 pages
BDA Project: Cardiovascular Data Analysis
No ratings yet
BDA Project: Cardiovascular Data Analysis
20 pages
Baseline - Ipynb - Colab
No ratings yet
Baseline - Ipynb - Colab
5 pages
ML 7
No ratings yet
ML 7
6 pages
ML Lab
No ratings yet
ML Lab
14 pages
My Code
No ratings yet
My Code
7 pages
Empirical Crop Suitability Model 1694688954
No ratings yet
Empirical Crop Suitability Model 1694688954
24 pages
Medical Data ML
No ratings yet
Medical Data ML
6 pages
Data Visualization for Analysts
No ratings yet
Data Visualization for Analysts
159 pages
Practical 4
No ratings yet
Practical 4
3 pages
Mastering Data Visualization Techniques (Part 1)
No ratings yet
Mastering Data Visualization Techniques (Part 1)
20 pages
Mastering Data Visualization Techniques 1728896857
No ratings yet
Mastering Data Visualization Techniques 1728896857
85 pages
Model2.ipynb - Colab
No ratings yet
Model2.ipynb - Colab
11 pages
Lab 2
No ratings yet
Lab 2
8 pages
Titanic ML for Data Scientists
No ratings yet
Titanic ML for Data Scientists
36 pages
Datascience 2 PDF
No ratings yet
Datascience 2 PDF
24 pages
Razi AML Assignment2
No ratings yet
Razi AML Assignment2
18 pages
Stroke Prediction
No ratings yet
Stroke Prediction
10 pages
ML Lab Manual-Iso
No ratings yet
ML Lab Manual-Iso
40 pages
Lab Manual - MachineLearningLaboratory-DR - Vaishnavi
No ratings yet
Lab Manual - MachineLearningLaboratory-DR - Vaishnavi
71 pages
Logistic Regression with PySpark
No ratings yet
Logistic Regression with PySpark
19 pages
1 10
No ratings yet
1 10
4 pages
Molecular Classification of Leukemia Using Gene Expression Data and Random Forest
No ratings yet
Molecular Classification of Leukemia Using Gene Expression Data and Random Forest
17 pages
Assignment2 DMS672
No ratings yet
Assignment2 DMS672
15 pages
Dsa 1
No ratings yet
Dsa 1
8 pages
DATA SCIENCE AsSIGNMENT - Ipynb - Colab
No ratings yet
DATA SCIENCE AsSIGNMENT - Ipynb - Colab
4 pages
Dav Lab Manual
No ratings yet
Dav Lab Manual
28 pages
Delhivery Data Processing Overview
No ratings yet
Delhivery Data Processing Overview
79 pages
DWM Journal
No ratings yet
DWM Journal
104 pages
Project Inferential Statistics-Checkpoint
No ratings yet
Project Inferential Statistics-Checkpoint
11 pages
Mayank Chaudhary DEV Practicals
No ratings yet
Mayank Chaudhary DEV Practicals
14 pages
Diabetic Retinopathy Risk Modeling
No ratings yet
Diabetic Retinopathy Risk Modeling
24 pages
ML Labmanual
No ratings yet
ML Labmanual
33 pages
Apply Logistic Regression Model Techniques To Predict Data On Any Dataset
No ratings yet
Apply Logistic Regression Model Techniques To Predict Data On Any Dataset
5 pages
Batch1 Ds
No ratings yet
Batch1 Ds
15 pages
Diabetes Prediction 1704256341
No ratings yet
Diabetes Prediction 1704256341
17 pages
Life Expectancy Data Analysis
No ratings yet
Life Expectancy Data Analysis
26 pages
Samplecode (HDPS)
No ratings yet
Samplecode (HDPS)
29 pages
Diabetes Prediction with KNN Model
No ratings yet
Diabetes Prediction with KNN Model
12 pages
DL Lab Programs
No ratings yet
DL Lab Programs
16 pages
Python For Machine Learning
No ratings yet
Python For Machine Learning
66 pages
Sample
No ratings yet
Sample
1 page
ModuleAr Merged
No ratings yet
ModuleAr Merged
42 pages
DSBDA Practicals
No ratings yet
DSBDA Practicals
16 pages
AI&ML
No ratings yet
AI&ML
9 pages
Anemia Word
No ratings yet
Anemia Word
7 pages
Data Visualization
No ratings yet
Data Visualization
70 pages
Print Print Print Print: Import As
No ratings yet
Print Print Print Print: Import As
6 pages
Roll NO 2020
No ratings yet
Roll NO 2020
8 pages
DSBDA2
No ratings yet
DSBDA2
6 pages
Modern Information Retrieval: Modeling
No ratings yet
Modern Information Retrieval: Modeling
197 pages
Unit V
No ratings yet
Unit V
14 pages
Array Implementation of List ADT
No ratings yet
Array Implementation of List ADT
5 pages
SPM UNIT I Problem
No ratings yet
SPM UNIT I Problem
33 pages
Job Characteristics in Software Management
No ratings yet
Job Characteristics in Software Management
22 pages
CNN Sensor Fault Detection
No ratings yet
CNN Sensor Fault Detection
21 pages
Machine Learning Zhou Newest Edition 2025
No ratings yet
Machine Learning Zhou Newest Edition 2025
103 pages
Caixinha 2016
No ratings yet
Caixinha 2016
16 pages
AI in Credit Risk Management
No ratings yet
AI in Credit Risk Management
21 pages
02 Image Characteristics and Quality
No ratings yet
02 Image Characteristics and Quality
8 pages
Ds Servo Mg90s
No ratings yet
Ds Servo Mg90s
17 pages
Unit V Summary Recommender System
No ratings yet
Unit V Summary Recommender System
16 pages
A Reliable and Valid Questionnaire Was Developed To Measure Computer Vision Syndrome at The Workplace
No ratings yet
A Reliable and Valid Questionnaire Was Developed To Measure Computer Vision Syndrome at The Workplace
13 pages
3rd Sem Mca Data Mining and Big Data Analytics Extra Qa
No ratings yet
3rd Sem Mca Data Mining and Big Data Analytics Extra Qa
51 pages
Quantifying The Effects of Ground Truth Annotation Quality On Object Detection and Instance Segmentation Performance
No ratings yet
Quantifying The Effects of Ground Truth Annotation Quality On Object Detection and Instance Segmentation Performance
15 pages
Nonradiology Health Care Professionals Significant
No ratings yet
Nonradiology Health Care Professionals Significant
14 pages
The ABIC Score For Assessing Long-Term Outcomes in
No ratings yet
The ABIC Score For Assessing Long-Term Outcomes in
21 pages
The Prevalence of Mental Health Problems in Elite Athletes
No ratings yet
The Prevalence of Mental Health Problems in Elite Athletes
7 pages
Nanduri Naga Sowri Pgp-Dsba - Octa - G2 Great Learning
No ratings yet
Nanduri Naga Sowri Pgp-Dsba - Octa - G2 Great Learning
40 pages
Maxent Model For Puma - Concolor
No ratings yet
Maxent Model For Puma - Concolor
10 pages
SPU-JSTMR Volume 1 2
100% (1)
SPU-JSTMR Volume 1 2
93 pages
Latency 3
No ratings yet
Latency 3
10 pages
Youth Employment in Bangladesh: Creating Opportunities-Reaping Dividends 1st Ed. 2020 Edition Fahmida Khatun PDF Download
100% (3)
Youth Employment in Bangladesh: Creating Opportunities-Reaping Dividends 1st Ed. 2020 Edition Fahmida Khatun PDF Download
49 pages
Example Systermatic Review and Narrative Synthesis
No ratings yet
Example Systermatic Review and Narrative Synthesis
11 pages
Classification
No ratings yet
Classification
58 pages
1 PB
No ratings yet
1 PB
10 pages
Using Synthetic MR Images For Distortion C - 2023 - Developmental Cognitive Neur
No ratings yet
Using Synthetic MR Images For Distortion C - 2023 - Developmental Cognitive Neur
17 pages
HCA 5 Unit Notes
No ratings yet
HCA 5 Unit Notes
73 pages
Flood Hazard Mapping Using A Multi-Criteria Decisi
No ratings yet
Flood Hazard Mapping Using A Multi-Criteria Decisi
18 pages
A Systematic Review: B-Cell Conformational Epitope Prediction From Epitope Characteristics View
No ratings yet
A Systematic Review: B-Cell Conformational Epitope Prediction From Epitope Characteristics View
7 pages
MIT Open Access Articles: A Graph Neural Network Approach For Product Relationship Prediction
No ratings yet
MIT Open Access Articles: A Graph Neural Network Approach For Product Relationship Prediction
19 pages
Algosintrvwques
No ratings yet
Algosintrvwques
27 pages
Statistical Thinking For Non Statisticians in Drug Regulation 3rd Edition Richard Kay Download
No ratings yet
Statistical Thinking For Non Statisticians in Drug Regulation 3rd Edition Richard Kay Download
48 pages
Secondary Brain Injury - Predicting and Preventing Insults
No ratings yet
Secondary Brain Injury - Predicting and Preventing Insults
8 pages
Data Mining & Warehousing Lab Report
No ratings yet
Data Mining & Warehousing Lab Report
25 pages