import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
import seaborn as sns
from sklearn.metrics import
accuracy_score,mean_squared_error,classification_report,confusion_matrix,precision_
score,recall_score,roc_curve,auc
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
data=pd.read_excel('health care diabetes.xlsx')
data.head()
data.describe()
#Identifying the mean of the features
print(data['Glucose'].mean())
print(data['BloodPressure'].mean())
print(data['SkinThickness'].mean())
print(data['Insulin'].mean())
print(data['Pregnancies'].mean())
print(data['BMI'].mean())
# Finding the number of rows which has the null values
print('Glucose-',len(data['Glucose'][data['Glucose']==0]))
print('BloodPressure-',len(data['BloodPressure'][data['BloodPressure']==0]))
print('SkinThickness-',len(data['SkinThickness'][data['SkinThickness']==0]))
print('Insulin-',len(data['Insulin'][data['Insulin']==0]))
print('Pregnancies-',len(data['Pregnancies'][data['Pregnancies']==0]))
print('BMI-',len(data['BMI'][data['BMI']==0]))
# Finding the null value percentage
selected_columns = ['Glucose', 'BloodPressure',
'SkinThickness','Insulin','Pregnancies','BMI']
null_percentage = (data[selected_columns] == 0).mean() * 100
# Displaying the null value percentage for each selected column
print("Percentage of Null Values for Each Column:")
print(null_percentage)
# Replacing the null values with the mean
data['Glucose']=data['Glucose'].replace([0],[data['Glucose'].mean()])
data['BloodPressure']=data['BloodPressure'].replace([0],
[data['BloodPressure'].mean()])
data['SkinThickness']=data['SkinThickness'].replace([0],
[data['SkinThickness'].mean()])
data['Insulin']=data['Insulin'].replace([0],[data['Insulin'].mean()])
data['Pregnancies']=data['Pregnancies'].replace([0],[data['Pregnancies'].mean()])
data['BMI']=data['BMI'].replace([0],[data['BMI'].mean()])
data.describe()
#Checking the null value percentage of the treated columns
null_percentage_treated = (data[selected_columns] == 0).mean() * 100
# Displaying the null value percentage for each selected column
print("Percentage of Null Values for Each Column after the null value treatment:")
print(null_percentage_treated)
columns=data[selected_columns]
# Display boxplots for numeric columns to visualize outliers
plt.figure(figsize=(12, 8))
sns.boxplot(data=columns)
plt.title("Boxplots for Numeric Columns")
plt.show()
# Finding the Outlier Count in the selected Columns:
def find_outliers_iqr(data, column_name):
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)
# Calculate the interquartile range (IQR)
IQR = Q3 - Q1
# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Find outliers
outliers = data[(data[column_name] < lower_bound) | (data[column_name] >
upper_bound)]
# Count the number of outliers
count_outliers = len(outliers)
return count_outliers
# Calculate and print the number of outliers for each column of interest
for column_name in selected_columns:
outlier_count = find_outliers_iqr(data, column_name)
print(f"Number of outliers in the '{column_name}' column: {outlier_count}")