5/11/25, 10:35 PM Untitled2.
ipynb - Colab
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import RandomForestClassifier
from [Link] import classification_report,confusion_matrix,accuracy_score
from [Link] import files
uploaded = [Link]()
import io
import pandas as pd
import seaborn as sns
import [Link] as plt
# Automatically get filename
filename = list([Link]())[0]
# Read the file
df = pd.read_csv([Link](uploaded[filename]))
# Show DataFrame details
print([Link]())
print([Link]())
print([Link]().sum())
# Plot the target column
[Link](x='target', data=df)
[Link]("Heart Disease Count (0=No, 1=Yes)")
[Link]()
[Link] 1/8
5/11/25, 10:35 PM [Link] - Colab
Choose Files [Link]
[Link](text/csv) - 39689 bytes, last modified: 5/11/2025 - 100% done
Saving [Link] to [Link]
age sex chest pain type resting bp s cholesterol fasting blood sugar \
0 40 1 2 140 289 0
1 49 0 3 160 180 0
2 37 1 2 130 283 0
3 48 0 4 138 214 0
4 54 1 3 150 195 0
resting ecg max heart rate exercise angina oldpeak ST slope target
0 0 172 0 0.0 1 0
1 0 156 0 1.0 2 1
2 1 98 0 0.0 1 0
3 0 108 1 1.5 2 1
4 0 122 0 0.0 1 0
<class '[Link]'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1190 non-null int64
1 sex 1190 non-null int64
2 chest pain type 1190 non-null int64
3 resting bp s 1190 non-null int64
4 cholesterol 1190 non-null int64
5 fasting blood sugar 1190 non-null int64
6 resting ecg 1190 non-null int64
7 max heart rate 1190 non-null int64
8 exercise angina 1190 non-null int64
9 oldpeak 1190 non-null float64
10 ST slope 1190 non-null int64
11 target 1190 non-null int64
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
None
age 0
sex 0
chest pain type 0
resting bp s 0
cholesterol 0
fasting blood sugar 0
resting ecg 0
max heart rate 0
exercise angina 0
oldpeak 0
ST slope 0
target 0
dtype: int64
import seaborn as sns
import [Link] as plt
import pandas as pd
[Link](style="white")
fig, axes = [Link](1, 3, figsize=(24, 8)) # Increase width and height for more space
# 1️⃣ Missing Values Heatmap
[Link]([Link](), cbar=False, cmap='viridis', ax=axes[0])
axes[0].set_title("Missing Values Heatmap")
# 2️⃣ Correlation Heatmap with increased spreading
[Link] 2/8
5/11/25, 10:35 PM [Link] - Colab
2️⃣ p p g
corr = [Link]()
[Link](corr, annot=True, cmap='coolwarm', ax=axes[1], fmt='.2f', annot_kws={"size": 10})
axes[1].set_title("Correlation Between Features")
# 3️⃣ Categorical Pivot Heatmap: Sex vs Target Count
pivot_table = df.pivot_table(index='sex', columns='target', aggfunc='size', fill_value=0)
[Link](pivot_table, annot=True, fmt='d', cmap="YlGnBu", ax=axes[2])
axes[2].set_title("Target vs Sex Heatmap (Count)")
# Adjust layout to avoid overlap
plt.subplots_adjust(wspace=0.3) # Increase space between subplots
plt.tight_layout()
[Link]()
import seaborn as sns
import [Link] as plt
# Subplots: 3 row, 2 column = 6 graphs
fig, axes = [Link](3, 2, figsize=(15, 14))
# Cholesterol
[Link](df['cholesterol'], kde=True, color='blue', ax=axes[0, 0])
axes[0, 0].set_title("Cholesterol Distribution")
axes[0, 0].set_xlabel("Cholesterol Level")
axes[0, 0].set_ylabel("Frequency")
# Age
[Link](df['age'], kde=True, color='purple', ax=axes[0, 1])
axes[0, 1].set_title("Age Distribution")
axes[0, 1].set_xlabel("Age")
axes[0, 1].set_ylabel("Frequency")
# Max Heart Rate
[Link](df['max heart rate'], kde=True, color='red', ax=axes[1, 0])
axes[1, 0].set_title("Max Heart Rate Distribution")
axes[1, 0].set_xlabel("Max Heart Rate")
axes[1, 0].set_ylabel("Frequency")
# Oldpeak
[Link](df['oldpeak'], kde=True, color='green', ax=axes[1, 1])
axes[1, 1].set_title("Oldpeak (ST Depression) Distribution")
axes[1, 1].set_xlabel("Oldpeak")
[Link] 3/8
5/11/25, 10:35 PM [Link] - Colab
axes[1, 1].set_ylabel("Frequency")
# Resting Blood Pressure
[Link](df['resting bp s'], kde=True, color='orange', ax=axes[2, 0])
axes[2, 0].set_title("Resting Blood Pressure Distribution")
axes[2, 0].set_xlabel("Resting BP")
axes[2, 0].set_ylabel("Frequency")
# Fasting Blood Sugar
[Link](df['fasting blood sugar'], kde=False, color='teal', ax=axes[2, 1])
axes[2, 1].set_title("Fasting Blood Sugar Distribution")
axes[2, 1].set_xlabel("Fasting Blood Sugar")
axes[2, 1].set_ylabel("Count")
# Adjust layout
plt.tight_layout()
[Link]()
[Link] 4/8
5/11/25, 10:35 PM [Link] - Colab
from [Link] import MinMaxScaler
scaler = MinMaxScaler()
numerical_columns = ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina',
'oldpeak', 'ST slope']
[Link] 5/8
5/11/25, 10:35 PM [Link] - Colab
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
print([Link]())
age sex chest pain type resting bp s cholesterol \
0 0.244898 1.0 0.333333 0.70 0.479270
1 0.428571 0.0 0.666667 0.80 0.298507
2 0.183673 1.0 0.333333 0.65 0.469320
3 0.408163 0.0 1.000000 0.69 0.354892
4 0.530612 1.0 0.666667 0.75 0.323383
fasting blood sugar resting ecg max heart rate exercise angina \
0 0.0 0.0 0.788732 0.0
1 0.0 0.0 0.676056 0.0
2 0.0 0.5 0.267606 0.0
3 0.0 0.0 0.338028 1.0
4 0.0 0.0 0.436620 0.0
oldpeak ST slope target
0 0.295455 0.333333 0
1 0.409091 0.666667 1
2 0.295455 0.333333 0
3 0.465909 0.666667 1
4 0.295455 0.333333 0
from sklearn.model_selection import train_test_split
X = [Link](columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training data size:", X_train.shape)
print("Test data size:", X_test.shape)
Training data size: (952, 11)
Test data size: (238, 11)
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier
from [Link] import SVC
from [Link] import accuracy_score, confusion_matrix, classification_report
# Assume that df is already loaded and preprocessed
# Splitting the data into features (X) and target (y)
X = [Link]('target', axis=1) # Dropping target column for features
y = df['target'] # Target column
# Splitting data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Model 1: Logistic Regression
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)
# Model 2: Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
# Model 3: Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
# Model 4: Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
# Evaluate models
# Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, logreg_pred))
[Link] 6/8
5/11/25, 10:35 PM [Link] - Colab
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))
# Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))
# Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))
# SVM
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))
Logistic Regression Accuracy: 0.8529411764705882
Logistic Regression Confusion Matrix:
[[ 90 17]
[ 18 113]]
Logistic Regression Classification Report:
precision recall f1-score support
0 0.83 0.84 0.84 107
1 0.87 0.86 0.87 131
accuracy 0.85 238
macro avg 0.85 0.85 0.85 238
weighted avg 0.85 0.85 0.85 238
Decision Tree Accuracy: 0.8991596638655462
Decision Tree Confusion Matrix:
[[ 99 8]
[ 16 115]]
Decision Tree Classification Report:
precision recall f1-score support
0 0.86 0.93 0.89 107
1 0.93 0.88 0.91 131
accuracy 0.90 238
macro avg 0.90 0.90 0.90 238
weighted avg 0.90 0.90 0.90 238
Random Forest Accuracy: 0.9453781512605042
Random Forest Confusion Matrix:
[[ 98 9]
[ 4 127]]
Random Forest Classification Report:
precision recall f1-score support
0 0.96 0.92 0.94 107
1 0.93 0.97 0.95 131
accuracy 0.95 238
macro avg 0.95 0.94 0.94 238
weighted avg 0.95 0.95 0.95 238
SVM Accuracy: 0.8571428571428571
SVM Confusion Matrix:
[[ 90 17]
[ 17 114]]
SVM Classification Report:
precision recall f1-score support
0 0.84 0.84 0.84 107
1 0.87 0.87 0.87 131
accuracy 0.86 238
macro avg 0.86 0.86 0.86 238
weighted avg 0.86 0.86 0.86 238
from [Link] import SVC
from [Link] import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import [Link] as plt
# Model 4: Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
# Logistic Regression Evaluation
print("Logistic Regression Evaluation:")
logreg_accuracy = accuracy_score(y_test, logreg_pred)
[Link] 7/8
5/11/25, 10:35 PM [Link] - Colab
print(f"Accuracy: {logreg_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, logreg_pred))
print("Classification Report:")
print(classification_report(y_test, logreg_pred))
# ROC Curve: Logistic Regression
fpr, tpr, thresholds = roc_curve(y_test, logreg_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Logistic Regression Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")
[Link](figsize=(10, 6))
[Link](fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Logistic Regression)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()
# Decision Tree Evaluation
print("\nDecision Tree Evaluation:")
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Accuracy: {dt_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))
print("Classification Report:")
print(classification_report(y_test, dt_pred))
# ROC Curve: Decision Tree
fpr, tpr, thresholds = roc_curve(y_test, dt_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Decision Tree Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")
[Link](figsize=(10, 6))
[Link](fpr, tpr, color='red', label=f'Decision Tree (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Decision Tree)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()
# Random Forest Evaluation
print("\nRandom Forest Evaluation:")
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Accuracy: {rf_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("Classification Report:")
print(classification_report(y_test, rf_pred))
# ROC Curve: Random Forest
fpr, tpr, thresholds = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Random Forest Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")
[Link](figsize=(10, 6))
[Link](fpr, tpr, color='green', label=f'Random Forest (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Random Forest)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()
# SVM Evaluation
print("\nSVM Evaluation:")
[Link] 8/8