0% found this document useful (0 votes)
95 views8 pages

Untitled2.Ipynb - Colab

The document outlines a data analysis and machine learning workflow using a heart disease dataset in Python with libraries such as pandas, seaborn, and scikit-learn. It includes data preprocessing, visualization of distributions and correlations, and the implementation of various classification models including Logistic Regression, Decision Tree, Random Forest, and SVM, along with their evaluation metrics. The analysis reveals insights into the dataset and the performance of different models in predicting heart disease.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
95 views8 pages

Untitled2.Ipynb - Colab

The document outlines a data analysis and machine learning workflow using a heart disease dataset in Python with libraries such as pandas, seaborn, and scikit-learn. It includes data preprocessing, visualization of distributions and correlations, and the implementation of various classification models including Logistic Regression, Decision Tree, Random Forest, and SVM, along with their evaluation metrics. The analysis reveals insights into the dataset and the performance of different models in predicting heart disease.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

5/11/25, 10:35 PM Untitled2.

ipynb - Colab

import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import RandomForestClassifier
from [Link] import classification_report,confusion_matrix,accuracy_score

from [Link] import files


uploaded = [Link]()

import io
import pandas as pd
import seaborn as sns
import [Link] as plt

# Automatically get filename


filename = list([Link]())[0]

# Read the file


df = pd.read_csv([Link](uploaded[filename]))

# Show DataFrame details


print([Link]())
print([Link]())
print([Link]().sum())

# Plot the target column


[Link](x='target', data=df)
[Link]("Heart Disease Count (0=No, 1=Yes)")
[Link]()

[Link] 1/8
5/11/25, 10:35 PM [Link] - Colab

Choose Files [Link]


[Link](text/csv) - 39689 bytes, last modified: 5/11/2025 - 100% done
Saving [Link] to [Link]
age sex chest pain type resting bp s cholesterol fasting blood sugar \
0 40 1 2 140 289 0
1 49 0 3 160 180 0
2 37 1 2 130 283 0
3 48 0 4 138 214 0
4 54 1 3 150 195 0

resting ecg max heart rate exercise angina oldpeak ST slope target
0 0 172 0 0.0 1 0
1 0 156 0 1.0 2 1
2 1 98 0 0.0 1 0
3 0 108 1 1.5 2 1
4 0 122 0 0.0 1 0
<class '[Link]'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1190 non-null int64
1 sex 1190 non-null int64
2 chest pain type 1190 non-null int64
3 resting bp s 1190 non-null int64
4 cholesterol 1190 non-null int64
5 fasting blood sugar 1190 non-null int64
6 resting ecg 1190 non-null int64
7 max heart rate 1190 non-null int64
8 exercise angina 1190 non-null int64
9 oldpeak 1190 non-null float64
10 ST slope 1190 non-null int64
11 target 1190 non-null int64
dtypes: float64(1), int64(11)
memory usage: 111.7 KB
None
age 0
sex 0
chest pain type 0
resting bp s 0
cholesterol 0
fasting blood sugar 0
resting ecg 0
max heart rate 0
exercise angina 0
oldpeak 0
ST slope 0
target 0
dtype: int64

import seaborn as sns


import [Link] as plt
import pandas as pd
[Link](style="white")

fig, axes = [Link](1, 3, figsize=(24, 8)) # Increase width and height for more space

# 1️⃣ Missing Values Heatmap


[Link]([Link](), cbar=False, cmap='viridis', ax=axes[0])
axes[0].set_title("Missing Values Heatmap")

# 2️⃣ Correlation Heatmap with increased spreading


[Link] 2/8
5/11/25, 10:35 PM [Link] - Colab
2️⃣ p p g
corr = [Link]()
[Link](corr, annot=True, cmap='coolwarm', ax=axes[1], fmt='.2f', annot_kws={"size": 10})
axes[1].set_title("Correlation Between Features")

# 3️⃣ Categorical Pivot Heatmap: Sex vs Target Count


pivot_table = df.pivot_table(index='sex', columns='target', aggfunc='size', fill_value=0)
[Link](pivot_table, annot=True, fmt='d', cmap="YlGnBu", ax=axes[2])
axes[2].set_title("Target vs Sex Heatmap (Count)")

# Adjust layout to avoid overlap


plt.subplots_adjust(wspace=0.3) # Increase space between subplots
plt.tight_layout()
[Link]()

import seaborn as sns


import [Link] as plt

# Subplots: 3 row, 2 column = 6 graphs


fig, axes = [Link](3, 2, figsize=(15, 14))

# Cholesterol
[Link](df['cholesterol'], kde=True, color='blue', ax=axes[0, 0])
axes[0, 0].set_title("Cholesterol Distribution")
axes[0, 0].set_xlabel("Cholesterol Level")
axes[0, 0].set_ylabel("Frequency")

# Age
[Link](df['age'], kde=True, color='purple', ax=axes[0, 1])
axes[0, 1].set_title("Age Distribution")
axes[0, 1].set_xlabel("Age")
axes[0, 1].set_ylabel("Frequency")

# Max Heart Rate


[Link](df['max heart rate'], kde=True, color='red', ax=axes[1, 0])
axes[1, 0].set_title("Max Heart Rate Distribution")
axes[1, 0].set_xlabel("Max Heart Rate")
axes[1, 0].set_ylabel("Frequency")

# Oldpeak
[Link](df['oldpeak'], kde=True, color='green', ax=axes[1, 1])
axes[1, 1].set_title("Oldpeak (ST Depression) Distribution")
axes[1, 1].set_xlabel("Oldpeak")

[Link] 3/8
5/11/25, 10:35 PM [Link] - Colab
axes[1, 1].set_ylabel("Frequency")

# Resting Blood Pressure


[Link](df['resting bp s'], kde=True, color='orange', ax=axes[2, 0])
axes[2, 0].set_title("Resting Blood Pressure Distribution")
axes[2, 0].set_xlabel("Resting BP")
axes[2, 0].set_ylabel("Frequency")

# Fasting Blood Sugar


[Link](df['fasting blood sugar'], kde=False, color='teal', ax=axes[2, 1])
axes[2, 1].set_title("Fasting Blood Sugar Distribution")
axes[2, 1].set_xlabel("Fasting Blood Sugar")
axes[2, 1].set_ylabel("Count")

# Adjust layout
plt.tight_layout()
[Link]()

[Link] 4/8
5/11/25, 10:35 PM [Link] - Colab

from [Link] import MinMaxScaler

scaler = MinMaxScaler()

numerical_columns = ['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',


'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina',
'oldpeak', 'ST slope']

[Link] 5/8
5/11/25, 10:35 PM [Link] - Colab

df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print([Link]())

age sex chest pain type resting bp s cholesterol \


0 0.244898 1.0 0.333333 0.70 0.479270
1 0.428571 0.0 0.666667 0.80 0.298507
2 0.183673 1.0 0.333333 0.65 0.469320
3 0.408163 0.0 1.000000 0.69 0.354892
4 0.530612 1.0 0.666667 0.75 0.323383

fasting blood sugar resting ecg max heart rate exercise angina \
0 0.0 0.0 0.788732 0.0
1 0.0 0.0 0.676056 0.0
2 0.0 0.5 0.267606 0.0
3 0.0 0.0 0.338028 1.0
4 0.0 0.0 0.436620 0.0

oldpeak ST slope target


0 0.295455 0.333333 0
1 0.409091 0.666667 1
2 0.295455 0.333333 0
3 0.465909 0.666667 1
4 0.295455 0.333333 0

from sklearn.model_selection import train_test_split

X = [Link](columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training data size:", X_train.shape)


print("Test data size:", X_test.shape)

Training data size: (952, 11)


Test data size: (238, 11)

# Import necessary libraries


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from [Link] import DecisionTreeClassifier
from [Link] import RandomForestClassifier
from [Link] import SVC
from [Link] import accuracy_score, confusion_matrix, classification_report

# Assume that df is already loaded and preprocessed

# Splitting the data into features (X) and target (y)


X = [Link]('target', axis=1) # Dropping target column for features
y = df['target'] # Target column

# Splitting data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Logistic Regression


logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# Model 2: Decision Tree


dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# Model 3: Random Forest


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Model 4: Support Vector Machine (SVM)


svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Evaluate models

# Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_test, logreg_pred))

[Link] 6/8
5/11/25, 10:35 PM [Link] - Colab
print("Logistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))

# Decision Tree
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))
print("Decision Tree Classification Report:\n", classification_report(y_test, dt_pred))

# Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))

# SVM
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

Logistic Regression Accuracy: 0.8529411764705882


Logistic Regression Confusion Matrix:
[[ 90 17]
[ 18 113]]
Logistic Regression Classification Report:
precision recall f1-score support

0 0.83 0.84 0.84 107


1 0.87 0.86 0.87 131

accuracy 0.85 238


macro avg 0.85 0.85 0.85 238
weighted avg 0.85 0.85 0.85 238

Decision Tree Accuracy: 0.8991596638655462


Decision Tree Confusion Matrix:
[[ 99 8]
[ 16 115]]
Decision Tree Classification Report:
precision recall f1-score support

0 0.86 0.93 0.89 107


1 0.93 0.88 0.91 131

accuracy 0.90 238


macro avg 0.90 0.90 0.90 238
weighted avg 0.90 0.90 0.90 238

Random Forest Accuracy: 0.9453781512605042


Random Forest Confusion Matrix:
[[ 98 9]
[ 4 127]]
Random Forest Classification Report:
precision recall f1-score support

0 0.96 0.92 0.94 107


1 0.93 0.97 0.95 131

accuracy 0.95 238


macro avg 0.95 0.94 0.94 238
weighted avg 0.95 0.95 0.95 238

SVM Accuracy: 0.8571428571428571


SVM Confusion Matrix:
[[ 90 17]
[ 17 114]]
SVM Classification Report:
precision recall f1-score support

0 0.84 0.84 0.84 107


1 0.87 0.87 0.87 131

accuracy 0.86 238


macro avg 0.86 0.86 0.86 238
weighted avg 0.86 0.86 0.86 238

from [Link] import SVC


from [Link] import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import [Link] as plt

# Model 4: Support Vector Machine (SVM)


svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Logistic Regression Evaluation


print("Logistic Regression Evaluation:")
logreg_accuracy = accuracy_score(y_test, logreg_pred)
[Link] 7/8
5/11/25, 10:35 PM [Link] - Colab
print(f"Accuracy: {logreg_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, logreg_pred))
print("Classification Report:")
print(classification_report(y_test, logreg_pred))

# ROC Curve: Logistic Regression


fpr, tpr, thresholds = roc_curve(y_test, logreg_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Logistic Regression Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

[Link](figsize=(10, 6))
[Link](fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Logistic Regression)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()

# Decision Tree Evaluation


print("\nDecision Tree Evaluation:")
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Accuracy: {dt_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_pred))
print("Classification Report:")
print(classification_report(y_test, dt_pred))

# ROC Curve: Decision Tree


fpr, tpr, thresholds = roc_curve(y_test, dt_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Decision Tree Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

[Link](figsize=(10, 6))
[Link](fpr, tpr, color='red', label=f'Decision Tree (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Decision Tree)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()

# Random Forest Evaluation


print("\nRandom Forest Evaluation:")
rf_accuracy = accuracy_score(y_test, rf_pred)
print(f"Accuracy: {rf_accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
print("Classification Report:")
print(classification_report(y_test, rf_pred))

# ROC Curve: Random Forest


fpr, tpr, thresholds = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
print("\n🔎 Random Forest Threshold Values (First 10):")
for i in range(min(10, len(thresholds))):
print(f"Threshold: {thresholds[i]:.4f}, TPR: {tpr[i]:.4f}, FPR: {fpr[i]:.4f}")

[Link](figsize=(10, 6))
[Link](fpr, tpr, color='green', label=f'Random Forest (AUC = {roc_auc:.2f})')
[Link]([0, 1], [0, 1], color='gray', linestyle='--')
for i in range(0, len(thresholds), max(1, len(thresholds)//10)):
[Link](f'{thresholds[i]:.2f}', (fpr[i], tpr[i]), textcoords="offset points", xytext=(5, -10), ha='left', fontsize=8)
[Link]('ROC Curve with Thresholds (Random Forest)')
[Link]('False Positive Rate')
[Link]('True Positive Rate')
[Link](loc='lower right')
[Link](True)
[Link]()

# SVM Evaluation
print("\nSVM Evaluation:")
[Link] 8/8

You might also like