0% found this document useful (0 votes)
69 views4 pages

Untitled5.ipynb - Colab

The document outlines a data preprocessing and modeling workflow for a stroke prediction task using Python libraries such as pandas, scikit-learn, and imbalanced-learn. It includes steps for handling missing values, fixing outliers, encoding categorical variables, standardizing numerical features, and training a logistic regression model. The final model evaluation metrics indicate a strong performance, with an AUC score of approximately 0.89 and an F-beta score of 0.99.

Uploaded by

gacia der
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
69 views4 pages

Untitled5.ipynb - Colab

The document outlines a data preprocessing and modeling workflow for a stroke prediction task using Python libraries such as pandas, scikit-learn, and imbalanced-learn. It includes steps for handling missing values, fixing outliers, encoding categorical variables, standardizing numerical features, and training a logistic regression model. The final model evaluation metrics indicate a strong performance, with an AUC score of approximately 0.89 and an F-beta score of 0.99.

Uploaded by

gacia der
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

2/24/25, 9:55 PM Untitled5.

ipynb - Colab

import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from [Link] import OneHotEncoder, StandardScaler
from [Link] import SimpleImputer
from sklearn.linear_model import LogisticRegression
from [Link] import fbeta_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

from [Link] import files


uploaded = [Link]()

Choose Files 2 files


[Link](text/csv) - 152301 bytes, last modified: 2/21/2025 - 100% done
[Link](text/csv) - 146010 bytes, last modified: 2/21/2025 - 100% done
Saving [Link] to [Link]
Saving [Link] to [Link]

train_path = "/content/[Link]"
test_path = "/content/[Link]"
import os
print([Link](train_path))
print([Link](test_path))
train_path = "[Link]"
test_path = "[Link]"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

True
True

# 1️⃣ Handling Missing Values


num_features = ["age", "avg_glucose_level", "bmi"]
num_imputer = SimpleImputer(strategy="median")
train_df[num_features] = num_imputer.fit_transform(train_df[num_features])
test_df[num_features] = num_imputer.transform(test_df[num_features])

# Handling categorical missing values separately


cat_features = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
cat_imputer = SimpleImputer(strategy="most_frequent")
train_df[cat_features] = cat_imputer.fit_transform(train_df[cat_features])
test_df[cat_features] = cat_imputer.transform(test_df[cat_features])

# Dropping missing target values (after checking distribution)


train_df = train_df.dropna(subset=["stroke"])

# 2️⃣ Fixing Outlier Issues


# Fixing Unrealistic Age Values (Remove Ages > 120)
train_df = train_df[train_df["age"] <= 120]

def cap_outliers(df, feature, lower_quantile=0.01, upper_quantile=0.99):


lower_cap = df[feature].quantile(lower_quantile)
upper_cap = df[feature].quantile(upper_quantile)
df[feature] = [Link](df[feature], lower_cap, upper_cap)
return df

# Apply capping to BMI separately for train and test


test_df = cap_outliers(test_df, "bmi")
[Link] 1/4
2/24/25, 9:55 PM [Link] - Colab
train_df = cap_outliers(train_df, "bmi")

# Log transform avg_glucose_level separately per set


train_df['avg_glucose_level'] = np.log1p(train_df['avg_glucose_level'])
test_df['avg_glucose_level'] = np.log1p(test_df['avg_glucose_level'])

# 3️⃣ Handling "Unknown" in smoking_status


train_df['smoking_status_unknown'] = train_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in train
test_df['smoking_status_unknown'] = test_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in test_df
train_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
test_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)

# 3️⃣ Handling "Unknown" in smoking_status


train_df['smoking_status_unknown'] = train_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in train
test_df['smoking_status_unknown'] = test_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in test_df
train_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
test_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)

# Update cat_features after dropping smoking_status


cat_features = ["gender", "ever_married", "work_type", "Residence_type"] # Removed smoking_status

# 4️⃣ Encoding Categorical Variables


encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_train = [Link](encoder.fit_transform(train_df[cat_features]), columns=encoder.get_feature_names_out())
encoded_test = [Link]([Link](test_df[cat_features]), columns=encoder.get_feature_names_out())

# Transform test data


encoded_test = [Link]([Link](test_df[cat_features]))
encoded_test.columns = encoder.get_feature_names_out(cat_features) # Get column names

# Reset index
encoded_train.reset_index(drop=True, inplace=True)
encoded_test.reset_index(drop=True, inplace=True)

# 5️⃣ Standardizing Numerical Features


scaler = StandardScaler()
scaled_train = [Link](scaler.fit_transform(train_df[num_features]), columns=num_features)
scaled_test = [Link]([Link](test_df[num_features]), columns=num_features)

# 6️⃣ Combining Processed Features


X_train_final = [Link]([scaled_train, encoded_train, train_df[["hypertension", "heart_disease", "smoking_status_unk
X_test_final = [Link]([scaled_test, encoded_test, test_df[["hypertension", "heart_disease", "smoking_status_unknown

# Handle potential NaN values in 'stroke' column before conversion


y_train_final = train_df["stroke"].map({'Yes': 1, 'No': 0}).fillna(0).astype(int) # Fill NaN with -1 or another suitab

# 7️⃣ Handling Class Imbalance Using SMOTE


smote = SMOTE(random_state=42, k_neighbors=min(5, y_train_final[y_train_final != -1].value_counts().min() - 1))
# Set k_neighbors to the minimum between 5 and the number of samples in the smallest class(excluding -1 if present), m
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_final, y_train_final)

# 8️⃣ Train-Test Split (Only Once!)


X_train, X_val, y_train, y_val = train_test_split(X_train_balanced, y_train_balanced, test_size=0.2, random_state=42,

# 9️⃣ Train Logistic Regression Model


model = LogisticRegression(class_weight='balanced', max_iter=1000)
[Link](X_train, y_train)

[Link] 2/4
2/24/25, 9:55 PM [Link] - Colab

▾ LogisticRegression i ?

LogisticRegression(class_weight='balanced', max_iter=1000)

# Predictions
y_pred = [Link](X_val)
y_probs = model.predict_proba(X_val)[:, 1]

# Evaluation Metrics
auc_score = roc_auc_score(y_val, y_probs)
f_beta = fbeta_score(y_val, y_pred, beta=10)
class_report = classification_report(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

# Display Metrics
print(f"AUC Score: {auc_score}")
print(f"F-beta Score (β=10): {f_beta}")
print("Classification Report:")
print(class_report)
print("Confusion Matrix:")
print(conf_matrix)

AUC Score: 0.8933009746138141


F-beta Score (β=10): 0.9974195801482286
Classification Report:
precision recall f1-score support

0 1.00 0.74 0.85 510


1 0.79 1.00 0.88 509

accuracy 0.87 1019


macro avg 0.90 0.87 0.87 1019
weighted avg 0.90 0.87 0.87 1019

Confusion Matrix:
[[377 133]
[ 0 509]]

# 1️⃣0️⃣ Prepare Test Data for Submission


test_probs = model.predict_proba(X_test_final)[:, 1]
test_preds = [Link](X_test_final)

# Save Processed Data


train_df.to_csv("train_cleaned.csv", index=False)
test_df.to_csv("test_cleaned.csv", index=False)
submission_df = [Link]({"id": test_df["id"], "stroke": test_preds})
submission_df.to_csv("[Link]", index=False)

print("Preprocessing and training complete! 🚀")


Preprocessing and training complete! 🚀

[Link] 3/4
2/24/25, 9:55 PM [Link] - Colab

[Link] 4/4

You might also like