2/24/25, 9:55 PM Untitled5.
ipynb - Colab
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from [Link] import OneHotEncoder, StandardScaler
from [Link] import SimpleImputer
from sklearn.linear_model import LogisticRegression
from [Link] import fbeta_score, roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from [Link] import files
uploaded = [Link]()
Choose Files 2 files
[Link](text/csv) - 152301 bytes, last modified: 2/21/2025 - 100% done
[Link](text/csv) - 146010 bytes, last modified: 2/21/2025 - 100% done
Saving [Link] to [Link]
Saving [Link] to [Link]
train_path = "/content/[Link]"
test_path = "/content/[Link]"
import os
print([Link](train_path))
print([Link](test_path))
train_path = "[Link]"
test_path = "[Link]"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
True
True
# 1️⃣ Handling Missing Values
num_features = ["age", "avg_glucose_level", "bmi"]
num_imputer = SimpleImputer(strategy="median")
train_df[num_features] = num_imputer.fit_transform(train_df[num_features])
test_df[num_features] = num_imputer.transform(test_df[num_features])
# Handling categorical missing values separately
cat_features = ["gender", "ever_married", "work_type", "Residence_type", "smoking_status"]
cat_imputer = SimpleImputer(strategy="most_frequent")
train_df[cat_features] = cat_imputer.fit_transform(train_df[cat_features])
test_df[cat_features] = cat_imputer.transform(test_df[cat_features])
# Dropping missing target values (after checking distribution)
train_df = train_df.dropna(subset=["stroke"])
# 2️⃣ Fixing Outlier Issues
# Fixing Unrealistic Age Values (Remove Ages > 120)
train_df = train_df[train_df["age"] <= 120]
def cap_outliers(df, feature, lower_quantile=0.01, upper_quantile=0.99):
lower_cap = df[feature].quantile(lower_quantile)
upper_cap = df[feature].quantile(upper_quantile)
df[feature] = [Link](df[feature], lower_cap, upper_cap)
return df
# Apply capping to BMI separately for train and test
test_df = cap_outliers(test_df, "bmi")
[Link] 1/4
2/24/25, 9:55 PM [Link] - Colab
train_df = cap_outliers(train_df, "bmi")
# Log transform avg_glucose_level separately per set
train_df['avg_glucose_level'] = np.log1p(train_df['avg_glucose_level'])
test_df['avg_glucose_level'] = np.log1p(test_df['avg_glucose_level'])
# 3️⃣ Handling "Unknown" in smoking_status
train_df['smoking_status_unknown'] = train_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in train
test_df['smoking_status_unknown'] = test_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in test_df
train_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
test_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
# 3️⃣ Handling "Unknown" in smoking_status
train_df['smoking_status_unknown'] = train_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in train
test_df['smoking_status_unknown'] = test_df['smoking_status'].eq('Unknown').astype(int) if 'smoking_status' in test_df
train_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
test_df.drop(columns=["smoking_status"], errors='ignore', inplace=True)
# Update cat_features after dropping smoking_status
cat_features = ["gender", "ever_married", "work_type", "Residence_type"] # Removed smoking_status
# 4️⃣ Encoding Categorical Variables
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
encoded_train = [Link](encoder.fit_transform(train_df[cat_features]), columns=encoder.get_feature_names_out())
encoded_test = [Link]([Link](test_df[cat_features]), columns=encoder.get_feature_names_out())
# Transform test data
encoded_test = [Link]([Link](test_df[cat_features]))
encoded_test.columns = encoder.get_feature_names_out(cat_features) # Get column names
# Reset index
encoded_train.reset_index(drop=True, inplace=True)
encoded_test.reset_index(drop=True, inplace=True)
# 5️⃣ Standardizing Numerical Features
scaler = StandardScaler()
scaled_train = [Link](scaler.fit_transform(train_df[num_features]), columns=num_features)
scaled_test = [Link]([Link](test_df[num_features]), columns=num_features)
# 6️⃣ Combining Processed Features
X_train_final = [Link]([scaled_train, encoded_train, train_df[["hypertension", "heart_disease", "smoking_status_unk
X_test_final = [Link]([scaled_test, encoded_test, test_df[["hypertension", "heart_disease", "smoking_status_unknown
# Handle potential NaN values in 'stroke' column before conversion
y_train_final = train_df["stroke"].map({'Yes': 1, 'No': 0}).fillna(0).astype(int) # Fill NaN with -1 or another suitab
# 7️⃣ Handling Class Imbalance Using SMOTE
smote = SMOTE(random_state=42, k_neighbors=min(5, y_train_final[y_train_final != -1].value_counts().min() - 1))
# Set k_neighbors to the minimum between 5 and the number of samples in the smallest class(excluding -1 if present), m
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_final, y_train_final)
# 8️⃣ Train-Test Split (Only Once!)
X_train, X_val, y_train, y_val = train_test_split(X_train_balanced, y_train_balanced, test_size=0.2, random_state=42,
# 9️⃣ Train Logistic Regression Model
model = LogisticRegression(class_weight='balanced', max_iter=1000)
[Link](X_train, y_train)
[Link] 2/4
2/24/25, 9:55 PM [Link] - Colab
▾ LogisticRegression i ?
LogisticRegression(class_weight='balanced', max_iter=1000)
# Predictions
y_pred = [Link](X_val)
y_probs = model.predict_proba(X_val)[:, 1]
# Evaluation Metrics
auc_score = roc_auc_score(y_val, y_probs)
f_beta = fbeta_score(y_val, y_pred, beta=10)
class_report = classification_report(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
# Display Metrics
print(f"AUC Score: {auc_score}")
print(f"F-beta Score (β=10): {f_beta}")
print("Classification Report:")
print(class_report)
print("Confusion Matrix:")
print(conf_matrix)
AUC Score: 0.8933009746138141
F-beta Score (β=10): 0.9974195801482286
Classification Report:
precision recall f1-score support
0 1.00 0.74 0.85 510
1 0.79 1.00 0.88 509
accuracy 0.87 1019
macro avg 0.90 0.87 0.87 1019
weighted avg 0.90 0.87 0.87 1019
Confusion Matrix:
[[377 133]
[ 0 509]]
# 1️⃣0️⃣ Prepare Test Data for Submission
test_probs = model.predict_proba(X_test_final)[:, 1]
test_preds = [Link](X_test_final)
# Save Processed Data
train_df.to_csv("train_cleaned.csv", index=False)
test_df.to_csv("test_cleaned.csv", index=False)
submission_df = [Link]({"id": test_df["id"], "stroke": test_preds})
submission_df.to_csv("[Link]", index=False)
print("Preprocessing and training complete! 🚀")
Preprocessing and training complete! 🚀
[Link] 3/4
2/24/25, 9:55 PM [Link] - Colab
[Link] 4/4