4/8/25, 7:44 PM 1stTask.
ipynb - Colab
Group Members
1. 21/04905 James Wainaina Githirwa
2. 21/04883 Fabian Ndung'u
3. 21/06700 Peter Kamau
4. 21/04956 Oliver Samwel
5. 21/05462 Purity Njenga
6. 21/05041 Caleb Sirma
7. 21/05119 Bramwel wanyoike
8. 19/02645 Ian Karanja
keyboard_arrow_down Import Libraries
This step imports the necessary libraries for data manipulation, modeling, and visualization.
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import RandomForestClassifier, GradientBoostingClassifier
from [Link] import DecisionTreeClassifier
from [Link] import (accuracy_score, classification_report,
confusion_matrix, precision_recall_curve,
PrecisionRecallDisplay)
from imblearn.over_sampling import SMOTE
from [Link] import resample
import seaborn as sns
import [Link] as plt
import warnings
# Suppress FutureWarnings
[Link]("ignore", category=FutureWarning)
keyboard_arrow_down Load and Preprocess Data
This function loads the dataset and performs initial preprocessing, including removing non-predictive columns and handling missing values.
# Load and preprocess data
def load_and_preprocess(filepath):
data = pd.read_csv(filepath)
# Remove non-predictive columns
non_predictive = [
"rapid_flu_results", "rapid_strep_results",
"cxr_findings", "cxr_impression", "cxr_label", "cxr_link",
"batch_date", "test_name", "swab_type"
]
[Link](columns=[col for col in non_predictive if col in [Link]], inplace=True)
# Handle missing values
[Link](subset=["covid19_test_results"], inplace=True)
# Numerical imputation
num_cols = data.select_dtypes(include=['number']).columns
data[num_cols] = data[num_cols].fillna(data[num_cols].mean())
# Categorical imputation
cat_cols = data.select_dtypes(exclude=['number']).[Link]('covid19_test_results')
for col in cat_cols:
data[col] = data[col].fillna(data[col].mode()[0])
return data
keyboard_arrow_down Handle Class Imbalance
This function balances the dataset by down-sampling the majority class (negative cases) to a specified size.
# Handle class imbalance
def balance_dataset(data):
majority = data[data['covid19_test_results'] == 'Negative']
minority = data[data['covid19_test_results'] == 'Positive']
[Link] 1/6
4/8/25, 7:44 PM [Link] - Colab
# Downsample majority
majority_down = resample(majority,
replace=False,
n_samples=5000,
random_state=42)
return [Link]([majority_down, minority], ignore_index=True)
keyboard_arrow_down Main Execution
This section executes the main workflow, including loading, balancing the dataset, and preparing for modeling.
# Main execution
data = load_and_preprocess('[Link]')
balanced_data = balance_dataset(data)
# Preprocess for modeling
X = pd.get_dummies(balanced_data.drop('covid19_test_results', axis=1), drop_first=True)
y = balanced_data['covid19_test_results'].map({'Negative': 0, 'Positive': 1})
keyboard_arrow_down Split Data
This step splits the data into training and testing sets, ensuring stratification based on the target variable.
# Split data before resampling
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
stratify=y,
random_state=42
)
keyboard_arrow_down Apply SMOTE
This step applies SMOTE to the training data to balance the classes further.
# Apply SMOTE only to training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
print("\nClass distribution after resampling:")
print([Link](y_res).value_counts())
Class distribution after resampling:
covid19_test_results
0 3500
1 3500
Name: count, dtype: int64
keyboard_arrow_down Model Training and Evaluation
This section defines the models, trains them, and evaluates their performance using accuracy, classification reports, confusion matrices, and
precision-recall curves.
# Model training and evaluation
models = {
"Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42)
}
for name, model in [Link]():
# Training
[Link](X_res, y_res)
# Prediction
y_pred = [Link](X_test)
y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else [0]*len(y_test)
# Evaluation
print(f"\n{name} Evaluation:")
[Link] 2/6
4/8/25, 7:44 PM [Link] - Colab
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
[Link](figsize=(6,4))
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted Negative', 'Predicted Positive'],
yticklabels=['Actual Negative', 'Actual Positive'])
[Link](f"{name} Confusion Matrix")
[Link]()
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)
disp = PrecisionRecallDisplay(precision=precision, recall=recall)
[Link]()
[Link](f"{name} Precision-Recall Curve")
[Link]()
# Feature Importance (if available)
if hasattr(model, 'feature_importances_'):
importances = [Link](model.feature_importances_, index=[Link])
top_features = importances.sort_values(ascending=False).head(10)
[Link](figsize=(10,6))
top_features.sort_values().[Link](color='darkgreen')
[Link](f"{name} - Top 10 Features")
[Link]("Importance Score")
plt.tight_layout()
[Link]()
[Link] 3/6
4/8/25, 7:44 PM [Link] - Colab
Random Forest Evaluation:
Accuracy: 0.9714854111405835
precision recall f1-score support
Negative 1.00 0.98 0.99 1500
Positive 0.05 0.25 0.09 8
accuracy 0.97 1508
macro avg 0.52 0.61 0.54 1508
weighted avg 0.99 0.97 0.98 1508
[Link] 4/6
4/8/25, 7:44 PM [Link] - Colab
Gradient Boosting Evaluation:
Accuracy: 0.9602122015915119
precision recall f1-score support
Negative 1.00 0.96 0.98 1500
Positive 0.07 0.50 0.12 8
accuracy 0.96 1508
macro avg 0.53 0.73 0.55 1508
weighted avg 0.99 0.96 0.98 1508
[Link] 5/6
4/8/25, 7:44 PM [Link] - Colab
Decision Tree Evaluation:
Accuracy: 0.9602122015915119
precision recall f1-score support
Negative 1.00 0.96 0.98 1500
Positive 0.02 0.12 0.03 8
accuracy 0.96 1508
macro avg 0.51 0.54 0.51 1508
weighted avg 0.99 0.96 0.97 1508
[Link] 6/6