0% found this document useful (0 votes)
8 views4 pages

CODE

The document outlines a machine learning workflow for disease classification using various models including Decision Tree, Random Forest, SVM, and Naive Bayes. It includes data preprocessing steps such as label encoding and oversampling to address class imbalance, followed by model training and evaluation using cross-validation and performance metrics like accuracy, precision, recall, and ROC-AUC. Visualizations such as confusion matrices and bar charts for model comparison are also included to present the results.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views4 pages

CODE

The document outlines a machine learning workflow for disease classification using various models including Decision Tree, Random Forest, SVM, and Naive Bayes. It includes data preprocessing steps such as label encoding and oversampling to address class imbalance, followed by model training and evaluation using cross-validation and performance metrics like accuracy, precision, recall, and ROC-AUC. Visualizations such as confusion matrices and bar charts for model comparison are also included to present the results.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

import numpy as np

import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,
roc_curve, auc
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

data = pd.read_csv('/content/improved_disease_dataset.csv')
encoder = LabelEncoder()
data["disease"] = encoder.fit_transform(data["disease"])

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

plt.figure(figsize=(18, 8))
sns.countplot(x=y)
plt.title("Disease Class Distribution Before Resampling")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

print("Resampled Class Distribution:\n", pd.Series(y_resampled).value_counts())

if 'gender' in X_resampled.columns:
le = LabelEncoder()
X_resampled['gender'] = le.fit_transform(X_resampled['gender'])

X_resampled = X_resampled.fillna(0)

if len(y_resampled.shape) > 1:
y_resampled = y_resampled.values.ravel()

models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier()
}

cv_scoring = 'accuracy' # you can also use 'f1_weighted', 'roc_auc_ovr' for multi-
class
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():


try:
scores = cross_val_score(
model,
X_resampled,
y_resampled,
cv=stratified_kfold,
scoring=cv_scoring,
n_jobs=-1,
error_score='raise'
)
print("=" * 50)
print(f"Model: {model_name}")
print(f"Scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")
except Exception as e:
print("=" * 50)
print(f"Model: {model_name} failed with error:")
print(e)

svm_model = SVC()
svm_model.fit(X_resampled, y_resampled)
svm_preds = svm_model.predict(X_resampled)

cf_matrix_svm = confusion_matrix(y_resampled, svm_preds)


plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_svm, annot=True, fmt="d")
plt.title("Confusion Matrix for SVM Classifier")
plt.show()

print(f"SVM Accuracy: {accuracy_score(y_resampled, svm_preds) * 100:.2f}%")

nb_model = GaussianNB()
nb_model.fit(X_resampled, y_resampled)
nb_preds = nb_model.predict(X_resampled)

cf_matrix_nb = confusion_matrix(y_resampled, nb_preds)


plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_nb, annot=True, fmt="d")
plt.title("Confusion Matrix for Naive Bayes Classifier")
plt.show()

print(f"Naive Bayes Accuracy: {accuracy_score(y_resampled, nb_preds) * 100:.2f}%")

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_resampled, y_resampled)
rf_preds = rf_model.predict(X_resampled)

cf_matrix_rf = confusion_matrix(y_resampled, rf_preds)


plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_rf, annot=True, fmt="d")
plt.title("Confusion Matrix for Random Forest Classifier")
plt.show()

print(f"Random Forest Accuracy: {accuracy_score(y_resampled, rf_preds) * 100:.2f}


%")

from statistics import mode

final_preds = [mode([i, j, k]) for i, j, k in zip(svm_preds, nb_preds, rf_preds)]

cf_matrix_combined = confusion_matrix(y_resampled, final_preds)


plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_combined, annot=True, fmt="d")
plt.title("Confusion Matrix for Combined Model")
plt.show()

print(f"Combined Model Accuracy: {accuracy_score(y_resampled, final_preds) *


100:.2f}%")

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,


test_size=0.2, stratify=y_resampled, random_state=42)

models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"Naive Bayes": GaussianNB(),
"SVM": SVC(probability=True)
}

# Nhị phân hóa label cho ROC-AUC đa lớp


lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)

# Kết quả của từng mô hình


results = {}

for model_name, model in models.items():


print("=" * 50)
print(f"Model: {model_name}")

# Huấn luyện mô hình


model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else
None

# Tính các metrics


acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

if y_prob is not None and y_test_bin.shape[1] > 1:


auc = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')
else:
auc = None

# The indentation of this line was incorrect and has been fixed
results[model_name] = {
"Accuracy": acc,
"Precision": prec,
"Recall": rec,
"F1-score": f1,
"ROC-AUC": auc if auc is not None else 0
}
# In kết quả
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
if auc:
print(f"ROC-AUC: {auc:.4f}")

metrics = ["Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"]


model_names = list(results.keys())

# Tạo mảng dữ liệu


bar_width = 0.15
x = np.arange(len(metrics))
fig, ax = plt.subplots(figsize=(12, 6))

# Vẽ các thanh cho từng mô hình


for i, model in enumerate(model_names):
scores = [results[model][metric] for metric in metrics]
bar = ax.bar(x + i * bar_width, scores, width=bar_width, label=model)
for rect in bar:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2., height + 0.005,
f'{height:.2f}', ha='center', va='bottom', fontsize=8)

# Tuỳ chỉnh trục và nhãn


ax.set_xlabel('Metrics', fontsize=12)
ax.set_ylabel('Scores', fontsize=12)
ax.set_title('So sánh các mô hình theo các chỉ số đánh giá', fontsize=14)
ax.set_xticks(x + bar_width * 1.5)
ax.set_xticklabels(metrics)
ax.legend()
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

You might also like