import numpy as np
import pandas as pd
from scipy.stats import mode
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,
roc_curve, auc
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
data = pd.read_csv('/content/improved_disease_dataset.csv')
encoder = LabelEncoder()
data["disease"] = encoder.fit_transform(data["disease"])
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
plt.figure(figsize=(18, 8))
sns.countplot(x=y)
plt.title("Disease Class Distribution Before Resampling")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)
print("Resampled Class Distribution:\n", pd.Series(y_resampled).value_counts())
if 'gender' in X_resampled.columns:
le = LabelEncoder()
X_resampled['gender'] = le.fit_transform(X_resampled['gender'])
X_resampled = X_resampled.fillna(0)
if len(y_resampled.shape) > 1:
y_resampled = y_resampled.values.ravel()
models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier()
}
cv_scoring = 'accuracy' # you can also use 'f1_weighted', 'roc_auc_ovr' for multi-
class
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for model_name, model in models.items():
try:
scores = cross_val_score(
model,
X_resampled,
y_resampled,
cv=stratified_kfold,
scoring=cv_scoring,
n_jobs=-1,
error_score='raise'
)
print("=" * 50)
print(f"Model: {model_name}")
print(f"Scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")
except Exception as e:
print("=" * 50)
print(f"Model: {model_name} failed with error:")
print(e)
svm_model = SVC()
svm_model.fit(X_resampled, y_resampled)
svm_preds = svm_model.predict(X_resampled)
cf_matrix_svm = confusion_matrix(y_resampled, svm_preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_svm, annot=True, fmt="d")
plt.title("Confusion Matrix for SVM Classifier")
plt.show()
print(f"SVM Accuracy: {accuracy_score(y_resampled, svm_preds) * 100:.2f}%")
nb_model = GaussianNB()
nb_model.fit(X_resampled, y_resampled)
nb_preds = nb_model.predict(X_resampled)
cf_matrix_nb = confusion_matrix(y_resampled, nb_preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_nb, annot=True, fmt="d")
plt.title("Confusion Matrix for Naive Bayes Classifier")
plt.show()
print(f"Naive Bayes Accuracy: {accuracy_score(y_resampled, nb_preds) * 100:.2f}%")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_resampled, y_resampled)
rf_preds = rf_model.predict(X_resampled)
cf_matrix_rf = confusion_matrix(y_resampled, rf_preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_rf, annot=True, fmt="d")
plt.title("Confusion Matrix for Random Forest Classifier")
plt.show()
print(f"Random Forest Accuracy: {accuracy_score(y_resampled, rf_preds) * 100:.2f}
%")
from statistics import mode
final_preds = [mode([i, j, k]) for i, j, k in zip(svm_preds, nb_preds, rf_preds)]
cf_matrix_combined = confusion_matrix(y_resampled, final_preds)
plt.figure(figsize=(12, 8))
sns.heatmap(cf_matrix_combined, annot=True, fmt="d")
plt.title("Confusion Matrix for Combined Model")
plt.show()
print(f"Combined Model Accuracy: {accuracy_score(y_resampled, final_preds) *
100:.2f}%")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
test_size=0.2, stratify=y_resampled, random_state=42)
models = {
"Decision Tree": DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(),
"Naive Bayes": GaussianNB(),
"SVM": SVC(probability=True)
}
# Nhị phân hóa label cho ROC-AUC đa lớp
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
# Kết quả của từng mô hình
results = {}
for model_name, model in models.items():
print("=" * 50)
print(f"Model: {model_name}")
# Huấn luyện mô hình
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) if hasattr(model, "predict_proba") else
None
# Tính các metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
if y_prob is not None and y_test_bin.shape[1] > 1:
auc = roc_auc_score(y_test_bin, y_prob, multi_class='ovr')
else:
auc = None
# The indentation of this line was incorrect and has been fixed
results[model_name] = {
"Accuracy": acc,
"Precision": prec,
"Recall": rec,
"F1-score": f1,
"ROC-AUC": auc if auc is not None else 0
}
# In kết quả
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1-score: {f1:.4f}")
if auc:
print(f"ROC-AUC: {auc:.4f}")
metrics = ["Accuracy", "Precision", "Recall", "F1-score", "ROC-AUC"]
model_names = list(results.keys())
# Tạo mảng dữ liệu
bar_width = 0.15
x = np.arange(len(metrics))
fig, ax = plt.subplots(figsize=(12, 6))
# Vẽ các thanh cho từng mô hình
for i, model in enumerate(model_names):
scores = [results[model][metric] for metric in metrics]
bar = ax.bar(x + i * bar_width, scores, width=bar_width, label=model)
for rect in bar:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width() / 2., height + 0.005,
f'{height:.2f}', ha='center', va='bottom', fontsize=8)
# Tuỳ chỉnh trục và nhãn
ax.set_xlabel('Metrics', fontsize=12)
ax.set_ylabel('Scores', fontsize=12)
ax.set_title('So sánh các mô hình theo các chỉ số đánh giá', fontsize=14)
ax.set_xticks(x + bar_width * 1.5)
ax.set_xticklabels(metrics)
ax.legend()
plt.ylim(0, 1.1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()