Use Naive bayes, K-nearest, and Decision tree classification algorithms and build classifiers.
Divide the data set into training and test set. Compare the accuracy of the different classifiers
under the following situations:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Load Iris dataset
iris = load_iris()
# Create a DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
# Add the target variable (species) to the DataFrame
iris_df['species'] = iris.target
X = iris.data
y = iris.target
print(iris_df.head(5))
sepal length (cm) sepal width (cm) petal length (cm) petal width
(cm) \
0 5.1 3.5 1.4
0.2
1 4.9 3.0 1.4
0.2
2 4.7 3.2 1.3
0.2
3 4.6 3.1 1.5
0.2
4 5.0 3.6 1.4
0.2
species
0 0
1 0
2 0
3 0
4 0
Splitting the data into training and test sets (75% training, 25% test)
# Define classifiers
nb_classifier = GaussianNB()
knn_classifier = KNeighborsClassifier()
dt_classifier = DecisionTreeClassifier()
# Splitting the data into training and test sets (75% training, 25%
test)
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y,
test_size=0.25, random_state=42)
# Train classifiers on the training set
nb_classifier.fit(X_train_a, y_train_a)
knn_classifier.fit(X_train_a, y_train_a)
dt_classifier.fit(X_train_a, y_train_a)
# Make predictions on the test set
nb_pred_a = nb_classifier.predict(X_test_a)
knn_pred_a = knn_classifier.predict(X_test_a)
dt_pred_a = dt_classifier.predict(X_test_a)
# Calculate accuracy scores
nb_accuracy_a = accuracy_score(y_test_a, nb_pred_a)
knn_accuracy_a = accuracy_score(y_test_a, knn_pred_a)
dt_accuracy_a = accuracy_score(y_test_a, dt_pred_a)
print("Accuracy using 75-25 split:")
print("Naive Bayes Classifier Accuracy:", nb_accuracy_a)
print("K-Nearest Neighbors Classifier Accuracy:", knn_accuracy_a)
print("Decision Tree Classifier Accuracy:", dt_accuracy_a)
Accuracy using 75-25 split:
Naive Bayes Classifier Accuracy: 1.0
K-Nearest Neighbors Classifier Accuracy: 1.0
Decision Tree Classifier Accuracy: 1.0
Splitting the data into training and test sets (2/3rd training, 1/3rd test)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X, y,
test_size=0.33, random_state=42)
# Train classifiers on the training set
nb_classifier.fit(X_train_b, y_train_b)
knn_classifier.fit(X_train_b, y_train_b)
dt_classifier.fit(X_train_b, y_train_b)
# Make predictions on the test set
nb_pred_b = nb_classifier.predict(X_test_b)
knn_pred_b = knn_classifier.predict(X_test_b)
dt_pred_b = dt_classifier.predict(X_test_b)
# Calculate accuracy scores
nb_accuracy_b = accuracy_score(y_test_b, nb_pred_b)
knn_accuracy_b = accuracy_score(y_test_b, knn_pred_b)
dt_accuracy_b = accuracy_score(y_test_b, dt_pred_b)
print("\nAccuracy using 66.6-33.3 split:")
print("Naive Bayes Classifier Accuracy:", nb_accuracy_b)
print("K-Nearest Neighbors Classifier Accuracy:", knn_accuracy_b)
print("Decision Tree Classifier Accuracy:", dt_accuracy_b)
Accuracy using 66.6-33.3 split:
Naive Bayes Classifier Accuracy: 0.96
K-Nearest Neighbors Classifier Accuracy: 0.98
Decision Tree Classifier Accuracy: 1.0
5.2 (a) Hold-out Method:
# Hold-out method
X_train_holdout, X_test_holdout, y_train_holdout, y_test_holdout =
train_test_split(X, y, test_size=0.2, random_state=42)
# Train classifiers on the training set
nb_classifier.fit(X_train_holdout, y_train_holdout)
knn_classifier.fit(X_train_holdout, y_train_holdout)
dt_classifier.fit(X_train_holdout, y_train_holdout)
# Make predictions on the test set
nb_pred_holdout = nb_classifier.predict(X_test_holdout)
knn_pred_holdout = knn_classifier.predict(X_test_holdout)
dt_pred_holdout = dt_classifier.predict(X_test_holdout)
# Calculate accuracy scores
nb_accuracy_holdout = accuracy_score(y_test_holdout, nb_pred_holdout)
knn_accuracy_holdout = accuracy_score(y_test_holdout,
knn_pred_holdout)
dt_accuracy_holdout = accuracy_score(y_test_holdout, dt_pred_holdout)
print("\nAccuracy using Hold-out Method:")
print("Naive Bayes Classifier Accuracy:", nb_accuracy_holdout)
print("K-Nearest Neighbors Classifier Accuracy:",
knn_accuracy_holdout)
print("Decision Tree Classifier Accuracy:", dt_accuracy_holdout)
Accuracy using Hold-out Method:
Naive Bayes Classifier Accuracy: 1.0
K-Nearest Neighbors Classifier Accuracy: 1.0
Decision Tree Classifier Accuracy: 1.0
(ii) Random subsampling
# Random Subsampling
accuracies_nb = []
accuracies_knn = []
accuracies_dt = []
for _ in range(10): # Perform 10 random subsampling iterations
X_train_sub, X_test_sub, y_train_sub, y_test_sub =
train_test_split(X, y, test_size=0.2)
# Train classifiers on the training set
nb_classifier.fit(X_train_sub, y_train_sub)
knn_classifier.fit(X_train_sub, y_train_sub)
dt_classifier.fit(X_train_sub, y_train_sub)
# Make predictions on the test set
nb_pred_sub = nb_classifier.predict(X_test_sub)
knn_pred_sub = knn_classifier.predict(X_test_sub)
dt_pred_sub = dt_classifier.predict(X_test_sub)
# Calculate accuracy scores and append to the list
accuracies_nb.append(accuracy_score(y_test_sub, nb_pred_sub))
accuracies_knn.append(accuracy_score(y_test_sub, knn_pred_sub))
accuracies_dt.append(accuracy_score(y_test_sub, dt_pred_sub))
# Calculate average accuracy
avg_accuracy_nb = sum(accuracies_nb) / len(accuracies_nb)
avg_accuracy_knn = sum(accuracies_knn) / len(accuracies_knn)
avg_accuracy_dt = sum(accuracies_dt) / len(accuracies_dt)
print("\nAverage accuracy using Random Subsampling:")
print("Naive Bayes Classifier Accuracy:", avg_accuracy_nb)
print("K-Nearest Neighbors Classifier Accuracy:", avg_accuracy_knn)
print("Decision Tree Classifier Accuracy:", avg_accuracy_dt)
Average accuracy using Random Subsampling:
Naive Bayes Classifier Accuracy: 0.9366666666666668
K-Nearest Neighbors Classifier Accuracy: 0.9566666666666667
Decision Tree Classifier Accuracy: 0.9400000000000001
(iii) Cross Validation
from sklearn.model_selection import cross_val_score
# Cross-validation
cv_scores_nb = cross_val_score(nb_classifier, X, y, cv=5)
cv_scores_knn = cross_val_score(knn_classifier, X, y, cv=5)
cv_scores_dt = cross_val_score(dt_classifier, X, y, cv=5)
print("\nCross-validation scores:")
print("Naive Bayes Classifier Accuracy:", cv_scores_nb.mean())
print("K-Nearest Neighbors Classifier Accuracy:",
cv_scores_knn.mean())
print("Decision Tree Classifier Accuracy:", cv_scores_dt.mean())
Cross-validation scores:
Naive Bayes Classifier Accuracy: 0.9533333333333334
K-Nearest Neighbors Classifier Accuracy: 0.9733333333333334
Decision Tree Classifier Accuracy: 0.9600000000000002
5.3 Data is scaled to standard format.
from sklearn.preprocessing import StandardScaler
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split the scaled data into training and test sets
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled =
train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train classifiers on the scaled training set
nb_classifier.fit(X_train_scaled, y_train_scaled)
knn_classifier.fit(X_train_scaled, y_train_scaled)
dt_classifier.fit(X_train_scaled, y_train_scaled)
# Make predictions on the scaled test set
nb_pred_scaled = nb_classifier.predict(X_test_scaled)
knn_pred_scaled = knn_classifier.predict(X_test_scaled)
dt_pred_scaled = dt_classifier.predict(X_test_scaled)
# Calculate accuracy scores
nb_accuracy_scaled = accuracy_score(y_test_scaled, nb_pred_scaled)
knn_accuracy_scaled = accuracy_score(y_test_scaled, knn_pred_scaled)
dt_accuracy_scaled = accuracy_score(y_test_scaled, dt_pred_scaled)
print("\nAccuracy after scaling the data:")
print("Naive Bayes Classifier Accuracy:", nb_accuracy_scaled)
print("K-Nearest Neighbors Classifier Accuracy:", knn_accuracy_scaled)
print("Decision Tree Classifier Accuracy:", dt_accuracy_scaled)
Accuracy after scaling the data:
Naive Bayes Classifier Accuracy: 1.0
K-Nearest Neighbors Classifier Accuracy: 1.0
Decision Tree Classifier Accuracy: 1.0