from google.
colab import drive
[Link]('/content/drive')
Mounted at /content/drive
from [Link] import drive
import pandas as pd
# Mount Google Drive
[Link]('/content/drive')
# Load training and testing data
train_data = pd.read_csv('/content/drive/MyDrive/[Link]')
test_data = pd.read_csv('/content/drive/MyDrive/[Link]')
# Display the first few rows of the training data
print(train_data.head())
Drive already mounted at /content/drive; to attempt to forcibly
remount, call [Link]("/content/drive", force_remount=True).
Age Annual Income Credit Score Experience Loan Amount Loan
Duration \
0 45 39948 617 22 13152
48
1 38 39709 628 15 26045
48
2 47 40724 570 26 17627
36
3 58 69084 545 34 37898
96
4 58 51250 564 39 12741
48
Number of Dependents Monthly Debt Payment Creditcard Utilizatio
Rate \
0 2 183
0.354418
1 1 496
0.087827
2 2 902
0.137414
3 1 755
0.267587
4 0 337
0.367380
Number of Open Credit Lines ... Total Assets TotalLiabilities \
0 1 ... 146111 19183
1 5 ... 53204 9595
2 2 ... 25176 128874
3 2 ... 104822 5370
4 6 ... 65624 43894
MonthlyIncome UtilityBillsPaymentHistory JobTenure NetWorth \
0 3329.000000 0.724972 11 126928
1 3309.083333 0.935132 3 43609
2 3393.666667 0.872241 6 5205
3 5757.000000 0.896155 5 99452
4 4270.833333 0.884275 5 21730
InterestRate MonthlyLoanPayment TotalDebtToIncomeRatio
LoanApproved
0 0.227590 419.805992 0.181077
0
1 0.201077 794.054238 0.389852
0
2 0.212548 666.406688 0.462157
0
3 0.300911 1047.506980 0.313098
0
4 0.205271 391.300352 0.170529
0
[5 rows x 28 columns]
find a suitable projection vector 𝑤 w and classify based on the projections. Here’s how to
Binary Classifiers: Original Features 2.1 Linear Discriminant Analysis (LDA) For LDA, we need to
implement LDA:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
as LDA
import [Link] as plt
# Separate features and labels
X_train = train_data.iloc[:, :-1].values # all rows, all columns
except last
y_train = train_data.iloc[:, -1].values # all rows, last column
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values
# Fit LDA
lda = LDA()
[Link](X_train, y_train)
# Project the test data
y_test_proj = [Link](X_test)
# Classify based on a threshold
thresholds = [Link](-3, 3, 100)
type1_errors = []
type2_errors = []
for threshold in thresholds:
y_pred = (y_test_proj > threshold).astype(int)
type1_error = [Link]((y_pred == 0) & (y_test == 1)) / 200 #
Denied when Approved
type2_error = [Link]((y_pred == 1) & (y_test == 0)) / 200 #
Approved when Denied
type1_errors.append(type1_error)
type2_errors.append(type2_error)
# Plotting Type 1 and Type 2 error rates
[Link](thresholds, type1_errors, label='Type 1 Error Rate')
[Link](thresholds, type2_errors, label='Type 2 Error Rate')
[Link]('Threshold')
[Link]('Error Rate')
[Link]('Error Rates for LDA with Varying Thresholds')
[Link]()
[Link]()
[Link]()
2.2 Decision Tree Now let’s implement a decision tree classifier:
from [Link] import DecisionTreeClassifier
from [Link] import confusion_matrix
# Fit Decision Tree
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
# Predictions
y_pred_tree = tree_clf.predict(X_test)
# Confusion Matrix
cm_tree = confusion_matrix(y_test, y_pred_tree)
type1_error_tree = cm_tree[1, 0] / 200 # Denied when Approved
type2_error_tree = cm_tree[0, 1] / 200 # Approved when Denied
print(f"Decision Tree - Type 1 Error Rate: {type1_error_tree}, Type 2
Error Rate: {type2_error_tree}")
Decision Tree - Type 1 Error Rate: 0.155, Type 2 Error Rate: 0.175
from [Link] import KNeighborsClassifier
k_values = [1, 3, 5, 10]
type1_errors_knn = []
type2_errors_knn = []
for k in k_values:
knn_clf = KNeighborsClassifier(n_neighbors=k)
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)
cm_knn = confusion_matrix(y_test, y_pred_knn)
type1_error_knn = cm_knn[1, 0] / 200 # Denied when Approved
type2_error_knn = cm_knn[0, 1] / 200 # Approved when Denied
type1_errors_knn.append(type1_error_knn)
type2_errors_knn.append(type2_error_knn)
print("kNN Error Rates:")
for k, t1, t2 in zip(k_values, type1_errors_knn, type2_errors_knn):
print(f"k={k}: Type 1 Error Rate = {t1}, Type 2 Error Rate =
{t2}")
kNN Error Rates:
k=1: Type 1 Error Rate = 0.185, Type 2 Error Rate = 0.25
k=3: Type 1 Error Rate = 0.17, Type 2 Error Rate = 0.185
k=5: Type 1 Error Rate = 0.135, Type 2 Error Rate = 0.2
k=10: Type 1 Error Rate = 0.165, Type 2 Error Rate = 0.17
2.4 Support Vector Machine (SVM) Finally, let’s implement the SVM with a soft margin:
from [Link] import SVC
# Fit SVM
svm_clf = SVC(C=1.0, kernel='rbf', random_state=42) # Use RBF kernel
svm_clf.fit(X_train, y_train)
# Predictions
y_pred_svm = svm_clf.predict(X_test)
# Confusion Matrix
cm_svm = confusion_matrix(y_test, y_pred_svm)
type1_error_svm = cm_svm[1, 0] / 200 # Denied when Approved
type2_error_svm = cm_svm[0, 1] / 200 # Approved when Denied
print(f"SVM - Type 1 Error Rate: {type1_error_svm}, Type 2 Error Rate:
{type2_error_svm}")
SVM - Type 1 Error Rate: 0.135, Type 2 Error Rate: 0.14
Step 3: Binary Classifiers: PCA Features Now, let’s apply PCA and use it to train the kNN and
SVM classifiers:
from [Link] import PCA
# Apply PCA
pca = PCA(n_components=5) # Change number of components as needed
X_train_pca = pca.fit_transform(X_train)
X_test_pca = [Link](X_test)
# kNN with PCA
knn_clf_pca = KNeighborsClassifier(n_neighbors=5)
knn_clf_pca.fit(X_train_pca, y_train)
y_pred_knn_pca = knn_clf_pca.predict(X_test_pca)
cm_knn_pca = confusion_matrix(y_test, y_pred_knn_pca)
type1_error_knn_pca = cm_knn_pca[1, 0] / 200 # Denied when Approved
type2_error_knn_pca = cm_knn_pca[0, 1] / 200 # Approved when Denied
print(f"kNN with PCA - Type 1 Error Rate: {type1_error_knn_pca}, Type
2 Error Rate: {type2_error_knn_pca}")
# SVM with PCA
svm_clf_pca = SVC(C=1.0, kernel='rbf', random_state=42)
svm_clf_pca.fit(X_train_pca, y_train)
y_pred_svm_pca = svm_clf_pca.predict(X_test_pca)
cm_svm_pca = confusion_matrix(y_test, y_pred_svm_pca)
type1_error_svm_pca = cm_svm_pca[1, 0] / 200 # Denied when Approved
type2_error_svm_pca = cm_svm_pca[0, 1] / 200 # Approved when Denied
print(f"SVM with PCA - Type 1 Error Rate: {type1_error_svm_pca}, Type
2 Error Rate: {type2_error_svm_pca}")
kNN with PCA - Type 1 Error Rate: 0.125, Type 2 Error Rate: 0.19
SVM with PCA - Type 1 Error Rate: 0.125, Type 2 Error Rate: 0.14
import pandas as pd
import numpy as np
import [Link] as plt
# Assuming these are the error rates you obtained:
results = {
'Classifier': ['Decision Tree', 'kNN (k=1)', 'kNN (k=3)', 'kNN
(k=5)', 'kNN (k=10)', 'SVM', 'kNN with PCA', 'SVM with PCA'],
'Type 1 Error Rate': [0.155, 0.185, 0.17, 0.135, 0.165, 0.135,
0.125, 0.125],
'Type 2 Error Rate': [0.175, 0.25, 0.185, 0.2, 0.17, 0.14, 0.19,
0.14]
}
# Create a DataFrame
error_df = [Link](results)
# Display the error rates
print("Error Rate Summary:")
print(error_df)
# Plotting error rates
fig, ax = [Link](figsize=(12, 6))
# Bar width
bar_width = 0.35
# Index for bar positions
index = [Link](len(error_df))
# Plot Type 1 Error Rates
[Link](index, error_df['Type 1 Error Rate'], bar_width, label='Type 1
Error Rate', alpha=0.7)
# Plot Type 2 Error Rates
[Link](index + bar_width, error_df['Type 2 Error Rate'], bar_width,
label='Type 2 Error Rate', alpha=0.5)
ax.set_ylabel('Error Rate')
ax.set_title('Error Rates for Different Classifiers')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(error_df['Classifier'])
[Link]()
[Link]()
plt.tight_layout()
[Link]()
# Discuss PCA Effects
pca_effects = """
Discussion on PCA Effects:
1. kNN with PCA showed a decrease in Type 1 error rate (from 0.135 to
0.125), indicating improved performance in avoiding false negatives.
2. The Type 2 error rate remained stable (0.19), suggesting that while
the model became better at detecting approved applications, it
maintained a similar rate of false positives.
3. SVM with PCA had similar outcomes, improving its Type 1 error while
keeping Type 2 errors consistent.
4. Overall, PCA appears to aid in reducing errors for both kNN and SVM
classifiers by focusing on the most informative features.
"""
print(pca_effects)
Error Rate Summary:
Classifier Type 1 Error Rate Type 2 Error Rate
0 Decision Tree 0.155 0.175
1 kNN (k=1) 0.185 0.250
2 kNN (k=3) 0.170 0.185
3 kNN (k=5) 0.135 0.200
4 kNN (k=10) 0.165 0.170
5 SVM 0.135 0.140
6 kNN with PCA 0.125 0.190
7 SVM with PCA 0.125 0.140
Discussion on PCA Effects:
1. kNN with PCA showed a decrease in Type 1 error rate (from 0.135 to
0.125), indicating improved performance in avoiding false negatives.
2. The Type 2 error rate remained stable (0.19), suggesting that while
the model became better at detecting approved applications, it
maintained a similar rate of false positives.
3. SVM with PCA had similar outcomes, improving its Type 1 error while
keeping Type 2 errors consistent.
4. Overall, PCA appears to aid in reducing errors for both kNN and SVM
classifiers by focusing on the most informative features.
import tarfile
import os
# Define the name of the tar file
submission_file = '/content/drive/MyDrive/project_submission.[Link]'
# Save the tar file in your Google Drive
# Create a tar file for submission
with [Link](submission_file, 'w:gz') as tar:
# Add your main script
[Link]('/content/drive/MyDrive/your_script.py') # Replace with
your actual script name
# Add the training and testing data if needed
[Link]('/content/drive/MyDrive/[Link]')
[Link]('/content/drive/MyDrive/[Link]')
# Check the contents of the tar file
with [Link](submission_file, 'r:gz') as tar:
print("Contents of the tar file:")
[Link]() # List the contents of the tar file
print(f"Submission file '{submission_file}' created successfully.")
Contents of the tar file:
?rw------- root/root 509 2024-10-26 [Link]
content/drive/MyDrive/your_script.py
?rw------- root/root 143548 2024-10-26 [Link]
content/drive/MyDrive/[Link]
?rw------- root/root 64023 2024-10-26 [Link]
content/drive/MyDrive/[Link]
Submission file '/content/drive/MyDrive/project_submission.[Link]'
created successfully.