QUESTION 1:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score,
classification_report, confusion_matrix
X, y = make_classification(n_samples=5000, n_features=20, n_classes=2,
weights=[0.95, 0.05], flip_y=0.01, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Precision: 0.72
Recall: 0.32142857142857145
F1-score: 0.4444444444444444
Classification Report:
precision recall f1-score support
0 0.96 0.99 0.98 944
1 0.72 0.32 0.44 56
accuracy 0.95 1000
macro avg 0.84 0.66 0.71 1000
weighted avg 0.95 0.95 0.95 1000
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)
import seaborn as sns
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Fraud',
'Fraud'], yticklabels=['Not Fraud',
'Fraud'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()
Confusion Matrix:
[[937 7]
[ 38 18]]
QUESTION 2:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_curve, classification_report
X, y = make_classification(n_samples=5000, n_features=20, n_classes=2,
weights=[0.9, 0.1], flip_y=0.01, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)
y_scores = svm_model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
plt.figure(figsize=(8,6))
plt.plot(recall, precision, marker='.', label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for SVM")
plt.legend()
plt.grid()
plt.show()
threshold = 0.5
y_pred = (y_scores >= threshold).astype(int)
print(f"\nClassification Report at threshold={threshold}:\n")
print(classification_report(y_test, y_pred))
plt.figure(figsize=(8,6))
plt.plot(thresholds, precision[:-1], label="Precision")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.xlabel("Decision Threshold")
plt.ylabel("Score")
plt.title("Precision and Recall vs Threshold")
plt.legend()
plt.grid()
plt.show()
Classification Report at threshold=0.5:
precision recall f1-score support
0 0.95 0.98 0.96 908
1 0.66 0.46 0.54 92
accuracy 0.93 1000
macro avg 0.80 0.72 0.75 1000
weighted avg 0.92 0.93 0.92 1000
QUESTION 3:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, classification_report,
accuracy_score
X, y = make_classification(n_samples=5000, n_features=20, n_classes=2,
weights=[0.9, 0.1], flip_y=0.01, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
logreg_baseline = LogisticRegression(random_state=42)
logreg_baseline.fit(X_train, y_train)
y_pred_baseline = logreg_baseline.predict(X_test)
y_prob_baseline = logreg_baseline.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_prob_baseline)
print("BASELINE MODEL (Logistic Regression):")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(classification_report(y_test, y_pred_baseline))
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label="Baseline Model")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve for Logistic Regression")
plt.legend()
plt.grid()
plt.show()
BASELINE MODEL (Logistic Regression):
Accuracy: 0.9280
precision recall f1-score support
0 0.95 0.97 0.96 908
1 0.64 0.49 0.56 92
accuracy 0.93 1000
macro avg 0.80 0.73 0.76 1000
weighted avg 0.92 0.93 0.92 1000
# CLASS-WEIGHTED MODEL
logreg_weighted = LogisticRegression(class_weight='balanced',
random_state=42)
logreg_weighted.fit(X_train, y_train)
y_pred_weighted = logreg_weighted.predict(X_test)
y_prob_weighted = logreg_weighted.predict_proba(X_test)[:, 1]
# Compute Precision-Recall Curve
precision_w, recall_w, _ = precision_recall_curve(y_test, y_prob_weighted)
# Evaluate Weighted Model
print("MODEL WITH CLASS WEIGHTING:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_weighted):.4f}")
print(classification_report(y_test, y_pred_weighted))
MODEL WITH CLASS WEIGHTING:
Accuracy: 0.8410
precision recall f1-score support
0 0.98 0.84 0.91 908
1 0.35 0.85 0.50 92
accuracy 0.84 1000
macro avg 0.67 0.84 0.70 1000
weighted avg 0.92 0.84 0.87 1000
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label="Baseline Model")
plt.plot(recall_w, precision_w, marker='.', linestyle='dashed', label="Class
Weighted Model")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Comparison of Precision-Recall Curves")
plt.legend()
plt.grid()
plt.show()
QUESTION 4:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, classification_report,
accuracy_score
X, y = make_classification(n_samples=5000, n_features=20, n_classes=2,
weights=[0.9, 0.1], flip_y=0.01, random_state=42)
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
y_scores = model.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
best_idx = np.argmax(precision) # Index of maximum precision
best_threshold = thresholds[best_idx]
print(f"Optimal Decision Threshold for Maximum Precision:
{best_threshold:.4f}")
y_pred_optimized = (y_scores >= best_threshold).astype(int)
print("Model Evaluation with Optimized Threshold:")
print(classification_report(y_test, y_pred_optimized))
Optimal Decision Threshold for Maximum Precision: 0.9439
Model Evaluation with Optimized Threshold:
precision recall f1-score support
0 0.92 1.00 0.96 908
1 1.00 0.09 0.16 92
accuracy 0.92 1000
macro avg 0.96 0.54 0.56 1000
weighted avg 0.92 0.92 0.88 1000
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label="Precision-Recall Curve")
plt.scatter(recall[best_idx], precision[best_idx], marker='o', color='red',
label="Optimal Point")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve with Optimal Threshold")
plt.legend()
plt.grid()
plt.show()