Hybrid Regression-Classification Model for Credit Default Risk:
Detailed Implementation Guide
I'll walk you through a comprehensive approach to complete this
project, from data preparation to model evaluation.
Step 1: Project Setup and Data Acquisition
1. Download the dataset:
o Use the LendingClub accepted loans dataset from
Kaggle: accepted_2007_to_2018Q4.csv.gz
o For initial exploration, you can start with a smaller
sample (first 100,000 rows)
2. Set up your environment:
o Python 3.7+ with Jupyter Notebook or VS Code
o Required packages: pandas, numpy, scikit-learn,
matplotlib, seaborn, xgboost (optional)
Step 2: Data Preprocessing
2.1 Initial Data Exploration
python
Copy
Download
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data (adjust path as needed)
df = pd.read_csv('accepted_2007_to_2018Q4.csv.gz',
compression='gzip', low_memory=True)
# Initial exploration
print(f"Dataset shape: {df.shape}")
print("Columns:\n", df.columns.tolist())
print("Data types:\n", df.dtypes.value_counts())
# Check target variable options (loan status)
print("Loan status value counts:\n",
df['loan_status'].value_counts())
2.2 Define Target Variables
python
Copy
Download
# Classification target: Default (1) vs Non-Default (0)
df['default'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged
Off', 'Default'] else 0)
# Regression target: Default amount (for defaulted loans only)
# We'll use 'total_rec_prncp' (principal received) to calculate loss
amount
df['default_amount'] = df.apply(lambda row: row['loan_amnt'] -
row['total_rec_prncp'] if row['default'] == 1 else 0, axis=1)
2.3 Feature Selection and Engineering
python
Copy
Download
# Select relevant features (expand based on your analysis)
features = [
'loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade',
'emp_length', 'home_ownership', 'annual_inc',
'verification_status',
'purpose', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high',
'inq_last_6mths', 'mths_since_last_delinq',
'mths_since_last_record',
'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
'initial_list_status', 'collections_12_mths_ex_med',
'mths_since_last_major_derog',
'application_type', 'mort_acc', 'pub_rec_bankruptcies'
# Create feature subset
df = df[features + ['default', 'default_amount']]
# Handle missing values
df = df.dropna(thresh=len(df)*0.7, axis=1) # Drop columns with
>70% missing
df = df.dropna() # Drop remaining rows with missing values
(consider imputation for better results)
# Convert categorical variables
cat_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
Step 3: Modeling Approach
3.1 Train-Test Split
python
Copy
Download
from sklearn.model_selection import train_test_split
X = df.drop(['default', 'default_amount'], axis=1)
y_class = df['default']
y_reg = df['default_amount']
# Split for classification
X_train, X_test, y_class_train, y_class_test = train_test_split(
X, y_class, test_size=0.3, random_state=42, stratify=y_class)
# Split regression data (only defaulted loans)
X_reg_train = X_train[y_class_train == 1]
y_reg_train = y_reg[y_class_train == 1]
3.2 Classification Model (Probability of Default)
python
Copy
Download
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score,
roc_auc_score
# Initialize and train classifier
clf = RandomForestClassifier(n_estimators=100,
random_state=42, class_weight='balanced')
clf.fit(X_train, y_class_train)
# Predict probabilities
y_class_pred_proba = clf.predict_proba(X_test)[:, 1]
# Evaluate
print("Classification Metrics:")
print(f"Accuracy: {accuracy_score(y_class_test,
(y_class_pred_proba > 0.5).astype(int)):.4f}")
print(f"F1 Score: {f1_score(y_class_test, (y_class_pred_proba >
0.5).astype(int)):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_class_test,
y_class_pred_proba):.4f}")
3.3 Regression Model (Loss Given Default)
python
Copy
Download
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Initialize and train regressor
reg = RandomForestRegressor(n_estimators=100,
random_state=42)
reg.fit(X_reg_train, y_reg_train)
# Predict on all test data (we'll filter later)
y_reg_pred = reg.predict(X_test)
# Evaluate on defaulted loans only
default_mask = y_class_test == 1
if sum(default_mask) > 0: # Check if there are any defaulted
loans in test set
print("\nRegression Metrics (on defaulted loans only):")
print(f"RMSE: {np.sqrt(mean_squared_error(y_reg[y_class_test
== 1], y_reg_pred[default_mask])):.2f}")
print(f"R²: {r2_score(y_reg[y_class_test == 1],
y_reg_pred[default_mask]):.4f}")
else:
print("No defaulted loans in test set for regression evaluation")
3.4 Calculate Expected Loss
python
Copy
Download
# Calculate expected loss for all test samples
expected_loss = y_class_pred_proba * y_reg_pred
# Create results dataframe
results = X_test.copy()
results['P(Default)'] = y_class_pred_proba
results['Predicted_Loss_Amount'] = y_reg_pred
results['Expected_Loss'] = expected_loss
results['Actual_Default'] = y_class_test
results['Actual_Loss_Amount'] = y_reg[y_class_test.index]
# Display sample results
print("\nSample Expected Loss Calculations:")
print(results[['P(Default)', 'Predicted_Loss_Amount',
'Expected_Loss', 'Actual_Default',
'Actual_Loss_Amount']].head(10))
Step 4: Model Evaluation and Refinement
4.1 Classification Model Improvement
python
Copy
Download
from sklearn.model_selection import GridSearchCV
# Hyperparameter tuning for classifier
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
grid_search =
GridSearchCV(RandomForestClassifier(class_weight='balanced',
random_state=42),
param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_class_train)
# Get best classifier
best_clf = grid_search.best_estimator_
4.2 Feature Importance Analysis
python
Copy
Download
# Plot feature importance for classification
feature_importance = pd.DataFrame({
'Feature': X.columns,
'Importance': best_clf.feature_importances_
}).sort_values('Importance', ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature',
data=feature_importance.head(20))
plt.title('Top 20 Important Features for Default Prediction')
plt.show()
4.3 Expected Loss Analysis
python
Copy
Download
# Analyze expected loss distribution
plt.figure(figsize=(10, 6))
sns.histplot(results['Expected_Loss'], bins=50, kde=True)
plt.title('Distribution of Expected Loss')
plt.xlabel('Expected Loss ($)')
plt.show()
# Compare with actual losses for defaulted loans
if sum(default_mask) > 0:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=results.loc[default_mask,
'Predicted_Loss_Amount'],
y=results.loc[default_mask, 'Actual_Loss_Amount'])
plt.plot([0, max(results.loc[default_mask,
'Actual_Loss_Amount'])],
[0, max(results.loc[default_mask, 'Actual_Loss_Amount'])],
'r--')
plt.title('Predicted vs Actual Loss Amount for Defaulted Loans')
plt.xlabel('Predicted Loss Amount ($)')
plt.ylabel('Actual Loss Amount ($)')
plt.show()
Step 5: Advanced Extensions (Optional)
1. Use rejected loans data:
o Build a separate model to predict probability of loan
approval
o Combine with your default model for more
comprehensive risk assessment
2. Alternative models:
o Try XGBoost or LightGBM for both classification and
regression
o Experiment with neural networks
3. Economic analysis:
o Calculate portfolio-level expected loss
o Simulate different economic scenarios
4. Deployment:
o Create a Flask/Django API for your model
o Build a simple dashboard with Streamlit
Key Considerations
1. Class imbalance: The dataset likely has few defaults.
Consider:
o Oversampling/undersampling techniques
o Different evaluation metrics (precision-recall curve)
o Threshold adjustment for classification
2. Feature interpretation: Ensure your models are explainable
for financial decision-making
3. Data leakage: Be careful not to use future information (e.g.,
features that wouldn't be available at loan origination)
This comprehensive approach should give you a robust hybrid
model for credit default risk assessment. Would you like me to
elaborate on any specific part of this implementation?
Step-by-Step Guide to Training the Credit Risk Prediction
Algorithm
This guide will walk you through data preparation, feature
engineering, model training, and evaluation for the hybrid
regression-classification model used in the Streamlit app.
Step 1: Data Collection & Preparation
1.1 Download & Load the Dataset
Use the LendingClub Loan Data (or similar credit risk dataset).
Download:
python
Copy
Download
import pandas as pd
# Load the dataset (adjust path as needed)
df = pd.read_csv('accepted_2007_to_2018Q4.csv.gz', compression='gzip',
low_memory=True)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
1.2 Define Target Variables
Classification Target (default) → Binary (1 if loan defaulted, else
0)
Regression Target (default_amount) → Loss amount if default
occurs.
python
Copy
Download
# Define default (1 if Charged Off/Default, else 0)
df['default'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off',
'Default'] else 0)
# Calculate loss amount (loan amount - principal received)
df['default_amount'] = df.apply(
lambda row: row['loan_amnt'] - row['total_rec_prncp'] if row['default']
== 1 else 0,
axis=1
1.3 Select Relevant Features
Choose features that impact credit risk:
python
Copy
Download
features = [
'loan_amnt', 'term', 'int_rate', 'grade', 'sub_grade',
'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
'purpose', 'dti', 'fico_range_low', 'fico_range_high',
'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
'total_acc', 'application_type', 'mort_acc', 'pub_rec_bankruptcies'
# Subset the data
df = df[features + ['default', 'default_amount']]
Step 2: Data Cleaning & Preprocessing
2.1 Handle Missing Data
Drop columns with >70% missing values.
Impute or drop remaining missing values.
python
Copy
Download
# Drop columns with too many missing values
df = df.dropna(thresh=len(df)*0.7, axis=1)
# Drop rows with missing values (or use imputation)
df = df.dropna()
2.2 Encode Categorical Variables
Convert categorical features (term, home_ownership, etc.) into
numerical format.
python
Copy
Download
from sklearn.preprocessing import OneHotEncoder
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
2.3 Normalize Numerical Features
Scale numerical features (loan_amnt, annual_inc, etc.) for better
model performance.
python
Copy
Download
from sklearn.preprocessing import StandardScaler
numerical_cols = ['loan_amnt', 'annual_inc', 'dti', 'fico_range_low']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
Step 3: Train-Test Split
3.1 Separate Features & Targets
python
Copy
Download
X = df.drop(['default', 'default_amount'], axis=1)
y_class = df['default'] # Classification target
y_reg = df['default_amount'] # Regression target
3.2 Split Data for Classification & Regression
python
Copy
Download
from sklearn.model_selection import train_test_split
# Split for classification
X_train, X_test, y_class_train, y_class_test = train_test_split(
X, y_class, test_size=0.3, random_state=42, stratify=y_class
# Split for regression (only defaulted loans)
X_reg_train = X_train[y_class_train == 1]
y_reg_train = y_reg[y_class_train == 1]
Step 4: Model Training
4.1 Train the Classification Model (Default Probability)
python
Copy
Download
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
# Initialize & train
clf = RandomForestClassifier(n_estimators=100, random_state=42,
class_weight='balanced')
clf.fit(X_train, y_class_train)
# Predict probabilities
y_class_pred_proba = clf.predict_proba(X_test)[:, 1]
# Evaluate
print("Classification Metrics:")
print(f"Accuracy: {accuracy_score(y_class_test, (y_class_pred_proba >
0.5).astype(int)):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_class_test, y_class_pred_proba):.4f}")
4.2 Train the Regression Model (Loss Given Default)
python
Copy
Download
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Initialize & train
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_reg_train, y_reg_train)
# Predict on test set (only defaulted loans)
default_mask = y_class_test == 1
if sum(default_mask) > 0:
y_reg_pred = reg.predict(X_test[default_mask])
print("\nRegression Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_reg[y_class_test == 1],
y_reg_pred)):.2f}")
print(f"R²: {r2_score(y_reg[y_class_test == 1], y_reg_pred):.4f}")
else:
print("No defaulted loans in test set")
Step 5: Save Models for Deployment
python
Copy
Download
import joblib
# Save models
joblib.dump(clf, 'default_classifier.pkl')
joblib.dump(reg, 'loss_regressor.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
# Later, load them in the Streamlit app:
# clf = joblib.load('default_classifier.pkl')
# reg = joblib.load('loss_regressor.pkl')
# scaler = joblib.load('feature_scaler.pkl')
Step 6: Expected Loss Calculation
python
Copy
Download
# Predict probability of default
p_default = clf.predict_proba(X_test)[:, 1]
# Predict loss amount (for all test samples)
loss_amount = reg.predict(X_test)
# Calculate expected loss
expected_loss = p_default * loss_amount
# Create results table
results = X_test.copy()
results['P(Default)'] = p_default.round(4)
results['Predicted_Loss'] = loss_amount.round(2)
results['Expected_Loss'] = expected_loss.round(2)
print(results.head())