You must use this data to build a model that, given details for any loan described above, will predict the probability that the borrower will default (also known as PD: the probability of default). Use the provided data to train a function that will estimate the probability of default for a borrower. Assuming a recovery rate of 10%, this can be used to give the expected loss on a loan.
- You should produce a function that can take in the properties of a loan and output the expected loss.
- You can explore any technique ranging from a simple regression or a decision tree to something more advanced. You can also use multiple methods and provide a comparative analysis.
编写函数,预测违约概率。
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
# Data preprocessing
def preprocess_data(data, target_col='default'):
"""
Preprocess loan data, including handling missing values, encoding categorical variables, and feature scaling.
Parameters:
◦ data (pd.DataFrame): The original dataframe.
◦ target_col (str): The name of the target variable column.
Returns:
◦ X (pd.DataFrame): The feature dataframe.
◦ y (pd.Series): The target variable series.
◦ scaler (StandardScaler): The object used for feature scaling (for later use).
"""
# Check if the target column exists
if target_col not in data.columns:
print(f"The target column '{target_col}' does not exist in the data.")
return None, None, None
# Separate features and target variable
X = data.drop(columns=[target_col])
y = data[target_col]
# Handle missing values (simply fill with mean or mode)
for col in X.select_dtypes(include=['float64', 'int64']).columns:
X[col].fillna(X[col].mean(), inplace=True)
for col in X.select_dtypes(include=['object']).columns:
X[col].fillna(X[col].mode()[0], inplace=True)
# Encode categorical variables
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col])
label_encoders[col] = le
# Feature scaling
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
print("Data preprocessing completed.")
return X, y, scaler, label_encoders
# Build logistic regression model
def build_logistic_model(X_train, y_train):
"""
Build and train a logistic regression model.
Parameters:
◦ X_train (pd.DataFrame): The training feature data.
◦ y_train (pd.Series): The training target variable.
Returns:
◦ model (LogisticRegression): The trained logistic regression model.
"""
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
print("Logistic regression model training completed.")
return model
# Build decision tree model
def build_decision_tree_model(X_train, y_train, max_depth=5):
"""
Build and train a decision tree model.
Parameters:
◦ X_train (pd.DataFrame): The training feature data.
◦ y_train (pd.Series): The training target variable.
◦ max_depth (int): The maximum depth of the decision tree.
Returns:
◦ model (DecisionTreeClassifier): The trained decision tree model.
"""
model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
model.fit(X_train, y_train)
print(f"Decision tree model (max_depth={max_depth}) training completed.")
return model
# Evaluate model performance
def evaluate_model(model, X_test, y_test, model_type='Logistic Regression'):
"""
Evaluate the performance of the model, including accuracy and AUC.
Parameters:
◦ model: The trained model.
◦ X_test (pd.DataFrame): The test feature data.
◦ y_test (pd.Series): The test target variable.
◦ model_type (str): The name of the model type, for output purposes.
Returns:
◦ accuracy (float): The accuracy score.
◦ auc (float): The AUC score.
"""
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
print(f"{model_type} Model Performance Evaluation:")
print(f"- Accuracy: {accuracy:.4f}")
print(f"- AUC Score: {auc:.4f}")
return accuracy, auc
# Calculate expected loss
def calculate_expected_loss(prob_default, recovery_rate=0.10):
"""
Calculate the expected loss.
Parameters:
◦ prob_default (float): The probability of default.
◦ recovery_rate (float): The recovery rate, default is 10%.
Returns:
◦ expected_loss (float): The expected loss.
"""
expected_loss = prob_default * (1 - recovery_rate)
return expected_loss
# Main function
def main():
# File path (please modify according to your actual situation)
file_path = 'C:\\Users\\冰火人\\Desktop\\项目\\Task 3 and 4_Loan_Data.csv' # Replace with your CSV file path
# 1. Load data
# Note: The load_data function is not defined in the provided code.
# You need to define it or replace it with the correct function to load data.
# For now, assuming data is loaded as a pandas DataFrame.
# Here's an example of how you might load it:
try:
data = pd.read_csv(file_path)
except FileNotFoundError:
print(f"The file was not found: {file_path}")
return
except Exception as e:
print(f"An error occurred while loading the data: {e}")
return
# Assume the target column is named 'default'. Modify if necessary.
target_col = 'default' # Ensure this column exists in your CSV and indicates default status (1 for default, 0 otherwise)
# 2. Data preprocessing
X, y, scaler, label_encoders = preprocess_data(data, target_col=target_col)
if X is None or y is None:
return
# 3. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")
# 4. Build logistic regression model
log_model = build_logistic_model(X_train, y_train)
# 5. Build decision tree model (optional)
dt_model = build_decision_tree_model(X_train, y_train, max_depth=5)
# 6. Evaluate model performance
evaluate_model(log_model, X_test, y_test, model_type='Logistic Regression')
evaluate_model(dt_model, X_test, y_test, model_type='Decision Tree')
# 7. Example: Calculate expected loss for a single loan
# Select a test sample (e.g., the first one)
sample = X_test.iloc[0]
sample_dict = sample.to_dict()
# If there are categorical variables, inverse transform them
for col, le in label_encoders.items():
if col in sample_dict:
# Ensure the value is properly transformed back
sample_dict[col] = le.inverse_transform([int(sample_dict[col])])[0]
# Convert back to DataFrame
sample_df = pd.DataFrame([sample_dict])
# Feature scaling (using the scaler from training)
numerical_cols = sample_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
sample_df[numerical_cols] = scaler.transform(sample_df[numerical_cols])
# Predict default probability
prob_default_log = log_model.predict_proba(sample_df)[:, 1]
prob_default_dt = dt_model.predict_proba(sample_df)[:, 1]
# Calculate expected loss
expected_loss_log = calculate_expected_loss(prob_default_log[0])
expected_loss_dt = calculate_expected_loss(prob_default_dt[0])
print("Expected loss for the example loan:")
print(f"- Predicted probability of default using Logistic Regression: {prob_default_log[0]:.4f}")
print(f" Expected Loss: ${expected_loss_log:.2f}")
print(f"- Predicted probability of default using Decision Tree: {prob_default_dt[0]:.4f}")
print(f" Expected Loss: ${expected_loss_dt:.2f}")
# Run the main function
if __name__ == "__main__":
main()