【forage】JPMorgan免费量化项目速通 part3(已附python源代码)

最新推荐文章于 2025-04-11 17:56:41 发布

佛系快乐水

最新推荐文章于 2025-04-11 17:56:41 发布

阅读量287

点赞数 5

文章标签： python 开发语言机器学习

本文链接：https://blog.youkuaiyun.com/yiny37/article/details/147009711

版权

You must use this data to build a model that, given details for any loan described above, will predict the probability that the borrower will default (also known as PD: the probability of default). Use the provided data to train a function that will estimate the probability of default for a borrower. Assuming a recovery rate of 10%, this can be used to give the expected loss on a loan.

You should produce a function that can take in the properties of a loan and output the expected loss.
You can explore any technique ranging from a simple regression or a decision tree to something more advanced. You can also use multiple methods and provide a comparative analysis.

编写函数，预测违约概率。

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


# Data preprocessing
def preprocess_data(data, target_col='default'):
    """
    Preprocess loan data, including handling missing values, encoding categorical variables, and feature scaling.

    Parameters:
    ◦ data (pd.DataFrame): The original dataframe.
    ◦ target_col (str): The name of the target variable column.

    Returns:
    ◦ X (pd.DataFrame): The feature dataframe.
    ◦ y (pd.Series): The target variable series.
    ◦ scaler (StandardScaler): The object used for feature scaling (for later use).
    """
    # Check if the target column exists
    if target_col not in data.columns:
        print(f"The target column '{target_col}' does not exist in the data.")
        return None, None, None

    # Separate features and target variable
    X = data.drop(columns=[target_col])
    y = data[target_col]

    # Handle missing values (simply fill with mean or mode)
    for col in X.select_dtypes(include=['float64', 'int64']).columns:
        X[col].fillna(X[col].mean(), inplace=True)
    for col in X.select_dtypes(include=['object']).columns:
        X[col].fillna(X[col].mode()[0], inplace=True)

    # Encode categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    # Feature scaling
    scaler = StandardScaler()
    numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    print("Data preprocessing completed.")
    return X, y, scaler, label_encoders

# Build logistic regression model
def build_logistic_model(X_train, y_train):
    """
    Build and train a logistic regression model.

    Parameters:
    ◦ X_train (pd.DataFrame): The training feature data.
    ◦ y_train (pd.Series): The training target variable.

    Returns:
    ◦ model (LogisticRegression): The trained logistic regression model.
    """
    model = LogisticRegression(random_state=42)
    model.fit(X_train, y_train)
    print("Logistic regression model training completed.")
    return model

# Build decision tree model
def build_decision_tree_model(X_train, y_train, max_depth=5):
    """
    Build and train a decision tree model.

    Parameters:
    ◦ X_train (pd.DataFrame): The training feature data.
    ◦ y_train (pd.Series): The training target variable.
    ◦ max_depth (int): The maximum depth of the decision tree.

    Returns:
    ◦ model (DecisionTreeClassifier): The trained decision tree model.
    """
    model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)
    print(f"Decision tree model (max_depth={max_depth}) training completed.")
    return model

# Evaluate model performance
def evaluate_model(model, X_test, y_test, model_type='Logistic Regression'):
    """
    Evaluate the performance of the model, including accuracy and AUC.

    Parameters:
    ◦ model: The trained model.
    ◦ X_test (pd.DataFrame): The test feature data.
    ◦ y_test (pd.Series): The test target variable.
    ◦ model_type (str): The name of the model type, for output purposes.

    Returns:
    ◦ accuracy (float): The accuracy score.
    ◦ auc (float): The AUC score.
    """
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    print(f"{model_type} Model Performance Evaluation:")
    print(f"- Accuracy: {accuracy:.4f}")
    print(f"- AUC Score: {auc:.4f}")

    return accuracy, auc

# Calculate expected loss
def calculate_expected_loss(prob_default, recovery_rate=0.10):
    """
    Calculate the expected loss.

    Parameters:
    ◦ prob_default (float): The probability of default.
    ◦ recovery_rate (float): The recovery rate, default is 10%.

    Returns:
    ◦ expected_loss (float): The expected loss.
    """
    expected_loss = prob_default * (1 - recovery_rate)
    return expected_loss

# Main function
def main():
    # File path (please modify according to your actual situation)
    file_path = 'C:\\Users\\冰火人\\Desktop\\项目\\Task 3 and 4_Loan_Data.csv'  # Replace with your CSV file path

    # 1. Load data
    # Note: The load_data function is not defined in the provided code.
    # You need to define it or replace it with the correct function to load data.
    # For now, assuming data is loaded as a pandas DataFrame.
    # Here's an example of how you might load it:
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"The file was not found: {file_path}")
        return
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return

    # Assume the target column is named 'default'. Modify if necessary.
    target_col = 'default'  # Ensure this column exists in your CSV and indicates default status (1 for default, 0 otherwise)

    # 2. Data preprocessing
    X, y, scaler, label_encoders = preprocess_data(data, target_col=target_col)
    if X is None or y is None:
        return

    # 3. Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {X_train.shape[0]}, Testing set size: {X_test.shape[0]}")

    # 4. Build logistic regression model
    log_model = build_logistic_model(X_train, y_train)

    # 5. Build decision tree model (optional)
    dt_model = build_decision_tree_model(X_train, y_train, max_depth=5)

    # 6. Evaluate model performance
    evaluate_model(log_model, X_test, y_test, model_type='Logistic Regression')
    evaluate_model(dt_model, X_test, y_test, model_type='Decision Tree')

    # 7. Example: Calculate expected loss for a single loan
    # Select a test sample (e.g., the first one)
    sample = X_test.iloc[0]
    sample_dict = sample.to_dict()

    # If there are categorical variables, inverse transform them
    for col, le in label_encoders.items():
        if col in sample_dict:
            # Ensure the value is properly transformed back
            sample_dict[col] = le.inverse_transform([int(sample_dict[col])])[0]

    # Convert back to DataFrame
    sample_df = pd.DataFrame([sample_dict])

    # Feature scaling (using the scaler from training)
    numerical_cols = sample_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
    sample_df[numerical_cols] = scaler.transform(sample_df[numerical_cols])

    # Predict default probability
    prob_default_log = log_model.predict_proba(sample_df)[:, 1]
    prob_default_dt = dt_model.predict_proba(sample_df)[:, 1]

    # Calculate expected loss
    expected_loss_log = calculate_expected_loss(prob_default_log[0])
    expected_loss_dt = calculate_expected_loss(prob_default_dt[0])

    print("Expected loss for the example loan:")
    print(f"- Predicted probability of default using Logistic Regression: {prob_default_log[0]:.4f}")
    print(f"  Expected Loss: ${expected_loss_log:.2f}")
    print(f"- Predicted probability of default using Decision Tree: {prob_default_dt[0]:.4f}")
    print(f"  Expected Loss: ${expected_loss_dt:.2f}")

# Run the main function
if __name__ == "__main__":
    main()