Python打卡训练营学习记录Day23-优快云博客

本文链接：https://blog.youkuaiyun.com/FanfanPyt/article/details/147904160
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# 1. 数据加载与初步探索
def load_data(file_path):
    """加载数据并进行初步探索"""
    try:
        df = pd.read_csv(file_path)
        print(f"数据基本信息：")
        df.info()
        print(f"\n数据集行数和列数：{df.shape}")
        print(f"\n数据集行数和列数：{df.columns}")
        return df
    except FileNotFoundError:
        print(f"错误：未找到文件 '{file_path}'")
        return None
    except Exception as e:
        print(f"错误：加载数据时发生异常: {e}")
        return None

# 2. 数据预处理
def preprocess_data(df, target_column):
    """数据预处理：划分特征和目标变量，处理缺失值，编码分类变量"""
    # 划分特征和目标变量
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # 区分数值特征和分类特征
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    
    # 定义预处理步骤
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return X, y, preprocessor, numerical_features, categorical_features

# 3. 数据划分
def split_data(X, y, test_size=0.2, random_state=42):
    """将数据划分为训练集和测试集"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# 4. 模型训练与优化
def train_model(X_train, y_train, preprocessor, param_grid=None):
    """训练模型并进行超参数优化"""
    # 定义模型
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    # 如果提供了参数网格，则进行网格搜索
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"最佳参数: {grid_search.best_params_}")
        print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")
        return best_model
    else:
        model.fit(X_train, y_train)
        return model

# 5. 模型评估
def evaluate_model(model, X_test, y_test):
    """评估模型性能"""
    y_pred = model.predict(X_test)
    
    # 计算并打印评估指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f"模型准确率: {accuracy:.4f}")
    
    print("\n分类报告:")
    print(classification_report(y_test, y_pred))
    
    print("\n混淆矩阵:")
    print(confusion_matrix(y_test, y_pred))
    
    return accuracy

# 6. 特征重要性分析
def analyze_feature_importance(model, numerical_features, categorical_features):
    """分析特征重要性"""
    try:
        # 获取预处理后的特征名称
        preprocessor = model.named_steps['preprocessor']
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        categorical_names = ohe.get_feature_names_out(categorical_features)
        feature_names = numerical_features + list(categorical_names)
        
        # 获取特征重要性
        classifier = model.named_steps['classifier']
        importances = classifier.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        print("\n特征重要性排序:")
        for f in range(len(feature_names)):
            print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]:.4f})")
        
        return feature_names, importances, indices
    except:
        print("无法获取特征重要性，请确保使用的模型支持此功能")
        return None, None, None

# 7. 模型保存
def save_model(model, model_path='model.pkl'):
    """保存训练好的模型"""
    try:
        joblib.dump(model, model_path)
        print(f"\n模型已保存至: {model_path}")
    except Exception as e:
        print(f"错误：保存模型时发生异常: {e}")

# 8. 模型加载与预测
def load_model_and_predict(model_path, new_data):
    """加载保存的模型并对新数据进行预测"""
    try:
        # 加载模型
        model = joblib.load(model_path)
        
        # 预测
        predictions = model.predict(new_data)
        print("\n预测结果:")
        print(predictions)
        
        return predictions
    except Exception as e:
        print(f"错误：加载模型或预测时发生异常: {e}")
        return None

# 主函数
def main():
    # 示例：使用Iris数据集
    print("=== 通用机器学习Pipeline示例 ===")
    
    # 1. 加载数据
    print("\n1. 加载数据...")
    # 这里使用sklearn内置的Iris数据集转换为CSV格式作为示例
    from sklearn.datasets import load_iris
    iris = load_iris()
    df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
    # 保存为CSV文件
    df.to_csv('iris.csv', index=False)
    df = load_data('iris.csv')
    
    if df is not None:
        # 2. 数据预处理
        print("\n2. 数据预处理...")
        X, y, preprocessor, numerical_features, categorical_features = preprocess_data(df, 'target')
        
        # 3. 数据划分
        print("\n3. 数据划分...")
        X_train, X_test, y_train, y_test = split_data(X, y)
        
        # 4. 模型训练
        print("\n4. 模型训练...")
        # 定义超参数搜索网格
        param_grid = {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20]
        }
        model = train_model(X_train, y_train, preprocessor, param_grid)
        
        # 5. 模型评估
        print("\n5. 模型评估...")
        evaluate_model(model, X_test, y_test)
        
        # 6. 特征重要性分析
        print("\n6. 特征重要性分析...")
        analyze_feature_importance(model, numerical_features, categorical_features)
        
        # 7. 模型保存
        print("\n7. 模型保存...")
        save_model(model, 'iris_model.pkl')
        
        # 8. 模型加载与预测示例
        print("\n8. 模型加载与预测示例...")
        load_model_and_predict('iris_model.pkl', X_test.head())

if __name__ == "__main__":
    main()
@浙大疏锦行