尝试针对之前的心脏病项目ipynb,将他按照今天的示例项目整理成规范的形式,思考下哪些部分可以未来复用。
# src/data/data_loader.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
def load_data(file_path: str) -> pd.DataFrame:
"""加载心脏病数据集"""
try:
df = pd.read_csv(file_path)
return df
except FileNotFoundError:
print(f"错误: 文件 {file_path} 未找到")
return None
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
"""预处理心脏病数据集"""
# 处理缺失值
df = df.dropna()
# 数据标准化/归一化
# ...
return df
def split_data(df: pd.DataFrame, target_col: str, test_size: float = 0.2, random_state: int = 42):
"""将数据分为训练集和测试集"""
X = df.drop(target_col, axis=1)
y = df[target_col]
return train_test_split(X, y, test_size=test_size, random_state=random_state)
# src/features/feature_engineering.py
from sklearn.preprocessing import StandardScaler
def build_features(X_train, X_test):
"""构建和转换特征"""
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 特征选择/提取
# ...
return X_train_scaled, X_test_scaled, scaler
# src/models/model_training.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
def train_model(X_train, y_train):
"""训练随机森林分类器"""
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
return model
def evaluate_model(model, X_test, y_test):
"""评估模型性能"""
y_pred = model.predict(X_test)
print("分类报告:")
print(classification_report(y_test, y_pred))
print("混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
return y_pred
def save_model(model, model_path: str):
"""保存模型到文件"""
joblib.dump(model, model_path)
def load_model(model_path: str):
"""从文件加载模型"""
return joblib.load(model_path)