import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Concatenate, Multiply, GlobalAveragePooling1D, Conv1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
# ---------------------------
# 1. 数据加载与完整性验证 (支持Excel)
# ---------------------------
def find_gait_data_files(data_dir='gait_data'):
"""自动发现所有步态数据文件(支持Excel和CSV)"""
file_map = {}
patterns = [
r'koa_?(\w+)\.(xlsx|xls|csv|npy)', # koa_t25.xlsx 或 koat25.csv
r'KOA_?(\w+)\.(xlsx|xls|csv|npy)', # KOA_T25.xlsx
r'patient_?(\w+)\.(xlsx|xls|csv|npy)', # patient_t25.xlsx
r'(\w+)_gait\.(xlsx|xls|csv|npy)' # t25_gait.xlsx
]
for filename in os.listdir(data_dir):
for pattern in patterns:
match = re.match(pattern, filename, re.IGNORECASE)
if match:
patient_id = match.group(1)
file_ext = match.group(2).lower()
file_map[patient_id.lower()] = os.path.join(data_dir, filename)
break
return file_map
def load_gait_data(file_path):
"""从文件加载步态数据(支持Excel、CSV和NPY)"""
if file_path.endswith('.npy'):
return np.load(file_path)
# 处理Excel和CSV文件
try:
if file_path.endswith(('.xlsx', '.xls')):
df = pd.read_excel(file_path, header=None)
elif file_path.endswith('.csv'):
df = pd.read_csv(file_path, header=None)
else:
raise ValueError(f"不支持的步态文件格式: {file_path}")
# 转换为NumPy数组并确保数值类型
gait_data = df.values.astype(np.float32)
# 验证数据维度(至少10列)
if gait_data.shape[1] < 10:
raise ValueError(f"步态数据列数不足(需要至少10列): {file_path}")
return gait_data
except Exception as e:
print(f"加载步态数据错误: {file_path} - {str(e)}")
return None
def load_and_validate_data(metadata_path='patient_metadata.csv', data_dir='gait_data'):
"""加载并验证数据完整性(支持Excel步态数据)"""
# 加载元数据
metadata = pd.read_csv(metadata_path, encoding='utf-8-sig')
# 清理患者ID
metadata['clean_id'] = metadata['patient_id'].apply(
lambda x: re.sub(r'[^\w]', '', str(x)).lower()
)
# 获取步态文件映射
gait_file_map = find_gait_data_files(data_dir)
# 添加文件路径到元数据
metadata['gait_path'] = metadata['clean_id'].map(gait_file_map)
# 标记缺失数据
metadata['data_complete'] = metadata['gait_path'].notnull()
# 分离完整和不完整数据
complete_data = metadata[metadata['data_complete']].copy()
incomplete_data = metadata[~metadata['data_complete']].copy()
print(f"总患者数: {len(metadata)}")
print(f"完整数据患者: {len(complete_data)}")
print(f"缺失步态数据患者: {len(incomplete_data)}")
if not incomplete_data.empty:
print("\n缺失步态数据的患者:")
print(incomplete_data[['patient_id', 'koa_grade']].to_string(index=False))
incomplete_data.to_csv('incomplete_records.csv', index=False)
return complete_data
# ---------------------------
# 2. 特征工程 (支持Excel步态数据)
# ---------------------------
def preprocess_static_features(metadata):
"""预处理静态元数据特征"""
# 定义特征类型
numeric_features = ['age', 'height_cm', 'weight_kg', 'bmi']
categorical_features = ['gender', 'dominant_leg', 'affected_side']
# 确保所有列存在
for col in numeric_features + categorical_features:
if col not in metadata.columns:
metadata[col] = np.nan
# 填充缺失值
metadata[numeric_features] = metadata[numeric_features].fillna(metadata[numeric_features].median())
metadata[categorical_features] = metadata[categorical_features].fillna('unknown')
# 创建预处理管道
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
],
remainder='drop' # 只处理指定特征
)
# 应用预处理
static_features = preprocessor.fit_transform(metadata)
feature_names = (
numeric_features +
list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
)
return static_features, feature_names
def extract_gait_features(gait_path):
"""从步态数据中提取特征(支持Excel)"""
gait_data = load_gait_data(gait_path)
if gait_data is None:
raise ValueError(f"无法加载步态数据: {gait_path}")
# 1. 时域特征
mean_values = np.mean(gait_data, axis=0)
std_values = np.std(gait_data, axis=0)
max_values = np.max(gait_data, axis=0)
min_values = np.min(gait_data, axis=0)
# 2. 频域特征 (FFT)
fft_values = np.abs(np.fft.rfft(gait_data, axis=0))
dominant_freq = np.argmax(fft_values, axis=0)
spectral_energy = np.sum(fft_values ** 2, axis=0)
# 3. 生物力学特征
# 左右腿不对称性 (假设前5列为右腿,后5列为左腿)
num_columns = gait_data.shape[1]
if num_columns >= 10:
# 标准情况:10列数据
asymmetry_index = np.mean(np.abs(gait_data[:, :5] - gait_data[:, 5:10]), axis=0)
else:
# 非标准情况:使用可用列
half_cols = num_columns // 2
asymmetry_index = np.mean(np.abs(gait_data[:, :half_cols] - gait_data[:, half_cols:half_cols * 2]), axis=0)
# 关节协调性 (髋-膝相位差)
if num_columns >= 8:
hip_knee_phase_diff = np.arctan2(gait_data[:, 2], gait_data[:, 1]) - np.arctan2(gait_data[:, 7],
gait_data[:, 6])
phase_diff_mean = np.mean(hip_knee_phase_diff)
phase_diff_std = np.std(hip_knee_phase_diff)
else:
phase_diff_mean = 0.0
phase_diff_std = 0.0
# 步态周期对称性
if num_columns >= 10:
gait_cycle_symmetry = np.corrcoef(gait_data[:, :5].flatten(), gait_data[:, 5:10].flatten())[0, 1]
elif num_columns >= 4:
gait_cycle_symmetry = np.corrcoef(gait_data[:, :2].flatten(), gait_data[:, 2:4].flatten())[0, 1]
else:
gait_cycle_symmetry = 0.0
# 组合所有特征
gait_features = np.concatenate([
mean_values, std_values, max_values, min_values,
dominant_freq, spectral_energy,
asymmetry_index,
[phase_diff_mean, phase_diff_std, gait_cycle_symmetry]
])
# 特征名称
base_names = [f'Ch_{i}' for i in range(num_columns)]
feature_names = []
for prefix in ['mean_', 'std_', 'max_', 'min_']:
feature_names.extend([prefix + name for name in base_names])
feature_names.extend(['dom_freq_' + name for name in base_names])
feature_names.extend(['spec_energy_' + name for name in base_names])
if num_columns >= 10:
asym_names = ['R.Hip_AP', 'R.Knee_Moment', 'R.KneeFlex', 'R.Gluteus', 'R.Vastus',
'L.Hip_AP', 'L.Knee_Moment', 'L.KneeFlex', 'L.Gluteus', 'L.Vastus']
feature_names.extend(['asym_' + name for name in asym_names])
else:
feature_names.extend([f'asym_{i}' for i in range(len(asymmetry_index))])
feature_names.extend(['phase_diff_mean', 'phase_diff_std', 'gait_symmetry'])
return gait_features, feature_names
def biomechanical_feature_enhancement(static_df, gait_features):
"""基于临床知识的特征增强"""
enhanced_features = gait_features.copy()
# 获取BMI值(如果存在)
if 'bmi' in static_df.columns:
bmi = static_df['bmi'].values[0]
bmi_factor = 1 + (bmi - 25) * 0.01 # 每增加1 BMI单位增加1%肌肉力量
else:
bmi_factor = 1.0
# 肌肉力量通道索引
num_features = len(gait_features)
if num_features > 50: # 标准特征数量
muscle_indices = [3, 4, 8, 9] # 肌肉相关特征索引
else:
# 非标准特征数量,使用启发式方法
muscle_indices = [i for i in range(40, 50) if i < num_features]
for idx in muscle_indices:
# 应用BMI调整
enhanced_features[idx] *= bmi_factor
# 获取年龄(如果存在)
if 'age' in static_df.columns:
age = static_df['age'].values[0]
stability_factor = max(0.8, 1 - (age - 60) * 0.005) # 60岁以上每岁减少0.5%稳定性
else:
stability_factor = 1.0
# 稳定性相关特征
stability_indices = []
if num_features > 50:
stability_indices = [1, 6, 40, 45, 50, 55, num_features - 3, num_features - 2, num_features - 1]
else:
# 尝试找到稳定性相关特征
stability_indices = [i for i in range(num_features - 5, num_features)]
for idx in stability_indices:
enhanced_features[idx] *= stability_factor
return enhanced_features
# ---------------------------
# 3. 特征融合模型 (保持不变)
# ---------------------------
def create_fusion_model(static_dim, gait_dim, num_classes):
"""创建特征融合模型"""
# 输入层
static_input = Input(shape=(static_dim,), name='static_input')
gait_input = Input(shape=(gait_dim,), name='gait_input')
# 静态特征处理
static_stream = Dense(64, activation='relu')(static_input)
static_stream = Dense(32, activation='relu')(static_stream)
# 步态特征处理
gait_stream = Dense(128, activation='relu')(gait_input)
gait_stream = Dense(64, activation='relu')(gait_stream)
# 注意力融合机制
attention_scores = Dense(64, activation='sigmoid')(gait_stream)
attended_static = Multiply()([static_stream, attention_scores])
# 特征融合
fused = Concatenate()([attended_static, gait_stream])
# 分类层
x = Dense(128, activation='relu')(fused)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax', name='koa_grade')(x)
return Model(inputs=[static_input, gait_input], outputs=output)
def create_1d_cnn_fusion_model(static_dim, gait_timesteps, gait_channels, num_classes):
"""创建包含原始时间序列的融合模型"""
# 输入层
static_input = Input(shape=(static_dim,), name='static_input')
gait_input = Input(shape=(gait_timesteps, gait_channels), name='gait_input')
# 静态特征处理
static_stream = Dense(64, activation='relu')(static_input)
static_stream = Dense(32, activation='relu')(static_stream)
# 时间序列特征提取
gait_stream = Conv1D(64, 7, activation='relu', padding='same')(gait_input)
gait_stream = Conv1D(128, 5, activation='relu', padding='same')(gait_stream)
gait_stream = GlobalAveragePooling1D()(gait_stream)
# 特征融合
fused = Concatenate()([static_stream, gait_stream])
# 分类层
x = Dense(128, activation='relu')(fused)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax', name='koa_grade')(x)
return Model(inputs=[static_input, gait_input], outputs=output)
# ---------------------------
# 4. 训练与评估 (支持Excel步态数据)
# ---------------------------
def train_model(metadata, model_type='features'):
"""训练特征融合模型(支持Excel步态数据)"""
# 预处理静态特征
static_features, static_feature_names = preprocess_static_features(metadata)
# 提取步态特征
gait_features_list = []
gait_feature_names = None
gait_data_list = [] # 原始时间序列数据
for idx, row in metadata.iterrows():
gait_path = row['gait_path']
try:
gait_features, gait_feature_names = extract_gait_features(gait_path)
# 应用生物力学增强
enhanced_features = biomechanical_feature_enhancement(row.to_frame().T, gait_features)
gait_features_list.append(enhanced_features)
# 保存原始时间序列数据(用于CNN模型)
gait_data = load_gait_data(gait_path)
gait_data_list.append(gait_data)
except Exception as e:
print(f"处理患者 {row['patient_id']} 的步态数据时出错: {str(e)}")
continue
if not gait_features_list:
raise ValueError("无法提取任何步态特征,请检查数据格式")
gait_features = np.array(gait_features_list)
# 目标变量
y = metadata['koa_grade'].values
num_classes = len(np.unique(y))
# 划分训练测试集
X_static_train, X_static_test, X_gait_train, X_gait_test, y_train, y_test = train_test_split(
static_features, gait_features, y, test_size=0.2, random_state=42, stratify=y
)
# 创建模型
if model_type == 'features':
model = create_fusion_model(
static_dim=static_features.shape[1],
gait_dim=gait_features.shape[1],
num_classes=num_classes
)
train_input = [X_static_train, X_gait_train]
test_input = [X_static_test, X_gait_test]
else: # 使用原始时间序列
# 填充时间序列到相同长度
max_timesteps = max(len(data) for data in gait_data_list)
X_gait_padded = np.array([np.pad(data, ((0, max_timesteps - len(data)), (0, 0)), 'constant')
for data in gait_data_list])
# 重新划分数据集
X_static_train, X_static_test, X_gait_train, X_gait_test, y_train, y_test = train_test_split(
static_features, X_gait_padded, y, test_size=0.2, random_state=42, stratify=y
)
model = create_1d_cnn_fusion_model(
static_dim=static_features.shape[1],
gait_timesteps=X_gait_padded.shape[1],
gait_channels=X_gait_padded.shape[2],
num_classes=num_classes
)
train_input = [X_static_train, X_gait_train]
test_input = [X_static_test, X_gait_test]
# 编译模型
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# 训练模型
history = model.fit(
train_input, y_train,
validation_split=0.1,
epochs=100,
batch_size=16,
callbacks=[
EarlyStopping(patience=20, restore_best_weights=True),
ReduceLROnPlateau(factor=0.5, patience=10)
]
)
# 评估模型
test_loss, test_acc = model.evaluate(test_input, y_test)
print(f"\n测试准确率: {test_acc:.4f}")
# 特征重要性分析
if model_type == 'features':
analyze_feature_importance(model, X_static_test, X_gait_test, y_test,
static_feature_names, gait_feature_names)
return model, history
def analyze_feature_importance(model, X_static, X_gait, y_true, static_names, gait_names):
"""分析特征重要性"""
# 获取模型预测
y_pred = model.predict([X_static, X_gait])
y_pred_classes = np.argmax(y_pred, axis=1)
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=["G0", "G1", "G2", "G3", "G4"],
yticklabels=["G0", "G1", "G2", "G3", "G4"])
plt.xlabel('预测分级')
plt.ylabel('真实分级')
plt.title('KL分级混淆矩阵')
plt.savefig('confusion_matrix.png', dpi=300)
plt.close()
# SHAP特征重要性分析
try:
import shap
# 创建解释器
background = [X_static[:50], X_gait[:50]]
explainer = shap.KernelExplainer(model.predict, background)
# 计算SHAP值
shap_values = explainer.shap_values(background)
# 组合特征名称
all_feature_names = static_names + gait_names
# 可视化
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, features=np.hstack(background),
feature_names=all_feature_names, plot_type='bar')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.close()
except ImportError:
print("SHAP库未安装,跳过特征重要性分析")
# ---------------------------
# 5. 主工作流程 (支持Excel步态数据)
# ---------------------------
def main():
print("=" * 50)
print("KOA分级诊疗平台 - 特征融合系统 (支持Excel步态数据)")
print("=" * 50)
# 步骤1: 加载并验证数据
print("\n[步骤1] 加载数据并验证完整性...")
complete_data = load_and_validate_data()
if len(complete_data) < 20:
print("\n错误: 完整数据样本不足,无法训练模型")
return
# 步骤2: 训练特征融合模型
print("\n[步骤2] 训练特征融合模型...")
print("选项1: 使用提取的特征 (更快)")
print("选项2: 使用原始时间序列 (更准确)")
choice = input("请选择模型类型 (1/2): ").strip()
model_type = 'features' if choice == '1' else 'time_series'
model, history = train_model(complete_data, model_type)
# 保存模型
model.save(f'koa_fusion_model_{model_type}.h5')
print(f"模型已保存为 koa_fusion_model_{model_type}.h5")
# 绘制训练历史
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('模型准确率')
plt.ylabel('准确率')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.ylabel('损失')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.savefig('training_history.png', dpi=300)
print("训练历史图已保存为 training_history.png")
print("\n特征融合模型训练完成!")
if __name__ == "__main__":
main()
上面是源代码,下面是报错,我是小白,你尽可能简单的手把手教我在原代码上怎么修改File "C:\Users\14816\Desktop\untitled1\yuchuli.py", line 524, in <module>
main()
File "C:\Users\14816\Desktop\untitled1\yuchuli.py", line 479, in main
complete_data = load_and_validate_data()
^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\14816\Desktop\untitled1\yuchuli.py", line 80, in load_and_validate_data
gait_file_map = find_gait_data_files(data_dir)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\14816\Desktop\untitled1\yuchuli.py", line 31, in find_gait_data_files
for filename in os.listdir(data_dir):
^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'gait_data'
==================================================
KOA分级诊疗平台 - 特征融合系统 (支持Excel步态数据)
==================================================