import os
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, Concatenate, Multiply, GlobalAveragePooling1D, Conv1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import matplotlib.pyplot as plt
import seaborn as sns
# ---------------------------
# 1. 数据加载与完整性验证
# ---------------------------
def find_gait_data_files(data_dir='gait_data'):
"""自动发现所有步态数据文件"""
file_map = {}
patterns = [
r'koa_?(\w+)\.npy', # koa_t25.npy 或 koat25.npy
r'KOA_?(\w+)\.npy', # KOA_T25.npy
r'patient_?(\w+)\.npy', # patient_t25.npy
r'(\w+)_gait\.npy' # t25_gait.npy
]
for filename in os.listdir(data_dir):
if filename.endswith('.npy'):
for pattern in patterns:
match = re.match(pattern, filename, re.IGNORECASE)
if match:
patient_id = match.group(1)
file_map[patient_id.lower()] = os.path.join(data_dir, filename)
break
return file_map
def load_and_validate_data(metadata_path='patient_metadata.csv', data_dir='gait_data'):
"""加载并验证数据完整性"""
# 加载元数据
metadata = pd.read_csv(metadata_path, encoding='utf-8-sig')
# 清理患者ID
metadata['clean_id'] = metadata['patient_id'].apply(
lambda x: re.sub(r'[^\w]', '', str(x)).str.lower()
# 获取步态文件映射
gait_file_map = find_gait_data_files(data_dir)
# 添加文件路径到元数据
metadata['gait_path'] = metadata['clean_id'].map(gait_file_map)
# 标记缺失数据
metadata['data_complete'] = metadata['gait_path'].notnull()
# 分离完整和不完整数据
complete_data = metadata[metadata['data_complete']].copy()
incomplete_data = metadata[~metadata['data_complete']].copy()
print(f"总患者数: {len(metadata)}")
print(f"完整数据患者: {len(complete_data)}")
print(f"缺失步态数据患者: {len(incomplete_data)}")
if not incomplete_data.empty:
print("\n缺失步态数据的患者:")
print(incomplete_data[['patient_id', 'koa_grade']])
incomplete_data.to_csv('incomplete_records.csv', index=False)
return complete_data
# ---------------------------
# 2. 特征工程
# ---------------------------
def preprocess_static_features(metadata):
"""预处理静态元数据特征"""
# 定义特征类型
numeric_features = ['age', 'height_cm', 'weight_kg', 'bmi']
categorical_features = ['gender', 'dominant_leg', 'affected_side']
# 确保所有列存在
for col in numeric_features + categorical_features:
if col not in metadata.columns:
metadata[col] = np.nan
# 填充缺失值
metadata[numeric_features] = metadata[numeric_features].fillna(metadata[numeric_features].median())
metadata[categorical_features] = metadata[categorical_features].fillna('unknown')
# 创建预处理管道
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
],
remainder='drop' # 只处理指定特征
)
# 应用预处理
static_features = preprocessor.fit_transform(metadata)
feature_names = (
numeric_features +
list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
return static_features, feature_names
def extract_gait_features(gait_path):
"""从步态数据中提取特征"""
gait_data = np.load(gait_path)
# 1. 时域特征
mean_values = np.mean(gait_data, axis=0)
std_values = np.std(gait_data, axis=0)
max_values = np.max(gait_data, axis=0)
min_values = np.min(gait_data, axis=0)
# 2. 频域特征 (FFT)
fft_values = np.abs(np.fft.rfft(gait_data, axis=0))
dominant_freq = np.argmax(fft_values, axis=0)
spectral_energy = np.sum(fft_values**2, axis=0)
# 3. 生物力学特征
# 左右腿不对称性
asymmetry_index = np.mean(np.abs(gait_data[:, :5] - gait_data[:, 5:10]), axis=0)
# 关节协调性 (髋-膝相位差)
hip_knee_phase_diff = np.arctan2(gait_data[:, 2], gait_data[:, 1]) - np.arctan2(gait_data[:, 7], gait_data[:, 6])
phase_diff_mean = np.mean(hip_knee_phase_diff)
phase_diff_std = np.std(hip_knee_phase_diff)
# 步态周期对称性
gait_cycle_symmetry = np.corrcoef(gait_data[:, :5].flatten(), gait_data[:, 5:10].flatten())[0, 1]
# 组合所有特征
gait_features = np.concatenate([
mean_values, std_values, max_values, min_values,
dominant_freq, spectral_energy,
asymmetry_index,
[phase_diff_mean, phase_diff_std, gait_cycle_symmetry]
])
# 特征名称
base_names = [
'R.Hip_AP', 'R.Knee_Moment', 'R.KneeFlex', 'R.Gluteus', 'R.Vastus',
'L.Hip_AP', 'L.Knee_Moment', 'L.KneeFlex', 'L.Gluteus', 'L.Vastus'
]
feature_names = []
for prefix in ['mean_', 'std_', 'max_', 'min_']:
feature_names.extend([prefix + name for name in base_names])
feature_names.extend(['dom_freq_' + name for name in base_names])
feature_names.extend(['spec_energy_' + name for name in base_names])
feature_names.extend(['asym_' + name for name in base_names])
feature_names.extend(['phase_diff_mean', 'phase_diff_std', 'gait_symmetry'])
return gait_features, feature_names
def biomechanical_feature_enhancement(static_df, gait_features):
"""基于临床知识的特征增强"""
enhanced_features = gait_features.copy()
# BMI影响肌肉力量
bmi = static_df['bmi'].values[0]
bmi_factor = 1 + (bmi - 25) * 0.01 # 每增加1 BMI单位增加1%肌肉力量
# 肌肉力量通道索引 (R.Gluteus, R.Vastus, L.Gluteus, L.Vastus)
muscle_indices = [3, 4, 8, 9]
for idx in muscle_indices:
# 应用BMI调整
enhanced_features[idx] *= bmi_factor # mean
enhanced_features[idx + 10] *= bmi_factor # std
enhanced_features[idx + 20] *= bmi_factor # max
enhanced_features[idx + 30] *= bmi_factor # min
enhanced_features[idx + 50] *= bmi_factor # spectral energy
# 年龄影响步态稳定性
age = static_df['age'].values[0]
stability_factor = max(0.8, 1 - (age - 60) * 0.005) # 60岁以上每岁减少0.5%稳定性
# 稳定性相关特征
stability_indices = [
1, 6, # 膝力矩
2, 7, # 膝屈曲
40, 45, 50, 55, # 不对称性和相位差特征
len(gait_features) - 3, len(gait_features) - 2, len(gait_features) - 1 # 相位和对称性
]
for idx in stability_indices:
enhanced_features[idx] *= stability_factor
return enhanced_features
# ---------------------------
# 3. 特征融合模型
# ---------------------------
def create_fusion_model(static_dim, gait_dim, num_classes):
"""创建特征融合模型"""
# 输入层
static_input = Input(shape=(static_dim,), name='static_input')
gait_input = Input(shape=(gait_dim,), name='gait_input')
# 静态特征处理
static_stream = Dense(64, activation='relu')(static_input)
static_stream = Dense(32, activation='relu')(static_stream)
# 步态特征处理
gait_stream = Dense(128, activation='relu')(gait_input)
gait_stream = Dense(64, activation='relu')(gait_stream)
# 注意力融合机制
attention_scores = Dense(64, activation='sigmoid')(gait_stream)
attended_static = Multiply()([static_stream, attention_scores])
# 特征融合
fused = Concatenate()([attended_static, gait_stream])
# 分类层
x = Dense(128, activation='relu')(fused)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax', name='koa_grade')(x)
return Model(inputs=[static_input, gait_input], outputs=output)
def create_1d_cnn_fusion_model(static_dim, gait_timesteps, gait_channels, num_classes):
"""创建包含原始时间序列的融合模型"""
# 输入层
static_input = Input(shape=(static_dim,), name='static_input')
gait_input = Input(shape=(gait_timesteps, gait_channels), name='gait_input')
# 静态特征处理
static_stream = Dense(64, activation='relu')(static_input)
static_stream = Dense(32, activation='relu')(static_stream)
# 时间序列特征提取
gait_stream = Conv1D(64, 7, activation='relu', padding='same')(gait_input)
gait_stream = Conv1D(128, 5, activation='relu', padding='same')(gait_stream)
gait_stream = GlobalAveragePooling1D()(gait_stream)
# 特征融合
fused = Concatenate()([static_stream, gait_stream])
# 分类层
x = Dense(128, activation='relu')(fused)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax', name='koa_grade')(x)
return Model(inputs=[static_input, gait_input], outputs=output)
# ---------------------------
# 4. 训练与评估
# ---------------------------
def train_model(metadata, model_type='features'):
"""训练特征融合模型"""
# 预处理静态特征
static_features, static_feature_names = preprocess_static_features(metadata)
# 提取步态特征
gait_features_list = []
gait_feature_names = None
gait_data_list = [] # 原始时间序列数据
for idx, row in metadata.iterrows():
gait_path = row['gait_path']
gait_features, gait_feature_names = extract_gait_features(gait_path)
# 应用生物力学增强
enhanced_features = biomechanical_feature_enhancement(row.to_frame().T, gait_features)
gait_features_list.append(enhanced_features)
# 保存原始时间序列数据(用于CNN模型)
gait_data = np.load(gait_path)
gait_data_list.append(gait_data)
gait_features = np.array(gait_features_list)
# 目标变量
y = metadata['koa_grade'].values
num_classes = len(np.unique(y))
# 划分训练测试集
X_static_train, X_static_test, X_gait_train, X_gait_test, y_train, y_test = train_test_split(
static_features, gait_features, y, test_size=0.2, random_state=42, stratify=y
)
# 创建模型
if model_type == 'features':
model = create_fusion_model(
static_dim=static_features.shape[1],
gait_dim=gait_features.shape[1],
num_classes=num_classes
)
train_input = [X_static_train, X_gait_train]
test_input = [X_static_test, X_gait_test]
else: # 使用原始时间序列
# 填充时间序列到相同长度
max_timesteps = max(len(data) for data in gait_data_list)
X_gait_padded = np.array([np.pad(data, ((0, max_timesteps - len(data)), (0, 0)), 'constant')
for data in gait_data_list])
# 重新划分数据集
X_static_train, X_static_test, X_gait_train, X_gait_test, y_train, y_test = train_test_split(
static_features, X_gait_padded, y, test_size=0.2, random_state=42, stratify=y
)
model = create_1d_cnn_fusion_model(
static_dim=static_features.shape[1],
gait_timesteps=X_gait_padded.shape[1],
gait_channels=X_gait_padded.shape[2],
num_classes=num_classes
)
train_input = [X_static_train, X_gait_train]
test_input = [X_static_test, X_gait_test]
# 编译模型
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# 训练模型
history = model.fit(
train_input, y_train,
validation_split=0.1,
epochs=100,
batch_size=16,
callbacks=[
EarlyStopping(patience=20, restore_best_weights=True),
ReduceLROnPlateau(factor=0.5, patience=10)
]
)
# 评估模型
test_loss, test_acc = model.evaluate(test_input, y_test)
print(f"\n测试准确率: {test_acc:.4f}")
# 特征重要性分析
if model_type == 'features':
analyze_feature_importance(model, X_static_test, X_gait_test, y_test,
static_feature_names, gait_feature_names)
return model, history
def analyze_feature_importance(model, X_static, X_gait, y_true, static_names, gait_names):
"""分析特征重要性"""
# 获取模型预测
y_pred = model.predict([X_static, X_gait])
y_pred_classes = np.argmax(y_pred, axis=1)
# 计算混淆矩阵
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=["G0", "G1", "G2", "G3", "G4"],
yticklabels=["G0", "G1", "G2", "G3", "G4"])
plt.xlabel('预测分级')
plt.ylabel('真实分级')
plt.title('KL分级混淆矩阵')
plt.savefig('confusion_matrix.png', dpi=300)
plt.close()
# SHAP特征重要性分析
try:
import shap
# 创建解释器
explainer = shap.KernelExplainer(model.predict, [X_static[:50], X_gait[:50]])
# 计算SHAP值
shap_values = explainer.shap_values([X_static[:50], X_gait[:50]])
# 组合特征名称
all_feature_names = static_names + gait_names
# 可视化
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, feature_names=all_feature_names, plot_type='bar')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.close()
except ImportError:
print("SHAP库未安装,跳过特征重要性分析")
# ---------------------------
# 5. 主工作流程
# ---------------------------
def main():
print("="*50)
print("KOA分级诊疗平台 - 特征融合系统")
print("="*50)
# 步骤1: 加载并验证数据
print("\n[步骤1] 加载数据并验证完整性...")
complete_data = load_and_validate_data()
if len(complete_data) < 20:
print("\n错误: 完整数据样本不足,无法训练模型")
return
# 步骤2: 训练特征融合模型
print("\n[步骤2] 训练特征融合模型...")
print("选项1: 使用提取的特征 (更快)")
print("选项2: 使用原始时间序列 (更准确)")
choice = input("请选择模型类型 (1/2): ").strip()
model_type = 'features' if choice == '1' else 'time_series'
model, history = train_model(complete_data, model_type)
# 保存模型
model.save(f'koa_fusion_model_{model_type}.h5')
print(f"模型已保存为 koa_fusion_model_{model_type}.h5")
# 绘制训练历史
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('模型准确率')
plt.ylabel('准确率')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.ylabel('损失')
plt.xlabel('Epoch')
plt.legend()
plt.tight_layout()
plt.savefig('training_history.png', dpi=300)
print("训练历史图已保存为 training_history.png")
print("\n特征融合模型训练完成!")
if __name__ == "__main__":
main()我那个步态数据文件是Excel的