import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report, roc_curve, auc,
roc_auc_score)
from sklearn.preprocessing import StandardScaler, LabelBinarizer
import os
import re
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
def load_data(file_path):
"""加载STR图谱数据"""
df = pd.read_excel("C:\\Users\\DELL\\Desktop\\B题附件\\附件1:不同人数的STR图谱数据.xlsx")
print(f"数据加载成功,形状: {df.shape}")
return df
def extract_contributor_info(df):
"""从样本文件名中提取贡献者信息"""
pattern = r'RD14-0003-([0-9_]+)-([0-9;]+)-'
# 提取信息并创建新列
df['sample_id'] = range(len(df)) # 添加样本序号
# 初始化列
df['contributor_count'] = np.nan
df['contributor_ids'] = None
df['mixing_ratio'] = None
for idx, row in df.iterrows():
sample_file = row['Sample File']
if pd.isna(sample_file):
continue
match = re.search(pattern, str(sample_file))
if match:
contributor_ids = match.group(1).split('_')
mixing_ratios = match.group(2).split(';')
df.at[idx, 'contributor_count'] = len(contributor_ids)
df.at[idx, 'contributor_ids'] = ','.join(contributor_ids)
df.at[idx, 'mixing_ratio'] = ';'.join(mixing_ratios)
# 将contributor_count转换为整数
df['contributor_count'] = df['contributor_count'].astype('Int64')
return df
def create_visualizations(df):
"""创建数据可视化"""
import re
save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0"
os.makedirs(save_dir, exist_ok=True)
plt.figure(figsize=(15, 12))
# 1. 贡献者数量分布
plt.subplot(2, 2, 1)
contributor_counts = df['contributor_count'].value_counts().sort_index()
ax = sns.barplot(x=contributor_counts.index, y=contributor_counts.values, palette='viridis')
for i, v in enumerate(contributor_counts.values):
ax.text(i, v + 5, str(v), ha='center')
title1 = '贡献者数量分布'
plt.title(title1, fontsize=14)
plt.xlabel('贡献者数量', fontsize=12)
plt.ylabel('样本数', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
# 保存图片
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title1)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
# 2. 不同贡献者数量的等位基因分布
plt.subplot(2, 2, 2)
allele_counts = []
for _, group in df.groupby(['Sample File', 'Marker']):
count = 0
for i in range(1, 21):
col = f'Allele {i}'
if col in group.columns and not group[col].isnull().all():
allele_val = group[col].iloc[0]
if pd.notna(allele_val) and str(allele_val) != 'nan':
count += 1
if count > 0 and 'contributor_count' in group.columns:
contributor_count = group['contributor_count'].iloc[0]
if pd.notna(contributor_count):
allele_counts.append((int(contributor_count), count))
allele_df = pd.DataFrame(allele_counts, columns=['contributor_count', 'allele_count'])
sns.boxplot(x='contributor_count', y='allele_count', data=allele_df, palette='viridis')
sns.stripplot(x='contributor_count', y='allele_count', data=allele_df, size=4, color='.3', alpha=0.5)
title2 = '不同贡献者数量的等位基因分布'
plt.title(title2, fontsize=14)
plt.xlabel('贡献者数量', fontsize=12)
plt.ylabel('等位基因数量', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title2)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
# 3. 峰高分布
plt.subplot(2, 2, 3)
heights = []
for i in range(1, 21):
height_col = f'Height {i}'
if height_col in df.columns:
valid_heights = df[height_col].dropna()
heights.extend(valid_heights[valid_heights > 0])
sns.histplot(heights, bins=50, kde=True, color='darkblue', alpha=0.7)
title3 = '峰高分布'
plt.title(title3, fontsize=14)
plt.xlabel('峰高', fontsize=12)
plt.ylabel('频次', fontsize=12)
plt.xscale('log')
plt.grid(axis='y', linestyle='--', alpha=0.7)
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title3)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
# 4. 贡献者数量与峰高比例关系
plt.subplot(2, 2, 4)
ratio_data = []
for _, group in df.groupby(['Sample File', 'Marker']):
heights = []
for i in range(1, 21):
height_col = f'Height {i}'
if height_col in group.columns and not group[height_col].isnull().all():
height_val = group[height_col].iloc[0]
if pd.notna(height_val) and height_val > 0:
heights.append(height_val)
if len(heights) >= 2 and 'contributor_count' in group.columns:
contributor_count = group['contributor_count'].iloc[0]
if pd.notna(contributor_count):
heights.sort(reverse=True)
ratio = heights[0] / heights[1]
ratio_data.append((int(contributor_count), ratio))
ratio_df = pd.DataFrame(ratio_data, columns=['contributor_count', 'peak_ratio'])
sns.boxplot(x='contributor_count', y='peak_ratio', data=ratio_df, palette='viridis')
sns.stripplot(x='contributor_count', y='peak_ratio', data=ratio_df, size=4, color='.3', alpha=0.5)
title4 = '贡献者数量与峰高比例关系'
plt.title(title4, fontsize=14)
plt.xlabel('贡献者数量', fontsize=12)
plt.ylabel('峰高比例 (最高/次高)', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title4)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
plt.tight_layout()
plt.show()
print("数据可视化完成")
def extract_features(df):
"""提取特征用于机器学习模型"""
print("开始提取特征...")
# 按样本文件和标记分组处理
features_list = []
for (sample_file, marker), group in df.groupby(['Sample File', 'Marker']):
# 跳过缺失贡献者数量的样本
if 'contributor_count' not in group.columns or group['contributor_count'].isnull().all():
continue
contributor_count = int(group['contributor_count'].iloc[0])
# 基本信息
feature = {
'sample_file': sample_file,
'marker': marker,
'contributor_count': contributor_count,
'sample_id': group['sample_id'].iloc[0] # 使用样本序号作为标识
}
# 提取等位基因和峰高数据
alleles = []
heights = []
non_ol_alleles = 0
sizes = []
for i in range(1, 21): # 假设最多20个等位基因
allele_col = f'Allele {i}'
height_col = f'Height {i}'
size_col = f'Size {i}'
if (allele_col in group.columns and height_col in group.columns and
not group[allele_col].isnull().all() and not group[height_col].isnull().all()):
allele = group[allele_col].iloc[0]
height = group[height_col].iloc[0]
if pd.notna(allele) and pd.notna(height) and height > 0:
alleles.append(str(allele))
heights.append(height)
# 获取size信息(如果可用)
if size_col in group.columns and not group[size_col].isnull().all():
size = group[size_col].iloc[0]
if pd.notna(size):
sizes.append(size)
# 统计非OL等位基因
if str(allele) != 'OL':
non_ol_alleles += 1
# 等位基因计数特征
feature['allele_count'] = len(alleles)
feature['non_ol_allele_count'] = non_ol_alleles
# 理论上每个人贡献2个等位基因
expected_alleles = 2 * contributor_count
feature['expected_allele_ratio'] = len(alleles) / expected_alleles if expected_alleles > 0 else 0
# 基于阈值的特征
feature['exceeds_2_alleles'] = 1 if len(alleles) > 2 else 0
feature['exceeds_4_alleles'] = 1 if len(alleles) > 4 else 0
feature['exceeds_6_alleles'] = 1 if len(alleles) > 6 else 0
feature['exceeds_8_alleles'] = 1 if len(alleles) > 8 else 0
# OL等位基因比例
feature['ol_allele_ratio'] = (len(alleles) - non_ol_alleles) / len(alleles) if len(alleles) > 0 else 0
# 峰高相关特征
if heights:
feature['height_count'] = len(heights)
feature['height_mean'] = np.mean(heights)
feature['height_std'] = np.std(heights) if len(heights) > 1 else 0
feature['height_min'] = np.min(heights)
feature['height_max'] = np.max(heights)
feature['height_range'] = feature['height_max'] - feature['height_min']
# 计算变异系数
feature['height_cv'] = feature['height_std'] / feature['height_mean'] if feature['height_mean'] > 0 else 0
# 峰高分布特征
sorted_heights = sorted(heights, reverse=True)
feature['height_top1'] = sorted_heights[0] if len(sorted_heights) > 0 else 0
feature['height_top2'] = sorted_heights[1] if len(sorted_heights) > 1 else 0
feature['height_top3'] = sorted_heights[2] if len(sorted_heights) > 2 else 0
feature['height_top4'] = sorted_heights[3] if len(sorted_heights) > 3 else 0
# 峰高总和
feature['height_sum'] = sum(heights)
# 峰高比例特征
if len(heights) > 1:
# 计算相邻峰高比
ratios = []
for i in range(len(sorted_heights) - 1):
if sorted_heights[i + 1] > 0:
ratios.append(sorted_heights[i] / sorted_heights[i + 1])
if ratios:
feature['height_ratio_mean'] = np.mean(ratios)
feature['height_ratio_std'] = np.std(ratios) if len(ratios) > 1 else 0
feature['height_ratio_max'] = np.max(ratios)
feature['height_ratio_min'] = np.min(ratios)
# 具体峰高比例
feature['top1_top2_ratio'] = sorted_heights[0] / sorted_heights[1] if len(sorted_heights) > 1 and \
sorted_heights[1] > 0 else 0
feature['top2_top3_ratio'] = sorted_heights[1] / sorted_heights[2] if len(sorted_heights) > 2 and \
sorted_heights[2] > 0 else 0
feature['top3_top4_ratio'] = sorted_heights[2] / sorted_heights[3] if len(sorted_heights) > 3 and \
sorted_heights[3] > 0 else 0
# 最高峰与平均峰高比
feature['top_to_mean_ratio'] = sorted_heights[0] / np.mean(sorted_heights) if np.mean(
sorted_heights) > 0 else 0
# Size特征
if sizes:
feature['size_mean'] = np.mean(sizes)
feature['size_std'] = np.std(sizes) if len(sizes) > 1 else 0
feature['size_range'] = np.max(sizes) - np.min(sizes) if len(sizes) > 1 else 0
# 标记特定特征 (为主要STR位点添加特征)
for known_marker in ['D8S1179', 'D21S11', 'D7S820', 'CSF1PO', 'D3S1358',
'TH01', 'D13S317', 'D16S539', 'D2S1338', 'D19S433',
'vWA', 'TPOX', 'D18S51', 'AMEL', 'D5S818', 'FGA']:
feature[f'marker_is_{known_marker}'] = 1 if marker == known_marker else 0
features_list.append(feature)
# 创建特征数据框
if not features_list:
print("警告: 未能生成有效特征")
return None
features_df = pd.DataFrame(features_list)
# 填充缺失值
features_df = features_df.fillna(0)
# 打印特征统计信息
print(f"特征提取完成,共生成 {len(features_df)} 个样本,{len(features_df.columns)} 个特征")
print(f"每个贡献者人数的样本数量分布:")
print(features_df['contributor_count'].value_counts().sort_index())
return features_df
def plot_roc_curve(y_test, y_scores, n_classes):
"""绘制ROC曲线"""
import re
save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0"
os.makedirs(save_dir, exist_ok=True)
plt.figure(figsize=(12, 8))
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_scores[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
plt.plot(fpr[i], tpr[i], lw=2, label=f'类别 {i + 2} ROC曲线 (AUC = {roc_auc[i]:.2f})')
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
mean_tpr /= n_classes
mean_auc = auc(all_fpr, mean_tpr)
plt.plot(all_fpr, mean_tpr, 'k--', lw=2, label=f'平均 ROC曲线 (AUC = {mean_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k:', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假正例率 (FPR)', fontsize=12)
plt.ylabel('真正例率 (TPR)', fontsize=12)
title = '多类别 ROC 曲线'
plt.title(title, fontsize=14)
plt.legend(loc="lower right", fontsize=10)
plt.grid(linestyle='--', alpha=0.7)
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
return plt
def plot_confusion_matrix(cm, class_names):
"""绘制美观的混淆矩阵"""
import re
save_dir = r"E:\数模\深圳杯\老哥D题\2025深圳杯&东三省数学建模D题思路解析\2025深圳杯D题【完整代码】\图0"
os.makedirs(save_dir, exist_ok=True)
plt.figure(figsize=(10, 8))
accuracy = np.trace(cm) / float(np.sum(cm))
misclass = 1 - accuracy
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
title = f'混淆矩阵_准确度_{accuracy:.4f}_错误率_{misclass:.4f}'
plt.title(f'混淆矩阵\n准确度: {accuracy:.4f}, 错误率: {misclass:.4f}', fontsize=14)
plt.ylabel('真实标签', fontsize=12)
plt.xlabel('预测标签', fontsize=12)
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks + 0.5, class_names)
plt.yticks(tick_marks + 0.5, class_names)
safe_title = re.sub(r'[\\/:*?"<>|\s]', '_', title)
plt.savefig(os.path.join(save_dir, f"{safe_title}.png"), bbox_inches='tight', dpi=300)
return plt
def evaluate_model(model, X_test_scaled, y_test, model_name, class_labels=None):
"""评估模型性能并打印详细指标"""
# 预测
y_pred = model.predict(X_test_scaled)
# 计算基本指标
accuracy = accuracy_score(y_test, y_pred)
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')
f1_weighted = f1_score(y_test, y_pred, average='weighted')
# 打印指标
print(f"\n{model_name} 模型评估指标:")
print(f"准确率 (Accuracy): {accuracy:.4f}")
print(f"加权精确率 (Weighted Precision): {precision_weighted:.4f}")
print(f"加权召回率 (Weighted Recall): {recall_weighted:.4f}")
print(f"加权F1分数 (Weighted F1): {f1_weighted:.4f}")
# 打印每个类别的指标
if class_labels is None:
class_labels = sorted(set(y_test) | set(y_pred))
print("\n每个类别的详细指标:")
precision_per_class = precision_score(y_test, y_pred, average=None)
recall_per_class = recall_score(y_test, y_pred, average=None)
f1_per_class = f1_score(y_test, y_pred, average=None)
for i, label in enumerate(class_labels):
print(f"类别 {label}:")
print(f" 精确率: {precision_per_class[i]:.4f}")
print(f" 召回率: {recall_per_class[i]:.4f}")
print(f" F1分数: {f1_per_class[i]:.4f}")
# 生成分类报告
print("\n分类报告:")
print(classification_report(y_test, y_pred, target_names=[f"{n}人" for n in class_labels]))
# 绘制混淆矩阵
cm = confusion_matrix(y_test, y_pred)
cm_plt = plot_confusion_matrix(cm, [f"{n}人" for n in class_labels])
cm_plt.show()
# 为ROC曲线准备数据
try:
y_score = model.predict_proba(X_test_scaled)
# 二值化标签
lb = LabelBinarizer()
lb.fit(class_labels)
y_test_bin = lb.transform(y_test)
# 绘制ROC曲线
n_classes = len(class_labels)
roc_plt = plot_roc_curve(y_test_bin, y_score, n_classes)
roc_plt.show()
except (AttributeError, ValueError) as e:
print(f"注意: 无法绘制ROC曲线 - {e}")
return {
'accuracy': accuracy,
'precision': precision_weighted,
'recall': recall_weighted,
'f1': f1_weighted,
'confusion_matrix': cm
}
def train_models_by_count(features_df):
"""按贡献者人数拆分数据集,为每个人数单独训练最佳模型"""
print("开始按贡献者人数拆分并训练模型...")
# 获取所有贡献者人数类别
count_classes = sorted(features_df['contributor_count'].unique())
count_classes = [c for c in count_classes if pd.notna(c)]
print(f"检测到的贡献者人数类别: {count_classes}")
# 存储每个类别的最佳模型、标准化器和结果
count_models = {} # 格式: {人数: (最佳模型, 标准化器)}
count_results = {} # 格式: {人数: 评估结果}
feature_cols = [col for col in features_df.columns
if col not in ['sample_file', 'marker', 'contributor_count', 'sample_id']]
# 为每个人数类别单独训练模型
for count in count_classes:
print(f"\n{'=' * 50}")
print(f"开始处理 {count}人混合样本的模型训练")
print(f"{'=' * 50}")
# 1. 筛选当前人数类别的数据(并包含少量其他类别作为负样本)
current_data = features_df[features_df['contributor_count'] == count].copy()
other_data = features_df[features_df['contributor_count'] != count].copy()
# 负样本数量控制(最多与正样本数量相同,避免类别失衡)
neg_sample_size = min(len(current_data), len(other_data))
other_data_sampled = other_data.sample(n=neg_sample_size, random_state=42)
# 合并数据并创建二分类标签(1=当前人数,0=其他人数)
combined_data = pd.concat([current_data, other_data_sampled], ignore_index=True)
combined_data['binary_label'] = combined_data['contributor_count'].apply(
lambda x: 1 if x == count else 0
)
print(f"训练数据分布: {count}人样本 {len(current_data)}个, 其他样本 {len(other_data_sampled)}个")
# 2. 准备训练数据
X = combined_data[feature_cols]
y = combined_data['binary_label']
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42, stratify=y
)
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 3. 定义候选模型(为每个人数单独测试模型效果)
models = {
"随机森林": RandomForestClassifier(
n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1
),
"梯度提升": GradientBoostingClassifier(
n_estimators=100, learning_rate=0.1, random_state=42
),
"多层感知器": MLPClassifier(
hidden_layer_sizes=(100, 50), max_iter=300, early_stopping=True, random_state=42
),
"支持向量机": SVC(
kernel='rbf', probability=True, class_weight='balanced', random_state=42
)
}
# 4. 训练并选择当前人数的最佳模型
best_model = None
best_score = 0
best_metrics = None
for name, model in models.items():
print(f"\n正在训练 {name} 模型...")
model.fit(X_train_scaled, y_train)
# 评估模型(二分类场景)
metrics = evaluate_model(
model, X_test_scaled, y_test,
model_name=f"{count}人样本 - {name}",
class_labels=[0, 1] # 0=其他人数,1=当前人数
)
# 跟踪最佳模型(以F1分数为标准)
if metrics['f1'] > best_score:
best_score = metrics['f1']
best_model = model
best_metrics = metrics
# 5. 保存当前人数的最佳模型和标准化器
count_models[count] = (best_model, scaler)
count_results[count] = best_metrics
print(f"\n{count}人样本的最佳模型确定为: {best_model.__class__.__name__}, F1分数: {best_score:.4f}")
# 6. 训练一个"人数识别主模型"(用于先判断样本属于哪个人数类别)
print(f"\n{'=' * 50}")
print("训练人数识别主模型(用于初步分类)")
print(f"{'=' * 50}")
# 准备主模型数据
X_main = features_df[feature_cols]
y_main = features_df['contributor_count']
X_train_main, X_test_main, y_train_main, y_test_main = train_test_split(
X_main, y_main, test_size=0.25, random_state=42, stratify=y_main
)
scaler_main = StandardScaler()
X_train_main_scaled = scaler_main.fit_transform(X_train_main)
X_test_main_scaled = scaler_main.transform(X_test_main)
# 主模型(多分类,判断人数)
main_model = RandomForestClassifier(
n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1
)
main_model.fit(X_train_main_scaled, y_train_main)
# 评估主模型
main_metrics = evaluate_model(
main_model, X_test_main_scaled, y_test_main,
model_name="人数识别主模型",
class_labels=count_classes
)
print(f"\n人数识别主模型准确率: {main_metrics['accuracy']:.4f}")
return count_models, count_results, main_model, scaler_main, count_classes, feature_cols
def aggregate_predictions(features_df, count_models, main_model, scaler_main, count_classes, feature_cols):
"""使用两级模型进行预测:先主模型判断人数,再专用模型验证"""
print("\n开始两级模型预测...")
# 1. 准备数据
X = features_df[feature_cols]
X_main_scaled = scaler_main.transform(X)
# 2. 第一步:主模型预测人数
features_df['main_prediction'] = main_model.predict(X_main_scaled)
# 3. 第二步:使用对应人数的专用模型验证
features_df['final_prediction'] = None
for idx, row in features_df.iterrows():
# 获取主模型预测的人数
pred_count = row['main_prediction']
# 如果预测人数不在已知类别中,默认使用主模型结果
if pred_count not in count_models:
features_df.at[idx, 'final_prediction'] = pred_count
continue
# 获取对应人数的专用模型和标准化器
model, scaler = count_models[pred_count]
# 标准化当前样本特征
sample_features = row[feature_cols].values.reshape(1, -1)
sample_scaled = scaler.transform(sample_features)
# 专用模型预测(1=符合该人数,0=不符合)
bin_pred = model.predict(sample_scaled)[0]
# 确定最终预测结果
if bin_pred == 1:
# 专用模型验证通过,使用主模型结果
features_df.at[idx, 'final_prediction'] = pred_count
else:
# 专用模型验证不通过,使用概率最高的其他类别
proba = main_model.predict_proba(X_main_scaled[idx].reshape(1, -1))[0]
proba_df = pd.DataFrame({
'count': count_classes,
'prob': proba
}).sort_values('prob', ascending=False)
# 选择概率最高的其他类别
for _, p_row in proba_df.iterrows():
if p_row['count'] != pred_count:
features_df.at[idx, 'final_prediction'] = p_row['count']
break
# 4. 按样本聚合最终预测结果
agg_results = features_df.groupby('sample_file').agg({
'contributor_count': 'first', # 实际人数
'final_prediction': lambda x: x.value_counts().index[0] # 最常见的预测值
}).reset_index()
# 5. 评估最终结果
agg_results['correct'] = agg_results['contributor_count'] == agg_results['final_prediction']
sample_accuracy = agg_results['correct'].mean()
print(f"\n两级模型样本级别准确率: {sample_accuracy:.4f}")
# 绘制最终混淆矩阵
cm = confusion_matrix(agg_results['contributor_count'], agg_results['final_prediction'])
cm_plt = plot_confusion_matrix(cm, [f"{n}人" for n in count_classes])
cm_plt.show()
# 分类报告
print("\n最终样本级别分类报告:")
print(classification_report(
agg_results['contributor_count'],
agg_results['final_prediction'],
target_names=[f"{n}人" for n in count_classes]
))
# 保存结果
agg_results.to_csv('两级模型样本预测结果.csv', index=False, encoding='utf-8-sig')
print("\n两级模型预测结果已保存为 '两级模型样本预测结果.csv'")
return agg_results
def main():
"""主函数(使用按人数拆分的模型)"""
print("=" * 80)
print("法医物证多人身份鉴定 - 按人数专用模型识别分析".center(60))
print("=" * 80)
# 步骤1:加载数据
file_path = "附件1:不同人数的STR图谱数据.xlsx"
if not os.path.exists(file_path):
print(f"错误: 文件 '{file_path}' 不存在")
return
print("\n第1步: 数据加载")
print("-" * 50)
df = load_data(file_path)
# 步骤2:提取贡献者信息
print("\n第2步: 提取贡献者信息")
print("-" * 50)
df = extract_contributor_info(df)
# 显示贡献者数量分布
print("\n贡献者数量分布:")
contributor_counts = df['contributor_count'].value_counts().sort_index()
for count, freq in contributor_counts.items():
if pd.notna(count):
print(f" {int(count)}人混合样本: {freq}行数据")
# 步骤3:数据可视化
print("\n第3步: 数据可视化")
print("-" * 50)
create_visualizations(df)
# 步骤4:提取特征
print("\n第4步: 特征提取")
print("-" * 50)
features_df = extract_features(df)
if features_df is None:
print("错误: 无法提取有效特征")
return
# 步骤5:按人数训练专用模型
print("\n第5步: 按贡献者人数训练专用模型")
print("-" * 50)
count_models, count_results, main_model, scaler_main, count_classes, feature_cols = train_models_by_count(
features_df)
# 步骤6:使用两级模型进行预测
print("\n第6步: 使用两级模型进行样本级别预测")
print("-" * 50)
agg_results = aggregate_predictions(features_df, count_models, main_model, scaler_main, count_classes,
feature_cols)
# 步骤7:比较新旧模型性能
print("\n第7步: 模型性能比较")
print("-" * 50)
# 这里可以添加新旧模型的对比分析代码
print("\n分析完成!")
print("=" * 80)
if __name__ == "__main__":
main()这个怎么什么都没生成出来