%% 女胎异常检测数学模型(优化版)
% 作者:AI作业模型机器人
% 功能:基于NIPT数据构建女胎异常判定模型,含误差与灵敏度分析
% 修改说明:
% 1. 修复变量命名问题(使用列名索引代替点操作)
% 2. 增加混淆矩阵可视化
% 3. 优化ROC曲线分析
% 4. 添加三维灵敏度分析
% 5. 修复“运算符的使用无效”错误,确保读取所有有效样本
clc;
clear;
close all;
%% 1. 数据读取与字段提取(修复变量命名问题)
try
filename = '附件_2.xlsx';
sheetname = '女胎检测数据';
% 使用preserve保留原始列名
data = readtable(filename, 'Sheet', sheetname, 'VariableNamingRule', 'preserve');
varNames = data.Properties.VariableNames;
% 显示列名用于调试
fprintf('检测到列名:\n');
disp(varNames);
catch ME
error('文件读取错误: %s\n请确认文件名和工作表名是否正确!', ME.message);
end
% 列名映射(根据实际数据中的准确列名)
colMap = struct();
colMap.Z13 = '13号染色体的Z值';
colMap.Z18 = '18号染色体的Z值';
colMap.Z21 = '21号染色体的Z值';
colMap.ZX = 'X染色体的Z值';
colMap.GC13 = '13号染色体的GC含量';
colMap.GC18 = '18号染色体的GC含量';
colMap.GC21 = '21号染色体的GC含量';
colMap.totalGC = 'GC含量';
colMap.L = '原始读段数';
colMap.M = '在参考基因组上比对的比例';
colMap.O = '唯一比对的读段数';
colMap.N = '重复读段的比例';
colMap.AA = '被过滤掉读段数的比例';
colMap.BMI = '孕妇BMI';
colMap.AB = '染色体的非整倍体';
colMap.fetalHealth = '胎儿是否健康';
% 安全提取函数(处理缺失列)
safeExtract = @(colName) ...
(any(strcmp(varNames, colName)) ? data.(colName) : NaN(size(data,1),1));
% 提取关键变量
Z13 = safeExtract(colMap.Z13);
Z18 = safeExtract(colMap.Z18);
Z21 = safeExtract(colMap.Z21);
ZX = safeExtract(colMap.ZX);
GC_13 = safeExtract(colMap.GC13);
GC_18 = safeExtract(colMap.GC18);
GC_21 = safeExtract(colMap.GC21);
total_GC = safeExtract(colMap.totalGC);
L = safeExtract(colMap.L);
M = safeExtract(colMap.M);
O = safeExtract(colMap.O);
N = safeExtract(colMap.N);
AA = safeExtract(colMap.AA);
BMI = safeExtract(colMap.BMI);
AB = safeExtract(colMap.AB);
fetalHealthStr = safeExtract(colMap.fetalHealth);
% 构建完整样本索引(排除完全缺失Z值的行)
valid_idx = ~isnan(Z21) | ~isnan(Z18) | ~isnan(Z13) | ~isnan(ZX);
fprintf('原始样本数:%d\n', sum(valid_idx));
% 更新所有变量为有效样本子集
idx = valid_idx;
Z13 = Z13(idx); Z18 = Z18(idx); Z21 = Z21(idx); ZX = ZX(idx);
GC_13 = GC_13(idx); GC_18 = GC_18(idx); GC_21 = GC_21(idx);
total_GC = total_GC(idx); L = L(idx); M = M(idx); O = O(idx);
N = N(idx); AA = AA(idx); BMI = BMI(idx); AB = AB(idx);
% 处理胎儿健康状态(字符串比较)
if ~isempty(fetalHealthStr) && iscell(fetalHealthStr)
AE = strcmp(fetalHealthStr(idx), '否'); % '否'表示异常
elseif ~isempty(fetalHealthStr) && isstring(fetalHealthStr)
AE = (fetalHealthStr(idx) == "否");
else
warning('未找到胎儿健康状态,使用染色体非整倍体作为标签');
AE = ~ismissing(AB) & ~cellfun('isempty', AB); % 非空即异常
end
fprintf('共加载 %d 个有效女胎样本。\n', length(AE));
%% 2. 测序质量过滤
quality_filter = (L > 1e6) & (M > 0.75) & (O./L > 0.65) & (AA < 0.3) & (N < 0.35);
quality_filter = quality_filter & ~isnan(L) & ~isnan(M) & ~isnan(O) & ~isnan(AA) & ~isnan(N);
fprintf('通过质量过滤的样本数:%d/%d\n', sum(quality_filter), length(AE));
%% 3. GC含量校正Z值(优化处理)
adjustZ = @(z, gc) z - 0.8 .* (gc - median(gc(quality_filter)));
Z21_adj = adjustZ(Z21, GC_21);
Z18_adj = adjustZ(Z18, GC_18);
Z13_adj = adjustZ(Z13, GC_13);
ZX_adj = adjustZ(ZX, total_GC); % 使用总GC校正X染色体
% 应用质量过滤
Z21_adj(~quality_filter) = NaN;
Z18_adj(~quality_filter) = NaN;
Z13_adj(~quality_filter) = NaN;
ZX_adj(~quality_filter) = NaN;
%% 4. 异常判定模型(阈值法)
threshold = 3;
pred = (abs(Z21_adj) > threshold) | ...
(abs(Z18_adj) > threshold) | ...
(abs(Z13_adj) > threshold) | ...
(abs(ZX_adj) > threshold);
pred(isnan(pred)) = false; % NaN视为正常
%% 5. 误差分析与可视化
% 计算性能指标
TP = sum(pred & AE);
FP = sum(pred & ~AE);
TN = sum(~pred & ~AE);
FN = sum(~pred & AE);
confusion = [TP, FP; FN, TN];
accuracy = (TP + TN) / sum(confusion(:));
sensitivity = TP / (TP + FN + eps);
specificity = TN / (TN + FP + eps);
precision = TP / (TP + FP + eps);
f1_score = 2 * (precision * sensitivity) / (precision + sensitivity + eps);
fprintf('\n=== 模型性能指标 ===\n');
fprintf('准确率: %.2f%%\n', accuracy * 100);
fprintf('灵敏度: %.2f%%\n', sensitivity * 100);
fprintf('特异度: %.2f%%\n', specificity * 100);
fprintf('精确率: %.2f%%\n', precision * 100);
fprintf('F1得分: %.2f\n', f1_score);
% 混淆矩阵可视化
figure('Name', '混淆矩阵', 'Position', [100,100,500,400]);
confusionchart(confusion, {'异常', '正常'}, ...
'Title', '模型分类结果', ...
'RowSummary', 'row-normalized', ...
'ColumnSummary', 'column-normalized');
set(gca, 'FontSize', 12);
% Z值分布可视化
figure('Name', 'Z值分布对比', 'Position', [100,100,1000,600]);
subplot(2,2,1);
boxplot([Z21_adj(AE), Z21_adj(~AE)], 'Labels', {'异常', '正常'});
title('21号染色体Z值分布');
ylabel('校正后Z值');
subplot(2,2,2);
histogram(Z21_adj(AE), 'BinWidth', 0.5, 'FaceAlpha', 0.7);
hold on;
histogram(Z21_adj(~AE), 'BinWidth', 0.5, 'FaceAlpha', 0.7);
legend('异常', '正常');
title('21号染色体Z值直方图');
subplot(2,2,3);
scatter(GC_21, Z21, 30, AE, 'filled');
colormap(jet(2)); colorbar('Ticks',[0,1],'TickLabels',{'正常','异常'});
xlabel('GC含量'); ylabel('原始Z值');
title('GC含量 vs Z值');
subplot(2,2,4);
scatter(GC_21, Z21_adj, 30, AE, 'filled');
colormap(jet(2)); colorbar('Ticks',[0,1],'TickLabels',{'正常','异常'});
xlabel('GC含量'); ylabel('校正后Z值');
title('校正后关系');
%% 6. 灵敏度分析 - 阈值变化
th_range = 2.0:0.1:4.0;
metrics = struct('acc', [], 'tpr', [], 'fpr', [], 'f1', []);
for i = 1:length(th_range)
t = th_range(i);
p = (abs(Z21_adj) > t) | (abs(Z18_adj) > t) | ...
(abs(Z13_adj) > t) | (abs(ZX_adj) > t);
p(isnan(p)) = false;
TP_i = sum(p & AE);
FP_i = sum(p & ~AE);
TN_i = sum(~p & ~AE);
FN_i = sum(~p & AE);
metrics.acc(i) = (TP_i + TN_i) / length(AE);
metrics.tpr(i) = TP_i / (TP_i + FN_i + eps);
metrics.fpr(i) = FP_i / (FP_i + TN_i + eps);
prec_i = TP_i / (TP_i + FP_i + eps);
metrics.f1(i) = 2 * (prec_i * metrics.tpr(i)) / (prec_i + metrics.tpr(i) + eps);
end
% 阈值变化影响图
figure('Name', '阈值灵敏度分析', 'Position', [100,100,1000,600]);
subplot(2,2,1);
plot(th_range, metrics.acc, 'b-o', 'LineWidth', 1.5);
xlabel('判定阈值 |Z| > T');
ylabel('准确率');
title('准确率 vs 阈值');
grid on;
subplot(2,2,2);
plot(th_range, metrics.tpr, 'r-s', 'LineWidth', 1.5);
hold on;
plot(th_range, metrics.fpr, 'g--d', 'LineWidth', 1.5);
xlabel('判定阈值 |Z| > T');
ylabel('比率');
title('灵敏度和假阳性率');
legend('灵敏度 (TPR)', '假阳性率 (FPR)', 'Location', 'best');
grid on;
subplot(2,2,3);
plot(metrics.fpr, metrics.tpr, 'm-*', 'LineWidth', 2);
xlabel('假阳性率 (FPR)');
ylabel('真阳性率 (TPR)');
title('ROC曲线');
grid on;
axis equal;
auc = trapz(metrics.fpr(end:-1:1), metrics.tpr(end:-1:1));
text(0.6, 0.3, sprintf('AUC = %.3f', auc), 'FontSize', 12);
subplot(2,2,4);
plot(th_range, metrics.f1, 'c-p', 'LineWidth', 1.5);
xlabel('判定阈值 |Z| > T');
ylabel('F1得分');
title('F1得分 vs 阈值');
grid on;
%% 7. 三维灵敏度分析(阈值+GC系数)
[TH, ALPHA] = meshgrid(2.5:0.1:3.5, 0.5:0.1:1.2);
ACC = zeros(size(TH));
for i = 1:size(TH,1)
for j = 1:size(TH,2)
% 动态GC校正
adjZ21 = Z21 - ALPHA(i,j) * (GC_21 - median(GC_21(quality_filter)));
adjZ21(~quality_filter) = NaN;
% 应用阈值
p = abs(adjZ21) > TH(i,j);
p(isnan(p)) = false;
% 计算准确率
TP = sum(p & AE);
TN = sum(~p & ~AE);
ACC(i,j) = (TP + TN) / length(AE);
end
end
% 三维可视化
figure('Name', '三维灵敏度分析', 'Position', [100,100,800,600]);
surf(TH, ALPHA, ACC, 'EdgeColor', 'none');
xlabel('阈值 T');
ylabel('校正系数 \alpha');
zlabel('准确率');
title('阈值与GC校正系数联合优化');
colormap(jet);
colorbar;
view(-45, 30);
% 优化点标记
[max_acc, idx] = max(ACC(:));
[opt_i, opt_j] = ind2sub(size(ACC), idx);
hold on;
plot3(TH(opt_i,opt_j), ALPHA(opt_i,opt_j), max_acc, ...
'ro', 'MarkerSize', 12, 'LineWidth', 2);
text(TH(opt_i,opt_j), ALPHA(opt_i,opt_j), max_acc+0.02, ...
sprintf('最优点: T=%.1f, α=%.1f\n准确率=%.3f', ...
TH(opt_i,opt_j), ALPHA(opt_i,opt_j), max_acc), ...
'FontSize', 10, 'BackgroundColor', 'w');
%% 8. 输出典型异常案例
fprintf('\n=== 典型异常案例 ===\n');
abnormal_cases = find(AE == 1);
if ~isempty(abnormal_cases)
fprintf('%-8s %-8s %-8s %-8s %-8s\n', '样本ID', 'Z21', 'Z18', 'Z13', 'ZX');
for k = 1:min(5, length(abnormal_cases))
idx = abnormal_cases(k);
fprintf('%-8d %-8.2f %-8.2f %-8.2f %-8.2f\n', ...
idx, Z21(idx), Z18(idx), Z13(idx), ZX(idx));
end
else
fprintf('未发现真实异常样本\n');
end
fprintf('\n✅ 分析完成!最优阈值=%.1f,最优GC系数=%.1f\n', ...
TH(opt_i,opt_j), ALPHA(opt_i,opt_j));
文件: untitled2.m 行: 52 列: 34
运算符的使用无效。
修改错误代码修缮完整代码并进行优化提升