% 导入数据
data = readtable("D:\附件-男胎检测数据2.xlsx", 'VariableNamingRule', 'preserve');
% 保留关键列
coreFields = {'孕妇代码', '孕妇BMI', '检测孕周', 'Y染色体浓度', '检测抽血次数'};
data = data(:, coreFields);
% 清洗缺失值
data = rmmissing(data, 'DataVariables', coreFields);
% 确保“检测孕周”是字符串
data.('检测孕周') = string(data.('检测孕周'));
% 转换检测孕周为连续型GA
gaCell = arrayfun(@parseGA, data.('检测孕周'), 'UniformOutput', false);
data.GA = cell2mat(gaCell);
% 过滤GA不在[10,25]的样本
data = data(data.GA >= 10 & data.GA <= 25, :);
% 重命名列为英文
data.Properties.VariableNames = {'pID', 'BMI', 'GA_str', 'Y_concentration', 'test_count', 'GA'};
% 按孕妇代码分组并按检测抽血次数排序
grp = findgroups(data.pID);
groupedData = splitapply(@(x) {sortrows(x, 'test_count')}, data, grp);
%% 2. 计算每个孕妇的最早达标时间T_early,p
[pIDs, earlyT] = computeEarlyT(groupedData);
%% 3. 有序聚类分组(按BMI)
% 提取每个孕妇的BMI(组内一致,取第一个)
bmi = zeros(length(groupedData), 1);
for i = 1:length(groupedData)
bmi(i) = groupedData{i}.BMI(1);
end
% 按BMI升序排序
[sortedBMI, sortIdx] = sort(bmi);
sortedT = earlyT(sortIdx);
% 肘部法则确定最优分组数(最大尝试5组)
maxM = 5;
bestM = elbowMethod(sortedBMI, sortedT, maxM);
% 执行有序聚类
[groups, bmiBounds] = optimalPartition(sortedBMI, sortedT, bestM);
%% 4. 计算每组最佳NIPT时点及风险
nGroups = bestM;
bestGA = zeros(nGroups, 1);
groupRisk = zeros(nGroups, 1);
gaRange = [10, 25];
for i = 1:nGroups
idx = find(groups == i);
groupT = sortedT(idx);
minT = min(groupT); % 组内最早达标时间
[bestGA(i), groupRisk(i)] = computeBestGA(groupT, minT, gaRange);
end
totalRisk = sum(groupRisk); % 总风险
%% 5. 检测误差影响分析(模拟σ=0.5%,1%,2%)
sigmas = [0.5, 1, 2];
errResults = cell(length(sigmas), 4);
for s = 1:length(sigmas)
sigma = sigmas(s);
[errT, errG, errGA, errR] = analyzeError(data, groupedData, sortedBMI, sortedT, bestM, sigma);
errResults{s,1} = errT;
errResults{s,2} = errG;
errResults{s,3} = errGA;
errResults{s,4} = errR;
end
%% 输出结果
disp('===== BMI分组区间 =====');
for i = 1:nGroups
if i == nGroups
fprintf('组%d: [%.2f, %.2f]\n', i, bmiBounds(i), bmiBounds(i+1));
else
fprintf('组%d: [%.2f, %.2f)\n', i, bmiBounds(i), bmiBounds(i+1));
end
end
disp(' ');
disp('===== 每组最佳NIPT时点(周) =====');
for i = 1:nGroups
fprintf('组%d: %.4极f\n', i, bestGA(i));
end
disp(' ');
disp('===== 每组潜在风险值 =====');
for i = 1:nGroups
fprintf('组%d: %.0f\n', i, groupRisk(i));
end
fprintf('总风险值: %.0f\n', totalRisk);
disp(' ');
disp('===== 检测误差影响分析 =====');
for s = 1:length(sigmas)
sigma = sigmas(s);
errT = errResults{s,1};
errGA = errResults{s,3};
errR = errResults{s,4};
fprintf('\nσ=%.1f%%时:\n', sigma);
fprintf('最早达标时间均值变化: %.4f → %.4f\n', ...
mean(earlyT(~isinf(earlyT))), mean(errT(~isinf(errT))));
fprintf('最佳时点均值变化: %.4f → %.4f\n', ...
mean(bestGA(~isnan(bestGA))), mean(errGA(~isnan(errGA))));
fprintf('总风险变化: %.0f → %.0f\n', totalRisk, errR);
end
data_clean = data_clean(valid_idx, :);
错误使用 splitapply (第 61 行)
组编号必须为正整数向量,并且不能为稀疏向量。
>> 根据报错提示,修改代码