% 主程序:孕周数据解析、BMI分组和风险评估
% 1. 导入并清洗数据
data = readtable("D:\附件-男胎检测数据2.xlsx", 'VariableNamingRule', 'preserve');
% 保留关键列
coreFields = {'孕妇代码', '孕妇BMI', '检测孕周', 'Y染色体浓度', '检测抽血次数'};
data = data(:, coreFields);
disp(['原始数据行数:', num2str(height(data))]);
% 清洗缺失值
cleanFields = {'孕妇BMI', '检测孕周', 'Y染色体浓度', '检测抽血次数'};
data = rmmissing(data, 'DataVariables', cleanFields);
disp(['清洗后行数:', num2str(height(data))]);
% 重命名列为英文
data.Properties.VariableNames = {'pID', 'BMI', 'GA_str', 'Y_concentration', 'test_count'};
% 修复错误:确保正确处理字符串类型
if iscellstr(data.GA_str)
data.GA_str = string(data.GA_str);
end
% 2. 解析孕周数据
gaValues = nan(height(data), 1);
for i = 1:height(data)
% 修复错误:使用圆括号()索引代替花括号{}
input = data.GA_str(i);
% 处理数值输入
if isnumeric(input)
ga = double(input);
if ~isnan(ga) && ga >= 5 && ga <= 40
gaValues(i) = ga;
end
continue;
end
% 确保输入为字符串
if ~isstring(input)
input = string(input);
end
str = input(1); % 提取第一个元素
str = lower(strtrim(str));
% 去除非法字符
str = regexprep(str, '[^\w\d\.\+\-]', '', 'once');
% 包含"周"和"天"的格式
if contains(str, '周') && (contains(str, '天') || contains(str, '+'))
weekPart = extractBefore(str, '周');
if contains(str, '+')
dayPart = extractAfter(str, '+');
else
dayPart = extractAfter(str, '周');
end
% 提取数字部分
weekPart = regexprep(weekPart, '[^0-9]', '', 'once');
dayPart = regexprep(dayPart, '[^0-9]', '', 'once');
week = str2double(weekPart);
day = str2double(dayPart);
if ~isnan(week) && ~isnan(day)
gaValues(i) = week + day / 7;
end
% 只包含"周"的格式
elseif contains(str, '周')
weekPart = regexprep(extractBefore(str, '周'), '[^0-9.]', '', 'once');
gaValues(i) = str2double(weekPart);
% 只包含"天"的格式
elseif contains(str, '天')
dayPart = regexprep(str, '[^0-9]', '', 'once');
days = str2double(dayPart);
gaValues(i) = days / 7;
% 纯数字格式
else
gaValues(i) = str2double(str);
end
end
% 添加孕周列并验证范围
data.GA = gaValues;
validGA = (data.GA >= 10 & data.GA <= 25) & ~isnan(data.GA);
data = data(validGA, :);
disp(['有效孕周数据:', num2str(height(data))]);
% 3. 按孕妇分组数据
[uniqueIDs, ~, grpID] = unique(data.pID);
groupedData = cell(numel(uniqueIDs), 1);
for i = 1:numel(uniqueIDs)
idx = (grpID == i);
groupedData{i} = sortrows(data(idx, :), 'test_count');
end
% 4. 计算最早达标时间
threshold = 0.04; % Y染色体浓度阈值
earlyT = nan(numel(uniqueIDs), 1);
bmiVals = nan(numel(uniqueIDs), 1);
for i = 1:numel(uniqueIDs)
patientData = groupedData{i};
bmiVals(i) = patientData.BMI(1); % 存储BMI值
aboveIdx = find(patientData.Y_concentration >= threshold, 1);
if ~isempty(aboveIdx)
earlyT(i) = patientData.GA(aboveIdx);
else
earlyT(i) = Inf; % 未达标
end
end
% 5. 提取有效数据并排序
validIdx = ~isnan(bmiVals) & ~isinf(earlyT);
sortedBMI = bmiVals(validIdx);
sortedT = earlyT(validIdx);
[sortedBMI, sortIdx] = sort(sortedBMI);
sortedT = sortedT(sortIdx);
n = numel(sortedBMI);
% 6. 肘部法则确定分组数
maxM = min(5, n);
sse = zeros(maxM, 1);
for m = 1:maxM
% 等间距分组
edges = linspace(min(sortedBMI), max(sortedBMI), m+1);
groups = discretize(sortedBMI, edges);
for j = 1:m
groupIdx = (groups == j);
if any(groupIdx)
groupMean = mean(sortedT(groupIdx), 'omitnan');
sse(m) = sse(m) + sum((sortedT(groupIdx) - groupMean).^2, 'omitnan');
end
end
end
% 计算肘点
if maxM > 1
diffs = diff(sse);
relDiffs = diffs(1:end-1) ./ diffs(2:end);
[~, bestM] = max(relDiffs);
bestM = bestM + 1; % 补偿索引
bestM = min(bestM, maxM);
else
bestM = 1;
end
% 7. 最终分组
edges = linspace(min(sortedBMI), max(sortedBMI), bestM + 1);
groups = discretize(sortedBMI, edges);
% 8. 计算每组最佳时间和风险
bestGA = zeros(bestM, 1);
groupRisk = zeros(bestM, 1);
candidateGA = linspace(10, 25, 101); % 10-25周之间评估
for i = 1:bestM
groupIdx = (groups == i);
groupT = sortedT(groupIdx);
validT = groupT(~isinf(groupT));
if isempty(validT)
bestGA(i) = NaN;
groupRisk(i) = 0;
continue;
end
% 计算每个候选时间的风险
risks = arrayfun(@(ga) sum(validT > ga), candidateGA);
[minRisk, idx] = min(risks);
bestGA(i) = candidateGA(idx);
groupRisk(i) = minRisk;
end
totalRisk = sum(groupRisk);
% 9. 输出结果
fprintf('\n===== BMI分组区间 =====\n');
for i = 1:bestM
if i < bestM
fprintf('组%d: [%.2f, %.2f)\n', i, edges(i), edges(i+1));
else
fprintf('组%d: [%.2f, %.2f]\n', i, edges(i), edges(i+1));
end
end
fprintf('\n===== 最佳NIPT时点 =====\n');
for i = 1:bestM
fprintf('组%d: %.2f周\n', i, bestGA(i));
end
fprintf('\n===== 潜在风险值 =====\n');
for i = 1:bestM
fprintf('组%d: %.0f\n', i, groupRisk(i));
end
fprintf('总风险值: %.0f\n', totalRisk);
原始数据行数:1082
清洗后行数:1082
有效孕周数据:1069
===== BMI分组区间 =====
组1: [20.70, 32.84)
组2: [32.84, 44.98]
===== 最佳NIPT时点 =====
组1: 23.65周
组2: 24.10周
===== 潜在风险值 =====
组1: 0
组2: 0
总风险值: 0
>> 这代码运行出来的结果有点扯,给我修改一下代码