代码
import pandas as pd
import numpy as np
from scipy.optimize import minimize_scalar
import matplotlib.pyplot as plt
from pygam import LinearGAM, s
from sklearn.exceptions import ConvergenceWarning
import warnings
# 忽略特定警告
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# 读取数据(仅保留必要列)
df = pd.read_excel('男胎检测数据_预处理.xlsx',
usecols=[
'孕妇BMI', '检测孕周', 'Y染色体浓度',
'检测抽血次数', 'GC含量', '胎儿是否健康'
])
# 数据预处理
df = df.dropna(subset=['Y染色体浓度', '检测孕周', '孕妇BMI'])
df['检测孕周'] = df['检测孕周'].astype(float) # 确保孕周为浮点数
df['BMI_group'] = pd.cut(df['孕妇BMI'],
bins=[20, 28, 32, 36, 40],
labels=['正常低值', '正常高值', '超重', '肥胖'])
# 定义风险计算函数(修复abs()错误)
def calculate_risk(week, group_data, weight_accuracy=0.6, weight_window=0.4):
"""
计算综合风险:
- 准确性风险:未达标浓度比例
- 治疗窗口风险:孕周偏离理想窗口的程度
"""
# 准确性风险:浓度<4%的比例
accuracy_risk = group_data['Y染色体浓度'].apply(lambda x: 1 if x < 4 else 0).mean()
# 治疗窗口风险:孕周与理想窗口的偏离度(假设理想窗口为13-20周)
ideal_window = (12, 22)
window_risk = (np.clip(week, ideal_window[0], ideal_window[1]) - week).abs().mean() # 修复此处
return (accuracy_risk * weight_accuracy) + (window_risk * weight_window)
# 动态规划优化函数(增加收敛性检查)
def optimize_nipt_timing(group_data):
weeks = np.arange(10, 28, 0.5) # 探索孕周范围10-28周,步长0.5周
# 初始化动态规划表
dp = {}
for week in weeks:
try:
risk = calculate_risk(week, group_data)
dp[week] = risk
except Exception as e:
dp[week] = np.inf # 无法计算时设为无穷大
prev = {week: None for week in weeks}
# 逆序动态规划
for week in reversed(weeks):
if week not in dp:
continue
candidates = []
for step in [0.5, 1.0]:
next_week = week + step
if next_week in dp:
candidates.append(dp[next_week])
if not candidates:
continue
min_prev_risk = min(candidates)
dp[week] += min_prev_risk
prev[week] = weeks[np.argmin(candidates)]
# 寻找全局最优解(增加有效性检查)
valid_weeks = [wk for wk in dp if not np.isinf(dp[wk])]
if not valid_weeks:
return np.nan, np.nan # 返回无效值防止后续错误
optimal_week = min(valid_weeks, key=lambda k: dp[k])
return optimal_week, dp[optimal_week]
# 分组优化(增加异常处理)
results = {}
for group_name, group_data in df.groupby('BMI_group'):
try:
# 使用GAM模型预测达标时间
X = group_data[['检测孕周', '检测抽血次数', 'GC含量']]
y = (group_data['Y染色体浓度'] >= 4).astype(int)
# 训练GAM模型(增加收敛性参数)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
gam = LinearGAM(s(0) + s(1) + s(2), max_iter=500, tol=1e-4).fit(X, y)
# 预测各孕周达标概率
weeks_to_predict = np.arange(10, 28, 0.1)
pred_probs = gam.predict(np.column_stack([
weeks_to_predict,
np.full_like(weeks_to_predict, group_data['检测抽血次数'].mean()),
np.full_like(weeks_to_predict, group_data['GC含量'].mean())
]))
# 确定达标时间(首次概率≥0.9的孕周)
try:
threshold_idx = np.where(pred_probs >= 0.9)[0][0]
threshold_week = weeks_to_predict[threshold_idx]
except IndexError:
threshold_week = np.nan
# 如果无法通过GAM确定,则使用动态规划
if np.isnan(threshold_week):
threshold_week, _ = optimize_nipt_timing(group_data)
# 计算最佳NIPT时点
best_week, min_risk = optimize_nipt_timing(group_data)
results[group_name] = {
'BMI分组': group_name,
'GAM达标周': threshold_week,
'推荐NIPT周': best_week,
'综合风险': min_risk,
'数据量': len(group_data)
}
except Exception as e:
print(f"处理分组 {group_name} 时发生错误:{str(e)}")
results[group_name] = {
'BMI分组': group_name,
'GAM达标周': np.nan,
'推荐NIPT周': np.nan,
'综合风险': np.nan,
'数据量': len(group_data)
}
# 过滤无效分组数据
valid_results = {k:v for k,v in results.items() if not np.isnan(v['推荐NIPT周'])}
# 结果可视化(增加异常处理)
if valid_results:
plt.figure(figsize=(12, 6))
for i, (group_name, data) in enumerate(valid_results.items()):
plt.bar(i, data['推荐NIPT周'],
color=plt.cm.tab10(i), alpha=0.7)
plt.text(i, data['推荐NIPT周'] + 0.5,
f"{group_name}\n风险:{data['综合风险']:.3f}",
ha='center', va='bottom', fontsize=8)
plt.title('不同BMI分组的最佳NIPT时点推荐')
plt.xticks(range(len(valid_results)), valid_results.keys())
plt.xlabel('BMI分组')
plt.ylabel('推荐NIPT孕周')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
else:
print("没有有效分组数据可供可视化")
# 检测误差敏感性分析
error_factors = ['检测抽血次数', 'GC含量']
for factor in error_factors:
plt.figure(figsize=(8, 6))
for group_name, group_data in df.groupby('BMI_group'):
X = group_data[[factor]]
y = group_data['Y染色体浓度']
# 计算误差范围(±1标准差)
error_margin = X.std() * 0.5
# 预测达标概率曲线
weeks = np.arange(10, 28, 0.5)
preds = []
for w in weeks:
X_test = X.copy()
X_test[factor] += error_margin # 模拟误差
pred = gam.predict(np.column_stack([X_test,
np.full_like(X_test[factor], group_data['检测抽血次数'].mean()),
np.full_like(X_test[factor], group_data['GC含量'].mean())]))
preds.append(pred.mean())
plt.plot(weeks, preds, label=f'{group_name} ({factor})')
plt.title(f'检测误差对{factor}的影响分析')
plt.xlabel('检测孕周')
plt.ylabel('Y染色体浓度达标概率')
plt.legend()
plt.grid(True)
plt.show()
出现
did not converge
did not converge
did not converge
did not converge
没有有效分组数据可供可视化
C:\Users\刘涵\AppData\Local\Temp\ipykernel_39940\1732474364.py:159: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
for group_name, group_data in df.groupby('BMI_group'):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[10], line 172
170 X_test = X.copy()
171 X_test[factor] += error_margin # 模拟误差
--> 172 pred = gam.predict(np.column_stack([X_test,
173 np.full_like(X_test[factor], group_data['检测抽血次数'].mean()),
174 np.full_like(X_test[factor], group_data['GC含量'].mean())]))
175 preds.append(pred.mean())
177 plt.plot(weeks, preds, label=f'{group_name} ({factor})')
File ~\AppData\Roaming\Python\Python312\site-packages\pygam\pygam.py:465, in GAM.predict(self, X)
450 def predict(self, X):
451 """
452 Preduct expected value of target given model and input X
453 often this is done via expected value of GAM given input X.
(...)
463 containing predicted values under the model
464 """
--> 465 return self.predict_mu(X)
File ~\AppData\Roaming\Python\Python312\site-packages\pygam\pygam.py:438, in GAM.predict_mu(self, X)
435 if not self._is_fitted:
436 raise AttributeError("GAM has not been fitted. Call fit first.")
--> 438 X = check_X(
439 X,
440 n_feats=self.statistics_["m_features"],
441 edge_knots=self.edge_knots_,
442 dtypes=self.dtype,
443 features=self.feature,
444 verbose=self.verbose,
445 )
447 lp = self._linear_predictor(X)
448 return self.link.mu(lp, self.distribution)
File ~\AppData\Roaming\Python\Python312\site-packages\pygam\utils.py:296, in check_X(X, n_feats, min_samples, edge_knots, dtypes, features, verbose)
293 n_feats = max(n_feats, max_feat)
295 # basic diagnostics
--> 296 X = check_array(
297 X,
298 force_2d=True,
299 n_feats=n_feats,
300 min_samples=min_samples,
301 name="X data",
302 verbose=verbose,
303 )
305 # check our categorical data has no new categories
306 if (edge_knots is not None) and (dtypes is not None) and (features is not None):
307 # get a flattened list of tuples
File ~\AppData\Roaming\Python\Python312\site-packages\pygam\utils.py:177, in check_array(array, force_2d, n_feats, ndim, min_samples, name, verbose)
175 # check finite
176 if not (np.isfinite(array).all()):
--> 177 raise ValueError(f"{name} must not contain Inf nor NaN")
179 # check ndim
180 if ndim is not None:
ValueError: X data must not contain Inf nor NaN
注意总样本量为933