import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import logging
from scipy.optimize import minimize
from scipy.stats import norm
import seaborn as sns
from datetime import datetime
from concurrent.futures import ProcessPoolExecutor
import warnings
from scipy.optimize import differential_evolution
import traceback
from scipy import stats
from scipy.interpolate import CubicSpline
warnings.filterwarnings('ignore', category=RuntimeWarning)
# ====================== 全局配置 ======================
ATOMIC_MASS = {'C': 12.0107, 'H': 1.00794, 'Cl': 35.453}
CONFIG = {
'max_rf_ratio': 6.0, # RF极差限制
'rsd_threshold': 0.1, # RSD阈值
'rsd_weight': 300000, # RSD惩罚权重
'outlier_sigma': 1, # 异常值检测阈值
'min_samples_for_rsd': 2 # RSD计算最小样本数
}
# ====================== 主优化类 ======================
class CPOptimizer:
def __init__(self, std_file='Standard_Sample.csv', unknown_file=None, result_path='result'):
self.std_file = std_file
self.unknown_file = unknown_file
self.result_path = result_path
self.standard_samples = None
self.unknown_samples = None
self.chain_optimizers = {}
self.rf_database = {}
os.makedirs(self.result_path, exist_ok=True)
# 配置日志系统
log_file = os.path.join(result_path, 'optimization.log')
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format='[%(asctime)s] %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
self.logger = logging.getLogger('CPOptimizer')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s', '%Y-%m-%d %H:%M:%S'))
self.logger.addHandler(console)
self.logger.info("CPOptimizer initialized")
self.logger.setLevel(logging.DEBUG) # 确保 DEBUG 日志可见
def load_data(self):
"""加载标准样品和未知样品数据"""
try:
# 加载标准样品
self.standard_samples = pd.read_csv(self.std_file)
self.logger.info(f"Loaded standard samples: {len(self.standard_samples)} records")
# 加载未知样品(如果存在)
if self.unknown_file and os.path.exists(self.unknown_file):
self.unknown_samples = pd.read_csv(self.unknown_file)
self.logger.info(f"Loaded unknown samples: {len(self.unknown_samples)} records")
else:
self.logger.info("No unknown samples file found")
return True
except Exception as e:
self.logger.error(f"Error loading data: {str(e)}")
return False
def preprocess_standard_samples(self):
"""预处理标准样品数据"""
self.logger.info("Starting standard samples preprocessing...")
if self.standard_samples is None:
self.logger.error("Standard samples not loaded")
return False
try:
df = self.standard_samples.copy()
# 1. 计算分子量
df['Base_MW'] = df['Carbon Number'].apply(self.calculate_base_mw)
# 2. 计算氯原子数
df['Cls'] = df.apply(self.calculate_cls, axis=1)
# 3. 计算平均分子量
df['Avg_MW'] = df.apply(lambda row: self.calculate_molar_mass(row['Carbon Number'], row['Cls']), axis=1)
# 4. 计算总摩尔量 (nmol)
df['Total_Molar_nmol'] = (df['Original Mass_ng'] * 1e-9) / df['Avg_MW'] * 1e9
# 5. 标准化峰面积
cl_columns = [f'Cl{i}' for i in range(1, 13)]
for col in cl_columns:
if col in df.columns:
df[f'Norm_{col}'] = df[col] / df['Internal Standard']
self.standard_samples = df
# 6. 按碳数分组创建优化器
for carbon_num, group in df.groupby('Carbon Number'):
self.chain_optimizers[carbon_num] = ChainOptimizer(
carbon_num, group, self.result_path, self.logger
)
self.logger.info("Standard samples preprocessed")
return True
except Exception as e:
self.logger.error(f"Error preprocessing standard samples: {str(e)}")
self.logger.error(traceback.format_exc())
return False
def run_optimization(self):
"""运行优化流程"""
self.logger.info("Starting optimization process...")
if not self.chain_optimizers:
self.logger.error("No chain optimizers available")
return False
results = {}
total_chains = len(self.chain_optimizers)
self.logger.info(f"Optimizing {total_chains} carbon chains using parallel processing")
# 使用多进程并行优化
with ProcessPoolExecutor() as executor:
futures = {carbon_num: executor.submit(optimizer.optimize) for carbon_num, optimizer in
self.chain_optimizers.items()}
# 收集结果
successful_chains = 0
for carbon_num, future in futures.items():
try:
result = future.result()
results[carbon_num] = result
if result:
self.logger.info(f"Optimization completed for C{carbon_num}")
successful_chains += 1
else:
self.logger.error(f"Optimization failed for C{carbon_num}")
except Exception as e:
self.logger.error(f"Error optimizing C{carbon_num}: {str(e)}")
results[carbon_num] = False
self.logger.info(f"Successfully optimized {successful_chains}/{total_chains} carbon chains")
# 构建RF数据库
self.build_rf_database()
# 处理未知样品(商业CP)
if self.unknown_samples is not None:
self.logger.info("Processing unknown samples...")
self.process_unknown_samples()
# 生成最终报告
self.logger.info("Generating final report...")
self.generate_final_report(results)
self.logger.info("Optimization process completed")
return successful_chains > 0 # 只要至少有一个碳链优化成功就返回True
def build_rf_database(self):
"""构建响应因子(RF)数据库"""
self.logger.info("Building RF database...")
successful_chains = 0
for carbon_num, optimizer in self.chain_optimizers.items():
rf_values = optimizer.get_rf_values()
if rf_values is not None and len(rf_values) > 0:
self.rf_database[carbon_num] = rf_values
self.logger.info(f"Added RF data for C{carbon_num} with {len(rf_values)} entries")
successful_chains += 1
else:
self.logger.warning(f"No valid RF data for C{carbon_num}")
self.logger.info(f"RF database built with {successful_chains} entries")
def process_unknown_samples(self):
"""处理未知样品"""
self.logger.info("Processing unknown samples...")
try:
df = self.unknown_samples.copy()
results = []
for _, row in df.iterrows():
carbon_num = row['Carbon Number']
# 检查是否有RF数据
if carbon_num not in self.rf_database:
self.logger.warning(f"No RF data for C{carbon_num}, skipping sample")
continue
rf_values = self.rf_database[carbon_num]
# 计算内标摩尔量
is_molar = (row['Internal Standard Mass_ng'] * 1e-9) / row['Internal Standard MW_g/mol'] * 1e9
molar_results = []
for m in range(1, 13):
cl_col = f'Cl{m}'
peak_area = row[cl_col] if cl_col in row else 0
# 获取RF几何平均值
rf_key = f'RF_geo_mean_{m}'
rf_geo_mean = rf_values.get(rf_key, 0.0)
# 计算摩尔量
denominator = row['Internal Standard'] * rf_geo_mean
if denominator > 1e-10 and rf_geo_mean > 0:
molar = (peak_area * is_molar) / denominator
else:
molar = 0.0
molar_results.append(molar)
# 计算总量
total_molar = sum(molar_results)
molar_masses = [self.calculate_molar_mass(carbon_num, m) for m in range(1, 13)]
total_mass = sum([molar * mass for molar, mass in zip(molar_results, molar_masses)])
# 存储结果
result = {'StanID': row.get('StanID', 'Unknown'), 'Carbon_Number': carbon_num,
'Total_Molar_nmol': total_molar, 'Total_Mass_ng': total_mass}
# 添加各氯代数据
for m in range(1, 13):
result[f'Molar_Cl{m}'] = molar_results[m - 1]
result[f'Mass_Cl{m}'] = molar_results[m - 1] * molar_masses[m - 1]
results.append(result)
# 保存结果
result_df = pd.DataFrame(results)
unknown_file = os.path.join(self.result_path, 'Unknown_Samples_Results.csv')
result_df.to_csv(unknown_file, index=False)
self.logger.info(f"Unknown samples processed. Results saved to {unknown_file}")
return True
except Exception as e:
self.logger.error(f"Error processing unknown samples: {str(e)}")
self.logger.error(traceback.format_exc())
return False
def generate_final_report(self, results):
"""生成最终报告"""
self.logger.info("Generating final congener distribution report...")
try:
all_reports = []
for carbon_num, optimizer in self.chain_optimizers.items():
report = optimizer.get_final_report()
if report is not None:
all_reports.append(report)
self.logger.info(f"Added report for C{carbon_num}")
if not all_reports:
self.logger.error("No valid reports to generate final report")
return False
# 合并所有报告
final_report = pd.concat(all_reports, ignore_index=True)
report_file = os.path.join(self.result_path, 'Congener_Distribution_Report.csv')
final_report.to_csv(report_file, index=False)
self.logger.info(f"Final report generated: {report_file}")
return True
except Exception as e:
self.logger.error(f"Error generating final report: {str(e)}")
self.logger.error(traceback.format_exc())
return False
# ====================== 实用方法 ======================
@staticmethod
def calculate_base_mw(n):
"""计算基础分子量 (无氯)"""
return ATOMIC_MASS['C'] * n + ATOMIC_MASS['H'] * (2 * n + 2)
@staticmethod
def calculate_molar_mass(n, m):
"""计算含氯分子量"""
base = ATOMIC_MASS['C'] * n + ATOMIC_MASS['H'] * (2 * n + 2 - m)
return base + ATOMIC_MASS['Cl'] * m
def calculate_cls(self, row):
"""计算氯原子数 (浮点数)"""
n = row['Carbon Number']
target_cl_content = row['Chlorine Content']
base_mw = self.calculate_base_mw(n)
chlorine_increment = ATOMIC_MASS['Cl'] - ATOMIC_MASS['H']
# 初始化最佳值
best_m, min_error = 0, float('inf')
target_error = 0.01
# 遍历1-12氯原子数
for m in range(1, 13):
molar_mass = base_mw + chlorine_increment * m
calculated_cl = (ATOMIC_MASS['Cl'] * m / molar_mass) * 100
error = abs(calculated_cl - target_cl_content)
# 更新最佳值
if error < min_error:
min_error, best_m = error, m
if error <= target_error:
return float(m)
# 浮点优化-完整遍历1-12氯原子数后再确定最优解
m_float, step = float(best_m), 0.1
current_error = min_error # 初始化当前误差为最小误差
for _ in range(50):
if current_error <= target_error:
break
improved = False
for direction in [-1, 1]:
test_m = m_float + direction * step
if test_m < 1 or test_m > 12:
continue
molar_mass = base_mw + chlorine_increment * test_m
calculated_cl = (ATOMIC_MASS['Cl'] * test_m / molar_mass) * 100
error = abs(calculated_cl - target_cl_content)
if error < current_error:
current_error, m_float, improved = error, test_m, True
if not improved:
step /= 2
return m_float
# ====================== 碳链优化类 ======================
class ChainOptimizer:
def __init__(self, carbon_num, samples, result_path, logger):
self.carbon_num = carbon_num
self.samples = samples
self.result_path = result_path
self.logger = logger
# 结果存储
self.result_df = None
self.rf_values = None
self.raw_rf_matrix = {m: [] for m in range(1, 13)}
self.rf_matrix = {m: [] for m in range(1, 13)}
# RF数据存储
self.chain_path = os.path.join(result_path, f'C{carbon_num}')
os.makedirs(self.chain_path, exist_ok=True)
# 配置参数
self.cfg = CONFIG.copy()
self.logger.info(f"Initialized optimizer for C{carbon_num} with {len(samples)} samples")
def _safe_obj(self, func, params, *args):
"""安全的目标函数计算 (防止数值错误)"""
if not np.all(np.isfinite(params)):
return 1e9
try:
return func(params, *args)
except Exception:
return 1e9
# ====================== 优化阶段1 ======================
def _phase1_obj(self, params, samples):
"""第一阶段目标函数 (总量匹配)"""
cod = 0.0 # 目标函数值
# 遍历所有样品
for i, (_, sample) in enumerate(samples.iterrows()):
mu, sigma = params[i * 2], max(0.5, params[i * 2 + 1]) # σ下界保护
# 计算同系物分布
ratios, molars, masses, _ = self.calculate_congener_distribution(mu, sigma,
sample['Total_Molar_nmol'], sample)
# RSR
molar_sum = sum(molars)
if sample['Total_Molar_nmol'] > 0:
cod += ((sample['Total_Molar_nmol'] - molar_sum) ** 2) / sample['Total_Molar_nmol']
# RSM
mass_sum = sum(masses)
if sample['Original Mass_ng'] > 0:
cod += ((sample['Original Mass_ng'] - mass_sum) ** 2) / sample['Original Mass_ng']
return cod
# ====================== 优化阶段2 ======================
def _phase2_obj(self, params, samples):
"""专注最小化RF的RSD,移除总量误差干扰"""
# 1. 重置数据结构
rf_matrix = {m: [] for m in range(1, 13)} # 按氯原子数分组存储RF值
all_sample_rfs = [] # 按样品存储RF值(每个样品一个12元素的列表)
# 2. 计算当前参数下的RF值
for i, (_, sample) in enumerate(samples.iterrows()):
mu = params[i * 2]
sigma = max(0.5, params[i * 2 + 1])
# 计算RF值
_, _, _, rfs = self.calculate_congener_distribution(
mu, sigma, sample['Total_Molar_nmol'], sample
)
# 存储RF值
sample_rfs = []
for m, rf in enumerate(rfs, start=1):
if rf > 1e-6: # 过滤极小值
rf_matrix[m].append(rf)
sample_rfs.append(rf)
else:
sample_rfs.append(0.0) # 保持12个元素
all_sample_rfs.append(sample_rfs) # 添加当前样品的RF序列
# 3. 计算总RSD惩罚
rsd_penalty = 0.0
valid_congeners = 0
for m in range(1, 13):
rf_vals = rf_matrix[m]
n_vals = len(rf_vals)
# 有效性检查
if n_vals < self.cfg['min_samples_for_rsd']:
# 样本不足惩罚 (按缺失比例加权)
rsd_penalty += self.cfg['rsd_weight'] * (
1.0 - n_vals / self.cfg['min_samples_for_rsd']
)
continue
# 计算几何RSD (更稳健)
log_rf = np.log(rf_vals)
log_mean = np.mean(log_rf)
log_std = np.std(log_rf)
geo_rsd = np.sqrt(np.exp(log_std ** 2) - 1) # 几何RSD公式
# 累加惩罚项
rsd_penalty += self.cfg['rsd_weight'] * geo_rsd
valid_congeners += 1
# 4. 添加物理约束 (轻量级)
phys_penalty = 0.0
for i, (_, sample) in enumerate(samples.iterrows()):
mu = params[i * 2]
# μ偏离自然值的惩罚
mu_dev = abs(mu - sample['Cls'])
if mu_dev > 0.4:
phys_penalty += 100 * (mu_dev - 0.3) ** 2
# 5. 添加RF约束惩罚
rf_penalty = 0.0
# a) 低氯代物递增约束 (Cl5-Cl8)
for sample_rfs in all_sample_rfs:
# 检查Cl5-Cl8是否满足递增:Cl5 < Cl6 < Cl7 < Cl8
for m in range(5, 8): # m=5,6,7
rf_current = sample_rfs[m - 1] # Cl(m)
rf_next = sample_rfs[m] # Cl(m+1)
# 跳过无效值(RF=0表示该同族体不存在)
if rf_current <= 1e-6 or rf_next <= 1e-6:
continue
# 检查递增约束
if rf_current >= rf_next:
rf_penalty += 10000 # 重大惩罚
# b) 高氯代物极差约束 (Cl9-Cl12)
for m in range(9, 13): # m=9,10,11,12
rf_vals = rf_matrix[m]
rf_vals = [rf for rf in rf_vals if rf > 1e-6] # 过滤无效值
if len(rf_vals) < 2: # 至少需要2个有效值
continue
max_rf = max(rf_vals)
min_rf = min(rf_vals)
ratio = max_rf / min_rf
# 检查极差约束(最大/最小 ≤ 10)
if ratio > 10.0:
rf_penalty += 3000 * (ratio - 10.0) # 按超出比例惩罚
return rsd_penalty + phys_penalty + rf_penalty
def get_initial_params(self):
"""基于RF一致性生成初始参数"""
rf_matrix = {m: [] for m in range(1, 13)}
# 使用自然μ计算初始RF
for _, sample in self.samples.iterrows():
_, _, _, rfs = self.calculate_congener_distribution(
sample['Cls'],
1.0, # 初始σ=1.0
sample['Total_Molar_nmol'],
sample
)
for m, rf in enumerate(rfs, start=1):
if rf > 1e-6:
rf_matrix[m].append(rf)
# 寻找RF最稳定的μ作为初始点
best_mu, best_rsd = None, float('inf')
for test_mu in np.linspace(3, 9, 7): # 测试3-9之间的μ
total_rsd = 0
valid = 0
for m in range(1, 13):
if rf_matrix[m]:
# 计算调整因子
adj_factors = [norm.pdf(m, test_mu, 1.0) / max(norm.pdf(m, sample['Cls'], 1.0), 1e-6)
for sample in self.samples]
adj_rf = [rf * factor for rf, factor in zip(rf_matrix[m], adj_factors)]
# 计算几何RSD
log_rf = np.log(adj_rf)
log_std = np.std(log_rf)
total_rsd += np.sqrt(np.exp(log_std ** 2) - 1)
valid += 1
avg_rsd = total_rsd / valid if valid else 1e6
if avg_rsd < best_rsd:
best_rsd, best_mu = avg_rsd, test_mu
# 生成初始参数
initial_params = []
for _ in self.samples.iterrows():
initial_params.extend([best_mu, 1.0]) # 统一使用最优μ
return initial_params
# ====================== 主优化函数 ======================
def optimize(self):
"""执行优化过程"""
self.logger.info(f"Starting optimization for C{self.carbon_num}")
start_time = datetime.now()
try:
# ===== 第一阶段:差分进化 =====
self.logger.info(f"Phase 1: Differential Evolution for C{self.carbon_num}")
# 定义第一阶段边界
phase1_bounds = []
for _, sample in self.samples.iterrows():
w = sample['Cls']
phase1_bounds.extend([
(max(1.0, w - 1.5), min(12.0, w + 1.5)), # μ边界
(0.5, 3.0) # σ边界
])
# 动态设置优化参数
n_samples = len(self.samples)
chain_factor = 1.0 + 0.04 * abs(self.carbon_num - 10)
cl_discrete = np.std(self.samples['Chlorine Content'])
maxiter = int(300 * chain_factor + 100 * n_samples * (1 + cl_discrete / 10))
popsize = int(15 * chain_factor + 8 * n_samples * (1 + cl_discrete / 20))
convergence_thresh = 1e-3 if n_samples <= 3 else 1e-4 * np.log10(n_samples)
# 早停机制历史记录
self.best_history = []
# 执行差分进化
res1 = differential_evolution(
self._phase1_obj, phase1_bounds, args=(self.samples,),
maxiter=maxiter,
popsize=popsize,
tol=convergence_thresh,
init='latinhypercube',
callback=self._early_stopping,
workers=1,
updating='immediate',
polish=True, # 确保最后进行局部优化
strategy='best1bin', # 更有效的策略
mutation=(0.5, 1.0), # 动态变异率
recombination=0.9 # 较高的重组率
)
# 添加重启机制
if res1.fun > 1.0 and res1.nit < maxiter // 2: # 结果不理想且提前停止
self.logger.warning(f"C{self.carbon_num} phase-1 restarted (previous best: {res1.fun:.2e})")
# 使用更大种群重启
res1 = differential_evolution(
self._phase1_obj, phase1_bounds, args=(self.samples,),
maxiter=maxiter,
popsize=int(popsize * 1.5), # 增加种群规模
tol=convergence_thresh,
init='latinhypercube',
polish=True,
mutation=(0.7, 1.3), # 更大的变异范围
recombination=0.85
)
# 记录结果
stop_reason = "converged" if res1.success else f"maxiter reached ({res1.nit}/{maxiter})"
self.logger.warning(f"C{self.carbon_num} phase-1 stopped: {stop_reason},Best COD: {res1.fun:.4e}")
best_total = res1.fun
x0 = res1.x
# ===== 第二阶段:L-BFGS-B =====
self.logger.info(f"Phase 2: L-BFGS-B Optimization for C{self.carbon_num}")
# 定义更严格的边界
phase2_bounds = []
for _, sample in self.samples.iterrows():
w = sample['Cls']
phase2_bounds.extend([
(max(1.0, w - 0.5), min(12.0, w + 0.5)), # μ严格边界
(0.5, 3.0) # σ边界
])
# 确保初始值在严格边界内
x0_strict = []
for j, (low, high) in enumerate(phase2_bounds):
x0_strict.append(np.clip(x0[j], low, high))
# 重置历史记录
self.best_history = []
# 执行L-BFGS-B优化 - 修正参数传递
res2 = minimize(
self._phase2_obj,
x0_strict,
args=(self.samples,), # 只传递一个参数
method='L-BFGS-B',
bounds=phase2_bounds,
options={
'maxiter': 1000, # 增加迭代次数
'ftol': 1e-10, # 更严格的收敛阈值
}
)
# 记录结果
stop_reason = ("converged" if res2.success else
f"maxiter reached ({res2.nit}/300)" if res2.nit >= 300 else
"early stopped")
self.logger.info(f"C{self.carbon_num} phase-2 stopped: {stop_reason},Final COD: {res2.fun:.4e}")
# 检查优化结果是否有效
if not np.all(np.isfinite(res2.x)):
self.logger.error(f"Optimization for C{self.carbon_num} produced invalid parameters")
return False
# ===== 处理结果 =====
opt_params = res2.x
results = []
# 计算每个样品的结果
for i, (_, sample) in enumerate(self.samples.iterrows()):
mu = opt_params[i * 2]
sigma = max(0.5, opt_params[i * 2 + 1]) # σ下界保护
result = self.calculate_sample_result(sample, mu, sigma)
if result is not None:
results.append(result)
# 检查是否有有效结果
if not results:
self.logger.error(f"No valid results for C{self.carbon_num}")
return False
# 保存结果
self.result_df = pd.DataFrame(results)
self.calculate_rf_stats()
self.save_results()
self.generate_plots()
duration = (datetime.now() - start_time).total_seconds()
self.logger.info(f"Optimization for C{self.carbon_num} completed in {duration:.2f} seconds")
return True
except Exception as e:
self.logger.error(f"Error optimizing C{self.carbon_num}: {e}")
self.logger.error(traceback.format_exc())
return False
# ====================== 优化辅助方法 ======================
def _early_stopping(self, xk, convergence):
"""稳健的差分进化早停回调函数"""
current_obj = self._phase1_obj(xk, self.samples)
self.best_history.append(current_obj)
n_iter = len(self.best_history)
min_iter = max(50, len(self.samples) * 5) # 基于样本数的动态最小迭代次数
# 1. 强制最小迭代次数
if n_iter < min_iter:
return False
# 2. 目标函数值接近零时直接停止
if current_obj < 1e-6: # 足够小的目标值
self.logger.debug(f"C{self.carbon_num} early stop: target reached ({current_obj:.2e})")
return True
# 3. 改进量检测
window = min(20, n_iter // 3) # 更大的窗口
if n_iter > window:
recent_values = self.best_history[-window:]
improvements = np.abs(np.diff(recent_values))
avg_improve = np.mean(improvements)
# 3.1 基于绝对值和相对值的复合条件
if avg_improve < max(1e-5, current_obj * 0.001): # 绝对阈值或相对0.1%
self.logger.debug(
f"C{self.carbon_num} early stop: avg_improve={avg_improve:.2e} (current={current_obj:.2e})")
return True
# 4. 平台期检测(更稳健的计算)
plateau_window = min(25, n_iter // 4)
if n_iter > plateau_window:
plateau_values = self.best_history[-plateau_window:]
start_val = plateau_values[0]
end_val = plateau_values[-1]
# 避免除以零错误
if abs(start_val) > 1e-10:
relative_improve = abs(start_val - end_val) / abs(start_val)
else:
relative_improve = 0.0
absolute_improve = abs(start_val - end_val)
# 复合平台检测条件
if (relative_improve < 0.001) and (absolute_improve < 0.1 * current_obj):
self.logger.debug(f"C{self.carbon_num} early stop: plateau detected "
f"(rel={relative_improve:.2%}, abs={absolute_improve:.2e}, current={current_obj:.2e})")
return True
return False
def _lbfgsb_callback(self, xk):
"""L-BFGS-B早停回调函数"""
current_obj = self._phase2_obj(xk, self.samples, self.best_history[0] if self.best_history else 1e9)
self.best_history.append(current_obj)
# 检查最近3次迭代改进
if len(self.best_history) > 3:
recent_improve = abs(np.diff(self.best_history[-3:])).mean()
if recent_improve < 5e-5: # 更严格的收敛阈值
# 通过抛出特殊异常停止优化
raise StopIteration("Convergence achieved")
def calculate_sample_result(self, sample, mu_opt, sigma_opt):
"""计算单个样品结果"""
try:
total_molar = sample['Total_Molar_nmol']
# 计算同系物分布
ratios, molars, masses, rfs = self.calculate_congener_distribution(
mu_opt, max(sigma_opt, 0.3), total_molar, sample
)
# 检查计算结果是否有效
if not all(np.isfinite(ratios)) or not all(np.isfinite(molars)) or not all(np.isfinite(masses)):
self.logger.warning(f"Invalid calculation results for sample {sample['StanID']}")
return None
# 存储RF值
for m, rf in enumerate(rfs, start=1):
if rf > 0:
self.raw_rf_matrix[m].append(rf)
self.rf_matrix[m].append(rf)
# 计算残差
original_mass = sample['Original Mass_ng']
calculated_mass = sum(masses)
rsr = ((total_molar - sum(molars)) ** 2) / total_molar if total_molar > 0 else 0
rsm = ((original_mass - calculated_mass) ** 2) / original_mass if original_mass > 0 else 0
# 构建结果字典
result_data = {
'StanID': sample['StanID'],
'Carbon_Number': self.carbon_num,
'Chlorine_Content': sample['Chlorine Content'],
'Total_Molar_nmol': total_molar,
'Total_Mass_ng': original_mass,
'Cls': sample['Cls'],
'mu_natural': sample['Cls'],
'mu_optimized': mu_opt,
'mu_dev': abs(mu_opt - sample['Cls']),
'sigma': sigma_opt,
'RSR': rsr,
'RSM': rsm
}
# 添加各氯代数据
for m in range(1, 13):
result_data[f'Ratio_Cl{m}'] = ratios[m - 1] if len(ratios) > m - 1 else 0.0
result_data[f'Molar_Cl{m}'] = molars[m - 1] if len(molars) > m - 1 else 0.0
result_data[f'Mass_Cl{m}'] = masses[m - 1] if len(masses) > m - 1 else 0.0
result_data[f'RF_Cl{m}'] = rfs[m - 1] if len(rfs) > m - 1 else 0.0
norm_col = f'Norm_Cl{m}'
result_data[norm_col] = sample[norm_col] if norm_col in sample else 0.0
return result_data
except Exception as e:
self.logger.error(f"Error calculating sample result for {sample['StanID']}: {e}")
return None
def calculate_congener_distribution(self, mu, sigma, total_molar, sample):
"""计算同系物分布和RF值"""
try:
n = self.carbon_num
m_values = np.arange(1, 13)
# 计算正态分布PDF
pdf_values = norm.pdf(m_values, mu, max(sigma, 0.3)) # σ下界保护调整为0.5
total_pdf = np.sum(pdf_values)
if total_pdf > 0:
ratios = pdf_values / total_pdf
else:
ratios = np.zeros(12)
# 检查比率是否有效
if not all(np.isfinite(ratios)):
self.logger.warning(f"Invalid PDF ratios for mu={mu}, sigma={sigma}")
ratios = np.zeros(12)
# 计算摩尔量、质量和RF
molars, masses, rfs = [], [], []
is_molar = (sample['Internal Standard Mass_ng'] * 1e-9) / sample['Internal Standard MW_g/mol'] * 1e9
for m, ratio in zip(range(1, 13), ratios):
# 摩尔量
molar = total_molar * ratio
mw = CPOptimizer.calculate_molar_mass(n, m)
mass = molar * mw
molars.append(molar)
masses.append(mass)
# 响应因子(RF)
cl_col = f'Cl{m}'
peak_area = sample[cl_col] if cl_col in sample else 0
if molar > 1e-10 and peak_area > 0:
denominator = max(sample['Internal Standard'] * molar, 1e-10) # 防止除以0
rf = (peak_area * is_molar) / denominator
else:
rf = 0.0
rfs.append(rf)
return ratios, molars, masses, rfs
except Exception as e:
self.logger.error(f"Error calculating congener distribution: {e}")
return np.zeros(12), np.zeros(12), np.zeros(12), np.zeros(12)
# ====================== RF统计方法 ======================
def calculate_rf_stats(self):
"""使用优化后的参数重新计算RF,避免中间状态污染"""
self.rf_values = {}
rf_matrix = {m: [] for m in range(1, 13)}
# 检查是否有有效的结果数据
if self.result_df is None or self.result_df.empty:
self.logger.warning(f"No result data available for C{self.carbon_num}")
return
# 使用最终参数重新计算RF
for _, row in self.result_df.iterrows():
# 查找对应的原始样品数据
sample_match = self.samples[self.samples['StanID'] == row['StanID']]
if sample_match.empty:
self.logger.warning(f"No matching sample found for StanID: {row['StanID']}")
continue
sample = sample_match.iloc[0]
# 使用优化后的参数计算RF
_, _, _, rfs = self.calculate_congener_distribution(
row['mu_optimized'],
row['sigma'],
row['Total_Molar_nmol'],
sample
)
for m, rf in enumerate(rfs, start=1):
if rf > 1e-6:
rf_matrix[m].append(rf)
# 计算几何平均和RSD
for m in range(1, 13):
rf_vals = rf_matrix[m]
if len(rf_vals) > 0:
log_vals = np.log(rf_vals)
geo_mean = np.exp(np.mean(log_vals))
log_std = np.std(log_vals)
geo_rsd = np.sqrt(np.exp(log_std ** 2) - 1)
else:
geo_mean, geo_rsd = 0.0, np.nan
self.rf_values[f'RF_geo_mean_{m}'] = geo_mean
self.rf_values[f'RF_geo_rsd_{m}'] = geo_rsd * 100 # 转换为百分比
def _calc_robust_rsd(self, rf_vals):
"""计算基于MAD的稳健RSD"""
if len(rf_vals) < self.cfg['min_samples_for_rsd']:
return np.nan # 样本不足返回NaN
# 稳健RSD (MAD-based)
median = np.median(rf_vals)
mad = stats.median_abs_deviation(rf_vals, scale='normal')
return mad / median if median > 0 else np.nan
# ====================== 结果处理 ======================
def save_results(self):
"""保存结果到CSV"""
if self.result_df is not None:
result_file = os.path.join(self.chain_path, f'C{self.carbon_num}_Results.csv')
self.result_df.to_csv(result_file, index=False)
self.logger.info(f"Results for C{self.carbon_num} saved to {result_file}")
def generate_plots(self):
"""生成所有可视化图表"""
if self.result_df is None or self.result_df.empty:
return
# 生成各样品拟合曲线
for _, sample in self.samples.iterrows():
sample_id = sample['StanID']
sample_results = self.result_df[self.result_df['StanID'] == sample_id]
if sample_results.empty:
continue
result = sample_results.iloc[0]
self.plot_fit_curve(result, sample['Chlorine Content'])
# 生成RF聚类图
self.plot_rf_cluster()
# 生成偏差图
self.plot_deviation()
# 生成RSD热力图
self.plot_rsd_heatmap()
def plot_fit_curve(self, result, chlorine_content):
"""绘制拟合曲线图"""
try:
plt.figure(figsize=(10, 6))
# 获取优化参数
mu_opt = result['mu_optimized']
sigma = max(0.3, result['sigma'])
mu_natural = result['mu_natural']
total_molar = result['Total_Molar_nmol']
n = self.carbon_num
# 计算理论分布
m_values = np.arange(1, 13)
dense_m = np.linspace(1, 12, 200)
pdf_values = norm.pdf(m_values, mu_opt, sigma)
total_pdf = pdf_values.sum()
if total_pdf > 0:
pdf_values = pdf_values / total_pdf
dense_pdf = norm.pdf(dense_m, mu_opt, sigma)
if total_pdf > 0:
dense_pdf = dense_pdf / total_pdf
# 实际分布
actual_molars = [result[f'Molar_Cl{m}'] for m in range(1, 13)]
actual_ratios = [m / total_molar for m in actual_molars] if total_molar > 0 else [0] * 12
# 绘制曲线
plt.plot(dense_m, dense_pdf, 'r-', linewidth=2, label=f'Fit: μ={mu_opt:.2f}, σ={sigma:.2f}')
plt.plot(m_values, pdf_values, 'rx', markersize=10, label='Predicted Ratios')
plt.plot(m_values, actual_ratios, 'bo', markersize=8, label='Actual Ratios')
# 添加双x轴(氯含量)
ax1 = plt.gca()
ax2 = ax1.twiny()
ax2.set_xlim(ax1.get_xlim())
cl_contents = [(ATOMIC_MASS['Cl'] * m) / CPOptimizer.calculate_molar_mass(n, m) * 100 for m in m_values]
ax2.set_xticks(m_values)
ax2.set_xticklabels([f'{c:.1f}%' for c in cl_contents], fontsize=8, color='#888888')
ax2.set_xlabel('Chlorine Content', fontsize=10, color='#888888')
# 添加参考线
plt.axvline(mu_natural, color='#888888', linestyle='--', label=f'Natural μ: {mu_natural:.2f}')
plt.axvline(mu_opt, color='red', linestyle='--', label=f'Optimized μ: {mu_opt:.2f}')
# 图表装饰
plt.title(f'C{self.carbon_num} - {chlorine_content}% Chlorine', fontsize=14)
plt.xlabel('Chlorine Substitution (m)', fontsize=12)
plt.ylabel('Normalized Distribution', fontsize=12)
plt.xticks(range(1, 13))
plt.ylim(bottom=-0.05)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(loc='upper right')
# 保存图表
plot_file = os.path.join(self.chain_path, f'Fit_Curve_C{self.carbon_num}_{chlorine_content}.png')
plt.tight_layout()
plt.savefig(plot_file, dpi=300)
plt.close()
self.logger.info(f"Fit curve plot saved: {plot_file}")
except Exception as e:
self.logger.error(f"Error generating fit curve plot: {str(e)}")
def plot_rf_cluster(self):
"""绘制RF聚类图"""
try:
plt.figure(figsize=(12, 8))
rf_data = []
# 收集RF数据
for _, row in self.result_df.iterrows():
sample_id = row['StanID']
chlorine_content = row['Chlorine_Content']
rfs = [row[f'RF_Cl{m}'] for m in range(1, 13)]
rf_data.append({'Sample': f'{sample_id} ({chlorine_content}%)', 'RF': rfs})
# 创建DataFrame
df_rf = pd.DataFrame({
'Chlorine Substitution': np.tile(range(1, 13), len(rf_data)),
'RF': np.concatenate([d['RF'] for d in rf_data]),
'Sample': np.repeat([d['Sample'] for d in rf_data], 12)
})
# 绘制条形图
sns.barplot(x='Chlorine Substitution', y='RF', hue='Sample', data=df_rf, palette='viridis', errorbar=None)
# 图表装饰
plt.title(f'Response Factor (RF) Cluster - C{self.carbon_num}', fontsize=16)
plt.xlabel('Chlorine Substitution', fontsize=12)
plt.ylabel('Response Factor (RF)', fontsize=12)
plt.yscale('log')
#plt.yscale('linear')
#plt.ylim(0, 3)
#plt.ylim(1e-3, 1e4)
plt.legend(title='Sample', loc='upper right', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.5, which='both')
# 保存图表
plot_file = os.path.join(self.chain_path, f'RF_Cluster_C{self.carbon_num}.png')
plt.tight_layout()
plt.savefig(plot_file, dpi=300)
plt.close()
self.logger.info(f"RF cluster plot saved: {plot_file}")
except Exception as e:
self.logger.error(f"Error generating RF cluster plot: {str(e)}")
def plot_deviation(self):
"""绘制偏差图"""
try:
fig, ax = plt.subplots(figsize=(10, 6))
# 准备数据
rsr_values = self.result_df['RSR'].values
rsm_values = self.result_df['RSM'].values
sample_ids = [f"{row['StanID']} ({row['Chlorine_Content']}%)"
for _, row in self.result_df.iterrows()]
# 绘制柱状图
x = np.arange(len(sample_ids))
width = 0.35
rsr_bars = ax.bar(x - width / 2, rsr_values, width, label='RSR (Molar)', color='skyblue', alpha=0.8)
rsm_bars = ax.bar(x + width / 2, rsm_values, width, label='RSM (Mass)', color='salmon', alpha=0.8)
# 添加数值标签
for bar in rsr_bars:
height = bar.get_height()
ax.annotate(f'{height:.2e}', xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in rsm_bars:
height = bar.get_height()
ax.annotate(f'{height:.2e}', xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
# 图表装饰
ax.set_xlabel('Sample', fontsize=12)
ax.set_ylabel('Residual Value', fontsize=12)
ax.set_title(f'Calculation Deviation - C{self.carbon_num}', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(sample_ids, rotation=45, ha='right', fontsize=10)
ax.legend(loc='upper right')
ax.grid(True, linestyle='--', alpha=0.3, axis='y')
# 保存图表
plot_file = os.path.join(self.chain_path, f'Deviation_C{self.carbon_num}.png')
plt.tight_layout()
plt.savefig(plot_file, dpi=300)
plt.close()
self.logger.info(f"Deviation plot saved: {plot_file}")
except Exception as e:
self.logger.error(f"Error generating deviation plot: {str(e)}")
def plot_rsd_heatmap(self):
"""单行热力图:显示优化后的几何RSD"""
try:
# 1. 准备数据(使用几何RSD)
rsd_row = []
for m in range(1, 13):
# 直接从RF值中获取几何RSD
rsd = self.rf_values.get(f'RF_geo_rsd_{m}', np.nan)
rsd_row.append(rsd)
# 2. DataFrame 形式:一行、列名 Cl1…Cl12
heatmap_df = pd.DataFrame([rsd_row],
columns=[f'Cl{i}' for i in range(1, 13)])
# 3. 绘图
plt.figure(figsize=(12, 1.8))
sns.heatmap(
heatmap_df,
annot=True,
fmt='.1f',
cmap='RdYlGn_r',
linewidths=0.5,
linecolor='#888888',
vmin=0,
vmax=20,
cbar_kws={'label': 'RSD (%)'},
annot_kws={'size': 10}
)
# 4. 装饰
plt.title(f'Optimized RF RSD - C{self.carbon_num}', fontsize=13)
plt.xlabel('Congener', fontsize=11)
plt.ylabel('') # 去掉纵轴标签
plt.yticks([]) # 去掉纵轴刻度
plt.xticks(rotation=0)
# 5. 保存
plot_file = os.path.join(self.chain_path,
f'Optimized_RSD_Row_Heatmap_C{self.carbon_num}.png')
plt.tight_layout()
plt.savefig(plot_file, dpi=300)
plt.close()
self.logger.info(f"Row-wise RSD heatmap saved: {plot_file}")
except Exception as e:
self.logger.error(f"Error generating row RSD heatmap: {str(e)}\n{traceback.format_exc()}")
# ====================== 接口方法 ======================
def get_final_report(self):
"""获取最终报告"""
return self.result_df.copy() if self.result_df is not None else None
def get_rf_values(self):
"""获取RF值 - 确保始终返回有效字典"""
if self.rf_values is None or len(self.rf_values) == 0:
# 如果没有RF值,尝试重新计算
self.logger.warning(f"No RF values found for C{self.carbon_num}, attempting to recalculate...")
self.calculate_rf_stats()
# 如果仍然没有RF值,返回一个包含默认值的字典
if self.rf_values is None or len(self.rf_values) == 0:
self.logger.error(f"Failed to calculate RF values for C{self.carbon_num}, returning default values")
default_rf = {}
for m in range(1, 13):
default_rf[f'RF_geo_mean_{m}'] = 1.0
default_rf[f'RF_geo_rsd_{m}'] = 100.0 # 高RSD表示不可靠
return default_rf
return self.rf_values
# ====================== 主函数 ======================
def main():
"""主执行函数"""
# 初始化优化器
optimizer = CPOptimizer(
std_file='Standard_Sample.csv',
unknown_file='Unknown_Samples.csv',
result_path='result'
)
# 执行优化流程
if optimizer.load_data():
if optimizer.preprocess_standard_samples():
optimizer.run_optimization()
optimizer.logger.info("Optimization completed successfully")
else:
optimizer.logger.error("Standard sample preprocessing failed")
else:
optimizer.logger.error("Data loading failed")
if __name__ == "__main__":
main()
最新发布