Line Plots and curve(plt.annotate)

本文详细介绍了使用Python的Matplotlib库进行数据可视化的方法,包括基本的线条绘制、自定义颜色和样式、调整线宽和标记,以及如何在子图上进行注释和绘制形状。通过实例展示了如何从文件读取数据并进行绘图,涵盖了Series和DataFrame的绘图方法,以及如何创建和调整子图。

In [1]: import matplotlib.pyplot as plt

In [2]: X = range(100)

           list(X)[:5]

Out[2]: [0, 1, 2, 3, 4]

 

In [3]: Y=[value **2 for value in X]

           Y[:5]

Out[3]: [0, 1, 4, 9, 16]

 

In [15]: plt.plot(X,Y)

Out[15]:  # if not plt.show()

[<matplotlib.lines.Line2D at 0x5219b00>]

https://i-blog.csdnimg.cn/blog_migrate/3d656a941a3676d68a797ebe15e7027c.png ​​

In [13]: plt.plot(X,Y)

             plt.show()

Out[13]:

https://i-blog.csdnimg.cn/blog_migrate/3f18bd603aac8d2f4e87c7fa93de951b.png

In [29]: import numpy as np

             import matplotlib.pyplot as plt

In [30]: XList = np.linspace(-3,2,200)

             YList = XList**2 - 2*XList +1     #ploting Y= X^2 - 2X -1

In [31]: plt.plot(XList, YList)

             plt.show()

https://i-blog.csdnimg.cn/blog_migrate/f289d1c4eb4b0d5b27619a8ddfd5b0f2.png ​​

In [49]: import matplotlib.pyplot as plt

In [54]: XList, YList=[], []

      for line in open('my_data.txt', 'r'):

         values =[float(v) for v in line.split()]

         XList.append(values[0]) #x-axis

         YList.append(values[1]) #y-axis

In [55]: plt.plot(XList, YList)

             plt.show()

https://i-blog.csdnimg.cn/blog_migrate/6e14f7af24b91cb6488f7fa209e79385.png ​​

In [56]: import matplotlib.pyplot as plt

In [57]: with open('my_data.txt', 'r') as f:

      X,Y= zip(*[

                          [ float(s) for s in line.split() ] for line in f

                      ])#2D-list, each element is a list in this 2D-list

In [58]: plt.plot(X,Y)

             plt.show()

https://i-blog.csdnimg.cn/blog_migrate/5990a2d9e293566a354805a711ee8724.png

In [59]: import matplotlib.pyplot as plt

             import numpy as np

In [60]: data = np.loadtxt('my_data.txt') #return 2D array or called full-blown matrices

In [61]: plt.plot(data[:,0], data[:,1])        

#0-->columnIndex-->X- axis, 1-->columnIndex --> Y-axis

             plt.show()

https://i-blog.csdnimg.cn/blog_migrate/4d5ac14b844d6880e091f4cc45a69254.png

##################################

'my_data.txt'

0 0 6
1 1 5
2 4 4
4 16 3
5 25 2
6 36 1

##################################

In [62]: import numpy as np

             import matplotlib.pyplot as plt

In [63]: data = np.loadtxt('my_data.txt')

            data.T

Out[63]: array([[ 0., 1., 2., 4., 5., 6.],

              [ 0., 1., 4., 16., 25., 36.],

              [ 6., 5., 4., 3., 2., 1.]])

In [67]: for columnList in data.T[1:]:            #excluding data.T[0]

                  plt.plot(data[:,0], columnList)       # the len(columnList) == the number of curves

             plt.show()

https://i-blog.csdnimg.cn/blog_migrate/ca481849c4cc4fd71d4c1e0708b3e23e.png

In [68]: data

Out[68]: array([[ 0., 0., 6.], [ 1., 1., 5.], [ 2., 4., 4.], [ 4., 16., 3.], [ 5., 25., 2.], [ 6., 36., 1.]])

#######################################################################

cp2_customizing the Color and Styles

1 Alias Colors

  1. b Blue
  2. g Green
  3. r Red
  4. c Cyan m Magenta
  5. y Yellow
  6. k Black
  7. w White

1.0.1 Gray-level strings: matplotlib will interpret a string representation of a floating point- value as a shade of gray, such as 0.75 for a medium light gray.

In [1]: import numpy as np

           import matplotlib.pyplot as plt

In [2]: #Gaussian Distribution

           #Probability density function

           def pdf(X, mu, sigma):

                  a = 1./(sigma*np.sqrt(2.*np.pi))

                  b =-1./(2* sigma**2)

                  return a * np.exp(b* (X-mu)**2 ) #f(x)

In [3]: X = np.linspace(-6,6,1000)

           for i in range(5):

                 #generates five sets of 50 samples from a normal distribution

                  samples = np.random.standard_normal(50)

                  mu, sigma = np.mean(samples), np.std(samples)

                  #For each of the 5 sets, we plot the estimated probability density in light gray

                  plt.plot(X, pdf(X, mu, sigma), color='0.75')

           plt.plot(X, pdf(X,0., 1.), color='k')# standard normal distribution(mean=0,stdev=1)

           plt.show()

In [6]: samples=np.random.standard_normal(50)

           samples.shape

Out[6]: (50L,)

In [7]: mu, sigma = np.mean(samples), np.std(samples)

           mu,sigma

Out[7]: (-0.20688694794049467, 0.8455170647061793)

Controlling a line pattern and thickness

# In[20]:


import numpy as np
import matplotlib.pyplot as plt


# ### Normal Distribution~(mean,Variance)=(0,1)
# #### Gaussians Probability Density Functions~(mean,Variance)=(u,sigma**2 )

# In[21]:


#Gaussians PDF
def pdf(X, mu, sigma):
    a=1./( sigma * np.sqrt(2. * np.pi))
    b=-1./( 2. * sigma **2)
    return a*np.exp(b*(X-mu)**2)


# In[22]:


X = np.linspace(-6,6,1024)


# In[23]:


plt.plot(X, pdf(X, 0., 1.), color='k', linestyle='solid')
plt.plot(X, pdf(X, 0., .5), color='k', linestyle='dashed')
plt.plot(X, pdf(X, 0., .25), color='k', linestyle='dashdot')

plt.show()

The line width

# In[62]:
import numpy as np
import matplotlib.pyplot as plt


# In[65]:
def pdf(X, mu, sigma):
    a = 1./(sigma*np.sqrt(2. * np.pi))
    b= -1./(2.* sigma**2)
    return a*np.exp(b* (X-mu)**2)


# In[69]:
X = np.linspace(-6,6,1024)
for i in range(64):
    samples = np.random.standard_normal(50)
    mu, sigma = np.mean(samples), np.std(samples)
    plt.plot(X, pdf(X, mu, sigma), color='.75', linewidth=0.5)

plt.plot(X, pdf(X,0,1), color='k', linewidth=3.)           #u =0, std.dev=1  ~ standard normal distribution
plt.show()

markevery=32 will plot every 32th marker starting from the first data point

# In[104]:
import numpy as np
import matplotlib.pyplot as plt


# In[100]:
X = np.linspace(-6,6,1024)
Y1 = np.sinc(X)               #The sinc function is sin(pi x)/(pi x).
Y2= np.sinc(X)+1
Y2


# In[103]:
plt.plot(X,Y1, marker='o', color='.75')
#markevery=32 will plot every 32th marker starting from the first data point
plt.plot(X,Y2, marker='o', color='k', markevery=32)  

plt.show()

# In[117]:
import numpy as np
import matplotlib.pyplot as plt


# In[118]:
X = np.linspace(-6,6,1024)
Y = np.sinc(X)


# In[120]:
plt.plot(X,Y,
        linewidth=3.,
        color='k',
        markersize=9,
        markeredgewidth=3.5,
        markerfacecolor='.75',
        markeredgecolor='k',
        marker='o',
        markevery=32)

plt.show()

 

# In[121]:
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt


# In[173]:
mpl.rc('lines', linewidth=5)
mpl.rc('axes', facecolor='y', edgecolor='w')
mpl.rc('xtick', color='m')
mpl.rc('ytick', color='m')
mpl.rc('text', color='m')
mpl.rc('figure', facecolor='r', edgecolor='y')
#mpl.rc('axes', color_cycle = ('w', '.5', '.75'))


# In[172]:
X = np.linspace(0,7,1024)


# In[170]:
plt.plot(X, np.sin(X))
plt.plot(X, np.cos(X))

plt.show()

If a matplotlibrc file is found in your current directory (that is, the directory from where you launched your script from), it will override matplotlib's default settings. You can also save your matplotlibrc file in a specific location to make your own default settings. In the interactive Python shell, run the following command: import matplotlib mpl.get_configdir() This command will display the location where you can place your matplotlibrc file so that those settings will be your own default settings.

Line Plots

The Series object’s index is passed to matplotlib for plotting on the x-axis, though you can disable this by passing use_index=False.

The x-axis ticks and limits can be adjusted with the xticks and xlim options, and y-axis respectively with yticks and ylim.

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

                        #Y-axis                     #X-axis

s = pd.Series( np.random.randn(10).cumsum(), index=np.arange(0,100,10) )

s.plot()

 

plt.show()

 

 

DataFrame’s plot method plots each of its columns as a different line on the same subplot, creating a legend automatically; The plot attribute contains a “family” of methods for different plot types. For example, df.plot() is equivalent to df.plot.line().

 

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

 

df = pd.DataFrame(np.random.randn(10,4).cumsum(0),

                 columns=['A','B','C','D'],

                 index = np.arange(0,100,10))

df.plot()

plt.show()

Figures and Subplots

Plots in matplotlib reside within a Figure object.

 

import matplotlib.pyplot as plt

fig = plt.figure()

ax = fig.add_subplot(1,1,1)

#pass no label or label='_nolegend_'

ax.plot(randn(1000).cumsum(), color='k', label='one')

ax.plot(randn(1000).cumsum(), color='k', linestyle='--', label='two')

ax.plot(randn(1000).cumsum(), color='k', linestyle='dotted',label='three')

 

ticks = ax.set_xticks([0,250,500,750,1000])

labels = ax.set_xticklabels(['one','two', 'three', 'four', 'five'], rotation=30, fontsize='small')

 

ax.set_title('My first matplotlib plot')

ax.set_xlabel('Stages')

////////////////////////////////////////  OR

props = {

        'title': 'My first matplotlib plot',

        'xlabel': 'Stages'

}

ax.set(**props)

////////////////////////////////////////

ax.legend(loc='best')

plt.show()

matplotlib draws on the last figure and subplot used (creating one if necessary), thus hiding the figure and subplot creation.

plt.plot(np.random.randn(50).cumsum(), color='black', ls='--')

 

matplotlib includes a convenience method, plt.subplots, that creates a new figure and returns a NumPy array containing the created subplot objects, the axes array can be easily indexed like a two-dimensional array; for example, axes[0, 1].

Adjusting the spacing around subplots

plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)

 

# 1

fig, axes = plt.subplots(2,2, sharex=True, sharey= True)

for i in range(2):

    for j in range(2):

        axes[i,j].hist(np.random.randn(500), bins =5, color='k', alpha=0.5) #hist: frequency

plt.subplots_adjust(wspace=0.05, hspace=0.05)

# 2

from numpy.random import randn

arr=randn(30)

arrCumSum=arr.cumsum()

plt.plot(arrCumSum, color='k', linestyle='dashed', drawstyle='steps-post', label='steps-post', marker='o')

plt.legend(loc='best')     #label='steps-post'

plt.show()

Annotations and Drawing on a Subplot

spx.csv file

......

import numpy as np

import pandas as pd

from datetime import datetime

                                       #index_col : int or sequence or False, default None

data = pd.read_csv('../examples/spx.csv',parse_dates=True, index_col=0)

spx = data['SPX']  #'SPX' column  # spx: pandas.core.series.Series

 

crisis_data=[

    (datetime(2007, 10, 11), 'Peak of bull market'),  #tuple

    (datetime(2008,  3, 12), 'Bear Stearns Fails'),

    (datetime(2008,  9, 15), 'Lehman Bankruptcy')

]

# // matplotlib Configuration

plt.rc('figure', figsize=(10,10))

font_options={

    'family': 'monospace',

    'weight': 'bold',

    'size': 16

}

plt.rc('font', **font_options)

 

fig = plt.figure()

ax = fig.add_subplot(1,1,1)

# spx:     date: X-axis          #   SPX: y-axis

spx.plot(ax=ax, color='green', linestyle='-')

for date, label in crisis_data:

    ax.annotate(  label,

                ha='left',

                va='top',

                xytext=(date, spx.asof(date) + 225), #The xytext parameter specifies the text position               

                xy=(date, spx.asof(date) + 75),     #The xy parameter specifies the arrow's destination 

                arrowprops=dict(facecolor='blue', headwidth=10, headlength=4, width=2 ),

                #arrowprops={'facecolor':'blue', 'headwidth':10, 'headlength':4, 'width':2} #OR

               )

#Zoom in on 2007-2010

ax.set_xlim(['1/1/2007', '1/1/2011'])

ax.set_ylim([600,1800])

ax.set_title('Important dates in the 2008-2009 financial crisis')

plt.show()

Adding arrows

The aspect of the arrow is controlled by a dictionary passed to the arrowprops parameter: 'arrowstyle': The parameters ''<-'', ''<'', ''-'', ''wedge'',''simple'', and ''fancy'' control the style of the arrow 'facecolor': This is the color used for the arrow. It will be used to set the background and the edge color 'edgecolor': This is the color used for the edges of the arrow's shape 'alpha': This is used to set the transparency level so that the arrow blends with the background

The shrink parameter controls the gap between the arrow's endpoints and the arrow itself.

 

import matplotlib.pyplot as plt

fig = plt.figure()

ax = fig.add_subplot(1,1,1)  # return first plot’s reference

                                      #Left-Bottom corner position

#Width, Height

rect = plt.Rectangle( (0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3 )

                             #center point position

circ = plt.Circle( (0.7, 0.2), radius=0.15, color='b', alpha=0.3 )

polygon = plt.Polygon(    [  [0.15, 0.15],

                                        [0.35, 0.4],

                                       [0.2,0.6]

                       ],

                       color='g',

                       alpha=0.5

                     )

 

ax.add_patch(rect)

ax.add_patch(circ)

ax.add_patch(polygon)

 

plt.show()  #  plt.savefig('figpath.png',dpi=400, bbox_inches='tight')

#dpi, which controls the dots-per-inch resolution

#bbox_inches, which can trim the whitespace around the actual figure.

 

 

from io import BytesIO

buffer = BytesIO()

plt.savefig(buffer)

plot_data = buffer.getvalue()

 

Adding shapes

Internally, a shape is described as a path called patch in matplotlib API. Paths for several kinds of shapes are available in the matplotlib.patches module. As is the case for lines, creating a path won't be enough to render it; you will have to signal that you want to render it. This is done by pyplot.gca().add_patch().

A lot of path constructors are available. Let's review those used in the example: 

Circle: This takes the coordinates of its center and the radius as the parameters

Rectangle: This takes the coordinates of its lower-left corner and its size as the parameters 

Ellipse: This takes the coordinates of its center and the half-length of its two axes as the parameters 

FancyBox: This is like a rectangle but takes an additional boxstyle parameter (either 'larrow', 'rarrow', 'round', 'round4', 'roundtooth', 'sawtooth', or 'square')

 

 

import matplotlib.patches as patches

import matplotlib.pyplot as plt

 

#Circle             #center point position

shape = patches.Circle((0,0), radius=1., color='.75')

plt.gca().add_patch(shape)

 

#Rectangle             #Left-Bottom corner position

shape = patches.Rectangle((2.5,-.5), 2.,      1., color='.75')

plt.gca().add_patch(shape)        #Length, Width

 

#Ellipse             #center position

shape = patches.Ellipse((0,-2.),  2.,              1.,  angle=45., color='.75')

plt.gca().add_patch(shape)    #2a:long-axis=2*2, 2b: short-axis=1*2

 

#Fancy box                   #inner-rectangle Left-Bottom

shape = patches.FancyBboxPatch((2.5, -2.5), 2., 1.,  boxstyle='sawtooth', color='.75')

plt.gca().add_patch(shape)

 

#display all

plt.grid(True)

plt.axis('scaled')

plt.show()

Working with polygons

import numpy as np

import matplotlib.patches as patches

import matplotlib.pyplot as plt

 

theta = np.linspace(0,2*np.pi, 9) # 8 points

points = np.vstack((np.cos(theta), np.sin(theta))).transpose()

 

plt.gca().add_patch(patches.Polygon(points, color='.75'))

 

plt.grid(True)

plt.axis('scaled')

plt.show()

Working with path attributes

import numpy as np

import matplotlib.patches as patches

import matplotlib.pyplot as plt

 

theta = np.linspace(0,2*np.pi, 6)# 5 points

points = np.vstack(( np.cos(theta), np.sin(theta) )).transpose()

 

plt.gca().add_patch( plt.Circle( (0,0), radius=1., color='.75' ) )

plt.gca().add_patch( plt.Polygon( points, closed=None, fill=None, lw=3., ls='dashed', edgecolor='k') )

 

plt.grid(True)

plt.axis('scaled')

plt.show()

 

 

 

import os import torch import torch.nn as nn import torch.optim as optim import pandas as pd import pydicom import numpy as np from sklearn.model_selection import train_test_split from skimage.transform import resize from torch.utils.data import Dataset, DataLoader from tqdm import tqdm import torchvision.transforms as transforms from torchvision.transforms.functional import to_pil_image from sklearn.utils.class_weight import compute_class_weight from torch.optim.lr_scheduler import ReduceLROnPlateau import random import matplotlib.pyplot as plt from joblib import Memory from pytorch_grad_cam import GradCAM from pytorch_grad_cam.utils.image import show_cam_on_image from sklearn.calibration import calibration_curve from sklearn.isotonic import IsotonicRegression from sklearn.linear_model import LogisticRegression import torchxrayvision as xrv # 导入torchxrayvision库 from sklearn.metrics import ( precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve, auc, precision_recall_curve, brier_score_loss # 确保包含这个 ) from imblearn.over_sampling import SMOTE # 设置可见的GPU os.environ["CUDA_VISIBLE_DEVICES"] = "2" # 修改为你要使用的GPU编号 # 硬件配置 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.backends.cudnn.benchmark = True # 启用CUDA优化 # 数据配置 - 统一图像尺寸参数 # 先声明为全局变量 global IMAGE_SIZE IMAGE_SIZE = 224 # 统一使用224x224作为图像尺寸 BATCH_SIZE = 32 NUM_WORKERS = 4 # 创建缓存对象 memory = Memory(location='./cache', verbose=0) # 使用torchxrayvision推荐的归一化参数(更适合X光片) train_transform = transforms.Compose([ transforms.Resize((IMAGE_SIZE + 32, IMAGE_SIZE + 32)), # 先放大,为随机裁剪做准备 transforms.RandomCrop(IMAGE_SIZE), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomRotation(degrees=5), transforms.ColorJitter(brightness=0, contrast=0.1, saturation=0.1, hue=0.1), transforms.ToTensor(), transforms.Normalize(mean=[0.5051], std=[0.2922]) # torchxrayvision的X光片归一化参数 ]) val_test_transform = transforms.Compose([ transforms.Resize(IMAGE_SIZE), transforms.ToTensor(), transforms.Normalize(mean=[0.5051], std=[0.2922]) # torchxrayvision的X光片归一化参数 ]) # -------------------- 数据预处理阶段 -------------------- def get_patient_name_by_accession(accession, df1): """通过放射编号获取病人姓名""" # 找到匹配的放射编号 matching_rows = df1[df1.iloc[:, 1] == accession] # 第二列是放射编号 if len(matching_rows) > 0: patient_name = matching_rows.iloc[0, 2] # 第三列是病人姓名 return str(patient_name).strip() return None def get_emphysema_label_by_name(patient_name, df2): """通过病人姓名获取肺气肿标签""" # 找到匹配的姓名 matching_rows = df2[df2.iloc[:, 1] == patient_name] # 第二列是姓名 if len(matching_rows) > 0: # 找到肺气肿列(假设列名包含"肺气肿"关键词) emphysema_cols = [col for col in df2.columns if '肺气肿' in str(col)] if emphysema_cols: label = matching_rows[emphysema_cols[0]].values[0] return label return None @memory.cache def load_and_preprocess_data(): # 读取元数据 df1 = pd.read_excel( r'/home/cfu/.virtualenvs/DR/tmp/pycharm_project_689/tmp/pycharm_project_715/DR已筛选2019-2023.xlsx', sheet_name=0) df2 = pd.read_excel( r'/home/cfu/.virtualenvs/DR/tmp/pycharm_project_689/tmp/pycharm_project_715/副本2025.7.14-基础信息+肺功能-郭屿老师.xlsx', sheet_name=0) # 数据收集 X, y = [], [] dicom_dir = r'/home/cfu/.virtualenvs/DR/TEST/NEW.DR' filter_reasons = { 'other_error': 0, 'no_pixel_data': 0, 'no_accession': 0, 'no_patient_name': 0, 'no_label': 0, 'invalid_label': 0 } filtered_filenames = [] print("开始处理DICOM文件...") print(f"DR已筛选2019-2023.xlsx 列名: {list(df1.columns)}") print(f"2025.7.14-基础信息+肺功能-郭屿老师.xlsx 列名: {list(df2.columns)}") for filename in tqdm(os.listdir(dicom_dir), desc="Processing DICOM"): if filename.endswith('.dcm'): try: # 读取DICOM文件 dcm_path = os.path.join(dicom_dir, filename) dcm = pydicom.dcmread(dcm_path) if 'PixelData' not in dcm: # 显式检查像素数据 print(f"跳过无像素数据文件:{filename}") filter_reasons['no_pixel_data'] += 1 filtered_filenames.append(filename) continue # 获取放射编号 if not hasattr(dcm, 'AccessionNumber') or not dcm.AccessionNumber: print(f"跳过无放射编号文件:{filename}") filter_reasons['no_accession'] += 1 filtered_filenames.append(filename) continue accession = str(dcm.AccessionNumber).strip() print(f"处理文件 {filename}: 放射编号={accession}") # 通过放射编号获取病人姓名 patient_name = get_patient_name_by_accession(accession, df1) if not patient_name: print(f"警告:放射编号 {accession} 未找到对应病人姓名,跳过该样本") filter_reasons['no_patient_name'] += 1 filtered_filenames.append(filename) continue print(f" 病人姓名: {patient_name}") # 通过病人姓名获取肺气肿标签 label = get_emphysema_label_by_name(patient_name, df2) if label is None: print(f"警告:病人 {patient_name} 未找到肺气肿标签,跳过该样本") filter_reasons['no_label'] += 1 filtered_filenames.append(filename) continue # 处理标签 try: label = int(label) if label not in {0, 1}: print(f"警告:病人 {patient_name} 的标签无效({label}),跳过该样本") filter_reasons['invalid_label'] += 1 filtered_filenames.append(filename) continue except (ValueError, TypeError): print(f"警告:病人 {patient_name} 的标签无法转换为整数({label}),跳过该样本") filter_reasons['invalid_label'] += 1 filtered_filenames.append(filename) continue print(f" 肺气肿标签: {label}") # 图像处理 img = dcm.pixel_array.astype(float) # 裁剪图像为正方形 height, width = img.shape if height > width: start = (height - width) // 2 img = img[start:start + width, :] else: start = (width - height) // 2 img = img[:, start:start + height] # 缩放为指定大小 img = resize(img, (IMAGE_SIZE, IMAGE_SIZE), order=3, anti_aliasing=True) # 归一化 img = img / (img.max() + 1e-8) # 防止除零 X.append(img) y.append(label) except Exception as e: print(f"处理文件 {filename} 时出错: {e}") filter_reasons['other_error'] += 1 filtered_filenames.append(filename) # 保存被过滤的文件列表(用于人工检查) with open('filtered_files.txt', 'w') as f: f.write('\n'.join(filtered_filenames)) print("\n数据过滤统计:") for reason, count in filter_reasons.items(): print(f"{reason}: {count} 个文件") # 转换为numpy数组 X = np.array(X, dtype=np.float32) y = np.array(y, dtype=np.long) # 标签类型改为long print(f"\n成功加载 {len(X)} 个样本") print(f"标签分布: 0={np.sum(y == 0)}, 1={np.sum(y == 1)}") return X, y # -------------------- 数据集划分 -------------------- X, y = load_and_preprocess_data() @memory.cache def get_processed_accessions(): valid_filenames = [] dicom_dir = r'/home/cfu/.virtualenvs/DR/TEST/NEW.DR' filtered_files = [] if os.path.exists('filtered_files.txt'): with open('filtered_files.txt', 'r') as f: filtered_files = [line.strip() for line in f.readlines()] for filename in os.listdir(dicom_dir): if filename.endswith('.dcm') and filename not in filtered_files: valid_filenames.append(filename) accessions = [] for filename in valid_filenames: try: dcm = pydicom.dcmread(os.path.join(dicom_dir, filename)) accessions.append(dcm.AccessionNumber) except Exception as e: print(f"读取文件 {filename} 的Accession时出错: {e}") accessions.append(None) return accessions if len(X) == 0 or len(y) == 0: print("错误:未加载到任何有效样本!") exit(1) accessions = get_processed_accessions() filtered_files = [] if os.path.exists('filtered_files.txt'): with open('filtered_files.txt', 'r') as f: filtered_files = [line.strip() for line in f.readlines()] if len(accessions) != len(X): print(f"警告:Accession列表长度 ({len(accessions)}) 与样本数量 ({len(X)}) 不一致") dicom_dir = r'/home/cfu/.virtualenvs/DR/TEST/NEW.DR' valid_files_count = len([f for f in os.listdir(dicom_dir) if f.endswith('.dcm') and f not in filtered_files]) print(f"有效文件数量: {valid_files_count}") print(f"处理后样本数量: {len(X)}") print(f"被过滤的文件数量: {len(filtered_files)}") if len(accessions) > len(X): print(f"截断Accession列表,从 {len(accessions)} 缩减到 {len(X)}") accessions = accessions[:len(X)] else: print(f"补充Accession列表,从 {len(accessions)} 扩展到 {len(X)}") accessions += [None] * (len(X) - len(accessions)) assert len(accessions) == len(X), "调整后仍然不匹配,请检查数据处理流程" print("已自动调整使Accession列表与样本数量匹配") # 划分数据集 X_train, X_test, y_train, y_test, acc_train, acc_test = train_test_split( X, y, accessions, test_size=0.2, random_state=42, stratify=y ) X_train, X_val, y_train, y_val, acc_train, acc_val = train_test_split( X_train, y_train, acc_train, test_size=0.2, random_state=42, stratify=y_train ) # 过采样处理类别不平衡 smote = SMOTE(random_state=42, sampling_strategy=0.5) X_train_flat = X_train.reshape(X_train.shape[0], -1) X_train_resampled_flat, y_train_resampled = smote.fit_resample(X_train_flat, y_train) total_elements = X_train_resampled_flat.size print(f"X_train_resampled 的元素数量: {total_elements}") if total_elements % (IMAGE_SIZE * IMAGE_SIZE) == 0: num_images = total_elements // (IMAGE_SIZE * IMAGE_SIZE) X_train_resampled = X_train_resampled_flat.reshape(num_images, IMAGE_SIZE, IMAGE_SIZE) current_image_size = IMAGE_SIZE else: valid_size = int(np.sqrt(total_elements)) valid_size = min(valid_size, 512) while total_elements % (valid_size * valid_size) != 0 and valid_size > 1: valid_size -= 1 print(f"自动调整为 {valid_size}x{valid_size} 的形状") num_images = total_elements // (valid_size * valid_size) X_train_resampled = X_train_resampled_flat.reshape(num_images, valid_size, valid_size) current_image_size = valid_size # 统计各类标签数量 total_zeros = np.sum(y == 0) total_ones = np.sum(y == 1) train_zeros = np.sum(y_train_resampled == 0) train_ones = np.sum(y_train_resampled == 1) val_zeros = np.sum(y_val == 0) val_ones = np.sum(y_val == 1) test_zeros = np.sum(y_test == 0) test_ones = np.sum(y_test == 1) print(f"全部数据中标签为 0 的数量: {total_zeros}") print(f"全部数据中标签为 1 的数量: {total_ones}") print(f"训练集中标签为 0 的数量: {train_zeros}") print(f"训练集中标签为 1 的数量: {train_ones}") print(f"验证集中标签为 0 的数量: {val_zeros}") print(f"验证集中标签为 1 的数量: {val_ones}") print(f"测试集中标签为 0 的数量: {test_zeros}") print(f"测试集中标签为 1 的数量: {test_ones}") # 数据加载器 class MedicalDataset(Dataset): def __init__(self, images, labels, transform=None, image_size=IMAGE_SIZE): self.images = images self.labels = labels self.transform = transform self.image_size = image_size self.xrv_transform = xrv.datasets.XRayCenterCrop() def __len__(self): return len(self.images) def __getitem__(self, idx): img = self.images[idx] label = self.labels[idx] if img.ndim == 1: img = img.reshape(self.image_size, self.image_size) if img.ndim == 2: img = img[np.newaxis, ...] img = self.xrv_transform(img) img = xrv.datasets.normalize(img, maxval=6000) img_np = img.squeeze(0) img_np = (img_np - img_np.min()) / (img_np.max() - img_np.min() + 1e-8) img_pil = to_pil_image(img_np) if self.transform: img = self.transform(img_pil) else: img = transforms.ToTensor()(img_pil) if img.ndim == 2: img = img.unsqueeze(0) label = torch.tensor(label, dtype=torch.float) return img, label # 创建数据加载器 train_loader = DataLoader( MedicalDataset(X_train_resampled, y_train_resampled, transform=train_transform, image_size=current_image_size), batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True ) val_loader = DataLoader( MedicalDataset(X_val, y_val, transform=val_test_transform, image_size=IMAGE_SIZE), batch_size=BATCH_SIZE * 2, num_workers=NUM_WORKERS, pin_memory=True ) test_loader = DataLoader( MedicalDataset(X_test, y_test, transform=val_test_transform, image_size=IMAGE_SIZE), batch_size=BATCH_SIZE * 2, num_workers=NUM_WORKERS, pin_memory=True ) # 创建校准专用的数据加载器(使用验证集) calibration_loader = DataLoader( MedicalDataset(X_val, y_val, transform=val_test_transform, image_size=IMAGE_SIZE), batch_size=BATCH_SIZE * 2, num_workers=NUM_WORKERS, pin_memory=True ) # 训练配置 - 使用Focal Loss class FocalLoss(nn.Module): def __init__(self, alpha=0.25, gamma=2.0): super(FocalLoss, self).__init__() self.alpha = alpha self.gamma = gamma self.bce_loss = nn.BCEWithLogitsLoss(reduction='none') def forward(self, inputs, targets): bce_loss = self.bce_loss(inputs, targets) pt = torch.exp(-bce_loss) focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss return focal_loss.mean() # 计算类别分布用于设置Focal Loss参数 neg_count = np.sum(y_train == 0) pos_count = np.sum(y_train == 1) total_count = len(y_train) print(f"训练集分布 - 负例: {neg_count}, 正例: {pos_count}, 总计: {total_count}") # 设置Focal Loss参数 alpha_value = 0.75 # 对于7:3的不平衡,0.75是一个很好的起点 gamma_value = 2.0 # 初始化Focal Loss criterion = FocalLoss(alpha=alpha_value, gamma=gamma_value) print(f"使用Focal Loss - alpha: {alpha_value}, gamma: {gamma_value}") # 模型配置 class CustomDenseNet(nn.Module): def __init__(self, pretrained=True): super(CustomDenseNet, self).__init__() self.base_model = xrv.models.DenseNet(weights="densenet121-res224-chex") self.features = self.base_model.features num_ftrs = self.base_model.classifier.in_features self.classifier = nn.Linear(num_ftrs, 1) if hasattr(self.base_model, 'op_threshs'): delattr(self.base_model, 'op_threshs') def forward(self, x): x = self.features(x) x = x.mean(dim=(2, 3)) x = self.classifier(x) return x # 初始化模型 model = CustomDenseNet().to(device) model = nn.DataParallel(model) model = model.to(device) params_to_update = [p for p in model.parameters() if p.requires_grad] print(f"可训练参数数量: {len(params_to_update)}") optimizer = optim.AdamW( params_to_update, lr=5e-5, weight_decay=0.001 ) scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5) # 早停策略参数 patience = 30 no_improvement_epochs = 0 # 记录指标 train_losses = [] val_losses = [] train_accuracies = [] test_losses = [] test_accuracies = [] # 记录最高AUC及相关指标 highest_auc = 0 highest_auc_epoch = 0 highest_auc_accuracy = 0 highest_auc_precision = 0 highest_auc_recall = 0 highest_auc_f1 = 0 highest_auc_sensitivity = 0 highest_auc_specificity = 0 highest_auc_conf_matrix = None # -------------------- 改进的校准相关函数 -------------------- def improved_plot_calibration_curve(true_labels, probs, n_bins=10, model_name='Model'): """改进的校准曲线绘制函数""" plt.figure(figsize=(12, 8)) # 计算校准曲线 prob_true, prob_pred = calibration_curve(true_labels, probs, n_bins=n_bins, strategy='uniform') # 计算Brier分数 brier_score = brier_score_loss(true_labels, probs) # 绘制理想校准曲线 plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") # 绘制模型校准曲线 plt.plot(prob_pred, prob_true, "s-", label=f'{model_name} (Brier = {brier_score:.4f})', linewidth=2, markersize=8) # 添加置信区间 bin_counts = np.histogram(probs, bins=np.linspace(0, 1, n_bins + 1))[0] # 为每个bin添加样本数量标注 for i, (x, y, count) in enumerate(zip(prob_pred, prob_true, bin_counts)): if count > 0: # 只为有样本的bin添加标注 plt.annotate(f'n={count}', (x, y), textcoords="offset points", xytext=(0, 10), ha='center', fontsize=9) # 添加直方图 plt.hist(probs, range=(0, 1), bins=n_bins, histtype="step", lw=2, alpha=0.7, label=f'Probability distribution (n={len(probs)})') plt.xlabel("Mean predicted probability", fontsize=12) plt.ylabel("Fraction of positives", fontsize=12) plt.title(f"Calibration Curve - {model_name}", fontsize=14, fontweight='bold') plt.legend(fontsize=11) plt.grid(True, alpha=0.3) plt.xticks(fontsize=10) plt.yticks(fontsize=10) # 添加Brier分数文本框 plt.text(0.02, 0.98, f'Brier Score: {brier_score:.4f}', transform=plt.gca().transAxes, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8), verticalalignment='top', fontsize=11) plt.tight_layout() return plt def calibrate_model(model, calibration_loader, device, method='isotonic'): """ 校准模型预测概率 Args: model: 训练好的PyTorch模型 calibration_loader: 用于校准的数据加载器 device: 设备 (CPU/GPU) method: 校准方法 ('isotonic' 或 'platt') Returns: calibrated_predict_proba: 校准后的预测概率函数 """ model.eval() # 收集校准数据的预测概率和真实标签 all_probs = [] all_labels = [] with torch.no_grad(): for inputs, labels in calibration_loader: inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs).squeeze() probs = torch.sigmoid(outputs).cpu().numpy() all_probs.extend(probs) all_labels.extend(labels.cpu().numpy()) all_probs = np.array(all_probs).reshape(-1, 1) all_labels = np.array(all_labels) # 创建校准器 if method == 'isotonic': calibrator = IsotonicRegression(out_of_bounds='clip') calibrator.fit(all_probs.flatten(), all_labels) def calibrated_predict_proba(probs): return calibrator.predict(probs.flatten()).reshape(-1) elif method == 'platt': calibrator = LogisticRegression() calibrator.fit(all_probs, all_labels) def calibrated_predict_proba(probs): return calibrator.predict_proba(probs.reshape(-1, 1))[:, 1] else: raise ValueError("method must be 'isotonic' or 'platt'") return calibrator, calibrated_predict_proba, all_probs, all_labels def plot_calibration_comparison(true_labels, uncalibrated_probs, calibrated_probs): """ 绘制校准前后的对比图 """ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7)) # 计算Brier分数 brier_uncalibrated = brier_score_loss(true_labels, uncalibrated_probs) brier_calibrated = brier_score_loss(true_labels, calibrated_probs) # 绘制未校准的校准曲线 prob_true_uncal, prob_pred_uncal = calibration_curve(true_labels, uncalibrated_probs, n_bins=10) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") ax1.plot(prob_pred_uncal, prob_true_uncal, "s-", label=f'Uncalibrated (Brier = {brier_uncalibrated:.4f})', linewidth=2, markersize=8) ax1.hist(uncalibrated_probs, range=(0, 1), bins=10, histtype="step", lw=2, alpha=0.7) ax1.set_xlabel("Mean predicted probability") ax1.set_ylabel("Fraction of positives") ax1.set_title("Before Calibration", fontweight='bold') ax1.legend() ax1.grid(True, alpha=0.3) # 绘制校准后的校准曲线 prob_true_cal, prob_pred_cal = calibration_curve(true_labels, calibrated_probs, n_bins=10) ax2.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") ax2.plot(prob_pred_cal, prob_true_cal, "s-", label=f'Calibrated (Brier = {brier_calibrated:.4f})', linewidth=2, markersize=8, color='green') ax2.hist(calibrated_probs, range=(0, 1), bins=10, histtype="step", lw=2, alpha=0.7, color='green') ax2.set_xlabel("Mean predicted probability") ax2.set_ylabel("Fraction of positives") ax2.set_title("After Calibration", fontweight='bold') ax2.legend() ax2.grid(True, alpha=0.3) plt.suptitle("Model Calibration Comparison", fontsize=16, fontweight='bold') plt.tight_layout() return plt def evaluate_calibration(true_labels, probs): """ 评估模型校准性能 """ # 计算Brier分数 brier_score = brier_score_loss(true_labels, probs) # 计算校准曲线 prob_true, prob_pred = calibration_curve(true_labels, probs, n_bins=10) # 计算预期校准误差 (ECE) bin_counts = np.histogram(probs, bins=np.linspace(0, 1, 11))[0] if len(bin_counts) > len(prob_true): bin_counts = bin_counts[:len(prob_true)] elif len(bin_counts) < len(prob_true): bin_counts = np.pad(bin_counts, (0, len(prob_true) - len(bin_counts))) ece = np.sum(np.abs(prob_true - prob_pred) * bin_counts) / np.sum(bin_counts) # 计算最大校准误差 (MCE) mce = np.max(np.abs(prob_true - prob_pred)) results = { 'brier_score': brier_score, 'expected_calibration_error': ece, 'max_calibration_error': mce, 'prob_true': prob_true, 'prob_pred': prob_pred, 'bin_counts': bin_counts } return results # -------------------- 其他评估函数保持不变 -------------------- def check_preprocessing(model, test_loader): model.eval() device = next(model.parameters()).device for inputs, labels in test_loader: inputs = inputs.to(device) print(f"输入图像形状: {inputs.shape}") print(f"输入图像范围: [{inputs.min()}, {inputs.max()}]") print(f"输入图像均值: {inputs.mean()}") print(f"输入图像标准差: {inputs.std()}") with torch.no_grad(): outputs = model(inputs) print(f"模型输出形状: {outputs.shape}") print(f"模型输出范围: [{outputs.min()}, {outputs.max()}]") break def mixup_data(x, y, alpha=0.5): if alpha > 0: lam = np.random.beta(alpha, alpha) else: lam = 1 batch_size = x.size()[0] index = torch.randperm(batch_size).to(device) mixed_x = lam * x + (1 - lam) * x[index, :] y_a, y_b = y, y[index].to(device) return mixed_x, y_a, y_b, lam def mixup_criterion(criterion, pred, y_a, y_b, lam): return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b) def calculate_metrics(true_labels, predictions, probs): precision = precision_score(true_labels, predictions) recall = recall_score(true_labels, predictions) f1 = f1_score(true_labels, predictions) conf_matrix = confusion_matrix(true_labels, predictions) average_auc = roc_auc_score(true_labels, probs) tn, fp, fn, tp = conf_matrix.ravel() specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 return { 'accuracy': np.mean(np.array(true_labels) == np.array(predictions)), 'precision': precision, 'recall': recall, 'sensitivity': recall, 'specificity': specificity, 'f1': f1, 'auc': average_auc, 'conf_matrix': conf_matrix, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp } def validate(model, loader): model.eval() total_loss = 0 correct = 0 total = 0 all_probs = [] all_labels = [] with torch.no_grad(): for inputs, labels in loader: inputs = inputs.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True).float() outputs = model(inputs).squeeze() loss = criterion(outputs, labels) total_loss += loss.item() probs = torch.sigmoid(outputs).detach().cpu().numpy() all_probs.extend(probs) all_labels.extend(labels.cpu().numpy()) predicted = (probs > 0.5).astype(float) correct += np.sum(predicted == labels.cpu().numpy()) total += len(labels) val_auc = roc_auc_score(all_labels, all_probs) if total > 0 else 0.0 accuracy = 100 * correct / total if total > 0 else 0.0 metrics = calculate_metrics(all_labels, (np.array(all_probs) > 0.5).astype(int), all_probs) return total_loss / len(loader), accuracy, val_auc, metrics def plot_confusion_matrix(conf_matrix, title='Confusion Matrix'): plt.figure(figsize=(8, 6)) tn, fp, fn, tp = conf_matrix.ravel() plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues) plt.title(title) plt.colorbar() classes = ['Negative', 'Positive'] tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) thresh = conf_matrix.max() / 2. for i in range(conf_matrix.shape[0]): for j in range(conf_matrix.shape[1]): plt.text(j, i, format(conf_matrix[i, j], 'd'), horizontalalignment="center", color="white" if conf_matrix[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0 specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 plt.figtext(0.5, 0.01, f'Sensitivity: {sensitivity:.4f}, Specificity: {specificity:.4f}', ha='center', fontsize=12, bbox={"facecolor": "orange", "alpha": 0.2, "pad": 5}) plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plt.show() def plot_pr_curve(true_labels, probs): precision, recall, thresholds = precision_recall_curve(true_labels, probs) pr_auc = auc(recall, precision) plt.figure(figsize=(10, 8)) plt.plot(recall, precision, color='darkorange', lw=2, label=f'PR curve (AUC = {pr_auc:.4f})') plt.plot([0, 1], [np.sum(true_labels) / len(true_labels), np.sum(true_labels) / len(true_labels)], color='navy', lw=2, linestyle='--', label=f'No Skill (Precision = {np.sum(true_labels) / len(true_labels):.4f})') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall (Sensitivity)') plt.ylabel('Precision') plt.title('Precision-Recall Curve') plt.legend(loc="lower left") plt.grid(True) plt.savefig('pr_curve.png', dpi=300) plt.show() return pr_auc def plot_dca_curve(true_labels, probs, thresholds=None): if thresholds is None: thresholds = np.linspace(0, 1, 100) event_rate = np.mean(true_labels) net_benefit_model = [] net_benefit_treat_all = [] net_benefit_treat_none = [] for pt in thresholds: predicted_high_risk = probs > pt tp = np.sum((predicted_high_risk == 1) & (true_labels == 1)) fp = np.sum((predicted_high_risk == 1) & (true_labels == 0)) net_benefit = (tp - fp * pt / (1 - pt)) / len(true_labels) if pt < 1 else 0 net_benefit_model.append(net_benefit) treat_all_benefit = (event_rate - pt / (1 - pt) * (1 - event_rate)) if pt < 1 else 0 net_benefit_treat_all.append(treat_all_benefit) net_benefit_treat_none.append(0) plt.figure(figsize=(12, 8)) plt.plot(thresholds, net_benefit_model, label='Model', linewidth=2) plt.plot(thresholds, net_benefit_treat_all, label='Treat all', linestyle='--', linewidth=2) plt.plot(thresholds, net_benefit_treat_none, label='Treat none', linestyle=':', linewidth=2) plt.xlabel('Threshold Probability') plt.ylabel('Net Benefit') plt.title('Decision Curve Analysis') plt.legend() plt.grid(True) plt.axhline(y=0, color='k', linestyle='-', alpha=0.3) plt.savefig('dca_curve.png', dpi=300) plt.show() def plot_comprehensive_evaluation(true_labels, probs): fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # 1. ROC曲线 fpr, tpr, _ = roc_curve(true_labels, probs) roc_auc = auc(fpr, tpr) axes[0, 0].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})') axes[0, 0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') axes[0, 0].set_xlim([0.0, 1.0]) axes[0, 0].set_ylim([0.0, 1.05]) axes[0, 0].set_xlabel('False Positive Rate (1-Specificity)') axes[0, 0].set_ylabel('True Positive Rate (Sensitivity)') axes[0, 0].set_title('ROC Curve') axes[0, 0].legend(loc="lower right") axes[0, 0].grid(True) # 2. PR曲线 precision, recall, _ = precision_recall_curve(true_labels, probs) pr_auc = auc(recall, precision) axes[0, 1].plot(recall, precision, color='darkorange', lw=2, label=f'PR curve (AUC = {pr_auc:.4f})') axes[0, 1].plot([0, 1], [np.sum(true_labels) / len(true_labels), np.sum(true_labels) / len(true_labels)], color='navy', lw=2, linestyle='--') axes[0, 1].set_xlim([0.0, 1.0]) axes[0, 1].set_ylim([0.0, 1.05]) axes[0, 1].set_xlabel('Recall (Sensitivity)') axes[0, 1].set_ylabel('Precision') axes[0, 1].set_title('Precision-Recall Curve') axes[0, 1].legend(loc="lower left") axes[0, 1].grid(True) # 3. 改进的校准曲线 prob_true, prob_pred = calibration_curve(true_labels, probs, n_bins=10) brier_score = brier_score_loss(true_labels, probs) axes[1, 0].plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") axes[1, 0].plot(prob_pred, prob_true, "s-", label=f'Model (Brier = {brier_score:.4f})', linewidth=2) axes[1, 0].set_xlabel("Mean predicted probability") axes[1, 0].set_ylabel("Fraction of positives") axes[1, 0].set_title("Calibration Curve") axes[1, 0].legend() axes[1, 0].grid(True) # 4. 预测概率分布 axes[1, 1].hist(probs[true_labels == 0], bins=20, alpha=0.5, label='Negative', density=True) axes[1, 1].hist(probs[true_labels == 1], bins=20, alpha=0.5, label='Positive', density=True) axes[1, 1].set_xlabel('Predicted Probability') axes[1, 1].set_ylabel('Density') axes[1, 1].set_title('Predicted Probability Distribution') axes[1, 1].legend() axes[1, 1].grid(True) plt.tight_layout() plt.savefig('comprehensive_evaluation.png', dpi=300) plt.show() # -------------------- 模型训练 -------------------- print(model.module) best_val_auc = 0.0 no_improvement_epochs = 0 patience = 30 for epoch in range(100): model.train() train_loss = 0 correct = 0 total = 0 for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}"): inputs = inputs.to(device, non_blocking=True) labels = labels.to(device, non_blocking=True).float() optimizer.zero_grad(set_to_none=True) if random.random() < 0.3: inputs, labels_a, labels_b, lam = mixup_data(inputs, labels) outputs = model(inputs).squeeze() loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam) else: outputs = model(inputs).squeeze() loss = criterion(outputs, labels) if torch.isnan(loss): print("NaN loss detected! Skipping batch.") continue loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() train_loss += loss.item() probs = torch.sigmoid(outputs).detach().cpu().numpy() predicted = (probs > 0.5).astype(float) correct += np.sum(predicted == labels.cpu().numpy()) total += len(labels) train_accuracy = 100 * correct / total if total > 0 else 0.0 train_losses.append(train_loss / len(train_loader)) train_accuracies.append(train_accuracy) print(f"Epoch {epoch + 1:02d} | Training Accuracy: {train_accuracy:.2f}%") val_loss, val_accuracy, val_auc, val_metrics = validate(model, val_loader) val_losses.append(val_loss) print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.2f}% | Val AUC: {val_auc:.4f}") print(f"Val Precision: {val_metrics['precision']:.4f} | Val Recall: {val_metrics['recall']:.4f}") print(f"Val Sensitivity: {val_metrics['sensitivity']:.4f} | Val Specificity: {val_metrics['specificity']:.4f}") print(f"Val F1-score: {val_metrics['f1']:.4f}") if val_auc > best_val_auc: best_val_auc = val_auc no_improvement_epochs = 0 torch.save(model.module.state_dict(), 'best_xray_densenet_model.pth') print(f"★ New best model saved (AUC={best_val_auc:.4f})") else: no_improvement_epochs += 1 if no_improvement_epochs >= patience: print(f"Early stopping: No AUC improvement for {patience} epochs.") break scheduler.step(val_loss) test_loss = 0 correct = 0 total = 0 predictions = [] true_labels = [] predicted_probs = [] with torch.no_grad(): for inputs, labels in test_loader: inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs).squeeze() loss = criterion(outputs, labels) test_loss += loss.item() predicted = (torch.sigmoid(outputs) > 0.5).float() total += labels.size(0) correct += (predicted == labels).sum().item() predictions.extend(predicted.cpu().numpy()) true_labels.extend(labels.cpu().numpy()) probs = torch.sigmoid(outputs).cpu().numpy() predicted_probs.extend(probs) test_accuracy = 100 * correct / total test_losses.append(test_loss / len(test_loader)) test_accuracies.append(test_accuracy) test_metrics = calculate_metrics(true_labels, predictions, predicted_probs) print(f"\n=== Epoch {epoch + 1:02d} Test Results ===") print(f"Test Loss: {test_loss / len(test_loader):.4f}") print(f"Test Accuracy: {test_metrics['accuracy'] * 100:.2f}%") print(f"Precision: {test_metrics['precision']:.4f}") print(f"Recall (Sensitivity): {test_metrics['sensitivity']:.4f}") print(f"Specificity: {test_metrics['specificity']:.4f}") print(f"F1-score: {test_metrics['f1']:.4f}") print(f"AUC: {test_metrics['auc']:.4f}") print(f"Confusion Matrix:\n{test_metrics['conf_matrix']}") print(f"TN: {test_metrics['tn']}, FP: {test_metrics['fp']}, FN: {test_metrics['fn']}, TP: {test_metrics['tp']}") if test_metrics['auc'] > highest_auc: highest_auc = test_metrics['auc'] highest_auc_epoch = epoch + 1 highest_auc_accuracy = test_metrics['accuracy'] highest_auc_precision = test_metrics['precision'] highest_auc_recall = test_metrics['recall'] highest_auc_f1 = test_metrics['f1'] highest_auc_sensitivity = test_metrics['sensitivity'] highest_auc_specificity = test_metrics['specificity'] highest_auc_conf_matrix = test_metrics['conf_matrix'] # 绘制训练损失和准确率曲线 plt.figure(figsize=(15, 5)) plt.subplot(1, 2, 1) plt.plot(train_losses, label='Training Loss') plt.plot(val_losses, label='Validation Loss') plt.plot(test_losses, label='Test Loss', color='blue') plt.xlabel('Epochs') plt.ylabel('Loss') plt.title('Training Loss Curve') plt.legend() plt.grid(True) plt.subplot(1, 2, 2) plt.plot(train_accuracies, label='Training Accuracy', color='orange') plt.plot(test_accuracies, label='Test Accuracy', color='green') plt.xlabel('Epochs') plt.ylabel('Accuracy (%)') plt.title('Training Accuracy Curve') plt.legend() plt.grid(True) plt.tight_layout() plt.savefig('training_metrics.png', dpi=300) plt.show() # -------------------- 最终评估(包含模型校准) -------------------- print("\n=== Final Evaluation with Best Model ===") model_path = 'best_xray_densenet_model.pth' if hasattr(model, 'module'): model.module.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path)) model.eval() # 获取测试集预测结果 all_test_labels = [] all_test_probs = [] all_test_predictions = [] with torch.no_grad(): for inputs, labels in test_loader: inputs = inputs.to(device) labels = labels.to(device) outputs = model(inputs).squeeze() probs = torch.sigmoid(outputs).cpu().numpy() predictions = (probs > 0.5).astype(int) all_test_labels.extend(labels.cpu().numpy()) all_test_probs.extend(probs) all_test_predictions.extend(predictions) # 计算最终指标 final_metrics = calculate_metrics(all_test_labels, all_test_predictions, all_test_probs) print(f"\n=== Final Test Metrics (Before Calibration) ===") print(f"Accuracy: {final_metrics['accuracy'] * 100:.2f}%") print(f"Precision: {final_metrics['precision']:.4f}") print(f"Recall (Sensitivity): {final_metrics['sensitivity']:.4f}") print(f"Specificity: {final_metrics['specificity']:.4f}") print(f"F1-score: {final_metrics['f1']:.4f}") print(f"AUC: {final_metrics['auc']:.4f}") print(f"Confusion Matrix:\n{final_metrics['conf_matrix']}") # -------------------- 模型校准 -------------------- print("\n=== Model Calibration ===") # 使用验证集进行校准 print("Calibrating model using validation set...") calibrator, calibrated_predict_proba, cal_probs, cal_labels = calibrate_model( model, calibration_loader, device, method='isotonic' ) # 对测试集进行校准 calibrated_test_probs = calibrated_predict_proba(np.array(all_test_probs)) # 计算校准后的指标 calibrated_predictions = (calibrated_test_probs > 0.5).astype(int) calibrated_metrics = calculate_metrics(all_test_labels, calibrated_predictions, calibrated_test_probs) print(f"\n=== Test Metrics (After Calibration) ===") print(f"Accuracy: {calibrated_metrics['accuracy'] * 100:.2f}%") print(f"Precision: {calibrated_metrics['precision']:.4f}") print(f"Recall (Sensitivity): {calibrated_metrics['sensitivity']:.4f}") print(f"Specificity: {calibrated_metrics['specificity']:.4f}") print(f"F1-score: {calibrated_metrics['f1']:.4f}") print(f"AUC: {calibrated_metrics['auc']:.4f}") # 评估校准性能 print("\n=== Calibration Evaluation ===") before_calibration = evaluate_calibration(all_test_labels, all_test_probs) after_calibration = evaluate_calibration(all_test_labels, calibrated_test_probs) print("Before Calibration:") print(f" Brier Score: {before_calibration['brier_score']:.4f}") print(f" Expected Calibration Error (ECE): {before_calibration['expected_calibration_error']:.4f}") print(f" Max Calibration Error (MCE): {before_calibration['max_calibration_error']:.4f}") print("\nAfter Calibration:") print(f" Brier Score: {after_calibration['brier_score']:.4f}") print(f" Expected Calibration Error (ECE): {after_calibration['expected_calibration_error']:.4f}") print(f" Max Calibration Error (MCE): {after_calibration['max_calibration_error']:.4f}") # -------------------- 绘制所有评估图表 -------------------- print("\n=== Generating Evaluation Plots ===") # 1. 综合评估图(包含改进的校准曲线) plot_comprehensive_evaluation(all_test_labels, all_test_probs) # 2. 混淆矩阵图(校准后) plot_confusion_matrix(calibrated_metrics['conf_matrix'], 'Confusion Matrix (After Calibration)') # 3. ROC曲线 fpr, tpr, thresholds = roc_curve(all_test_labels, calibrated_test_probs) roc_auc = auc(fpr, tpr) plt.figure(figsize=(10, 8)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') best_threshold_idx = np.argmax(tpr - fpr) best_threshold = thresholds[best_threshold_idx] plt.plot(fpr[best_threshold_idx], tpr[best_threshold_idx], 'ro', label=f'Best Threshold = {best_threshold:.4f}') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate (1-Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') plt.title('Receiver Operating Characteristic (ROC) Curve - After Calibration') plt.legend(loc="lower right") plt.grid(True) plt.savefig('roc_curve_calibrated.png', dpi=300) plt.show() # 4. PR曲线 pr_auc = plot_pr_curve(all_test_labels, calibrated_test_probs) # 5. 改进的校准曲线(校准后) plt = improved_plot_calibration_curve(all_test_labels, calibrated_test_probs, n_bins=10, model_name='Calibrated Model') plt.savefig('improved_calibration_curve.png', dpi=300, bbox_inches='tight') plt.close() # 6. 校准前后对比图 plt = plot_calibration_comparison(all_test_labels, all_test_probs, calibrated_test_probs) plt.savefig('calibration_comparison.png', dpi=300, bbox_inches='tight') plt.close() # 7. DCA曲线(校准后) plot_dca_curve(all_test_labels, calibrated_test_probs) # 8. 最佳阈值分析 print(f"\n=== Best Threshold Analysis (After Calibration) ===") print(f"Best Threshold: {best_threshold:.4f}") print(f"Sensitivity at Best Threshold: {tpr[best_threshold_idx]:.4f}") print(f"Specificity at Best Threshold: {1 - fpr[best_threshold_idx]:.4f}") best_calibrated_predictions = (np.array(calibrated_test_probs) > best_threshold).astype(int) best_calibrated_metrics = calculate_metrics(all_test_labels, best_calibrated_predictions, calibrated_test_probs) print(f"Accuracy at Best Threshold: {best_calibrated_metrics['accuracy'] * 100:.2f}%") print(f"Precision at Best Threshold: {best_calibrated_metrics['precision']:.4f}") print(f"F1-score at Best Threshold: {best_calibrated_metrics['f1']:.4f}") # 保存校准器 import joblib joblib.dump(calibrator, 'model_calibrator.pkl') print("\nCalibrator saved to 'model_calibrator.pkl'") print("\nAll evaluation plots have been saved!")其中的特异度特别低
最新发布
10-21
import numpy as np import pandas as pd import matplotlib.pyplot as plt import os import logging from scipy.optimize import minimize from scipy.stats import norm import seaborn as sns from datetime import datetime from concurrent.futures import ProcessPoolExecutor import warnings from scipy.optimize import differential_evolution import traceback from scipy import stats from scipy.interpolate import CubicSpline warnings.filterwarnings('ignore', category=RuntimeWarning) # ====================== 全局配置 ====================== ATOMIC_MASS = {'C': 12.0107, 'H': 1.00794, 'Cl': 35.453} CONFIG = { 'max_rf_ratio': 6.0, # RF极差限制 'rsd_threshold': 0.1, # RSD阈值 'rsd_weight': 300000, # RSD惩罚权重 'outlier_sigma': 1, # 异常值检测阈值 'min_samples_for_rsd': 2 # RSD计算最小样本数 } # ====================== 主优化类 ====================== class CPOptimizer: def __init__(self, std_file='Standard_Sample.csv', unknown_file=None, result_path='result'): self.std_file = std_file self.unknown_file = unknown_file self.result_path = result_path self.standard_samples = None self.unknown_samples = None self.chain_optimizers = {} self.rf_database = {} os.makedirs(self.result_path, exist_ok=True) # 配置日志系统 log_file = os.path.join(result_path, 'optimization.log') logging.basicConfig( filename=log_file, level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) self.logger = logging.getLogger('CPOptimizer') console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s', '%Y-%m-%d %H:%M:%S')) self.logger.addHandler(console) self.logger.info("CPOptimizer initialized") self.logger.setLevel(logging.DEBUG) # 确保 DEBUG 日志可见 def load_data(self): """加载标准样品和未知样品数据""" try: # 加载标准样品 self.standard_samples = pd.read_csv(self.std_file) self.logger.info(f"Loaded standard samples: {len(self.standard_samples)} records") # 加载未知样品(如果存在) if self.unknown_file and os.path.exists(self.unknown_file): self.unknown_samples = pd.read_csv(self.unknown_file) self.logger.info(f"Loaded unknown samples: {len(self.unknown_samples)} records") else: self.logger.info("No unknown samples file found") return True except Exception as e: self.logger.error(f"Error loading data: {str(e)}") return False def preprocess_standard_samples(self): """预处理标准样品数据""" self.logger.info("Starting standard samples preprocessing...") if self.standard_samples is None: self.logger.error("Standard samples not loaded") return False try: df = self.standard_samples.copy() # 1. 计算分子量 df['Base_MW'] = df['Carbon Number'].apply(self.calculate_base_mw) # 2. 计算氯原子数 df['Cls'] = df.apply(self.calculate_cls, axis=1) # 3. 计算平均分子量 df['Avg_MW'] = df.apply(lambda row: self.calculate_molar_mass(row['Carbon Number'], row['Cls']), axis=1) # 4. 计算总摩尔量 (nmol) df['Total_Molar_nmol'] = (df['Original Mass_ng'] * 1e-9) / df['Avg_MW'] * 1e9 # 5. 标准化峰面积 cl_columns = [f'Cl{i}' for i in range(1, 13)] for col in cl_columns: if col in df.columns: df[f'Norm_{col}'] = df[col] / df['Internal Standard'] self.standard_samples = df # 6. 按碳数分组创建优化器 for carbon_num, group in df.groupby('Carbon Number'): self.chain_optimizers[carbon_num] = ChainOptimizer( carbon_num, group, self.result_path, self.logger ) self.logger.info("Standard samples preprocessed") return True except Exception as e: self.logger.error(f"Error preprocessing standard samples: {str(e)}") self.logger.error(traceback.format_exc()) return False def run_optimization(self): """运行优化流程""" self.logger.info("Starting optimization process...") if not self.chain_optimizers: self.logger.error("No chain optimizers available") return False results = {} total_chains = len(self.chain_optimizers) self.logger.info(f"Optimizing {total_chains} carbon chains using parallel processing") # 使用多进程并行优化 with ProcessPoolExecutor() as executor: futures = {carbon_num: executor.submit(optimizer.optimize) for carbon_num, optimizer in self.chain_optimizers.items()} # 收集结果 successful_chains = 0 for carbon_num, future in futures.items(): try: result = future.result() results[carbon_num] = result if result: self.logger.info(f"Optimization completed for C{carbon_num}") successful_chains += 1 else: self.logger.error(f"Optimization failed for C{carbon_num}") except Exception as e: self.logger.error(f"Error optimizing C{carbon_num}: {str(e)}") results[carbon_num] = False self.logger.info(f"Successfully optimized {successful_chains}/{total_chains} carbon chains") # 构建RF数据库 self.build_rf_database() # 处理未知样品(商业CP) if self.unknown_samples is not None: self.logger.info("Processing unknown samples...") self.process_unknown_samples() # 生成最终报告 self.logger.info("Generating final report...") self.generate_final_report(results) self.logger.info("Optimization process completed") return successful_chains > 0 # 只要至少有一个碳链优化成功就返回True def build_rf_database(self): """构建响应因子(RF)数据库""" self.logger.info("Building RF database...") successful_chains = 0 for carbon_num, optimizer in self.chain_optimizers.items(): rf_values = optimizer.get_rf_values() if rf_values is not None and len(rf_values) > 0: self.rf_database[carbon_num] = rf_values self.logger.info(f"Added RF data for C{carbon_num} with {len(rf_values)} entries") successful_chains += 1 else: self.logger.warning(f"No valid RF data for C{carbon_num}") self.logger.info(f"RF database built with {successful_chains} entries") def process_unknown_samples(self): """处理未知样品""" self.logger.info("Processing unknown samples...") try: df = self.unknown_samples.copy() results = [] for _, row in df.iterrows(): carbon_num = row['Carbon Number'] # 检查是否有RF数据 if carbon_num not in self.rf_database: self.logger.warning(f"No RF data for C{carbon_num}, skipping sample") continue rf_values = self.rf_database[carbon_num] # 计算内标摩尔量 is_molar = (row['Internal Standard Mass_ng'] * 1e-9) / row['Internal Standard MW_g/mol'] * 1e9 molar_results = [] for m in range(1, 13): cl_col = f'Cl{m}' peak_area = row[cl_col] if cl_col in row else 0 # 获取RF几何平均值 rf_key = f'RF_geo_mean_{m}' rf_geo_mean = rf_values.get(rf_key, 0.0) # 计算摩尔量 denominator = row['Internal Standard'] * rf_geo_mean if denominator > 1e-10 and rf_geo_mean > 0: molar = (peak_area * is_molar) / denominator else: molar = 0.0 molar_results.append(molar) # 计算总量 total_molar = sum(molar_results) molar_masses = [self.calculate_molar_mass(carbon_num, m) for m in range(1, 13)] total_mass = sum([molar * mass for molar, mass in zip(molar_results, molar_masses)]) # 存储结果 result = {'StanID': row.get('StanID', 'Unknown'), 'Carbon_Number': carbon_num, 'Total_Molar_nmol': total_molar, 'Total_Mass_ng': total_mass} # 添加各氯代数据 for m in range(1, 13): result[f'Molar_Cl{m}'] = molar_results[m - 1] result[f'Mass_Cl{m}'] = molar_results[m - 1] * molar_masses[m - 1] results.append(result) # 保存结果 result_df = pd.DataFrame(results) unknown_file = os.path.join(self.result_path, 'Unknown_Samples_Results.csv') result_df.to_csv(unknown_file, index=False) self.logger.info(f"Unknown samples processed. Results saved to {unknown_file}") return True except Exception as e: self.logger.error(f"Error processing unknown samples: {str(e)}") self.logger.error(traceback.format_exc()) return False def generate_final_report(self, results): """生成最终报告""" self.logger.info("Generating final congener distribution report...") try: all_reports = [] for carbon_num, optimizer in self.chain_optimizers.items(): report = optimizer.get_final_report() if report is not None: all_reports.append(report) self.logger.info(f"Added report for C{carbon_num}") if not all_reports: self.logger.error("No valid reports to generate final report") return False # 合并所有报告 final_report = pd.concat(all_reports, ignore_index=True) report_file = os.path.join(self.result_path, 'Congener_Distribution_Report.csv') final_report.to_csv(report_file, index=False) self.logger.info(f"Final report generated: {report_file}") return True except Exception as e: self.logger.error(f"Error generating final report: {str(e)}") self.logger.error(traceback.format_exc()) return False # ====================== 实用方法 ====================== @staticmethod def calculate_base_mw(n): """计算基础分子量 (无氯)""" return ATOMIC_MASS['C'] * n + ATOMIC_MASS['H'] * (2 * n + 2) @staticmethod def calculate_molar_mass(n, m): """计算含氯分子量""" base = ATOMIC_MASS['C'] * n + ATOMIC_MASS['H'] * (2 * n + 2 - m) return base + ATOMIC_MASS['Cl'] * m def calculate_cls(self, row): """计算氯原子数 (浮点数)""" n = row['Carbon Number'] target_cl_content = row['Chlorine Content'] base_mw = self.calculate_base_mw(n) chlorine_increment = ATOMIC_MASS['Cl'] - ATOMIC_MASS['H'] # 初始化最佳值 best_m, min_error = 0, float('inf') target_error = 0.01 # 遍历1-12氯原子数 for m in range(1, 13): molar_mass = base_mw + chlorine_increment * m calculated_cl = (ATOMIC_MASS['Cl'] * m / molar_mass) * 100 error = abs(calculated_cl - target_cl_content) # 更新最佳值 if error < min_error: min_error, best_m = error, m if error <= target_error: return float(m) # 浮点优化-完整遍历1-12氯原子数后再确定最优解 m_float, step = float(best_m), 0.1 current_error = min_error # 初始化当前误差为最小误差 for _ in range(50): if current_error <= target_error: break improved = False for direction in [-1, 1]: test_m = m_float + direction * step if test_m < 1 or test_m > 12: continue molar_mass = base_mw + chlorine_increment * test_m calculated_cl = (ATOMIC_MASS['Cl'] * test_m / molar_mass) * 100 error = abs(calculated_cl - target_cl_content) if error < current_error: current_error, m_float, improved = error, test_m, True if not improved: step /= 2 return m_float # ====================== 碳链优化类 ====================== class ChainOptimizer: def __init__(self, carbon_num, samples, result_path, logger): self.carbon_num = carbon_num self.samples = samples self.result_path = result_path self.logger = logger # 结果存储 self.result_df = None self.rf_values = None self.raw_rf_matrix = {m: [] for m in range(1, 13)} self.rf_matrix = {m: [] for m in range(1, 13)} # RF数据存储 self.chain_path = os.path.join(result_path, f'C{carbon_num}') os.makedirs(self.chain_path, exist_ok=True) # 配置参数 self.cfg = CONFIG.copy() self.logger.info(f"Initialized optimizer for C{carbon_num} with {len(samples)} samples") def _safe_obj(self, func, params, *args): """安全的目标函数计算 (防止数值错误)""" if not np.all(np.isfinite(params)): return 1e9 try: return func(params, *args) except Exception: return 1e9 # ====================== 优化阶段1 ====================== def _phase1_obj(self, params, samples): """第一阶段目标函数 (总量匹配)""" cod = 0.0 # 目标函数值 # 遍历所有样品 for i, (_, sample) in enumerate(samples.iterrows()): mu, sigma = params[i * 2], max(0.5, params[i * 2 + 1]) # σ下界保护 # 计算同系物分布 ratios, molars, masses, _ = self.calculate_congener_distribution(mu, sigma, sample['Total_Molar_nmol'], sample) # RSR molar_sum = sum(molars) if sample['Total_Molar_nmol'] > 0: cod += ((sample['Total_Molar_nmol'] - molar_sum) ** 2) / sample['Total_Molar_nmol'] # RSM mass_sum = sum(masses) if sample['Original Mass_ng'] > 0: cod += ((sample['Original Mass_ng'] - mass_sum) ** 2) / sample['Original Mass_ng'] return cod # ====================== 优化阶段2 ====================== def _phase2_obj(self, params, samples): """专注最小化RF的RSD,移除总量误差干扰""" # 1. 重置数据结构 rf_matrix = {m: [] for m in range(1, 13)} # 按氯原子数分组存储RF值 all_sample_rfs = [] # 按样品存储RF值(每个样品一个12元素的列表) # 2. 计算当前参数下的RF值 for i, (_, sample) in enumerate(samples.iterrows()): mu = params[i * 2] sigma = max(0.5, params[i * 2 + 1]) # 计算RF值 _, _, _, rfs = self.calculate_congener_distribution( mu, sigma, sample['Total_Molar_nmol'], sample ) # 存储RF值 sample_rfs = [] for m, rf in enumerate(rfs, start=1): if rf > 1e-6: # 过滤极小值 rf_matrix[m].append(rf) sample_rfs.append(rf) else: sample_rfs.append(0.0) # 保持12个元素 all_sample_rfs.append(sample_rfs) # 添加当前样品的RF序列 # 3. 计算总RSD惩罚 rsd_penalty = 0.0 valid_congeners = 0 for m in range(1, 13): rf_vals = rf_matrix[m] n_vals = len(rf_vals) # 有效性检查 if n_vals < self.cfg['min_samples_for_rsd']: # 样本不足惩罚 (按缺失比例加权) rsd_penalty += self.cfg['rsd_weight'] * ( 1.0 - n_vals / self.cfg['min_samples_for_rsd'] ) continue # 计算几何RSD (更稳健) log_rf = np.log(rf_vals) log_mean = np.mean(log_rf) log_std = np.std(log_rf) geo_rsd = np.sqrt(np.exp(log_std ** 2) - 1) # 几何RSD公式 # 累加惩罚项 rsd_penalty += self.cfg['rsd_weight'] * geo_rsd valid_congeners += 1 # 4. 添加物理约束 (轻量级) phys_penalty = 0.0 for i, (_, sample) in enumerate(samples.iterrows()): mu = params[i * 2] # μ偏离自然值的惩罚 mu_dev = abs(mu - sample['Cls']) if mu_dev > 0.4: phys_penalty += 100 * (mu_dev - 0.3) ** 2 # 5. 添加RF约束惩罚 rf_penalty = 0.0 # a) 低氯代物递增约束 (Cl5-Cl8) for sample_rfs in all_sample_rfs: # 检查Cl5-Cl8是否满足递增:Cl5 < Cl6 < Cl7 < Cl8 for m in range(5, 8): # m=5,6,7 rf_current = sample_rfs[m - 1] # Cl(m) rf_next = sample_rfs[m] # Cl(m+1) # 跳过无效值(RF=0表示该同族体不存在) if rf_current <= 1e-6 or rf_next <= 1e-6: continue # 检查递增约束 if rf_current >= rf_next: rf_penalty += 10000 # 重大惩罚 # b) 高氯代物极差约束 (Cl9-Cl12) for m in range(9, 13): # m=9,10,11,12 rf_vals = rf_matrix[m] rf_vals = [rf for rf in rf_vals if rf > 1e-6] # 过滤无效值 if len(rf_vals) < 2: # 至少需要2个有效值 continue max_rf = max(rf_vals) min_rf = min(rf_vals) ratio = max_rf / min_rf # 检查极差约束(最大/最小 ≤ 10) if ratio > 10.0: rf_penalty += 3000 * (ratio - 10.0) # 按超出比例惩罚 return rsd_penalty + phys_penalty + rf_penalty def get_initial_params(self): """基于RF一致性生成初始参数""" rf_matrix = {m: [] for m in range(1, 13)} # 使用自然μ计算初始RF for _, sample in self.samples.iterrows(): _, _, _, rfs = self.calculate_congener_distribution( sample['Cls'], 1.0, # 初始σ=1.0 sample['Total_Molar_nmol'], sample ) for m, rf in enumerate(rfs, start=1): if rf > 1e-6: rf_matrix[m].append(rf) # 寻找RF最稳定的μ作为初始点 best_mu, best_rsd = None, float('inf') for test_mu in np.linspace(3, 9, 7): # 测试3-9之间的μ total_rsd = 0 valid = 0 for m in range(1, 13): if rf_matrix[m]: # 计算调整因子 adj_factors = [norm.pdf(m, test_mu, 1.0) / max(norm.pdf(m, sample['Cls'], 1.0), 1e-6) for sample in self.samples] adj_rf = [rf * factor for rf, factor in zip(rf_matrix[m], adj_factors)] # 计算几何RSD log_rf = np.log(adj_rf) log_std = np.std(log_rf) total_rsd += np.sqrt(np.exp(log_std ** 2) - 1) valid += 1 avg_rsd = total_rsd / valid if valid else 1e6 if avg_rsd < best_rsd: best_rsd, best_mu = avg_rsd, test_mu # 生成初始参数 initial_params = [] for _ in self.samples.iterrows(): initial_params.extend([best_mu, 1.0]) # 统一使用最优μ return initial_params # ====================== 主优化函数 ====================== def optimize(self): """执行优化过程""" self.logger.info(f"Starting optimization for C{self.carbon_num}") start_time = datetime.now() try: # ===== 第一阶段:差分进化 ===== self.logger.info(f"Phase 1: Differential Evolution for C{self.carbon_num}") # 定义第一阶段边界 phase1_bounds = [] for _, sample in self.samples.iterrows(): w = sample['Cls'] phase1_bounds.extend([ (max(1.0, w - 1.5), min(12.0, w + 1.5)), # μ边界 (0.5, 3.0) # σ边界 ]) # 动态设置优化参数 n_samples = len(self.samples) chain_factor = 1.0 + 0.04 * abs(self.carbon_num - 10) cl_discrete = np.std(self.samples['Chlorine Content']) maxiter = int(300 * chain_factor + 100 * n_samples * (1 + cl_discrete / 10)) popsize = int(15 * chain_factor + 8 * n_samples * (1 + cl_discrete / 20)) convergence_thresh = 1e-3 if n_samples <= 3 else 1e-4 * np.log10(n_samples) # 早停机制历史记录 self.best_history = [] # 执行差分进化 res1 = differential_evolution( self._phase1_obj, phase1_bounds, args=(self.samples,), maxiter=maxiter, popsize=popsize, tol=convergence_thresh, init='latinhypercube', callback=self._early_stopping, workers=1, updating='immediate', polish=True, # 确保最后进行局部优化 strategy='best1bin', # 更有效的策略 mutation=(0.5, 1.0), # 动态变异率 recombination=0.9 # 较高的重组率 ) # 添加重启机制 if res1.fun > 1.0 and res1.nit < maxiter // 2: # 结果不理想且提前停止 self.logger.warning(f"C{self.carbon_num} phase-1 restarted (previous best: {res1.fun:.2e})") # 使用更大种群重启 res1 = differential_evolution( self._phase1_obj, phase1_bounds, args=(self.samples,), maxiter=maxiter, popsize=int(popsize * 1.5), # 增加种群规模 tol=convergence_thresh, init='latinhypercube', polish=True, mutation=(0.7, 1.3), # 更大的变异范围 recombination=0.85 ) # 记录结果 stop_reason = "converged" if res1.success else f"maxiter reached ({res1.nit}/{maxiter})" self.logger.warning(f"C{self.carbon_num} phase-1 stopped: {stop_reason},Best COD: {res1.fun:.4e}") best_total = res1.fun x0 = res1.x # ===== 第二阶段:L-BFGS-B ===== self.logger.info(f"Phase 2: L-BFGS-B Optimization for C{self.carbon_num}") # 定义更严格的边界 phase2_bounds = [] for _, sample in self.samples.iterrows(): w = sample['Cls'] phase2_bounds.extend([ (max(1.0, w - 0.5), min(12.0, w + 0.5)), # μ严格边界 (0.5, 3.0) # σ边界 ]) # 确保初始值在严格边界内 x0_strict = [] for j, (low, high) in enumerate(phase2_bounds): x0_strict.append(np.clip(x0[j], low, high)) # 重置历史记录 self.best_history = [] # 执行L-BFGS-B优化 - 修正参数传递 res2 = minimize( self._phase2_obj, x0_strict, args=(self.samples,), # 只传递一个参数 method='L-BFGS-B', bounds=phase2_bounds, options={ 'maxiter': 1000, # 增加迭代次数 'ftol': 1e-10, # 更严格的收敛阈值 } ) # 记录结果 stop_reason = ("converged" if res2.success else f"maxiter reached ({res2.nit}/300)" if res2.nit >= 300 else "early stopped") self.logger.info(f"C{self.carbon_num} phase-2 stopped: {stop_reason},Final COD: {res2.fun:.4e}") # 检查优化结果是否有效 if not np.all(np.isfinite(res2.x)): self.logger.error(f"Optimization for C{self.carbon_num} produced invalid parameters") return False # ===== 处理结果 ===== opt_params = res2.x results = [] # 计算每个样品的结果 for i, (_, sample) in enumerate(self.samples.iterrows()): mu = opt_params[i * 2] sigma = max(0.5, opt_params[i * 2 + 1]) # σ下界保护 result = self.calculate_sample_result(sample, mu, sigma) if result is not None: results.append(result) # 检查是否有有效结果 if not results: self.logger.error(f"No valid results for C{self.carbon_num}") return False # 保存结果 self.result_df = pd.DataFrame(results) self.calculate_rf_stats() self.save_results() self.generate_plots() duration = (datetime.now() - start_time).total_seconds() self.logger.info(f"Optimization for C{self.carbon_num} completed in {duration:.2f} seconds") return True except Exception as e: self.logger.error(f"Error optimizing C{self.carbon_num}: {e}") self.logger.error(traceback.format_exc()) return False # ====================== 优化辅助方法 ====================== def _early_stopping(self, xk, convergence): """稳健的差分进化早停回调函数""" current_obj = self._phase1_obj(xk, self.samples) self.best_history.append(current_obj) n_iter = len(self.best_history) min_iter = max(50, len(self.samples) * 5) # 基于样本数的动态最小迭代次数 # 1. 强制最小迭代次数 if n_iter < min_iter: return False # 2. 目标函数值接近零时直接停止 if current_obj < 1e-6: # 足够小的目标值 self.logger.debug(f"C{self.carbon_num} early stop: target reached ({current_obj:.2e})") return True # 3. 改进量检测 window = min(20, n_iter // 3) # 更大的窗口 if n_iter > window: recent_values = self.best_history[-window:] improvements = np.abs(np.diff(recent_values)) avg_improve = np.mean(improvements) # 3.1 基于绝对值和相对值的复合条件 if avg_improve < max(1e-5, current_obj * 0.001): # 绝对阈值或相对0.1% self.logger.debug( f"C{self.carbon_num} early stop: avg_improve={avg_improve:.2e} (current={current_obj:.2e})") return True # 4. 平台期检测(更稳健的计算) plateau_window = min(25, n_iter // 4) if n_iter > plateau_window: plateau_values = self.best_history[-plateau_window:] start_val = plateau_values[0] end_val = plateau_values[-1] # 避免除以零错误 if abs(start_val) > 1e-10: relative_improve = abs(start_val - end_val) / abs(start_val) else: relative_improve = 0.0 absolute_improve = abs(start_val - end_val) # 复合平台检测条件 if (relative_improve < 0.001) and (absolute_improve < 0.1 * current_obj): self.logger.debug(f"C{self.carbon_num} early stop: plateau detected " f"(rel={relative_improve:.2%}, abs={absolute_improve:.2e}, current={current_obj:.2e})") return True return False def _lbfgsb_callback(self, xk): """L-BFGS-B早停回调函数""" current_obj = self._phase2_obj(xk, self.samples, self.best_history[0] if self.best_history else 1e9) self.best_history.append(current_obj) # 检查最近3次迭代改进 if len(self.best_history) > 3: recent_improve = abs(np.diff(self.best_history[-3:])).mean() if recent_improve < 5e-5: # 更严格的收敛阈值 # 通过抛出特殊异常停止优化 raise StopIteration("Convergence achieved") def calculate_sample_result(self, sample, mu_opt, sigma_opt): """计算单个样品结果""" try: total_molar = sample['Total_Molar_nmol'] # 计算同系物分布 ratios, molars, masses, rfs = self.calculate_congener_distribution( mu_opt, max(sigma_opt, 0.3), total_molar, sample ) # 检查计算结果是否有效 if not all(np.isfinite(ratios)) or not all(np.isfinite(molars)) or not all(np.isfinite(masses)): self.logger.warning(f"Invalid calculation results for sample {sample['StanID']}") return None # 存储RF值 for m, rf in enumerate(rfs, start=1): if rf > 0: self.raw_rf_matrix[m].append(rf) self.rf_matrix[m].append(rf) # 计算残差 original_mass = sample['Original Mass_ng'] calculated_mass = sum(masses) rsr = ((total_molar - sum(molars)) ** 2) / total_molar if total_molar > 0 else 0 rsm = ((original_mass - calculated_mass) ** 2) / original_mass if original_mass > 0 else 0 # 构建结果字典 result_data = { 'StanID': sample['StanID'], 'Carbon_Number': self.carbon_num, 'Chlorine_Content': sample['Chlorine Content'], 'Total_Molar_nmol': total_molar, 'Total_Mass_ng': original_mass, 'Cls': sample['Cls'], 'mu_natural': sample['Cls'], 'mu_optimized': mu_opt, 'mu_dev': abs(mu_opt - sample['Cls']), 'sigma': sigma_opt, 'RSR': rsr, 'RSM': rsm } # 添加各氯代数据 for m in range(1, 13): result_data[f'Ratio_Cl{m}'] = ratios[m - 1] if len(ratios) > m - 1 else 0.0 result_data[f'Molar_Cl{m}'] = molars[m - 1] if len(molars) > m - 1 else 0.0 result_data[f'Mass_Cl{m}'] = masses[m - 1] if len(masses) > m - 1 else 0.0 result_data[f'RF_Cl{m}'] = rfs[m - 1] if len(rfs) > m - 1 else 0.0 norm_col = f'Norm_Cl{m}' result_data[norm_col] = sample[norm_col] if norm_col in sample else 0.0 return result_data except Exception as e: self.logger.error(f"Error calculating sample result for {sample['StanID']}: {e}") return None def calculate_congener_distribution(self, mu, sigma, total_molar, sample): """计算同系物分布和RF值""" try: n = self.carbon_num m_values = np.arange(1, 13) # 计算正态分布PDF pdf_values = norm.pdf(m_values, mu, max(sigma, 0.3)) # σ下界保护调整为0.5 total_pdf = np.sum(pdf_values) if total_pdf > 0: ratios = pdf_values / total_pdf else: ratios = np.zeros(12) # 检查比率是否有效 if not all(np.isfinite(ratios)): self.logger.warning(f"Invalid PDF ratios for mu={mu}, sigma={sigma}") ratios = np.zeros(12) # 计算摩尔量、质量和RF molars, masses, rfs = [], [], [] is_molar = (sample['Internal Standard Mass_ng'] * 1e-9) / sample['Internal Standard MW_g/mol'] * 1e9 for m, ratio in zip(range(1, 13), ratios): # 摩尔量 molar = total_molar * ratio mw = CPOptimizer.calculate_molar_mass(n, m) mass = molar * mw molars.append(molar) masses.append(mass) # 响应因子(RF) cl_col = f'Cl{m}' peak_area = sample[cl_col] if cl_col in sample else 0 if molar > 1e-10 and peak_area > 0: denominator = max(sample['Internal Standard'] * molar, 1e-10) # 防止除以0 rf = (peak_area * is_molar) / denominator else: rf = 0.0 rfs.append(rf) return ratios, molars, masses, rfs except Exception as e: self.logger.error(f"Error calculating congener distribution: {e}") return np.zeros(12), np.zeros(12), np.zeros(12), np.zeros(12) # ====================== RF统计方法 ====================== def calculate_rf_stats(self): """使用优化后的参数重新计算RF,避免中间状态污染""" self.rf_values = {} rf_matrix = {m: [] for m in range(1, 13)} # 检查是否有有效的结果数据 if self.result_df is None or self.result_df.empty: self.logger.warning(f"No result data available for C{self.carbon_num}") return # 使用最终参数重新计算RF for _, row in self.result_df.iterrows(): # 查找对应的原始样品数据 sample_match = self.samples[self.samples['StanID'] == row['StanID']] if sample_match.empty: self.logger.warning(f"No matching sample found for StanID: {row['StanID']}") continue sample = sample_match.iloc[0] # 使用优化后的参数计算RF _, _, _, rfs = self.calculate_congener_distribution( row['mu_optimized'], row['sigma'], row['Total_Molar_nmol'], sample ) for m, rf in enumerate(rfs, start=1): if rf > 1e-6: rf_matrix[m].append(rf) # 计算几何平均和RSD for m in range(1, 13): rf_vals = rf_matrix[m] if len(rf_vals) > 0: log_vals = np.log(rf_vals) geo_mean = np.exp(np.mean(log_vals)) log_std = np.std(log_vals) geo_rsd = np.sqrt(np.exp(log_std ** 2) - 1) else: geo_mean, geo_rsd = 0.0, np.nan self.rf_values[f'RF_geo_mean_{m}'] = geo_mean self.rf_values[f'RF_geo_rsd_{m}'] = geo_rsd * 100 # 转换为百分比 def _calc_robust_rsd(self, rf_vals): """计算基于MAD的稳健RSD""" if len(rf_vals) < self.cfg['min_samples_for_rsd']: return np.nan # 样本不足返回NaN # 稳健RSD (MAD-based) median = np.median(rf_vals) mad = stats.median_abs_deviation(rf_vals, scale='normal') return mad / median if median > 0 else np.nan # ====================== 结果处理 ====================== def save_results(self): """保存结果到CSV""" if self.result_df is not None: result_file = os.path.join(self.chain_path, f'C{self.carbon_num}_Results.csv') self.result_df.to_csv(result_file, index=False) self.logger.info(f"Results for C{self.carbon_num} saved to {result_file}") def generate_plots(self): """生成所有可视化图表""" if self.result_df is None or self.result_df.empty: return # 生成各样品拟合曲线 for _, sample in self.samples.iterrows(): sample_id = sample['StanID'] sample_results = self.result_df[self.result_df['StanID'] == sample_id] if sample_results.empty: continue result = sample_results.iloc[0] self.plot_fit_curve(result, sample['Chlorine Content']) # 生成RF聚类图 self.plot_rf_cluster() # 生成偏差图 self.plot_deviation() # 生成RSD热力图 self.plot_rsd_heatmap() def plot_fit_curve(self, result, chlorine_content): """绘制拟合曲线图""" try: plt.figure(figsize=(10, 6)) # 获取优化参数 mu_opt = result['mu_optimized'] sigma = max(0.3, result['sigma']) mu_natural = result['mu_natural'] total_molar = result['Total_Molar_nmol'] n = self.carbon_num # 计算理论分布 m_values = np.arange(1, 13) dense_m = np.linspace(1, 12, 200) pdf_values = norm.pdf(m_values, mu_opt, sigma) total_pdf = pdf_values.sum() if total_pdf > 0: pdf_values = pdf_values / total_pdf dense_pdf = norm.pdf(dense_m, mu_opt, sigma) if total_pdf > 0: dense_pdf = dense_pdf / total_pdf # 实际分布 actual_molars = [result[f'Molar_Cl{m}'] for m in range(1, 13)] actual_ratios = [m / total_molar for m in actual_molars] if total_molar > 0 else [0] * 12 # 绘制曲线 plt.plot(dense_m, dense_pdf, 'r-', linewidth=2, label=f'Fit: μ={mu_opt:.2f}, σ={sigma:.2f}') plt.plot(m_values, pdf_values, 'rx', markersize=10, label='Predicted Ratios') plt.plot(m_values, actual_ratios, 'bo', markersize=8, label='Actual Ratios') # 添加双x轴(氯含量) ax1 = plt.gca() ax2 = ax1.twiny() ax2.set_xlim(ax1.get_xlim()) cl_contents = [(ATOMIC_MASS['Cl'] * m) / CPOptimizer.calculate_molar_mass(n, m) * 100 for m in m_values] ax2.set_xticks(m_values) ax2.set_xticklabels([f'{c:.1f}%' for c in cl_contents], fontsize=8, color='#888888') ax2.set_xlabel('Chlorine Content', fontsize=10, color='#888888') # 添加参考线 plt.axvline(mu_natural, color='#888888', linestyle='--', label=f'Natural μ: {mu_natural:.2f}') plt.axvline(mu_opt, color='red', linestyle='--', label=f'Optimized μ: {mu_opt:.2f}') # 图表装饰 plt.title(f'C{self.carbon_num} - {chlorine_content}% Chlorine', fontsize=14) plt.xlabel('Chlorine Substitution (m)', fontsize=12) plt.ylabel('Normalized Distribution', fontsize=12) plt.xticks(range(1, 13)) plt.ylim(bottom=-0.05) plt.grid(True, linestyle='--', alpha=0.7) plt.legend(loc='upper right') # 保存图表 plot_file = os.path.join(self.chain_path, f'Fit_Curve_C{self.carbon_num}_{chlorine_content}.png') plt.tight_layout() plt.savefig(plot_file, dpi=300) plt.close() self.logger.info(f"Fit curve plot saved: {plot_file}") except Exception as e: self.logger.error(f"Error generating fit curve plot: {str(e)}") def plot_rf_cluster(self): """绘制RF聚类图""" try: plt.figure(figsize=(12, 8)) rf_data = [] # 收集RF数据 for _, row in self.result_df.iterrows(): sample_id = row['StanID'] chlorine_content = row['Chlorine_Content'] rfs = [row[f'RF_Cl{m}'] for m in range(1, 13)] rf_data.append({'Sample': f'{sample_id} ({chlorine_content}%)', 'RF': rfs}) # 创建DataFrame df_rf = pd.DataFrame({ 'Chlorine Substitution': np.tile(range(1, 13), len(rf_data)), 'RF': np.concatenate([d['RF'] for d in rf_data]), 'Sample': np.repeat([d['Sample'] for d in rf_data], 12) }) # 绘制条形图 sns.barplot(x='Chlorine Substitution', y='RF', hue='Sample', data=df_rf, palette='viridis', errorbar=None) # 图表装饰 plt.title(f'Response Factor (RF) Cluster - C{self.carbon_num}', fontsize=16) plt.xlabel('Chlorine Substitution', fontsize=12) plt.ylabel('Response Factor (RF)', fontsize=12) plt.yscale('log') #plt.yscale('linear') #plt.ylim(0, 3) #plt.ylim(1e-3, 1e4) plt.legend(title='Sample', loc='upper right', fontsize=10) plt.grid(True, linestyle='--', alpha=0.5, which='both') # 保存图表 plot_file = os.path.join(self.chain_path, f'RF_Cluster_C{self.carbon_num}.png') plt.tight_layout() plt.savefig(plot_file, dpi=300) plt.close() self.logger.info(f"RF cluster plot saved: {plot_file}") except Exception as e: self.logger.error(f"Error generating RF cluster plot: {str(e)}") def plot_deviation(self): """绘制偏差图""" try: fig, ax = plt.subplots(figsize=(10, 6)) # 准备数据 rsr_values = self.result_df['RSR'].values rsm_values = self.result_df['RSM'].values sample_ids = [f"{row['StanID']} ({row['Chlorine_Content']}%)" for _, row in self.result_df.iterrows()] # 绘制柱状图 x = np.arange(len(sample_ids)) width = 0.35 rsr_bars = ax.bar(x - width / 2, rsr_values, width, label='RSR (Molar)', color='skyblue', alpha=0.8) rsm_bars = ax.bar(x + width / 2, rsm_values, width, label='RSM (Mass)', color='salmon', alpha=0.8) # 添加数值标签 for bar in rsr_bars: height = bar.get_height() ax.annotate(f'{height:.2e}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8) for bar in rsm_bars: height = bar.get_height() ax.annotate(f'{height:.2e}', xy=(bar.get_x() + bar.get_width() / 2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8) # 图表装饰 ax.set_xlabel('Sample', fontsize=12) ax.set_ylabel('Residual Value', fontsize=12) ax.set_title(f'Calculation Deviation - C{self.carbon_num}', fontsize=14) ax.set_xticks(x) ax.set_xticklabels(sample_ids, rotation=45, ha='right', fontsize=10) ax.legend(loc='upper right') ax.grid(True, linestyle='--', alpha=0.3, axis='y') # 保存图表 plot_file = os.path.join(self.chain_path, f'Deviation_C{self.carbon_num}.png') plt.tight_layout() plt.savefig(plot_file, dpi=300) plt.close() self.logger.info(f"Deviation plot saved: {plot_file}") except Exception as e: self.logger.error(f"Error generating deviation plot: {str(e)}") def plot_rsd_heatmap(self): """单行热力图:显示优化后的几何RSD""" try: # 1. 准备数据(使用几何RSD) rsd_row = [] for m in range(1, 13): # 直接从RF值中获取几何RSD rsd = self.rf_values.get(f'RF_geo_rsd_{m}', np.nan) rsd_row.append(rsd) # 2. DataFrame 形式:一行、列名 Cl1…Cl12 heatmap_df = pd.DataFrame([rsd_row], columns=[f'Cl{i}' for i in range(1, 13)]) # 3. 绘图 plt.figure(figsize=(12, 1.8)) sns.heatmap( heatmap_df, annot=True, fmt='.1f', cmap='RdYlGn_r', linewidths=0.5, linecolor='#888888', vmin=0, vmax=20, cbar_kws={'label': 'RSD (%)'}, annot_kws={'size': 10} ) # 4. 装饰 plt.title(f'Optimized RF RSD - C{self.carbon_num}', fontsize=13) plt.xlabel('Congener', fontsize=11) plt.ylabel('') # 去掉纵轴标签 plt.yticks([]) # 去掉纵轴刻度 plt.xticks(rotation=0) # 5. 保存 plot_file = os.path.join(self.chain_path, f'Optimized_RSD_Row_Heatmap_C{self.carbon_num}.png') plt.tight_layout() plt.savefig(plot_file, dpi=300) plt.close() self.logger.info(f"Row-wise RSD heatmap saved: {plot_file}") except Exception as e: self.logger.error(f"Error generating row RSD heatmap: {str(e)}\n{traceback.format_exc()}") # ====================== 接口方法 ====================== def get_final_report(self): """获取最终报告""" return self.result_df.copy() if self.result_df is not None else None def get_rf_values(self): """获取RF值 - 确保始终返回有效字典""" if self.rf_values is None or len(self.rf_values) == 0: # 如果没有RF值,尝试重新计算 self.logger.warning(f"No RF values found for C{self.carbon_num}, attempting to recalculate...") self.calculate_rf_stats() # 如果仍然没有RF值,返回一个包含默认值的字典 if self.rf_values is None or len(self.rf_values) == 0: self.logger.error(f"Failed to calculate RF values for C{self.carbon_num}, returning default values") default_rf = {} for m in range(1, 13): default_rf[f'RF_geo_mean_{m}'] = 1.0 default_rf[f'RF_geo_rsd_{m}'] = 100.0 # 高RSD表示不可靠 return default_rf return self.rf_values # ====================== 主函数 ====================== def main(): """主执行函数""" # 初始化优化器 optimizer = CPOptimizer( std_file='Standard_Sample.csv', unknown_file='Unknown_Samples.csv', result_path='result' ) # 执行优化流程 if optimizer.load_data(): if optimizer.preprocess_standard_samples(): optimizer.run_optimization() optimizer.logger.info("Optimization completed successfully") else: optimizer.logger.error("Standard sample preprocessing failed") else: optimizer.logger.error("Data loading failed") if __name__ == "__main__": main()
08-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

LIQING LIN

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值