dataframe 内的to_csv和from_csv能使存储的数据不变

大数据分析中,使用Dataframe的to_csv保存数据,然后通过from_csv读取,能确保数据不变,避免每次存取增加一列index。推荐使用from_csv以保持数据完整性。
部署运行你感兴趣的模型镜像

       大数据分析时,Dataframe 内的to_csv和from_csv能使存储的数据不变;而使用to_csv和read_csv则每存取一次就多一列index。故,最好用from_csv。

       用from_csv:

a = [("bababa",6.9),("cacaca",5),("aaaaaa",3)]
b = DataFrame(a)

b.to_csv('D:\python\\test1\\result\\init_result.csv')
print b
c = DataFrame.from_csv('D:\python\\test1\\result\\init_result.csv')
print c

c.to_csv('D:\python\\test1\\result\\init_result.csv')
d = DataFrame.from_csv('D:\python\\test1\\result\\init_result.csv')
print d

d.to_csv('D:\python\\test1\\result\\init_result.csv')
e = DataFrame.from_csv('D:\python\\test1\\result\\init_result.csv')
print e

       用read_csv:
a = [("bababa",6.9),("cacaca",5),("aaaaaa",3)]
b = DataFrame(a)

b.to_csv('D:\python\\test1\\result\\init_result.csv')
print b
c = pandas.read_csv('D:\python\\test1\\result\\init_result.csv')
print c

c.to_csv('D:\python\\test1\\result\\init_result.csv')
d = pandas.read_csv('D:\python\\test1\\result\\init_result.csv')
print d

d.to_csv('D:\python\\test1\\result\\init_result.csv')
e = pandas.read_csv('D:\python\\test1\\result\\init_result.csv')
print e



您可能感兴趣的与本文相关的镜像

Anything-LLM

Anything-LLM

AI应用

AnythingLLM是一个全栈应用程序,可以使用商用或开源的LLM/嵌入器/语义向量数据库模型,帮助用户在本地或云端搭建个性化的聊天机器人系统,且无需复杂设置

# 安装所需库 # pip install xarray pandas netCDF4 import xarray as xr import pandas as pd import numpy as np from pathlib import Path def netcdf_to_csv(nc_file_path, csv_output_path=None, variables=None, time_format='%Y-%m-%d'): """ 将NetCDF文件转换为CSV格式 参数: nc_file_path (str): NetCDF文件路径 csv_output_path (str): 输出CSV路径(默认: 同目录同文件名.csv) variables (list): 要导出的变量列表(默认: 全部变量) time_format (str): 时间格式字符串 返回: str: 生成的CSV文件路径 """ # 打开NetCDF文件 ds = xr.open_dataset(nc_file_path) # 设置默认输出路径 if csv_output_path is None: nc_path = Path(nc_file_path) csv_output_path = nc_path.with_suffix('.csv') # 处理时间维度 if 'time' in ds.coords: # 将时间转换为字符串格式 time_str = ds.time.dt.strftime(time_format) ds = ds.assign_coords(time=time_str) # 选择要导出的变量 if variables is None: variables = list(ds.data_vars) # 创建空的DataFrame用于存储结果 combined_df = pd.DataFrame() # 处理每个变量 for var_name in variables: if var_name not in ds.data_vars: print(f"警告: 变量 '{var_name}' 不存在,跳过") continue # 获取变量数据 var = ds[var_name] # 将数据数组转换为DataFrame df = var.to_dataframe(name=var_name).reset_index() # 添加变量单位作为列注释 if 'units' in var.attrs: df[var_name + '_units'] = var.attrs['units'] # 合并数据 if combined_df.empty: combined_df = df else: # 合并时只保留共同维度 common_dims = set(combined_df.columns) & set(df.columns) combined_df = pd.merge(combined_df, df, on=list(common_dims), how='outer') # 保存为CSV combined_df.to_csv(csv_output_path, index=False) print(f"成功保存到: {csv_output_path}") print(f"包含 {len(combined_df)} 行 {len(combined_df.columns)} 列") # 关闭数据集 ds.close() return csv_output_path # 使用示例 if __name__ == "__main__": # 示例文件路径(替换为你的实际文件路径) nc_file = "example.nc" # 转换整个文件 netcdf_to_csv(nc_file) # 只转换特定变量 # netcdf_to_csv(nc_file, variables=['temperature', 'humidity']) # 指定输出路径 # netcdf_to_csv(nc_file, csv_output_path="output_data.csv") # 自定义时间格式 # netcdf_to_csv(nc_file, time_format='%Y/%m/%d') 将输出路径改为 r"D:\1\d2m",其他不要变
07-23
import pandas as pd import numpy as np from sklearn.model_selection import GroupShuffleSplit import warnings warnings.filterwarnings('ignore') class MarineREEDatasetBuilder: """ 海洋稀土元素数据集构建器 负责构建输入特征目标变量,并实现空间分层抽样 """ def __init__(self, target_columns_dissolved=None, target_columns_particulate=None, feature_columns=None, location_columns=['longitude', 'latitude', 'depth']): """ 初始化数据集构建器 参数: target_columns_dissolved: list, 溶解态稀土元素列名列表 target_columns_particulate: list, 颗粒态稀土元素列名列表 feature_columns: list, 特征列名列表(水文/化学参数) location_columns: list, 地理位置列名列表 """ # 默认目标列(稀土元素) if target_columns_dissolved is None: target_columns_dissolved = ['REE_D_La', 'REE_D_Ce', 'REE_D_Pr', 'REE_D_Nd', 'REE_D_Sm', 'REE_D_Eu', 'REE_D_Gd', 'REE_D_Tb', 'REE_D_Dy', 'REE_D_Ho', 'REE_D_Er', 'REE_D_Tm', 'REE_D_Yb', 'REE_D_Lu', 'REE_D_Y'] if target_columns_particulate is None: target_columns_particulate = ['REE_P_La', 'REE_P_Ce', 'REE_P_Pr', 'REE_P_Nd', 'REE_P_Sm', 'REE_P_Eu', 'REE_P_Gd', 'REE_P_Tb', 'REE_P_Dy', 'REE_P_Ho', 'REE_P_Er', 'REE_P_Tm', 'REE_P_Yb', 'REE_P_Lu', 'REE_P_Y'] # 默认特征列 if feature_columns is None: feature_columns = ['temperature', 'salinity', 'phosphate', 'nitrate', 'silicate', 'oxygen', 'pH'] self.target_columns_dissolved = target_columns_dissolved self.target_columns_particulate = target_columns_particulate self.feature_columns = feature_columns self.location_columns = location_columns # 存储划分后的数据集 self.X_train = None self.X_val = None self.X_test = None self.y_train = None self.y_val = None self.y_test = None # 存储分组信息 self.train_groups = None self.val_groups = None self.test_groups = None def build_dataset(self, df, cruise_col='cruise', region_col=None): """ 构建输入特征目标变量数据集 参数: df: DataFrame, 包含所有原始数据DataFrame cruise_col: str, 航次列名 region_col: str, 地理区域列名(如果已预定义区域) 返回: X: DataFrame, 输入特征 y: DataFrame, 目标变量 groups: Series, 分组信息(航次或区域) """ # 合并所有目标列 all_target_columns = self.target_columns_dissolved + self.target_columns_particulate # 检查数据中是否存在所有需要的列 missing_cols = [col for col in all_target_columns + self.feature_columns + self.location_columns if col not in df.columns] if missing_cols: raise ValueError(f"数据中缺失以下列: {missing_cols}") # 构建输入特征 (地理位置 + 水文/化学参数) X = df[self.location_columns + self.feature_columns].copy() # 构建目标变量 (所有稀土元素) y = df[all_target_columns].copy() # 确定分组策略:优先使用航次,如果没有则创建地理区域分组 if cruise_col in df.columns: groups = df[cruise_col] print(f"使用航次信息进行分组,共有 {groups.nunique()} 个航次") elif region_col is not None and region_col in df.columns: groups = df[region_col] print(f"使用预定义地理区域进行分组,共有 {groups.nunique()} 个区域") else: # 如果没有航次或区域信息,则根据经纬度创建地理区域分组 print("未找到航次或区域信息,将根据经纬度创建地理区域分组") groups = self._create_geo_regions(df[self.location_columns[0]], df[self.location_columns[1]]) return X, y, groups def _create_geo_regions(self, longitudes, latitudes, n_regions=10): """ 根据经纬度创建地理区域分组 参数: longitudes: Series, 经度数据 latitudes: Series, 纬度数据 n_regions: int, 要创建的区域数量 返回: regions: Series, 区域标签 """ # 使用KMeans聚类将地理位置分成多个区域 from sklearn.cluster import KMeans # 组合经纬度 coords = np.column_stack([longitudes, latitudes]) # 使用KMeans进行聚类 kmeans = KMeans(n_clusters=n_regions, random_state=42, n_init=10) regions = kmeans.fit_predict(coords) print(f"根据经纬度创建了 {n_regions} 个地理区域") return pd.Series(regions, index=longitudes.index) def spatial_train_test_split(self, X, y, groups, test_size=0.2, val_size=0.1, random_state=42): """ 执行空间分层抽样,划分训练集、验证集测试集 参数: X: DataFrame, 输入特征 y: DataFrame, 目标变量 groups: Series, 分组信息(航次或区域) test_size: float, 测试集比例 val_size: float, 验证集比例(从训练集中再划分) random_state: int, 随机种子 返回: 划分后的数据集 (X_train, X_val, X_test, y_train, y_val, y_test) """ # 第一次划分:分离测试集 splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state) train_val_idx, test_idx = next(splitter.split(X, y, groups)) X_train_val, X_test = X.iloc[train_val_idx], X.iloc[test_idx] y_train_val, y_test = y.iloc[train_val_idx], y.iloc[test_idx] groups_train_val = groups.iloc[train_val_idx] # 第二次划分:从训练验证集中分离验证集 # 计算验证集在原始数据中的比例,然后调整在训练验证集中的比例 val_ratio_adjusted = val_size / (1 - test_size) splitter_val = GroupShuffleSplit(test_size=val_ratio_adjusted, n_splits=1, random_state=random_state) train_idx, val_idx = next(splitter_val.split(X_train_val, y_train_val, groups_train_val)) X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx] y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx] # 存储分组信息 self.train_groups = groups_train_val.iloc[train_idx] self.val_groups = groups_train_val.iloc[val_idx] self.test_groups = groups.iloc[test_idx] # 存储划分后的数据集 self.X_train, self.X_val, self.X_test = X_train, X_val, X_test self.y_train, self.y_val, self.y_test = y_train, y_val, y_test print(f"数据集划分完成:") print(f"- 训练集: {len(X_train)} 个样本, {self.train_groups.nunique()} 个组") print(f"- 验证集: {len(X_val)} 个样本, {self.val_groups.nunique()} 个组") print(f"- 测试集: {len(X_test)} 个样本, {self.test_groups.nunique()} 个组") return X_train, X_val, X_test, y_train, y_val, y_test def get_dataset_stats(self): """获取数据集的统计信息""" if self.X_train is None: raise ValueError("请先执行数据集划分") stats = { 'train_samples': len(self.X_train), 'val_samples': len(self.X_val), 'test_samples': len(self.X_test), 'train_groups': self.train_groups.nunique(), 'val_groups': self.val_groups.nunique(), 'test_groups': self.test_groups.nunique(), 'input_features': self.X_train.shape[1], 'target_variables': self.y_train.shape[1], 'feature_names': list(self.X_train.columns), 'target_names': list(self.y_train.columns) } return stats def save_datasets(self, save_path): """保存划分后的数据集到文件""" if self.X_train is None: raise ValueError("请先执行数据集划分") # 保存为CSV文件 self.X_train.to_csv(f"{save_path}/X_train.csv", index=False) self.X_val.to_csv(f"{save_path}/X_val.csv", index=False) self.X_test.to_csv(f"{save_path}/X_test.csv", index=False) self.y_train.to_csv(f"{save_path}/y_train.csv", index=False) self.y_val.to_csv(f"{save_path}/y_val.csv", index=False) self.y_test.to_csv(f"{save_path}/y_test.csv", index=False) # 保存分组信息 pd.DataFrame({'groups': self.train_groups}).to_csv(f"{save_path}/train_groups.csv", index=False) pd.DataFrame({'groups': self.val_groups}).to_csv(f"{save_path}/val_groups.csv", index=False) pd.DataFrame({'groups': self.test_groups}).to_csv(f"{save_path}/test_groups.csv", index=False) print(f"数据集已保存到 {save_path}") # 使用示例 if __name__ == "__main__": # 假设df是已经加载并预处理好的数据 # 包含: 地理位置列, 特征列, 目标列(稀土元素), 以及可能的航次信息 # 初始化数据集构建器 dataset_builder = MarineREEDatasetBuilder() # 构建数据集 X, y, groups = dataset_builder.build_dataset(df, cruise_col='cruise_id') # 执行空间分层抽样划分 X_train, X_val, X_test, y_train, y_val, y_test = dataset_builder.spatial_train_test_split( X, y, groups, test_size=0.2, val_size=0.1, random_state=42 ) # 获取数据集统计信息 stats = dataset_builder.get_dataset_stats() print("数据集统计信息:") for key, value in stats.items(): if key not in ['feature_names', 'target_names']: print(f"{key}: {value}") # 保存数据集(可选) # dataset_builder.save_datasets("./data/processed/") 帮我检查一下这段代码
09-04
import pandas as pd import Config import Plot_map import os import warnings warnings.simplefilter("ignore") # 忽略所有警告 def get_filenames(directory): filenames = os.listdir(directory) return filenames def merge_CKBD(Die_X, Die_Y, scramble_path): # 用了,从wafermap.csv文件得到Die的坐标Die_XDie_Y filenames = get_filenames(scramble_path) for Vol in Config.Test_Voltage: # ⭐替换--遍历测试电压 # 根据电压的不同分类 result_df = pd.DataFrame(columns=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']) for file in filenames: # 遍历scramble中的文件名 # 根据电压选择相对的文件夹 if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: # 判断wafermap中的坐标是否为文件名中的坐标 if Vol in file: # 如文件名中有电压 # 根据pattern进行merge for pattern in Config.Test_Pattern: # 遍历测试标记 if pattern in file: # 如文件名中有该测试标记 tmp = pd.read_csv(f'{scramble_path}/{file}') # 读取既有该标记又有电压值的文件 tmp[pattern] = 'F' # 新增“测试标记”列,值为F result_df = pd.merge(result_df, tmp, on=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row'], how='outer') # 将两个表所有的键取并集后,再进行连接 result_df.to_csv(f'{scramble_path}/{Die_X}_{Die_Y}_{Vol}_all_scramble.csv', index=False) # 根据坐标电压,将不同pattern的合在一起形成一个all_scramble的文件 # 比如根据1_4_(ckbd/ickbd/wt0rd0/wt1rd1) _0.960_scramble.csv四个文件得到1_4_0.960_all_scramble.csv一个总结文件, # 保存在Scramble_path里 def Cal_FBC(wafermap_55, Vol, Scramble_path, Yield_path, LotID, WaferID): # ❗⭐⭐wafermap_55替换 OS_data = pd.read_csv(f'{Yield_path}/OS.csv') # 整片wafer的OS信息 OS_data = OS_data[OS_data['P_F'] == 'F'] # OS为fail的 OS_list = [] for die in OS_data[['Die_X', 'Die_Y']].values.tolist(): if die not in OS_list: OS_list.append(die) # 将OS中fail的die坐标加入到OS_list FBC = [] # fail bit count? for ind in wafermap_55.index: # ⭐wafermap_55替换 Die_X = str(wafermap_55['Die_X'][ind]) # ⭐wafermap_55替换 Die_Y = str(wafermap_55['Die_Y'][ind]) # ⭐wafermap_55替换 for file in get_filenames(Scramble_path): if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: # 从all文件 if f'{Vol}_all' in file: # 读取上一步得到的根据每个die每个电压,合并四种测试方式的_{Vol}__all_scramble.csv if [int(Die_X), int(Die_Y)] in OS_list: # 如wafermap中的die在OS中fail的OS_list FBC.append([Die_X, Die_Y, -1]) # OS测试完全失败的die不统计具体失败bit,将die坐标加上去 else: # wafermap中不在OS_list里的die坐标 # 根据电压选择 all_data = pd.read_csv(f'{Scramble_path}/{file}') # 读取OS中无fail的文件 FBC.append([Die_X, Die_Y, all_data.shape[0]]) # 每个不在里面的die的实际失败bit数量 FBC = pd.DataFrame(FBC) FBC.columns = ['Die_X', 'Die_Y', 'FBC'] # ❗⭐在这里修改FBC.csv文件的输出名 FBC.to_csv(f'{Yield_path}/{LotID}_{WaferID}_FBC_{Vol}.csv', index=False) # 从all_scramble计算得到 def merge_Vol(Die_X, Die_Y, Scramble_path): result_df = pd.DataFrame(columns=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']) for file in get_filenames(Scramble_path): if 'all' in file: if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: for vol in Config.Test_Voltage: if vol in file: Vol_df = pd.read_csv(f'{Scramble_path}/{file}') Vol_df = Vol_df[['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']] Vol_df[f'{vol}V'] = 'F' result_df = pd.merge(result_df, Vol_df, on=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row'], how='outer') result_df.to_csv(f'{Scramble_path}/{Die_X}_{Die_Y}_Voltage.csv', index=False) def cal_FBC_PF(Scramble_path, Yield_path, LotID, WaferID): # ❗⭐ wafermap_55 = pd.read_csv('55_wafermap.csv') # ⭐wafermap_55替换 OS_data = pd.read_csv(f'{Yield_path}/OS.csv') OS_data = OS_data[OS_data['P_F'] == 'F'] OS_list = [] for die in OS_data[['Die_X', 'Die_Y']].values.tolist(): if die not in OS_list: OS_list.append(die) M16 = [] for ind in wafermap_55.index: # ⭐wafermap_55替换 Die_X = str(wafermap_55['Die_X'][ind]) # ⭐wafermap_55替换 Die_Y = str(wafermap_55['Die_Y'][ind]) # ⭐wafermap_55替换 all_data = pd.DataFrame(columns=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row', 'pattern']) for file in get_filenames(Scramble_path): if 'Voltage' in file: # 从Voltage文件 if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: print(Die_X, Die_Y) all_data = pd.read_csv(f'{Scramble_path}/{file}') all_data = all_data.fillna('P') for Vol1 in ['P', 'F']: # ⭐⭐⭐电压变跟着改 for Vol2 in ['P', 'F']: for Vol3 in ['P', 'F']: for Vol4 in ['P', 'F']: # for Vol5 in['P', 'F']: # for Vol1 in ['F']: # for Vol2 in ['F']: # for Vol3 in ['F']: if [int(Die_X), int(Die_Y)] in OS_list: M16.append([Vol1, Vol2, Vol3, Vol4, Die_X, Die_Y, -1]) # 八种情况都是-1 else: # 根据电压选择 count = all_data[ (all_data[Config.Test_Voltage[0] + 'V'] == Vol1) & (all_data[Config.Test_Voltage[1] + 'V'] == Vol2) & (all_data[Config.Test_Voltage[2] + 'V'] == Vol3) & (all_data[Config.Test_Voltage[3] + 'V'] == Vol4)] # (all_data[Config.Test_Voltage[4] + 'V'] == Vol5) ].shape[0] M16.append([Vol1, Vol2, Vol3, Vol4, Die_X, Die_Y, count]) M16 = pd.DataFrame(M16) M16.columns = [voltage + 'V' for voltage in Config.Test_Voltage] + ['Die_X', 'Die_Y', 'FBC'] M16.to_csv(f'{Yield_path}/16M.csv', index=False) print('Plot.....') all_data = pd.read_csv(f'{Yield_path}/16M.csv') for Vol1 in ['P', 'F']: for Vol2 in ['P', 'F']: for Vol3 in ['P', 'F']: for Vol4 in ['P', 'F']: # for Vol5 in ['P', 'F']: print(Vol1, Vol2, Vol3, Vol4) tmp = all_data[(all_data[Config.Test_Voltage[0]+'V'] == Vol1) & (all_data[Config.Test_Voltage[1]+'V'] == Vol2) & (all_data[Config.Test_Voltage[2]+'V'] == Vol3) & (all_data[Config.Test_Voltage[3] + 'V'] == Vol4)] # (all_data[Config.Test_Voltage[4] + 'V'] == Vol5)] tmp = tmp.reset_index(drop=True) Plot_map.Plot_FBC_Map(tmp, Yield_path, f'{Vol1}_{Vol2}_{Vol3}_{Vol4}', LotID, WaferID) # ❗⭐ if __name__ == '__main__': print(1) # LotID = 'AE0091.044' # WaferID = '16' # Scramble_path = f'{Config.Result_data_path}/{LotID}/{WaferID}/03_Scramble' # Yield_path = f'{Config.Result_data_path}/{LotID}/{WaferID}/05_Yield' # # # # # # # merge pattern # wafermap_55 = pd.read_csv('55_wafermap.csv') # # # for ind in wafermap_55.index: # # # Die_X = str(wafermap_55['Die_X'][ind]) # # # Die_Y = str(wafermap_55['Die_Y'][ind]) # # # merge_CKBD(Die_X, Die_Y, Scramble_path) # # # # for ind in wafermap_55.index: # # Die_X = str(wafermap_55['Die_X'][ind]) # # Die_Y = str(wafermap_55['Die_Y'][ind]) # # result_df = pd.DataFrame(columns=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']) # # for file in get_filenames(Scramble_path): # # if 'all' in file: # # if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: # # if '0.960' in file: # # Vol1_df = pd.read_csv(f'{Scramble_path}/{file}') # # Vol1_df = Vol1_df[['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']] # # Vol1_df['0.96V'] = 'F' # # result_df = pd.merge(result_df, Vol1_df, on=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row'], # # how='outer') # # elif '1.200' in file: # # Vol2_df = pd.read_csv(f'{Scramble_path}/{file}') # # Vol2_df = Vol2_df[['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']] # # Vol2_df['1.2V'] = 'F' # # result_df = pd.merge(result_df, Vol2_df, on=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row'], # # how='outer') # # if '1.440' in file: # # Vol3_df = pd.read_csv(f'{Scramble_path}/{file}') # # Vol3_df = Vol3_df[['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row']] # # Vol3_df['1.44V'] = 'F' # # result_df = pd.merge(result_df, Vol3_df, on=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row'], # # how='outer') # # result_df.to_csv(f'{Scramble_path}/{Die_X}_{Die_Y}_Voltage.csv', index=False) # # # # # # # # # OS_data = pd.read_csv(f'{Yield_path}/OS.csv') # OS_data = OS_data[OS_data['P_F'] == 'F'] # OS_list = [] # for die in OS_data[['Die_X', 'Die_Y']].values.tolist(): # if die not in OS_list: # OS_list.append(die) # M16 = [] # # M1 = [] # # M4 = [] # # K512 = [] # # K256 = [] # for ind in wafermap_55.index: # Die_X = str(wafermap_55['Die_X'][ind]) # Die_Y = str(wafermap_55['Die_Y'][ind]) # all_data = pd.DataFrame(columns=['Block', 'Bank', 'WL', 'BL', 'DQ', 'Col', 'Row', 'pattern']) # for file in get_filenames(Scramble_path): # if 'Vol' in file: # if Die_X == file.split('_')[0] and Die_Y == file.split('_')[1]: # print(Die_X, Die_Y) # all_data = pd.read_csv(f'{Scramble_path}/{file}') # all_data = all_data.fillna('P') # for Vol1 in ['P', 'F']: # for Vol2 in ['P', 'F']: # for Vol3 in ['P', 'F']: # if [int(Die_X), int(Die_Y)] in OS_list: # M16.append([Vol1, Vol2, Vol3, Die_X, Die_Y, -1]) # else: # # 根据电压选择 # count = all_data[(all_data['0.96V'] == Vol1) & (all_data['1.2V'] == Vol2) & (all_data['1.44V'] == Vol3)].shape[0] # M16.append([Vol1, Vol2, Vol3, Die_X, Die_Y, count]) # # # for Block in range(32): # # # Block_data = all_data[all_data['Block'] == Block] # # # # 256K # # # DQ_data = Block_data[Block_data['DQ'] < 16] # # # K256.append([1.2, Die_X, Die_Y, f'{Block}_low', DQ_data.shape[0]]) # # # DQ_data = Block_data[Block_data['DQ'] > 15] # # # K256.append([1.2, Die_X, Die_Y, f'{Block}_high', DQ_data.shape[0]]) # # # # 512K # # # K512.append([1.2, Die_X, Die_Y, Block, Block_data.shape[0]]) # # # # 1M # # # for Two_Block in range(16): # # # Two_Block_data = all_data[(all_data['Block'] >= Two_Block*2) & (all_data['Block'] <= Two_Block * 2 + 1)] # # # M1.append([1.2, Die_X, Die_Y, Two_Block, Two_Block_data.shape[0]]) # # # # 4M # # # for Two_Block in range(4): # # # Two_Block_data = all_data[ # # # (all_data['Block'] >= Two_Block * 8) & (all_data['Block'] <= Two_Block * 8 + 7)] # # # M4.append([1.2, Die_X, Die_Y, Two_Block, Two_Block_data.shape[0]]) # # # 16M # # # M16.append([1.2, Die_X, Die_Y, all_data.shape[0]]) # # # Yield = [] # # # K256 = pd.DataFrame(K256) # # # K256.columns = ['Vol', 'Die_X', 'Die_Y', 'Block', 'FBC'] # # # for Vol in K256['Vol'].unique(): # # # tmp = K256[K256['Vol'] == Vol] # # # count = tmp[tmp['FBC'] == 0].shape[0] + len(OS_list) * 64 # # # Yield.append([Vol, '256K', round(count/3840, 3)]) # # # K256.to_csv(f'{Yield_path}/256K.csv', index=False) # # # # # # K512 = pd.DataFrame(K512) # # # K512.columns = ['Vol', 'Die_X', 'Die_Y', 'Block', 'FBC'] # # # for Vol in K512['Vol'].unique(): # # # tmp = K512[K512['Vol'] == Vol] # # # count = tmp[tmp['FBC'] == 0].shape[0] # # # Yield.append([Vol, '512K', round(count/1920, 3)]) # # # K512.to_csv(f'{Yield_path}/512K.csv', index=False) # # # # # # M1 = pd.DataFrame(M1) # # # M1.columns = ['Vol', 'Die_X', 'Die_Y', 'Block', 'FBC'] # # # for Vol in M1['Vol'].unique(): # # # tmp = M1[M1['Vol'] == Vol] # # # count = tmp[tmp['FBC'] == 0].shape[0] # # # Yield.append([Vol, '1M', round(count/960, 3)]) # # # # # # M1.to_csv(f'{Yield_path}/1M.csv', index=False) # # # # # # M4 = pd.DataFrame(M4) # # # M4.columns = ['Vol', 'Die_X', 'Die_Y', 'Block', 'FBC'] # # # for Vol in M4['Vol'].unique(): # # # tmp = M4[M1['Vol'] == Vol] # # # count = tmp[tmp['FBC'] == 0].shape[0] # # # Yield.append([Vol, '4M', round(count / 240, 3)]) # # # # # # M4.to_csv(f'{Yield_path}/4M.csv', index=False) # # # M16 = pd.DataFrame(M16) # M16.columns = ['0.96V', '1.2V', '1.44V', 'Die_X', 'Die_Y', 'FBC'] # # for Vol in M16['Vol'].unique(): # # tmp = M16[M16['Vol'] == Vol] # # count = tmp[tmp['FBC'] == 0].shape[0] # # Yield.append([Vol, '16M', round(count/60, 3)]) # M16.to_csv(f'{Yield_path}/16M.csv', index=False) # # # # # # Yield = pd.DataFrame(Yield) # # # Yield.columns = ['Voltage', 'Bitcell', 'yield'] # # # Yield.to_csv(f'{Yield_path}/Yield.csv', index=False) # # Plot_BitCell_Die(wafermap_55, Scramble_path, vol, Yield_path) # print('Plot.....') # all_data = pd.read_csv(f'{Yield_path}/16M.csv') # for Vol1 in ['P', 'F']: # for Vol2 in ['P', 'F']: # for Vol3 in ['P', 'F']: # print(Vol1, Vol2, Vol3) # tmp = all_data[(all_data['0.96V'] == Vol1) & (all_data['1.2V'] == Vol2) & (all_data['1.44V'] == Vol3)] # tmp = tmp.reset_index(drop=True) # Plot_map.Plot_FBC_Map(tmp, Yield_path, f'{Vol1}_{Vol2}_{Vol3}') # 以上代码为什么报错 Traceback (most recent call last): File "D:\Users\x00708\SRAM_Classification-242终极版\MultiProcess.py", line 172, in <module> Split_Bitcell.cal_FBC_PF(Scramble_path, Yield_path, LotID, WaferID) # ❗⭐ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\Users\x00708\SRAM_Classification-242终极版\Split_Bitcell.py", line 136, in cal_FBC_PF Plot_map.Plot_FBC_Map(tmp, Yield_path, f'{Vol1}_{Vol2}_{Vol3}_{Vol4}', LotID, WaferID) # ❗⭐ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "D:\Users\x00708\SRAM_Classification-242终极版\Plot_map.py", line 22, in Plot_FBC_Map colors = cm(norm(data['FBC'])) ^^^^^^^^^^^^^^^^^ File "D:\Users\x00708\SRAM_Classification\.venv\Lib\site-packages\matplotlib\colors.py", line 2294, in __call__ resdat -= vmin TypeError: unsupported operand type(s) for -: 'str' and 'float'
最新发布
11-27
''' 问题1 首先对附件1的数据进行预处理,使用fillna结合for循环将“催化剂组合编号”与“催化剂组合”信息与温度等一一对应,确保每个数据点都能对应到正确的催化剂组合 将乙醇转化率、C4烯烃选择性转化为数值类型,便于后续统计分析可视化处理 使用最小二乘法求回归方程,使用LinearRegression模型对每组数据进行最小二乘拟合,分析温度与乙醇转化率、C4烯烃选择性等之间的线性关系。计算斜率、截距R²值,评估拟合效果。 将数据以折线图、散点图、回归线的形式直观显示 问题2 ①Co负载量 ②装料比 ③乙醇浓度 ④温度 ⑤乙醇转化率 ⑥C4烯烃选择性 使用statsmodels对特征进行标准化处理(消除量纲影响),构建多元线性回归模型,拟合乙醇转化率C4烯烃选择性两个目标变量的回归模型 转化率=β 0+β 1(Co负载量)+β 2 (装料比)+β 3(温度)+ϵ 选择性=β 0+β 1(Co负载量)+β 2 (装料比)+β 3(温度)+ϵ (这两个式子的β数值不同) 通过OLS模型(普通最小二乘)的t值p值判断参数显著性(p<0.05为显著) 箱线图:用于数据分布可视化,直观反映数据中位数、四分位数,适合对比不同组别的数据分布。箱体表示数据中间的50% 问题3 1、首先计算C4烯烃收率,据此找出C4烯烃收率最高的实验以及对应催化剂组合温度。 2、然后分析不同温度下C4烯烃收率,得出温度低于350摄氏度时的最佳组合。 3、(分组统计)接着利用groupby对催化剂组合与温度进行分组,统计平均C4烯烃收率,找出每个催化剂组合的最佳温度(柱状图),找出前五个最佳催化剂组合(作为最优方案的参考,截图表格都有) 4、然后解析催化剂成分,分别分析Co负载量、装料比、乙醇浓度对C4烯烃收率的影响(三张条形图以及对应数据)。最后找出温度低于350°C时的最佳组合 5、使用LinearRegression模型对温度与C4烯烃收率之间的关系进行拟合,计算斜率、截距决定系数,评估变量间的线性相关性(方程与R方,拟合图)。 再进行非线性(多项式回归分析,r2,mse) 问题4 首先分析当前已经存在的实验,读取C4烯烃收率最高的实验,得出对应温度、Co装载量、装料比、乙醇浓度 实验设计原则:需要控制单变量,确保实验结果的可解释性;合理调整幅度;由前面几题的分析可得对反应影响最大的三个因素(温度、Co负载量、装料比) 新增实验设计:1、首先复现当前的最佳条件,确保实验结果的稳定性 2、调整温度 3、调整Co负载量 4、调整装料比 ''' import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score, mean_squared_error from sklearn.preprocessing import StandardScaler import statsmodels.api as sm import os import re plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False os.makedirs('./results', exist_ok=True) os.makedirs('./results/problem2', exist_ok=True) os.makedirs('./results/problem3', exist_ok=True) os.makedirs('./processed_data', exist_ok=True) os.makedirs('./results/problem4', exist_ok=True) def preprocess_attachment1(file_path): df = pd.read_excel(file_path) df.fillna('', inplace=True) cleaned_ids = [] cleaned_descs = [] current_id = '' current_desc = '' for index, row in df.iterrows(): catalyst_id = row['催化剂组合编号'] catalyst_desc = row['催化剂组合'] if catalyst_id != '': current_id = catalyst_id current_desc = catalyst_desc cleaned_ids.append(current_id) cleaned_descs.append(current_desc) df['催化剂组合编号'] = cleaned_ids df['催化剂组合'] = cleaned_descs numeric_cols = df.columns[2:] for col in numeric_cols: df[col] = pd.to_numeric(df[col], errors='coerce') return df def preprocess_attachment2(file_path): df = pd.read_excel(file_path) df.columns = ['时间(min)', '乙醇转化率(%)', '乙烯选择性(%)', 'C4烯烃选择性(%)', '乙醛选择性(%)', '碳数为4-12脂肪醇(%)', '甲基苯甲醛甲基苯甲醇(%)', '其他(%)'] df['时间(min)'] = pd.to_numeric(df['时间(min)'], errors='coerce') df.fillna(method='ffill', inplace=True) for col in df.columns[1:]: df[col] = pd.to_numeric(df[col], errors='coerce') df.dropna(inplace=True) df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 return df # 线性回归函数 def linear_regression_fit(x, y): if len(x) < 2: return None, None, None X = x.values.reshape(-1, 1) model = LinearRegression() model.fit(X, y) slope = model.coef_[0] intercept = model.intercept_ y_pred = model.predict(X) r2 = r2_score(y, y_pred) return slope, intercept, r2 def plot_grouped_line_with_regression(data, x_col, y_col, group_col, title, xlabel, ylabel, save_name=None): plt.figure(figsize=(14, 8)) colors = plt.cm.tab20.colors groups = data[group_col].unique() results = [] for i, group in enumerate(groups): subset = data[data[group_col] == group] x = subset[x_col] y = subset[y_col] if len(x) < 2: print(f"组 {group} 数据不足,跳过拟合") continue slope, intercept, r2 = linear_regression_fit(x, y) if slope is None: continue y_fit = slope * x + intercept color = colors[i % len(colors)] plt.plot(x, y, marker='o', linestyle='--', color=color, label=f'{group} (原始数据)') plt.plot(x, y_fit, color=color, linestyle='-', linewidth=2, label=f'{group} 拟合') equation = f'y = {slope:.4f}x + {intercept:.4f}' for xi, yi in zip(x, y): results.append({ '组别': group, 'x': xi, 'y': yi, 'y_拟合': slope * xi + intercept, '斜率': slope, '截距': intercept, 'R2': r2, '方程': equation }) result_df = pd.DataFrame(results) plt.title(title, fontsize=14) plt.xlabel(xlabel, fontsize=12) plt.ylabel(ylabel, fontsize=12) plt.grid(True) plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() if save_name: plt.savefig(f"./results/{save_name}.png", dpi=300, bbox_inches='tight') result_df.to_csv(f"./results/{save_name}_回归结果.csv", index=False, encoding='utf_8_sig') plt.show() def plot_ethanol_vs_c4_with_regression(data, ethanol_col, c4_col, group_col, title, xlabel, ylabel, save_name=None): plt.figure(figsize=(14, 8)) colors = plt.cm.tab20.colors groups = data[group_col].unique() results = [] for i, group in enumerate(groups): subset = data[data[group_col] == group] x = subset[ethanol_col] y = subset[c4_col] if len(x) < 2: print(f"组 {group} 数据不足,跳过拟合") continue slope, intercept, r2 = linear_regression_fit(x, y) if slope is None: continue y_fit = slope * x + intercept color = colors[i % len(colors)] plt.plot(x, y, marker='o', linestyle='--', color=color, label=f'{group} (原始数据)') plt.plot(x, y_fit, color=color, linestyle='-', linewidth=2, label=f'{group} 拟合') equation = f'y = {slope:.4f}x + {intercept:.4f}' for xi, yi in zip(x, y): results.append({ '组别': group, 'x': xi, 'y': yi, 'y_拟合': slope * xi + intercept, '斜率': slope, '截距': intercept, 'R2': r2, '方程': equation }) result_df = pd.DataFrame(results) plt.title(title, fontsize=14) plt.xlabel(xlabel, fontsize=12) plt.ylabel(ylabel, fontsize=12) plt.grid(True) plt.legend(fontsize=10, bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() if save_name: plt.savefig(f"./results/{save_name}.png", dpi=300, bbox_inches='tight') result_df.to_csv(f"./results/{save_name}_回归结果.csv", index=False, encoding='utf_8_sig') plt.show() def analyze_attachment2_data(data): # 乙醇转化率分析 plt.figure(figsize=(10, 6)) sns.lineplot(x='时间(min)', y='乙醇转化率(%)', data=data, marker='o', color='g', label='原始数据') slope, intercept, r2 = linear_regression_fit(data['时间(min)'], data['乙醇转化率(%)']) if slope is not None: x_fit = data['时间(min)'] y_fit = slope * x_fit + intercept plt.plot(x_fit, y_fit, color='orange', linestyle='--', label=f'线性拟合\ny = {slope:.4f}x + {intercept:.4f}\nR² = {r2:.4f}') plt.title('乙醇转化率随时间变化(350°C)', fontsize=14) plt.xlabel('时间(min)', fontsize=12) plt.ylabel('乙醇转化率 (%)', fontsize=12) plt.legend() plt.grid(True) plt.tight_layout() plt.savefig(f'./results/attachment2_乙醇转化率_线性拟合.png', dpi=300, bbox_inches='tight') plt.show() # C4烯烃选择性分析 plt.figure(figsize=(10, 6)) sns.lineplot(x='时间(min)', y='C4烯烃选择性(%)', data=data, marker='o', color='b', label='原始数据') slope, intercept, r2 = linear_regression_fit(data['时间(min)'], data['C4烯烃选择性(%)']) if slope is not None: x_fit = data['时间(min)'] y_fit = slope * x_fit + intercept plt.plot(x_fit, y_fit, color='red', linestyle='--', label=f'线性拟合\ny = {slope:.4f}x + {intercept:.4f}\nR² = {r2:.4f}') plt.title('C4烯烃选择性随时间变化(350°C)', fontsize=14) plt.xlabel('时间(min)', fontsize=12) plt.ylabel('C4烯烃选择性 (%)', fontsize=12) plt.legend() plt.grid(True) plt.tight_layout() plt.savefig(f'./results/attachment2_C4选择性_线性拟合.png', dpi=300, bbox_inches='tight') plt.show() # 各产物选择性分析 plt.figure(figsize=(12, 6)) for col in data.columns[2:-1]: # 排除时间列收率列 sns.lineplot(x='时间(min)', y=col, data=data, label=col) plt.title('各产物选择性随时间变化', fontsize=14) plt.xlabel('时间(min)') plt.ylabel('选择性 (%)') plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid(True) plt.tight_layout() plt.savefig('./results/attachment2_各产物选择性.png', dpi=300, bbox_inches='tight') plt.show() # 乙醇转化率与C4烯烃选择性关系 plt.figure(figsize=(10, 6)) sns.scatterplot(x='乙醇转化率(%)', y='C4烯烃选择性(%)', data=data) plt.title('乙醇转化率与C4烯烃选择性关系', fontsize=14) plt.xlabel('乙醇转化率 (%)') plt.ylabel('C4烯烃选择性 (%)') plt.grid(True) plt.tight_layout() plt.savefig('./results/attachment2_转化率与选择性关系.png', dpi=300, bbox_inches='tight') plt.show() # C4烯烃收率分析 plt.figure(figsize=(10, 6)) sns.lineplot(x='时间(min)', y='C4烯烃收率(%)', data=data, marker='o', color='purple', label='原始数据') slope, intercept, r2 = linear_regression_fit(data['时间(min)'], data['C4烯烃收率(%)']) if slope is not None: x_fit = data['时间(min)'] y_fit = slope * x_fit + intercept plt.plot(x_fit, y_fit, color='orange', linestyle='--', label=f'线性拟合\ny = {slope:.4f}x + {intercept:.4f}\nR² = {r2:.4f}') plt.title('C4烯烃收率随时间变化(350°C)', fontsize=14) plt.xlabel('时间(min)', fontsize=12) plt.ylabel('C4烯烃收率 (%)', fontsize=12) plt.legend() plt.grid(True) plt.tight_layout() plt.savefig(f'./results/attachment2_C4收率_线性拟合.png', dpi=300, bbox_inches='tight') plt.show() def parse_catalyst_components(df): df['Co负载量(wt%)'] = np.nan df['Co/SiO2质量(mg)'] = np.nan df['HAP质量(mg)'] = np.nan df['装料比(Co/SiO2:HAP)'] = np.nan df['乙醇浓度(ml/min)'] = np.nan df['装料方式'] = ['I' if 'A' in idx else 'II' for idx in df['催化剂组合编号']] for idx, row in df.iterrows(): desc = row['催化剂组合'] # Co负载量 if 'wt%' in desc: wt_percent = desc.split('wt%')[0].split()[-1] try: df.at[idx, 'Co负载量(wt%)'] = float(wt_percent) except: pass # Co/SiO2质量HAP质量 parts = desc.split('-') for part in parts: if 'mg' in part and 'Co/SiO2' in part: try: mass = float(part.split('mg')[0].strip()) df.at[idx, 'Co/SiO2质量(mg)'] = mass except: pass elif 'mg' in part and 'HAP' in part: try: mass = float(part.split('mg')[0].strip()) df.at[idx, 'HAP质量(mg)'] = mass except: pass elif '乙醇浓度' in part: try: conc = float(part.split('乙醇浓度')[1].split('ml/min')[0].strip()) df.at[idx, '乙醇浓度(ml/min)'] = conc except: pass # 装料比 co_mass = df.at[idx, 'Co/SiO2质量(mg)'] hap_mass = df.at[idx, 'HAP质量(mg)'] if not np.isnan(co_mass) and not np.isnan(hap_mass) and hap_mass != 0: df.at[idx, '装料比(Co/SiO2:HAP)'] = co_mass / hap_mass return df def analyze_catalyst_effects(df): df = parse_catalyst_components(df) df.to_csv('./processed_data/attachment1_with_catalyst_params.csv', index=False, encoding='utf_8_sig') os.makedirs('./results/problem2', exist_ok=True) # 移除缺失值 regression_df = df.dropna(subset=['Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)', '温度', '乙醇转化率(%)', 'C4烯烃选择性(%)']) # 特征标准化 features = regression_df[['Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)', '温度']] scaler = StandardScaler() scaled_features = scaler.fit_transform(features) scaler_df = pd.DataFrame({ '特征': features.columns, '均值': scaler.mean_, '标准差': scaler.scale_ }) scaler_df.to_csv('./results/problem2/standardization_params.csv', index=False, encoding='utf_8_sig') # 回归模型 X_conv = sm.add_constant(scaled_features) y_conv = regression_df['乙醇转化率(%)'] model_conv = sm.OLS(y_conv, X_conv).fit() y_select = regression_df['C4烯烃选择性(%)'] model_select = sm.OLS(y_select, X_conv).fit() with open('./results/problem2/regression_results.txt', 'w') as f: f.write("乙醇转化率回归分析结果:\n") f.write(str(model_conv.summary())) f.write("\n\n" + "=" * 80 + "\n\n") f.write("C4烯烃选择性回归分析结果:\n") f.write(str(model_select.summary())) residuals_conv = model_conv.resid residuals_select = model_select.resid pd.DataFrame({'乙醇转化率残差': residuals_conv, 'C4烯烃选择性残差': residuals_select}).to_csv( './results/problem2/regression_residuals.csv', index=False, encoding='utf_8_sig' ) # VIF 检验多重共线性 from statsmodels.stats.outliers_influence import variance_inflation_factor vif_data = pd.DataFrame() vif_data["特征"] = features.columns vif_data["VIF"] = [variance_inflation_factor(scaled_features, i) for i in range(scaled_features.shape[1])] vif_data.to_csv('./results/problem2/vif_values.csv', index=False, encoding='utf_8_sig') # Durbin-Watson 检验自相关 with open('./results/problem2/durbin_watson.txt', 'w') as f: f.write(f"乙醇转化率模型 Durbin-Watson 检验: {sm.stats.durbin_watson(residuals_conv):.4f}\n") f.write(f"C4烯烃选择性模型 Durbin-Watson 检验: {sm.stats.durbin_watson(residuals_select):.4f}") regression_coefficients = pd.DataFrame({ '特征': ['截距'] + list(features.columns), '乙醇转化率系数': [model_conv.params[0]] + list(model_conv.params[1:]), 'C4烯烃选择性系数': [model_select.params[0]] + list(model_select.params[1:]) }) regression_coefficients.to_csv('./results/problem2/regression_coefficients.csv', index=False, encoding='utf_8_sig') unique_categories = sorted(df['Co负载量(wt%)'].dropna().unique()) boxplot_stats_list = [] for val in unique_categories: stats = df[df['Co负载量(wt%)'] == val]['乙醇转化率(%)'].describe() stats_dict = { 'Co负载量(wt%)': val, 'count': stats['count'], 'mean': stats['mean'], 'std': stats['std'], 'min': stats['min'], '25%': stats['25%'], '50%': stats['50%'], '75%': stats['75%'], 'max': stats['max'] } boxplot_stats_list.append(stats_dict) boxplot_stats = pd.DataFrame(boxplot_stats_list) boxplot_stats.to_csv('./results/problem2/boxplot_stats.csv', index=False, encoding='utf_8_sig') boxplot_stats.to_pickle('./results/problem2/boxplot_stats.pkl') # 温度与Co负载量交互影响 temp_co_conversion = df.groupby(['温度', 'Co负载量(wt%)'])['乙醇转化率(%)'].agg(['mean', 'std', 'count']).reset_index() temp_co_conversion.columns = ['温度', 'Co负载量(wt%)', '平均乙醇转化率(%)', '标准差', '样本量'] temp_co_conversion.to_csv('./results/problem2/interaction_temp_co_loading.csv', index=False, encoding='utf_8_sig') plt.figure(figsize=(10, 6)) sns.lineplot(x='温度', y='乙醇转化率(%)', hue='Co负载量(wt%)', ci='sd', data=df) plt.title('温度与Co负载量对乙醇转化率的交互影响(含标准差)') plt.grid(True) plt.tight_layout() plt.savefig('./results/problem2/temp_vs_conversion_by_co_loading_with_error.png', dpi=300, bbox_inches='tight') plt.show() return { 'regression_coefficients': regression_coefficients, 'boxplot_stats': boxplot_stats, 'vif_data': vif_data, 'temp_co_conversion': temp_co_conversion } #问题3 def analyze_c4_olefin_yield(df): # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 找出最高C4烯烃收率的实验 max_yield_row = df.loc[df['C4烯烃收率(%)'].idxmax()] print(f"\n最高C4烯烃收率为{max_yield_row['C4烯烃收率(%)']:.2f}%") print(f"对应的催化剂组合为:{max_yield_row['催化剂组合']}") print(f"对应的温度为:{max_yield_row['温度']}°C") # 分析不同温度下的C4烯烃收率 plt.figure(figsize=(12, 6)) sns.lineplot(x='温度', y='C4烯烃收率(%)', hue='催化剂组合编号', data=df, marker='o') plt.title('不同催化剂组合下C4烯烃收率随温度变化趋势', fontsize=14) plt.xlabel('温度 (°C)', fontsize=12) plt.ylabel('C4烯烃收率 (%)', fontsize=12) plt.grid(True) plt.legend(title='催化剂组合', bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.savefig('./results/problem3/c4_olefin_yield_vs_temperature.png', dpi=300, bbox_inches='tight') plt.show() # 温度与C4烯烃收率的回归分析 results = [] catalysts = df['催化剂组合编号'].unique() for catalyst in catalysts: catalyst_df = df[df['催化剂组合编号'] == catalyst] catalyst_df = catalyst_df.dropna(subset=['温度', 'C4烯烃收率(%)']) if len(catalyst_df) < 2: continue X = catalyst_df['温度'].values.reshape(-1, 1) y = catalyst_df['C4烯烃收率(%)'].values model = LinearRegression() model.fit(X, y) slope = model.coef_[0] intercept = model.intercept_ y_pred = model.predict(X) r2 = r2_score(y, y_pred) equation = f'y = {slope:.4f}x + {intercept:.4f}' results.append({ '催化剂组合': catalyst, '斜率': slope, '截距': intercept, 'R2': r2, '方程': equation }) plt.figure(figsize=(10, 6)) plt.scatter(X, y, color='blue', label='原始数据') plt.plot(X, y_pred, color='red', label=f'线性拟合\nR²={r2:.4f}\n{equation}') plt.title(f'{catalyst} 催化剂下温度与C4烯烃收率的关系', fontsize=14) plt.xlabel('温度 (°C)', fontsize=12) plt.ylabel('C4烯烃收率 (%)', fontsize=12) plt.legend() plt.grid(True) plt.tight_layout() plt.savefig(f'./results/problem3/regression_{catalyst}.png', dpi=300, bbox_inches='tight') plt.show() regression_results_df = pd.DataFrame(results) regression_results_df.to_csv('./results/problem3/temperature_vs_c4_regression_results.csv', index=False, encoding='utf_8_sig') # 找出温度低于350°C时的最佳组合 low_temp_df = df[df['温度'] < 350] if not low_temp_df.empty: max_yield_low_temp_row = low_temp_df.loc[low_temp_df['C4烯烃收率(%)'].idxmax()] print(f"\n温度低于350°C时最高C4烯烃收率为{max_yield_low_temp_row['C4烯烃收率(%)']:.2f}%") print(f"对应的催化剂组合为:{max_yield_low_temp_row['催化剂组合']}") print(f"对应的温度为:{max_yield_low_temp_row['温度']}°C") else: print("\n没有温度低于350°C的实验数据") df.to_csv('./processed_data/attachment1_with_yield.csv', index=False, encoding='utf_8_sig') return max_yield_row, max_yield_low_temp_row if 'max_yield_low_temp_row' in locals() else None def plot_c4_olefin_yield_comparison(df): # 按催化剂组合温度分组统计平均C4烯烃收率 grouped = df.groupby(['催化剂组合编号', '温度'])['C4烯烃收率(%)'].mean().reset_index() # 找出每个催化剂组合的最佳温度 best_temp_per_catalyst = grouped.loc[grouped.groupby('催化剂组合编号')['C4烯烃收率(%)'].idxmax()] best_temp_per_catalyst = best_temp_per_catalyst.sort_values(by='C4烯烃收率(%)', ascending=False) # 创建包含温度信息的DataFrame best_temp_df = df.loc[df.groupby('催化剂组合编号')['C4烯烃收率(%)'].idxmax()] best_temp_df = best_temp_df[['催化剂组合编号', '温度', 'C4烯烃收率(%)']] # 创建双Y轴图表 fig, ax1 = plt.subplots(figsize=(14, 8)) ax2 = ax1.twinx() sns.barplot(x='催化剂组合编号', y='C4烯烃收率(%)', data=best_temp_df, palette='viridis', ax=ax1) sns.lineplot(x='催化剂组合编号', y='温度', data=best_temp_df, color='red', marker='o', ax=ax2, label='最佳温度') plt.title('不同催化剂组合的最佳C4烯烃收率及对应温度', fontsize=16) plt.xlabel('催化剂组合编号', fontsize=14) ax1.set_ylabel('最佳C4烯烃收率 (%)', fontsize=14) ax2.set_ylabel('温度 (°C)', fontsize=14) plt.figure(figsize=(14, 8)) sns.barplot(x='催化剂组合编号', y='C4烯烃收率(%)', data=best_temp_per_catalyst, palette='viridis') plt.title('不同催化剂组合的最佳C4烯烃收率对比', fontsize=16) plt.xlabel('催化剂组合编号', fontsize=14) plt.ylabel('最佳C4烯烃收率 (%)', fontsize=14) plt.xticks(rotation=45) plt.grid(True, axis='y', linestyle='--', alpha=0.7) plt.tight_layout() plt.savefig('./results/problem3/best_c4_olefin_yield_by_catalyst.png', dpi=300, bbox_inches='tight') plt.show() # 找出前5个最佳催化剂组合 top_5_catalysts = best_temp_per_catalyst.head(5) print("\n前5个最佳催化剂组合:") for i, (_, row) in enumerate(top_5_catalysts.iterrows()): print(f"{i+1}. 催化剂组合:{row['催化剂组合编号']}") print(f" 最佳温度:{row['温度']}°C") print(f" C4烯烃收率:{row['C4烯烃收率(%)']:.2f}%") print() return best_temp_per_catalyst def analyze_catalyst_components_impact(df): # 解析催化剂成分 df = parse_catalyst_components(df) # C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # Co负载量的影响 co_loading_impact = df.groupby('Co负载量(wt%)')['C4烯烃收率(%)'].mean().reset_index() plt.figure(figsize=(10, 6)) sns.barplot(x='Co负载量(wt%)', y='C4烯烃收率(%)', data=co_loading_impact, palette='viridis') plt.title('Co负载量对C4烯烃收率的影响', fontsize=14) plt.xlabel('Co负载量 (wt%)', fontsize=12) plt.ylabel('平均C4烯烃收率 (%)', fontsize=12) plt.grid(True, axis='y') plt.tight_layout() plt.savefig('./results/problem3/co_loading_impact.png', dpi=300, bbox_inches='tight') plt.show() # 装料比的影响 feed_ratio_impact = df.groupby('装料比(Co/SiO2:HAP)')['C4烯烃收率(%)'].mean().reset_index() plt.figure(figsize=(10, 6)) sns.barplot(x='装料比(Co/SiO2:HAP)', y='C4烯烃收率(%)', data=feed_ratio_impact, palette='viridis') plt.title('装料比对C4烯烃收率的影响', fontsize=14) plt.xlabel('装料比 (Co/SiO2:HAP)', fontsize=12) plt.ylabel('平均C4烯烃收率 (%)', fontsize=12) plt.grid(True, axis='y') plt.tight_layout() plt.savefig('./results/problem3/feed_ratio_impact.png', dpi=300, bbox_inches='tight') plt.show() # 乙醇浓度的影响 ethanol_conc_impact = df.groupby('乙醇浓度(ml/min)')['C4烯烃收率(%)'].mean().reset_index() plt.figure(figsize=(10, 6)) sns.barplot(x='乙醇浓度(ml/min)', y='C4烯烃收率(%)', data=ethanol_conc_impact, palette='viridis') plt.title('乙醇浓度对C4烯烃收率的影响', fontsize=14) plt.xlabel('乙醇浓度 (ml/min)', fontsize=12) plt.ylabel('平均C4烯烃收率 (%)', fontsize=12) plt.grid(True, axis='y') plt.tight_layout() plt.savefig('./results/problem3/ethanol_conc_impact.png', dpi=300, bbox_inches='tight') plt.show() return co_loading_impact, feed_ratio_impact, ethanol_conc_impact def additional_output_problem3(df, max_yield_row, max_yield_low_temp_row, best_temp_per_catalyst): max_yield_df = pd.DataFrame([max_yield_row]) max_yield_df.to_csv('./results/problem3/max_c4_olefin_yield_experiment.csv', index=False, encoding='utf_8_sig') if max_yield_low_temp_row is not None: max_yield_low_temp_df = pd.DataFrame([max_yield_low_temp_row]) max_yield_low_temp_df.to_csv('./results/problem3/max_c4_olefin_yield_low_temp.csv', index=False, encoding='utf_8_sig') else: with open('./results/problem3/max_c4_olefin_yield_low_temp.txt', 'w') as f: f.write("没有温度低于350°C的实验数据") best_temp_per_catalyst.to_csv('./results/problem3/best_temp_per_catalyst.csv', index=False, encoding='utf_8_sig') top_5_catalysts = best_temp_per_catalyst.head(5) top_5_catalysts.to_csv('./results/problem3/top_5_catalysts.csv', index=False, encoding='utf_8_sig') yield_stats = df.groupby('催化剂组合编号')['C4烯烃收率(%)'].agg(['mean', 'std', 'max', 'min', 'count']) yield_stats.to_csv('./results/problem3/c4_olefin_yield_statistics.csv', index=True, encoding='utf_8_sig') co_loading_impact = df.groupby('Co负载量(wt%)')['C4烯烃收率(%)'].mean().reset_index() co_loading_impact.to_csv('./results/problem3/co_loading_impact.csv', index=False, encoding='utf_8_sig') feed_ratio_impact = df.groupby('装料比(Co/SiO2:HAP)')['C4烯烃收率(%)'].mean().reset_index() feed_ratio_impact.to_csv('./results/problem3/feed_ratio_impact.csv', index=False, encoding='utf_8_sig') ethanol_conc_impact = df.groupby('乙醇浓度(ml/min)')['C4烯烃收率(%)'].mean().reset_index() ethanol_conc_impact.to_csv('./results/problem3/ethanol_conc_impact.csv', index=False, encoding='utf_8_sig') def polynomial_regression(df, degree=2): results = [] for catalyst in df['催化剂组合编号'].unique(): catalyst_df = df[df['催化剂组合编号'] == catalyst] catalyst_df = catalyst_df.dropna(subset=['温度', 'C4烯烃收率(%)']) if len(catalyst_df) < degree + 1: continue X = catalyst_df[['温度']] y = catalyst_df['C4烯烃收率(%)'] from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(degree=degree, include_bias=False) X_poly = poly.fit_transform(X) model = LinearRegression() model.fit(X_poly, y) y_pred = model.predict(X_poly) r2 = r2_score(y, y_pred) mse = mean_squared_error(y, y_pred) coef = model.coef_ equation = f'y = {model.intercept_:.4f}' for i, c in enumerate(coef): equation += f' + {c:.4f}*x^{i+1}' results.append({ '催化剂组合': catalyst, '多项式次数': degree, '系数': model.coef_, '截距': model.intercept_, 'R2': r2, 'MSE': mse, '方程': equation }) plt.figure(figsize=(10, 6)) plt.scatter(X, y, color='blue', label='原始数据') X_range = np.linspace(X.min(), X.max(), 100).reshape(-1, 1) X_range_poly = poly.transform(X_range) y_range = model.predict(X_range_poly) plt.plot(X_range, y_range, color='red', label=f'多项式拟合\nR²={r2:.4f}\n{equation}') plt.title(f'{catalyst} 催化剂下温度与C4烯烃收率的非线性关系', fontsize=14) plt.xlabel('温度 (°C)', fontsize=12) plt.ylabel('C4烯烃收率 (%)', fontsize=12) plt.legend() plt.grid(True) plt.tight_layout() plt.savefig(f'./results/problem3/polynomial_regression_{catalyst}.png', dpi=300, bbox_inches='tight') plt.show() results_df = pd.DataFrame(results) results_df.to_csv(f'./results/problem3/polynomial_regression_results_degree_{degree}.csv', index=False, encoding='utf_8_sig') return results_df #问题4 def design_additional_experiments(df, num_experiments=5): """ 设计新增实验方案 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 获取最佳实验条件 max_yield_row = df.loc[df['C4烯烃收率(%)'].idxmax()] # 分析最佳条件的特征 best_temp = max_yield_row['温度'] best_co_loading = max_yield_row['Co负载量(wt%)'] best_ratio = max_yield_row['装料比(Co/SiO2:HAP)'] best_ethanol = max_yield_row['乙醇浓度(ml/min)'] print(f"\n当前最高C4烯烃收率出现在:") print(f"温度:{best_temp}°C") print(f"Co负载量:{best_co_loading}wt%") print(f"装料比:{best_ratio:.2f}") print(f"乙醇浓度:{best_ethanol}ml/min") # 设计新增实验 experiments = [] # 1. 在最佳条件附近微调 experiments.append({ '实验编号': f'New1', '温度': best_temp, 'Co负载量(wt%)': best_co_loading, '装料比(Co/SiO2:HAP)': best_ratio, '乙醇浓度(ml/min)': best_ethanol, '设计理由': '基准实验,验证最佳条件可重复性' }) # 2-4. 在最佳条件附近进行小幅度调整 experiments.append({ '实验编号': f'New2', '温度': best_temp - 10, 'Co负载量(wt%)': best_co_loading, '装料比(Co/SiO2:HAP)': best_ratio, '乙醇浓度(ml/min)': best_ethanol, '设计理由': '测试最佳温度-10°C的效果' }) experiments.append({ '实验编号': f'New3', '温度': best_temp + 10, 'Co负载量(wt%)': best_co_loading, '装料比(Co/SiO2:HAP)': best_ratio, '乙醇浓度(ml/min)': best_ethanol, '设计理由': '测试最佳温度+10°C的效果' }) experiments.append({ '实验编号': f'New4', '温度': best_temp, 'Co负载量(wt%)': best_co_loading * 1.2, '装料比(Co/SiO2:HAP)': best_ratio, '乙醇浓度(ml/min)': best_ethanol, '设计理由': '测试Co负载量增加20%的效果' }) experiments.append({ '实验编号': f'New5', '温度': best_temp, 'Co负载量(wt%)': best_co_loading, '装料比(Co/SiO2:HAP)': best_ratio * 1.2, '乙醇浓度(ml/min)': best_ethanol, '设计理由': '测试装料比增加20%的效果' }) experiments_df = pd.DataFrame(experiments) experiments_df.to_csv('./results/problem4/additional_experiments_design.csv', index=False, encoding='utf_8_sig') with open('./results/problem4/experiment_design_reasons.txt', 'w', encoding='utf-8') as f: f.write("新增实验设计说明:\n\n") f.write("1. 实验New1:基准实验\n") f.write(" 重复当前最佳条件,验证实验结果的可重复性\n") f.write(" 确认最佳条件的稳定性\n\n") f.write("2. 实验New2:温度降低10°C\n") f.write(" 测试最佳温度范围,了解温度对C4烯烃收率的敏感度\n") f.write(" 验证是否可能存在更优的温度条件\n\n") f.write("3. 实验New3:温度升高10°C\n") f.write(" 测试最佳温度范围,了解高温对反应的影响\n") f.write(" 验证温度上限热稳定性\n\n") f.write("4. 实验New4:Co负载量增加20%\n") f.write(" 测试Co负载量对反应的影响\n") f.write(" 验证是否存在更优的Co负载量\n\n") f.write("5. 实验New5:装料比增加20%\n") f.write(" 测试装料比对反应的影响\n") f.write(" 验证是否存在更优的装料比\n\n") f.write("设计原则:\n") f.write("1. 以当前最佳条件为基础进行小范围调整\n") f.write("2. 每次只改变一个变量,便于分析变量影响\n") f.write("3. 聚焦于关键因素:温度、Co负载量、装料比\n") f.write("4. 保持乙醇浓度不变,避免过多变量变化\n") f.write("5. 实验设计幅度参考了现有实验范围分布") return experiments_df def plot_experiment_comparison_heatmap(df): """ 绘制实验参数对比热图 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 选择关键参数 features = ['温度', 'Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)', 'C4烯烃收率(%)'] df_features = df[features] # 计算相关性矩阵 corr_matrix = df_features.corr() # 绘制热图 plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) plt.title('实验参数对比热图', fontsize=16) plt.tight_layout() plt.savefig('./results/problem4/experiment_comparison_heatmap.png', dpi=300, bbox_inches='tight') plt.show() # 保存相关性数据 corr_matrix.to_csv('./results/problem4/experiment_correlation_matrix.csv', index=True, encoding='utf_8_sig') def plot_experiment_condition_distribution(df): """ 绘制实验条件分布直方图 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 创建子图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # 绘制直方图 sns.histplot(df['温度'], ax=axes[0, 0], kde=True, color='skyblue') sns.histplot(df['Co负载量(wt%)'], ax=axes[0, 1], kde=True, color='salmon') sns.histplot(df['装料比(Co/SiO2:HAP)'], ax=axes[1, 0], kde=True, color='lightgreen') sns.histplot(df['乙醇浓度(ml/min)'], ax=axes[1, 1], kde=True, color='gold') # 设置标题 axes[0, 0].set_title('温度分布') axes[0, 1].set_title('Co负载量分布') axes[1, 0].set_title('装料比分布') axes[1, 1].set_title('乙醇浓度分布') plt.tight_layout() plt.savefig('./results/problem4/experiment_condition_distribution.png', dpi=300, bbox_inches='tight') plt.show() def plot_parameter_trend_lines(df, experiments_df): """ 绘制参数变化趋势折线图 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 获取最佳实验条件 max_yield_row = df.loc[df['C4烯烃收率(%)'].idxmax()] best_temp = max_yield_row['温度'] # 创建子图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # 温度趋势 sns.lineplot(data=df, x='温度', y='C4烯烃收率(%)', hue='Co负载量(wt%)', ax=axes[0, 0], palette='viridis') axes[0, 0].set_title('温度对C4烯烃收率的影响') axes[0, 0].axvline(x=best_temp, color='red', linestyle='--', label=f'最佳温度: {best_temp}°C') axes[0, 0].legend() # Co负载量趋势 sns.lineplot(data=df, x='Co负载量(wt%)', y='C4烯烃收率(%)', hue='温度', ax=axes[0, 1], palette='viridis') axes[0, 1].set_title('Co负载量对C4烯烃收率的影响') best_co = experiments_df[experiments_df['设计理由'] == '测试Co负载量增加20%的效果']['Co负载量(wt%)'].values[0] axes[0, 1].axvline(x=best_co, color='red', linestyle='--', label=f'新增Co负载量: {best_co:.2f}wt%') axes[0, 1].legend() # 装料比趋势 sns.lineplot(data=df, x='装料比(Co/SiO2:HAP)', y='C4烯烃收率(%)', hue='温度', ax=axes[1, 0], palette='viridis') axes[1, 0].set_title('装料比对C4烯烃收率的影响') best_ratio = experiments_df[experiments_df['设计理由'] == '测试装料比增加20%的效果']['装料比(Co/SiO2:HAP)'].values[0] axes[1, 0].axvline(x=best_ratio, color='red', linestyle='--', label=f'新增装料比: {best_ratio:.2f}') axes[1, 0].legend() # 乙醇浓度趋势 sns.lineplot(data=df, x='乙醇浓度(ml/min)', y='C4烯烃收率(%)', hue='温度', ax=axes[1, 1], palette='viridis') axes[1, 1].set_title('乙醇浓度对C4烯烃收率的影响') best_ethanol = experiments_df[experiments_df['设计理由'] == '基准实验']['乙醇浓度(ml/min)'].values[0] axes[1, 1].axvline(x=best_ethanol, color='red', linestyle='--', label=f'基准乙醇浓度: {best_ethanol:.2f}ml/min') axes[1, 1].legend() plt.tight_layout() plt.savefig('./results/problem4/parameter_trend_lines.png', dpi=300, bbox_inches='tight') plt.show() def plot_optimal_experiment_comparison(df, experiments_df): """ 绘制最优实验与新增实验对比 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 获取最佳实验条件 max_yield_row = df.loc[df['C4烯烃收率(%)'].idxmax()] # 创建对比数据 comparison_df = pd.concat([max_yield_row, experiments_df.iloc[0]], axis=1).T comparison_df.index = ['最佳历史实验', '新增基准实验'] # 绘制雷达图 categories = ['温度', 'Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)'] num_vars = len(categories) # 标准化数据 max_values = df[categories].max() comparison_data = comparison_df[categories] / max_values # 构造雷达图 angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() comparison_data = np.concatenate((comparison_data, comparison_data[:,[0]]), axis=1) fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True)) # 绘制历史最佳实验 ax.plot(angles, comparison_data[0], color='blue', linewidth=2, linestyle='solid', label='最佳历史实验') ax.fill(angles, comparison_data[0], color='blue', alpha=0.25) # 绘制新增基准实验 ax.plot(angles, comparison_data[1], color='red', linewidth=2, linestyle='solid', label='新增基准实验') ax.fill(angles, comparison_data[1], color='red', alpha=0.25) # 设置雷达图标签 ax.set_thetagrids(np.degrees(angles), categories) ax.set_rlabel_position(30) # 设置雷达图范围 plt.ylim(0, 1.2) plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1)) plt.title('最佳历史实验与新增基准实验对比') plt.tight_layout() plt.savefig('./results/problem4/experiment_comparison_radar_chart.png', dpi=300, bbox_inches='tight') plt.show() # 保存对比数据 comparison_df.to_csv('./results/problem4/experiment_comparison_data.csv', index=True, encoding='utf_8_sig') def plot_experiment_design_space(df, experiments_df): """ 可视化实验设计空间 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 计算C4烯烃收率 df['C4烯烃收率(%)'] = df['乙醇转化率(%)'] * df['C4烯烃选择性(%)'] / 100 # 创建子图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # 温度 vs Co负载量 sns.scatterplot(data=df, x='温度', y='Co负载量(wt%)', size='C4烯烃收率(%)', sizes=(20, 200), color='blue', alpha=0.6, label='历史实验', ax=axes[0, 0]) sns.scatterplot(data=experiments_df, x='温度', y='Co负载量(wt%)', color='red', size='C4烯烃收率(%)', sizes=(100, 100), label='新增实验', ax=axes[0, 0]) axes[0, 0].set_title('温度 vs Co负载量') # 温度 vs 装料比 sns.scatterplot(data=df, x='温度', y='装料比(Co/SiO2:HAP)', size='C4烯烃收率(%)', sizes=(20, 200), color='blue', alpha=0.6, label='历史实验', ax=axes[0, 1]) sns.scatterplot(data=experiments_df, x='温度', y='装料比(Co/SiO2:HAP)', color='red', size='C4烯烃收率(%)', sizes=(100, 100), label='新增实验', ax=axes[0, 1]) axes[0, 1].set_title('温度 vs 装料比') # Co负载量 vs 装料比 sns.scatterplot(data=df, x='Co负载量(wt%)', y='装料比(Co/SiO2:HAP)', size='C4烯烃收率(%)', sizes=(20, 200), color='blue', alpha=0.6, label='历史实验', ax=axes[1, 0]) sns.scatterplot(data=experiments_df, x='Co负载量(wt%)', y='装料比(Co/SiO2:HAP)', color='red', size='C4烯烃收率(%)', sizes=(100, 100), label='新增实验', ax=axes[1, 0]) axes[1, 0].set_title('Co负载量 vs 装料比') # 三维散点图 ax = fig.add_subplot(2, 2, 4, projection='3d') ax.scatter(df['温度'], df['Co负载量(wt%)'], df['装料比(Co/SiO2:HAP)'], c=df['C4烯烃收率(%)'], cmap='viridis', alpha=0.6, label='历史实验') ax.scatter(experiments_df['温度'], experiments_df['Co负载量(wt%)'], experiments_df['装料比(Co/SiO2:HAP)'], c='red', s=100, alpha=1, label='新增实验') ax.set_xlabel('温度 (°C)') ax.set_ylabel('Co负载量 (wt%)') ax.set_zlabel('装料比') ax.set_title('实验设计空间分布') ax.legend() plt.tight_layout() plt.savefig('./results/problem4/experiment_design_space.png', dpi=300, bbox_inches='tight') plt.show() def analyze_catalyst_parameters(df,experiments_df): """ 分析催化剂参数之间的关系 """ # 解析催化剂成分 df = parse_catalyst_components(df) # 创建参数组合图 g = sns.PairGrid(df[['温度', 'Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)']]) g.map_upper(sns.scatterplot, color='blue', alpha=0.6) g.map_lower(sns.scatterplot, color='blue', alpha=0.6) g.map_diag(sns.histplot, color='blue', alpha=0.6) # 添加新增实验点 for i, row in df.iterrows(): if row['催化剂组合编号'] in experiments_df['催化剂组合编号'].values: for j, col in enumerate(g.diag_axes): col.axvline(x=row[g.diag_names[j]], color='red', linestyle='--', alpha=0.5) plt.savefig('./results/problem4/catalyst_parameter_relations.png', dpi=300, bbox_inches='tight') plt.show() def plot_experiment_parameter_changes(experiments_df): # 查找基准实验 base_exp = experiments_df[experiments_df['设计理由'] == '基准实验'] # 如果未找到,使用第一行作为基准 if base_exp.empty: print("警告:未找到设计理由为'基准实验'的实验,使用第一行作为基准") base_exp = experiments_df.iloc[[0]] # 使用 iloc[[0]] 保持 DataFrame 结构一致 # 安全提取参数 try: base_temp = base_exp['温度'].iloc[0] base_co = base_exp['Co负载量(wt%)'].iloc[0] base_ratio = base_exp['装料比(Co/SiO2:HAP)'].iloc[0] base_ethanol = base_exp['乙醇浓度(ml/min)'].iloc[0] except IndexError: raise ValueError("基准实验数据为空,无法提取参数,请检查输入数据") # 创建参数变化数据 parameters = ['温度', 'Co负载量', '装料比', '乙醇浓度'] param_changes = [] for _, exp in experiments_df.iterrows(): param_changes.append([ exp['温度'] - base_temp, exp['Co负载量(wt%)'] - base_co, exp['装料比(Co/SiO2:HAP)'] - base_ratio, exp['乙醇浓度(ml/min)'] - base_ethanol ]) change_df = pd.DataFrame(param_changes, columns=parameters) change_df['实验编号'] = experiments_df['实验编号'] # 绘图 change_df.set_index('实验编号').plot(kind='bar', figsize=(12, 6)) plt.title('新增实验参数变化趋势') plt.ylabel('参数变化值(相对于基准实验)') plt.xticks(rotation=0) plt.axhline(y=0, color='black', linestyle='--', linewidth=1) plt.tight_layout() plt.savefig('./results/problem4/experiment_parameter_changes.png', dpi=300, bbox_inches='tight') plt.show() # 保存数据 change_df.to_csv('./results/problem4/experiment_parameter_changes.csv', index=False, encoding='utf_8_sig') if __name__ == "__main__": df1 = preprocess_attachment1('C:/Users/Yeah/Desktop/数模/第五题/B/附件1.xlsx') df2 = preprocess_attachment2('C:/Users/Yeah/Desktop/数模/第五题/B/附件2.xlsx') df1.to_csv('./processed_data/attachment1_processed.csv', index=False, encoding='utf_8_sig') df2.to_csv('./processed_data/attachment2_processed.csv', index=False, encoding='utf_8_sig') plot_grouped_line_with_regression( data=df1, x_col='温度', y_col='乙醇转化率(%)', group_col='催化剂组合编号', title='不同催化剂组合下乙醇转化率随温度变化趋势(含线性拟合)', xlabel='温度 (°C)', ylabel='乙醇转化率 (%)', save_name='ethanol_conversion_regression' ) plot_grouped_line_with_regression( data=df1, x_col='温度', y_col='C4烯烃选择性(%)', group_col='催化剂组合编号', title='不同催化剂组合下C4烯烃选择性随温度变化趋势(含线性拟合)', xlabel='温度 (°C)', ylabel='C4烯烃选择性 (%)', save_name='c4_selectivity_regression' ) plot_ethanol_vs_c4_with_regression( data=df1, ethanol_col='乙醇转化率(%)', c4_col='C4烯烃选择性(%)', group_col='催化剂组合编号', title='乙醇转化率与C4烯烃选择性关系图(按催化剂组合)', xlabel='乙醇转化率 (%)', ylabel='C4烯烃选择性 (%)', save_name='ethanol_vs_c4_regression' ) analyze_attachment2_data(df2) # 问题2 analyze_catalyst_effects(df1) # 问题3 max_yield_row, max_yield_low_temp_row = analyze_c4_olefin_yield(df1) best_temp_per_catalyst = plot_c4_olefin_yield_comparison(df1) analyze_catalyst_components_impact(df1) additional_output_problem3(df1, max_yield_row, max_yield_low_temp_row, best_temp_per_catalyst) polynomial_regression(df1, degree=2) #问题4 additional_experiments_df = design_additional_experiments(df1) print("\n新增实验设计方案:") print(additional_experiments_df[['实验编号', '温度', 'Co负载量(wt%)', '装料比(Co/SiO2:HAP)', '乙醇浓度(ml/min)', '设计理由']]) # 新增可视化 plot_experiment_condition_distribution(df1) plot_experiment_parameter_changes(additional_experiments_df) plot_parameter_trend_lines(df1, additional_experiments_df) plot_experiment_design_space(df1, additional_experiments_df) plot_optimal_experiment_comparison(df1, additional_experiments_df)
07-30
帮我检查代码:import pandas as pd import numpy as np import lightgbm as lgb from lightgbm import early_stopping, log_evaluation from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score import chardet def detect_encoding(file_path): with open(file_path, 'rb') as f: result = chardet.detect(f.read(10000)) return result['encoding'], result['confidence'] def load_all_data(days=32): see_list, click_list, play_list = [], [], [] dtypes = {'did': 'category', 'vid': 'category'} for i in range(1, days + 1): day = f"{i:02d}" # 加载 see 数据 see = pd.read_csv(f'see_{day}.csv', encoding='latin1', dtype=dtypes) if 'did' not in see.columns or 'vid' not in see.columns: raise ValueError(f"see_{day}.csv 缺少必要字段") see['day'] = day see_list.append(see) # 加载 click 数据 click = pd.read_csv( f'click_{day}.csv', encoding='ISO-8859-1', on_bad_lines='skip', dtype=dtypes ) if 'click_time' not in click.columns: raise ValueError(f"click_{day}.csv 缺少 click_time 字段") click['date'] = pd.to_datetime(click['click_time']).dt.date click_list.append(click[['did', 'vid', 'date']]) # 加载 play 数据 play = pd.read_csv( f'playplus_{day}.csv', engine='python', encoding_errors='ignore', dtype=dtypes ) if 'play_time' not in play.columns: raise ValueError(f"playplus_{day}.csv 缺少 play_time 字段") play_list.append(play[['did', 'vid', 'play_time']]) all_see = pd.concat(see_list).drop_duplicates(['did', 'vid']) all_click = pd.concat(click_list).drop_duplicates(['did', 'vid']) all_play = pd.concat(play_list).groupby(['did', 'vid'], observed=True).sum().reset_index() return all_see, all_click, all_play def prepare_samples(all_see, all_click, all_play): video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'}) # 合并基础数据 samples = all_see.merge(all_play, on=['did', 'vid'], how='left').fillna({'play_time': 0}) samples = samples.merge(video_info, on='vid', how='left') # 计算完成率(仅用于分析,不用于预测) samples['completion_rate'] = (samples['play_time'] / samples['item_duration']).clip(0, 1).astype(np.float32) # 点击标记 click_flag = all_click.groupby(['did', 'vid']).size().reset_index(name='clicked') click_flag['clicked'] = 1 samples = samples.merge(click_flag, on=['did', 'vid'], how='left').fillna({'clicked': 0}) samples['clicked'] = samples['clicked'].astype(np.int8) # 标签定义 samples['label'] = np.select( [ (samples['completion_rate'] > 0.9), (samples['clicked'] == 1) ], [2, 1], # 2=完成, 1=点击 default=0 # 0=曝光未点击 ) # 二分类目标(点击或完成为正类) samples['binary_label'] = samples['label'].apply(lambda x: 1 if x >= 1 else 0).astype(int) # 计算用户点击率(修正版) user_exposure = all_see.groupby('did').size().rename('exposure_count') user_click_count = all_click.groupby('did').size().rename('click_count') user_click_rate = (user_click_count / user_exposure).fillna(0).astype(np.float32) # 视频流行度 video_popularity = all_click.groupby('vid').size().rename('video_popularity') # 映射特征 samples['user_click_rate'] = samples['did'].map(user_click_rate).fillna(0) samples['video_popularity'] = samples['vid'].map(video_popularity).fillna(0) # 修复:保存唯一用户点击率(关键修复点) user_click_rate_df = pd.DataFrame({ 'did': user_click_rate.index, 'user_click_rate': user_click_rate.values }).drop_duplicates('did') # 修复:保存唯一视频流行度 video_popularity_df = pd.DataFrame({ 'vid': video_popularity.index, 'video_popularity': video_popularity.values }).drop_duplicates('vid') # 保存特征 user_click_rate_df.to_csv('user_click_rate.csv', index=False) video_popularity_df.to_csv('video_popularity.csv', index=False) return samples, user_click_rate, video_popularity def train_model(samples): # 仅使用可复现的特征 features = ['user_click_rate', 'video_popularity'] X = samples[features] y = samples['binary_label'] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1 } model = lgb.train( params, lgb_train, num_boost_round=100, valid_sets=[lgb_train, lgb_eval], callbacks=[ early_stopping(stopping_rounds=20), log_evaluation(period=50) ] ) y_pred = model.predict(X_test) auc_score = roc_auc_score(y_test, y_pred) print(f"Validation AUC: {auc_score:.4f}") return model, features, auc_score def predict_new_data(model, feature_columns, test_file): # 读取测试数据 test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'}) # 修复:正确读取特征映射 user_click_rate_df = pd.read_csv('user_click_rate.csv') video_popularity_df = pd.read_csv('video_popularity.csv') # 计算全局均值用于填充新用户/新视频 global_user_rate = user_click_rate_df['user_click_rate'].mean() global_video_pop = video_popularity_df['video_popularity'].mean() # 创建映射字典 user_click_map = user_click_rate_df.set_index('did')['user_click_rate'].to_dict() video_pop_map = video_popularity_df.set_index('vid')['video_popularity'].to_dict() # 映射特征 test_data['user_click_rate'] = test_data['did'].map(user_click_map).fillna(global_user_rate) test_data['video_popularity'] = test_data['vid'].map(video_pop_map).fillna(global_video_pop) # 预测 test_data['click_prob'] = model.predict(test_data[feature_columns]) # 生成结果 top_predictions = test_data.sort_values('click_prob', ascending=False).groupby('did').head(1) result = top_predictions[['did', 'vid', 'click_prob']].copy() result.columns = ['did', 'vid', 'click_prob'] result.to_csv('prediction_result.csv', index=False) return result if __name__ == '__main__': encoding, confidence = detect_encoding('see_01.csv') print(f"编码: {encoding}, 置信度: {confidence:.2f}") all_see, all_click, all_play = load_all_data() samples, _, _ = prepare_samples(all_see, all_click, all_play) model, features, auc_score = train_model(samples) result = predict_new_data(model, features, 'testA_did_show.csv')
07-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值