Distances to Zero(lower_bound和upper_bound应用)

寻找零元素的最短距离
本文介绍了一种高效算法,用于解决数组中各非零元素到最近零元素的距离问题。通过预先存储零元素的位置,并利用lower_bound和upper_bound函数快速定位非零元素附近的零元素,从而计算出最短距离。
B. Distances to Zero
time limit per test
2 seconds
memory limit per test
256 megabytes
input
standard input
output
standard output

You are given the array of integer numbers a0, a1, ..., an - 1. For each element find the distance to the nearest zero (to the element which equals to zero). There is at least one zero element in the given array.

Input

The first line contains integer n (1 ≤ n ≤ 2·105) — length of the array a. The second line contains integer elements of the array separated by single spaces ( - 109 ≤ ai ≤ 109).

Output

Print the sequence d0, d1, ..., dn - 1, where di is the difference of indices between i and nearest j such that aj = 0. It is possible that i = j.

Examples
Input
9
2 1 0 3 0 0 3 2 4
Output
2 1 0 1 0 0 1 2 3 
Input
5
0 1 2 3 4
Output
0 1 2 3 4 
Input
7
5 6 0 1 -2 3 4
Output
2 1 0 1 2 3 4 
题意:题意比较好理解,就是找每个非零数,离这个数组中的零的最短距离,数组中可能有多个零,找最近的算出距离,最后并大印
出来
思路:一开始看到时间限制2000ms,就想直接暴力,遇到非零数左右遍历,但是超时了,但是听他们说暴力做也过了,哎不管了,自己
能想到的暴力都是最简单最单纯的暴力,人家的暴力可能还动了些手脚,后来学长讲了更搞笑(高效)的方法,储存下零的位置,再储存
下非零数的位置,以非零数位置为查找值用lower_bound和upper_bound找到,这个非零位置左右两边零的位置,相减比较位置
下面是代码
#include <iostream>
#include <cstdio>
#include <cstring>
#include <algorithm>
#include <vector>
using namespace std;
#define INF 0x3f3f3f3f
int n;
int a[200100];
int dis[200100];
vector<int>zero,number;
int main(){
    int i;
    zero.push_back(-INF);//先把一个最小的数放在数组首位置
    scanf("%d",&n);
    for(i = 0; i < n; i++){
        scanf("%d",&a[i]);
        if(a[i]==0){
            dis[i] = 0;//如果是零距离是零,直接存
            zero.push_back(i);//把零的位置依次存到数组中
        }
        else number.push_back(i);//把非零的数的位置存到另一个数组中
    }
    zero.push_back(INF);//把一个最大数放在数组的末尾位置
    vector<int>::iterator it;
    for(i = 0,it = number.begin(); it != number.end(); it++,i++){//用迭代器遍历非零的数组
        int pos1,pos2;
        pos1 = *(lower_bound(zero.begin(),zero.end(),number[i])-1);//lower_bound找到第一个大于等于的坐标,然后减1,得到这个数左边零的位置
        pos2 = *upper_bound(zero.begin(),zero.end(),number[i]);//upper_bound找到第一个大于的坐标,找到的是右边零的位置
        int dis1,dis2;                                         //注意两个函数返回的是一个迭代器,所以取星得到的值才是零在原来数组的位置
        dis1 = number[i]-pos1;
        dis2 = pos2-number[i];//分别求这两个数和左右两个零的位置
        dis[number[i]] = min(dis1,dis2);//取最小
    }
    for(i = 0; i < n; i++){//输出
        if(i==0)printf("%d",dis[i]);
        else printf(" %d",dis[i]);
    }
    puts("");
    return 0;
}


import numpy as np import matplotlib.pyplot as plt from pymatgen.io.vasp import Vasprun from pymatgen.core.structure import Structure from scipy.ndimage import gaussian_filter1d from scipy.spatial import cKDTree from tqdm import tqdm import matplotlib as mpl import warnings from collections import defaultdict import os import csv import argparse import multiprocessing from functools import partial import time import dill # 忽略可能的警告 warnings.filterwarnings("ignore", category=UserWarning) # 专业绘图设置 - 符合Journal of Chemical Physics要求 plt.style.use('seaborn-v0_8-whitegrid') mpl.rcParams.update({ 'font.family': 'serif', 'font.serif': ['Times New Roman', 'DejaVu Serif'], 'font.size': 12, 'axes.labelsize': 14, 'axes.titlesize': 16, 'xtick.labelsize': 12, 'ytick.labelsize': 12, 'figure.dpi': 600, 'savefig.dpi': 600, 'figure.figsize': (8, 6), 'lines.linewidth': 2.0, 'legend.fontsize': 10, 'legend.framealpha': 0.8, 'mathtext.default': 'regular', 'axes.linewidth': 1.5, 'xtick.major.width': 1.5, 'ytick.major.width': 1.5, 'xtick.major.size': 5, 'ytick.major.size': 5, }) def identify_atom_types(struct): """优化的原子类型识别函数,不再区分P=OP-O""" # 1. 初始化数据结构 p_oxygens = {"P-O/P=O": [], "P-OH": []} # 合并P-OP=O phosphate_hydrogens = [] water_oxygens = [] water_hydrogens = [] hydronium_oxygens = [] hydronium_hydrogens = [] fluoride_atoms = [i for i, site in enumerate(struct) if site.species_string == "F"] aluminum_atoms = [i for i, site in enumerate(struct) if site.species_string == "Al"] # 2. 构建全局KDTree all_coords = np.array([site.coords for site in struct]) kdtree = cKDTree(all_coords, boxsize=struct.lattice.abc) # 3. 识别磷酸基团 p_atoms = [i for i, site in enumerate(struct) if site.species_string == "P"] phosphate_oxygens = [] for p_idx in p_atoms: # 查找P周围的O原子 (距离 < 1.6Å) neighbors = kdtree.query_ball_point(all_coords[p_idx], r=1.6) p_o_indices = [idx for idx in neighbors if idx != p_idx and struct[idx].species_string == "O"] phosphate_oxygens.extend(p_o_indices) # 4. 识别所有H原子并确定归属 hydrogen_owners = {} h_atoms = [i for i, site in enumerate(struct) if site.species_string == "H"] for h_idx in h_atoms: neighbors = kdtree.query_ball_point(all_coords[h_idx], r=1.2) candidate_os = [idx for idx in neighbors if idx != h_idx and struct[idx].species_string == "O"] if not candidate_os: continue min_dist = float('inf') owner_o = None for o_idx in candidate_os: dist = struct.get_distance(h_idx, o_idx) if dist < min_dist: min_dist = dist owner_o = o_idx hydrogen_owners[h_idx] = owner_o # 5. 分类磷酸氧:带H的为P-OH,不带H的为P-O/P=O for o_idx in phosphate_oxygens: has_hydrogen = any(owner_o == o_idx for h_idx, owner_o in hydrogen_owners.items()) if has_hydrogen: p_oxygens["P-OH"].append(o_idx) else: p_oxygens["P-O/P=O"].append(o_idx) # 6. 识别水水合氢离子 all_o_indices = [i for i, site in enumerate(struct) if site.species_string == "O"] non_phosphate_os = [o_idx for o_idx in all_o_indices if o_idx not in phosphate_oxygens] o_h_count = defaultdict(int) for h_idx, owner_o in hydrogen_owners.items(): o_h_count[owner_o] += 1 for o_idx in non_phosphate_os: h_count = o_h_count.get(o_idx, 0) attached_hs = [h_idx for h_idx, owner_o in hydrogen_owners.items() if owner_o == o_idx] if h_count == 2: water_oxygens.append(o_idx) water_hydrogens.extend(attached_hs) elif h_count == 3: hydronium_oxygens.append(o_idx) hydronium_hydrogens.extend(attached_hs) # 7. 识别磷酸基团的H原子 for o_idx in p_oxygens["P-OH"]: attached_hs = [h_idx for h_idx, owner_o in hydrogen_owners.items() if owner_o == o_idx] phosphate_hydrogens.extend(attached_hs) return { "phosphate_oxygens": p_oxygens, "phosphate_hydrogens": phosphate_hydrogens, "water_oxygens": water_oxygens, "water_hydrogens": water_hydrogens, "hydronium_oxygens": hydronium_oxygens, "hydronium_hydrogens": hydronium_hydrogens, "fluoride_atoms": fluoride_atoms, "aluminum_atoms": aluminum_atoms } def process_frame(struct, center_sel, target_sel, r_max, exclude_bonds, bond_threshold): """处理单帧结构计算""" atom_types = identify_atom_types(struct) centers = center_sel(atom_types) targets = target_sel(atom_types) if len(centers) == 0 or len(targets) == 0: return { "distances": np.array([], dtype=np.float64), "n_centers": 0, "n_targets": 0, "volume": struct.volume } center_coords = np.array([struct[i].coords for i in centers]) target_coords = np.array([struct[i].coords for i in targets]) lattice = struct.lattice kdtree = cKDTree(target_coords, boxsize=lattice.abc) k_val = min(50, len(targets)) if k_val == 0: return { "distances": np.array([], dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } try: query_result = kdtree.query(center_coords, k=k_val, distance_upper_bound=r_max) except Exception as e: print(f"KDTree query error: {str(e)}") return { "distances": np.array([], dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } if k_val == 1: if isinstance(query_result, tuple): distances, indices = query_result else: distances = query_result indices = np.zeros_like(distances, dtype=int) distances = np.atleast_1d(distances) indices = np.atleast_1d(indices) else: distances, indices = query_result valid_distances = [] for i in range(distances.shape[0]): center_idx = centers[i] for j in range(distances.shape[1]): dist = distances[i, j] if dist > r_max or np.isinf(dist): continue target_idx = targets[indices[i, j]] if exclude_bonds: actual_dist = struct.get_distance(center_idx, target_idx) if actual_dist < bond_threshold: continue valid_distances.append(dist) return { "distances": np.array(valid_distances, dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } def calculate_rdf_parallel(structures, center_sel, target_sel, r_max=8.0, bin_width=0.05, exclude_bonds=True, bond_threshold=1.3, workers=1): """并行计算径向分布函数""" bins = np.arange(0, r_max, bin_width) hist = np.zeros(len(bins) - 1) total_centers = 0 total_targets = 0 total_volume = 0 dill.settings['recurse'] = True func = partial(process_frame, center_sel=center_sel, target_sel=target_sel, r_max=r_max, exclude_bonds=exclude_bonds, bond_threshold=bond_threshold) with multiprocessing.Pool(processes=workers) as pool: results = [] for res in tqdm(pool.imap_unordered(func, structures), total=len(structures), desc="Calculating RDF"): results.append(res) n_frames = 0 for res in results: if res is None: continue n_frames += 1 valid_distances = res["distances"] n_centers = res["n_centers"] n_targets = res["n_targets"] volume = res["volume"] if len(valid_distances) > 0: hist += np.histogram(valid_distances, bins=bins)[0] total_centers += n_centers total_targets += n_targets total_volume += volume if n_frames == 0: r = bins[:-1] + bin_width/2 return r, np.zeros_like(r), {"position": None, "value": None} avg_density = total_targets / total_volume if total_volume > 0 else 0 r = bins[:-1] + bin_width/2 rdf = np.zeros_like(r) for i in range(len(hist)): r_lower = bins[i] r_upper = bins[i+1] shell_vol = 4/3 * np.pi * (r_upper**3 - r_lower**3) expected_count = shell_vol * avg_density * total_centers if expected_count > 1e-10: rdf[i] = hist[i] / expected_count else: rdf[i] = 0 # 使用高斯滤波替代Savitzky-Golay滤波 if len(rdf) > 10: rdf_smoothed = gaussian_filter1d(rdf, sigma=1.5) # 调整sigma控制平滑程度 else: rdf_smoothed = rdf # 计算主要峰值 peak_info = {} mask = (r >= 1.5) & (r <= 3.0) if np.any(mask) and np.any(rdf_smoothed[mask] > 0): peak_idx = np.argmax(rdf_smoothed[mask]) peak_pos = r[mask][peak_idx] peak_val = rdf_smoothed[mask][peak_idx] peak_info = {"position": peak_pos, "value": peak_val} else: peak_info = {"position": None, "value": None} return r, rdf_smoothed, peak_info # 定义选择器函数 def selector_phosphate_P_O_or_double(atom_types): """选择所有非羟基磷酸氧(P-O/P=O)""" return atom_types["phosphate_oxygens"]["P-O/P=O"] def selector_phosphate_P_OH(atom_types): return atom_types["phosphate_oxygens"]["P-OH"] def selector_phosphate_hydrogens(atom_types): return atom_types["phosphate_hydrogens"] def selector_water_only_hydrogens(atom_types): return atom_types["water_hydrogens"] def selector_hydronium_only_hydrogens(atom_types): return atom_types["hydronium_hydrogens"] def selector_water_only_oxygens(atom_types): return atom_types["water_oxygens"] def selector_hydronium_only_oxygens(atom_types): return atom_types["hydronium_oxygens"] def selector_fluoride_atoms(atom_types): return atom_types["fluoride_atoms"] def selector_aluminum_atoms(atom_types): return atom_types["aluminum_atoms"] def selector_all_phosphate_oxygens(atom_types): """选择所有磷酸氧(包括P-O/P=OP-OH)""" return (atom_types["phosphate_oxygens"]["P-O/P=O"] + atom_types["phosphate_oxygens"]["P-OH"]) # 更新后的RDF分组配置 def get_rdf_groups(): """返回六张图的RDF分组配置(已优化)""" return { # 图1: Al的配位情况 "Al_Coordination": [ (selector_aluminum_atoms, selector_fluoride_atoms, "Al-F", "blue"), (selector_aluminum_atoms, selector_water_only_oxygens, "Al-Ow", "green"), (selector_aluminum_atoms, selector_all_phosphate_oxygens, "Al-Op", "red") ], # 图2: F与H形成的氢键 "F_Hydrogen_Bonding": [ (selector_fluoride_atoms, selector_water_only_hydrogens, "F-Hw", "lightblue"), (selector_fluoride_atoms, selector_hydronium_only_hydrogens, "F-Hh", "blue"), (selector_fluoride_atoms, selector_phosphate_hydrogens, "F-Hp", "darkblue") ], # 图3: 磷酸作为受体供体(六组) "Phosphate_HBonding": [ # 受体部分(磷酸氧接受氢键) (selector_phosphate_P_O_or_double, selector_water_only_hydrogens, "P-O/P=O···Hw", "orange"), (selector_phosphate_P_O_or_double, selector_hydronium_only_hydrogens, "P-O/P=O···Hh", "red"), (selector_phosphate_P_OH, selector_water_only_hydrogens, "P-OH···Hw", "lightgreen"), (selector_phosphate_P_OH, selector_hydronium_only_hydrogens, "P-OH···Hh", "green"), # 供体部分(磷酸氢提供氢键) (selector_phosphate_hydrogens, selector_water_only_oxygens, "Hp···Ow", "lightblue"), (selector_phosphate_hydrogens, selector_hydronium_only_oxygens, "Hp···Oh", "blue") ], # 图4: 磷酸-水-水合氢离子交叉氢键 "Cross_Species_HBonding": [ (selector_phosphate_hydrogens, selector_water_only_oxygens, "Hp···Ow", "pink"), (selector_phosphate_hydrogens, selector_hydronium_only_oxygens, "Hp···Oh", "purple"), (selector_water_only_hydrogens, selector_all_phosphate_oxygens, "Hw···Op", "lightgreen"), (selector_water_only_hydrogens, selector_hydronium_only_oxygens, "Hw···Oh", "green"), (selector_hydronium_only_hydrogens, selector_water_only_oxygens, "Hh···Ow", "lightblue"), (selector_hydronium_only_hydrogens, selector_all_phosphate_oxygens, "Hh···Op", "blue") ], # 图5: 同类型分子内/间氢键 "Same_Species_HBonding": [ (selector_phosphate_hydrogens, selector_phosphate_P_O_or_double, "Hp···P-O/P=O", "red"), (selector_phosphate_hydrogens, selector_phosphate_P_OH, "Hp···P-OH", "orange"), (selector_water_only_hydrogens, selector_water_only_oxygens, "Hw···Ow", "lightblue"), (selector_hydronium_only_hydrogens, selector_hydronium_only_oxygens, "Hh···Oh", "blue") ], # 图6: O-O聚集分析 "O_O_Aggregation": [ (selector_all_phosphate_oxygens, selector_water_only_oxygens, "Op-Ow", "blue"), (selector_all_phosphate_oxygens, selector_hydronium_only_oxygens, "Op-Oh", "green"), (selector_all_phosphate_oxygens, selector_all_phosphate_oxygens, "Op-Op", "red"), (selector_water_only_oxygens, selector_hydronium_only_oxygens, "Ow-Oh", "purple"), (selector_water_only_oxygens, selector_water_only_oxygens, "Ow-Ow", "cyan"), (selector_hydronium_only_oxygens, selector_hydronium_only_oxygens, "Oh-Oh", "magenta") ] } def main(workers=1): # 定义要处理的体系 vasprun_files = { "System1": "vasprun1.xml", "System2": "vasprun2.xml", "System3": "vasprun3.xml", "System4": "vasprun4.xml" } # 获取RDF分组配置 rdf_groups = get_rdf_groups() # 标题映射 title_map = { "Al_Coordination": "Al Coordination Environment", "F_Hydrogen_Bonding": "F-H Hydrogen Bonding", "Phosphate_HBonding": "Phosphate H-bonding (Acceptor and Donor)", "Cross_Species_HBonding": "Cross H-bonding between Different Species", "Same_Species_HBonding": "Intra- and Inter-molecular H-bonding", "O_O_Aggregation": "O-O Aggregation Analysis" } # 存储所有数据 all_system_data = {} group_y_max = {group_name: 0 for group_name in list(rdf_groups.keys())} group_x_max = { "Al_Coordination": (0, 6), "F_Hydrogen_Bonding": (0, 6), "Phosphate_HBonding": (0, 6.0), "Cross_Species_HBonding": (0, 6.0), "Same_Species_HBonding": (0, 6.0), "O_O_Aggregation": (0, 6.0) } # 创建输出目录 os.makedirs("RDF_Plots", exist_ok=True) # 计算所有体系的所有RDF数据 for system_name, vasprun_file in vasprun_files.items(): print(f"\n{'='*50}") print(f"Processing {system_name}: {vasprun_file} with {workers} workers") print(f"{'='*50}") start_time = time.time() try: # 加载VASP结果 vr = Vasprun(vasprun_file, ionic_step_skip=5) structures = vr.structures print(f"Loaded {len(structures)} frames") # 存储体系数据 system_data = { "rdf_results": {}, "peak_infos": {} } # 计算所有RDF分组 for group_name, pairs in rdf_groups.items(): system_data["rdf_results"][group_name] = {} system_data["peak_infos"][group_name] = {} group_y_max_current = 0 for center_sel, target_sel, label, color in pairs: print(f"\nCalculating RDF for: {label}") try: r, rdf, peak_info = calculate_rdf_parallel( structures, center_sel, target_sel, r_max=10.0, exclude_bonds=True, bond_threshold=1.3, workers=workers ) system_data["rdf_results"][group_name][label] = (r, rdf, color) system_data["peak_infos"][group_name][label] = peak_info if len(rdf) > 0: current_max = np.max(rdf) if current_max > group_y_max_current: group_y_max_current = current_max if peak_info["position"] is not None: print(f" Peak for {label}: {peak_info['position']:.3f} Å (g(r) = {peak_info['value']:.2f})") else: print(f" No significant peak found for {label} in 1.5-3.0 Å range") except Exception as e: print(f"Error calculating RDF for {label}: {str(e)}") system_data["rdf_results"][group_name][label] = (np.array([]), np.array([]), color) system_data["peak_infos"][group_name][label] = {"position": None, "value": None} if group_y_max_current > group_y_max[group_name]: group_y_max[group_name] = group_y_max_current all_system_data[system_name] = system_data elapsed = time.time() - start_time print(f"\nCompleted processing for {system_name} in {elapsed:.2f} seconds") except Exception as e: print(f"Error processing {system_name}: {str(e)}") # 为每个分组添加余量 for group_name in group_y_max: group_y_max[group_name] = max(group_y_max[group_name] * 1.15, 3.0) # 确保最小值 # 第二步:生成符合期刊要求的图表 for system_name, system_data in all_system_data.items(): print(f"\nGenerating publication-quality plots for {system_name}") for group_name, group_data in system_data["rdf_results"].items(): fig, ax = plt.subplots(figsize=(8, 6)) # 设置坐标轴范围 xlim = group_x_max.get(group_name, (0, 6.0)) ylim = (0, group_y_max[group_name]) for label, (r, rdf, color) in group_data.items(): if len(r) > 0 and len(rdf) > 0: ax.plot(r, rdf, color=color, label=label, linewidth=2.0) ax.set_xlim(xlim) ax.set_ylim(ylim) # 期刊格式标签 ax.set_xlabel('Radial Distance (Å)', fontweight='bold') ax.set_ylabel('g(r)', fontweight='bold') # 添加体系名称到标题 ax.set_title(f"{system_name}: {title_map[group_name]}", fontsize=16, pad=15) # 精简图例 ncol = 3 if group_name == "Same_Species_HBonding" else 1 ax.legend(ncol=ncol, loc='best', framealpha=0.8, fontsize=10) # 添加氢键区域标记(除O-O聚集图外) if group_name != "O_O_Aggregation": ax.axvspan(1.5, 2.5, alpha=0.1, color='green', zorder=0) ax.text(1.7, ylim[1]*0.85, 'H-bond Region', fontsize=10) # 添加网格 ax.grid(True, linestyle='--', alpha=0.5) # 保存高分辨率图片 plt.tight_layout() filename = os.path.join("RDF_Plots", f"RDF_{system_name}_{group_name}.tiff") plt.savefig(filename, bbox_inches='tight', dpi=600, format='tiff') print(f"Saved publication plot: {filename}") plt.close() # 保存Origin兼容数据 for system_name, system_data in all_system_data.items(): save_origin_data(system_name, system_data) print("\nAll RDF analysis completed successfully!") def save_origin_data(system_name, system_data): """保存Origin兼容格式数据""" os.makedirs("Origin_Data", exist_ok=True) system_dir = os.path.join("Origin_Data", system_name) os.makedirs(system_dir, exist_ok=True) # 保存峰值信息 peak_info_path = os.path.join(system_dir, f"Peak_Positions_{system_name}.csv") with open(peak_info_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(["Group", "Interaction", "Peak Position (A)", "g(r) Value"]) for group_name, peaks in system_data["peak_infos"].items(): for label, info in peaks.items(): if info["position"] is not None: writer.writerow([group_name, label, f"{info['position']:.3f}", f"{info['value']:.3f}"]) else: writer.writerow([group_name, label, "N/A", "N/A"]) print(f"Saved peak positions: {peak_info_path}") # 保存RDF数据 for group_name, group_results in system_data["rdf_results"].items(): group_dir = os.path.join(system_dir, group_name) os.makedirs(group_dir, exist_ok=True) for label, (r, rdf, color) in group_results.items(): if len(r) > 0 and len(rdf) > 0: safe_label = label.replace(" ", "_").replace("/", "_").replace("=", "_") safe_label = safe_label.replace("(", "").replace(")", "").replace("$", "") filename = f"RDF_{system_name}_{group_name}_{safe_label}.csv" filepath = os.path.join(group_dir, filename) with open(filepath, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(["Distance (A)", "g(r)"]) for i in range(len(r)): writer.writerow([f"{r[i]:.6f}", f"{rdf[i]:.6f}"]) print(f"Saved Origin data: {filename}") if __name__ == "__main__": # 设置命令行参数 parser = argparse.ArgumentParser(description='Calculate RDF for VASP simulations') parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(), help=f'Number of parallel workers (default: {multiprocessing.cpu_count()})') args = parser.parse_args() print(f"Starting RDF analysis with {args.workers} workers...") main(workers=args.workers) 以上代码已经能够完全正确的运行,在此基础上,我想要加入CN配位数的计算,也就是一个关于g(r)积分的函数,积分下限为0,积分上限为第一个波峰之后的波谷,在图中输出该CN配位数值即可
08-08
def train_model(recipeDict,spec,type=‘rf’): featureMatrix = featureMatrixGen(list(recipeDict.values())) normalizedFeatureMatrix, minFeatureMatrix, maxFeatureMatrix = min_max_normalization(featureMatrix) #import pdb;pdb.set_trace() # 使用 pd.concat 拼接所有 DataFrame,并设置 ignore_index=True 来重置索引 recipe_dataframe_list=[] for item in recipeDict.values(): s = item.fillna(0).stack() new_columns = s.index.map(lambda x: f"{x[1]}#{x[0]}") # 构建新的一维 DataFrame,并重置索引(不需要多级索引) wide_df = pd.DataFrame([s.values], columns=new_columns) recipe_dataframe_list.append(wide_df) recipe_dataframe = pd.concat( recipe_dataframe_list , ignore_index=True) # 创建一个新的字典,键为文件名,值为对应的索引 recipe_index_map = {filename: idx for idx, filename in enumerate(recipeDict.keys())} target_spec = { 'maskremain': 2250, 'TCD': 21.5, 'Depth': 1200, 'SOCremain': 0, 'SiNSWA': 87.5, 'doubleslope': 0 } tempSpec = spec.iloc[:, 1:] spec_column_means = tempSpec tempSpec = tempSpec.fillna(spec_column_means) normalizedSpec, minMatrix, maxMatrix = min_max_normalization_spec(tempSpec) nonNormalizedSpec = tempSpec.values recipe_index = [] x_train, x_test,x_nor_train,x_nor_test, y_train, y_test, y_nonormal_train, y_nonormal_test = train_test_split( featureMatrix,normalizedFeatureMatrix, normalizedSpec, nonNormalizedSpec, test_size=0.1, random_state=0, shuffle=True ) print(type) if type == 'linear_local': recipe_df = recipe_dataframe unique_counts = recipe_df.nunique() # # 筛选出不同值小于3的列 columns_to_drop = unique_counts[unique_counts < 2].index # 排除这些列 recipe_df = recipe_df.drop(columns=columns_to_drop) # 需要删除的字符串列表 strings_to_delete = ['MiddleTuneGas', 'EdgeTuneGas', 'MidInnerESCTemp', 'MidOuterESCTemp', 'OuterESCTemp'] # 使用列表推导式找到所有包含这些字符串的列名 cols_to_drop = [col for col in recipe_df.columns if any(s in col for s in strings_to_delete)] recipe_df = recipe_df.drop(columns=cols_to_drop) recipe_df = recipe_df.loc[:, ~recipe_df.columns.str.contains('sta', case=False)] spec.columns = [re.sub(r'[^a-zA-Z0-9-]', '', col) for col in spec.columns] spec=spec.iloc[:, 1:].reset_index(drop=True) recipe_df = recipe_df.fillna(0) df1=pd.concat([recipe_df, spec], axis=1) param_cols = recipe_df.columns target_cols = spec.columns # 创建一个与 df 索引对齐的 Series,值来自 file_dict 的键 df1['recipeid'] = pd.Series(recipe_index_map.keys(), index=recipe_index_map.values()) # 将 'recipeid' 列移动到最前面 cols = ['recipeid'] + [col for col in df1.columns if col != 'recipeid'] df1 = df1[cols] scalers,base_models=full_multi_target_diff_analysis( df=df1, feature_cols=param_cols, target_cols=target_cols, recipe_id_col="recipeid", weights=None, # 可选加权参数 distance_threshold=10, top_k_base=len(df1), top_k_diff=20, model_type="lasso", output_root=path11+'all-line-pair' ) # 1. 创建主文件夹名称:linear+随机id file_id=str(id(base_models)) model_id=f"{type}_{file_id}" main_folder = f"./model/{model_id}" os.makedirs(main_folder, exist_ok=True) a = {'id': model_id,'featureMatrix':recipe_df.values.tolist(),'recipe_index_map':recipe_index_map,'feature_name':recipe_df.columns.tolist(),'spec_name':spec.columns.tolist(),'specMatrix':spec.values.tolist(), 'minMatrix': minMatrix.to_dict(), 'maxMatrix': maxMatrix.to_dict(),'step_order':list(recipeDict.values())[0].columns.tolist(),'parameter_order':list(recipeDict.values())[0].index.tolist()} # 打开已有json文件并读取其中的内容;如果没有就创建一个新的空列表 try: with open('./model/data.json', 'r') as f: data = json.load(f) # 加载原始数据 except FileNotFoundError: data = [] # 如果文件不存在则初始化为空list if isinstance(data, list): # 确保我们的基础数据是个list以便于append新元素 data.append(a) with open('./model/data.json', 'w') as f: json.dump(data, f, indent=4) for i,base_model in enumerate(base_models): for model_name, model in base_model.items(): print(model_name.replace(' ','')) base_part, target_col_part = model_name.replace(' ','').replace('Process#','Process').split('#') target_col_clean = target_col_part.replace(' ','') # 去掉 Δ 符号用于命名 # 创建子文件夹 target_folder = os.path.join(main_folder, target_col_clean) os.makedirs(target_folder, exist_ok=True) # 获取对应的 scaler scaler = scalers[i] # 构建模型路径并保存 model_path = os.path.join(target_folder, f"{model_name}.pkl") # 将模型 scaler 打包成字典保存 with open(model_path, 'wb') as f: pickle.dump({ 'model': model, 'scaler': scaler, 'name': model_name }, f) return model_id def full_multi_target_diff_analysis( df: pd.DataFrame, feature_cols: List[str], target_cols: List[str], recipe_id_col: str = “recipe_id”, weights: Optional[Dict[str, float]] = None, distance_threshold: float = 0.4, top_k_base: int = 3, top_k_diff: int = 5, model_type: str = “lasso”, output_root: str = “multi_target_diff_analysis” ) : “”" 对多个目标列进行差分建模、方向分析、响应面绘图的完整流程。 自动过滤 recipe_id_col,不纳入参数分析。 使用组合图绘制方向条形图 + 残差直方图。 响应面图自动选择变化量最大的两个参数维度。 新增整体样本分布图。 “”" os.makedirs(output_root, exist_ok=True) if recipe_id_col in feature_cols: feature_cols = [col for col in feature_cols if col != recipe_id_col] # base_ids = select_base_recipes_by_density( # df=df, # feature_cols=target_cols, # recipe_id_col=recipe_id_col, # weights=weights, # distance_threshold=distance_threshold, # top_k=top_k_base # ) scalers_list = [] base_models_list = [] for target_col in target_cols: #if target_col in ‘TCD’: print(f"\n分析目标: {target_col}“) output_dir = os.path.join(output_root, f"target_{target_col}”) os.makedirs(output_dir, exist_ok=True) # 自动选择 base base_ids,partition = select_base_recipes_by_density( df=df, feature_cols=feature_cols, recipe_id_col=recipe_id_col, weights=weights, distance_threshold=distance_threshold, top_k=top_k_base ) print(f"Selected base_ids for {target_col}: {base_ids}“) # 差分数据构建 df_diff,scalers = construct_multi_base_differences_with_distance_vector( df=df, feature_cols=feature_cols, target_col=target_col, recipe_id_col=recipe_id_col, base_ids=base_ids, max_distance=distance_threshold, top_k=top_k_diff, weights=weights ,partition=partition ) print(f"Constructed {len(df_diff)} Δ samples for {target_col}”) scalers_list.append(scalers) # 差分特征列表 delta_feature_cols = [col for col in df_diff.columns if col.startswith(“Δ”) and col != f"Δ{target_col}"] # 模型训练 base_models,model_result_df = train_diff_models_by_base_no_shap( df=df, df_diff=df_diff, feature_cols=delta_feature_cols, target_col=f"Δ{target_col}", base_col="base_id", model_type=model_type, output_dir=os.path.join(output_dir, "models") ) base_models_list.append(base_models) print("\n全部目标建模分析完成!") return scalers_list ,base_models_list def select_base_recipes_by_density( df: pd.DataFrame, feature_cols: List[str], recipe_id_col: str = “recipe_id”, weights: Optional[Dict[str, float]] = None, distance_threshold: float = 0.4, top_k: int = 5 ) -> tuple[list[Any], Any]: data = df.copy() X = df[feature_cols] recipe_ids = df[recipe_id_col].values # 归一化参数 scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X) # 权重向量 if weights is None: weights = {col: 1.0 for col in feature_cols} weight_vector = np.array([weights.get(col, 1.0) for col in feature_cols]) # 计算距离矩阵 n = len(X_scaled) neighbor_counts = [] distance_matrix = np.zeros((n, n)) # 改进的base选择策略 selected_indices = [] remaining_indices = set(range(n)) for i in range(n): dists = compute_weighted_l1_distance_vector(X_scaled[i], X_scaled, weight_vector) count = np.sum((dists > 0) & (dists <= distance_threshold)) distance_matrix[i] = dists z_scores = np.abs(zscore(dists)) threshold = 10 # 使用 np.where 获取满足条件的索引位置 valid_idxs = np.where(z_scores < threshold)[0] # z_scores = np.abs(zscore(dists)) # threshold = 10 # # 使用 np.where 获取满足条件的索引位置 # outlier_indices = np.where(z_scores > threshold)[0] # # 排除异常值 # dists = np.delete(dists, outlier_indices) neighbor_counts.append(len(valid_idxs)) data["neighbor_count"] = neighbor_counts print(neighbor_counts) #top_indices = data.sort_values("neighbor_count", ascending=False).head(top_k).index base_recipe_ids = recipe_ids # 可视化部分 partition=None #visualize_graph(X_scaled, recipe_ids, distance_threshold, weight_vector) return base_recipe_ids,partition 用于保存每个 base_id 对应的参数导数 final_derivatives = {} 用于按参数保存所有 (delta_spec, delta_param, base_id, compare_id) 四元组 all_param_records = {} def construct_multi_base_differences_with_distance_vector( df: pd.DataFrame, feature_cols: List[str], target_col: str, recipe_id_col: str = “recipe_id”, base_ids: Optional[List[str]] = None, max_distance: float = 0.5, top_k: Optional[int] = None, weights: Optional[Dict[str, float]] = None, partition: Optional[Dict[str, float]]= None, ) : diffs = [] scalers = {} data = df[df[target_col].notna()].copy() X = data[feature_cols].copy() y = data[target_col].values ids = data[recipe_id_col].values scaler = MinMaxScaler() X = scaler.fit_transform(X) #scalers['target_col'](scaler) scalers[target_col.replace('Δ', '')] = scaler if weights is None: weights = {col: 1.0 for col in feature_cols} weight_vector = np.array([weights.get(col, 1.0) for col in feature_cols]) id_to_index = {rid: idx for idx, rid in enumerate(ids)} if base_ids is None: base_ids = list(ids) use=[] indices_with_none = [id_to_index.get(id) for id in base_ids] for base_id in base_ids: i = id_to_index.get(base_id) if i is None: continue base_vector = X[i] dist_vector = compute_weighted_l1_distance_vector(base_vector, X, weight_vector) # 1. 计算 Q1(25% 分位数) Q3(75% 分位数) Q1 = np.percentile(dist_vector, 25) Q3 = np.percentile(dist_vector, 75) # 2. 计算四分位距 IQR IQR = Q3 - Q1 # 3. 定义异常值边界 #lower_bound = Q1 - 1.5 * IQR upper_bound = Q1 # 4. 筛选非异常值的索引 valid_idxs = np.where((dist_vector >0) & (dist_vector <= upper_bound))[0] # 输出保留下来的样本数量 #print("保留的有效样本数量:", valid_idxs) use.extend(valid_idxs) #print('=============') # Step 1: base_id valid_idxs 中的每个点做比较 for j in valid_idxs: #if y[j] - y[i]>=0: delta_x = X[j] - X[i] delta_y = y[j] - y[i] record = { f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols) } record[f"Δ{target_col}"] = delta_y record["base_id"] = ids[i] record["compare_id1"] = ids[i] record["compare_id2"] = ids[j] record["distance"] = dist_vector[j] diffs.append(record) # else: # delta_x = X.iloc[i].values - X.iloc[j].values # delta_y = y[i] - y[j] # record = { # f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols) # } # record[f"Δ{target_col}"] = delta_y # record["base_id"] = ids[i] # record["compare_id1"] = ids[i] # record["compare_id2"] = ids[j] # record["distance"] = dist_vector[j] # diffs.append(record) #Step 2: valid_idxs 内部两两之间也做比较 n = len(valid_idxs) for idx1 in range(n): for idx2 in range(idx1 + 1, n): i1 = valid_idxs[idx1] i2 = valid_idxs[idx2] #if y[i2] - y[i1] >= 0: delta_x = X[i2] - X[i1] delta_y = y[i2] - y[i1] record = { f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols) } record[f"Δ{target_col}"] = delta_y record["compare_id1"] = ids[i1] record["base_id"] = ids[i] record["compare_id2"] = ids[i2] record["distance"] = abs(dist_vector[i2] - dist_vector[i1]) diffs.append(record) # else: # delta_x = X.iloc[i1].values - X.iloc[i2].values # delta_y = y[i1] - y[i2] # record = { # f"Δ{col}": delta_x[k] for k, col in enumerate(feature_cols) # } # record[f"Δ{target_col}"] = delta_y # record["compare_id1"] = ids[i1] # record["base_id"] = ids[i] # record["compare_id2"] = ids[i2] # record["distance"] = abs(dist_vector[i2] - dist_vector[i1]) # diffs.append(record) print(len(use)) use.extend(indices_with_none) print(len(set(use))) epsilon = 0.0001 df_diff = pd.DataFrame(diffs) def smooth_weight(dist, scale=1.0): """ 平滑函数:根据距离计算权重,scale 控制衰减速率 可选:指数衰减、倒数、高斯核等 """ return 1 / (1 + dist / scale) def remove_outliers_zscore_grouped(df, category_col, value_col, threshold=2): # 对每个分组计算 z-score,并保留非异常值 df['z_score'] = df.groupby(category_col)[value_col].transform(lambda x: zscore(x, nan_policy='omit')) # 筛选 z-score 绝对值小于阈值的数据(默认阈值为 3) cleaned_df = df[np.abs(df['z_score']) < threshold] # 删除辅助列 cleaned_df = cleaned_df.drop(columns=['z_score']) return cleaned_df # 应用函数去除异常值 df_diff = remove_outliers_zscore_grouped(df_diff, 'base_id', f"Δ{target_col}") # 去掉 col1 col2 值相同的行 df_diff = df_diff[df_diff['compare_id1'] != df_diff['compare_id2']] # 假设 df_diff 是已经生成好的差分记录 DataFrame # feature_cols 是原始特征列名列表,target_col 是目标变量名 for base_id in df_diff['base_id'].unique(): # 提取该 base_id 的所有差分记录 sub_df = df_diff[df_diff['base_id'] == base_id] # 存储每个参数对应的 (delta_spec, delta_param, compare_id) param_deltas = {} for _, row in sub_df.iterrows(): delta_spec = row[f"Δ{target_col}"] current_base = row['compare_id1'] compare_id = row['compare_id2'] # 或者 compare_id1,取决于你想记录哪个对比ID distance = row['distance'] for col in feature_cols: delta_param = row[f"Δ{col}"] if delta_param != 0: # 只处理有变化的参数 if col not in param_deltas: param_deltas[col] = [] # 同时记录 spec变化、参数变化、basecompare param_deltas[col].append((delta_spec, delta_param,(delta_spec / delta_param), current_base, compare_id,distance)) # 将结果存入最终字典 derivatives = {} for col, records in param_deltas.items(): weights1=[] slopes = [(spec / param) for spec, param,_, _, _,_ in records] distances = [distance for spec, param,_, _, _,distance in records] weights1 =[1 / (1 + d) for d in distances] derivatives[col] = { 'mean_slope' : np.average(slopes, weights=weights1) if weights1 else np.nan, 'records': records # 包含完整四元组:(spec变化, 参数变化, base, compare) } final_derivatives[base_id+'%'+target_col] = derivatives # 同时将数据保存到全局 all_param_records 中 for col, records in param_deltas.items(): if col+'%'+target_col not in all_param_records: all_param_records[col+'%'+target_col] = [] all_param_records[col+'%'+target_col].extend(records) #df_diff[f"Δ{target_col}"] = np.log(df_diff[f"Δ{target_col}"]+epsilon) # z_scores = np.abs(zscore(df_diff.dropna()[f"Δ{target_col}"])) # threshold = 1.2 # outlier_indices = df_diff[f"Δ{target_col}"][z_scores > threshold].index # df_diff= df_diff.drop(outlier_indices) # 输出去重后的结果 #print(unique_data) return df_diff,scalers def train_diff_models_by_base_no_shap( df: pd.DataFrame, df_diff: pd.DataFrame, feature_cols: List[str], target_col: str, base_col: str = “base_id”, model_type: str = “lasso”, output_dir: str = “base_models” ) : os.makedirs(output_dir, exist_ok=True) base_ids = df_diff[base_col].unique() records = [] records1 = [] records2 = [] records3 = [] # 训练并保存所有base recipe的模型 base_models = {} base_models1 = {} base_models2 = {} base_centers = {} base_center_ys={} # 计算所有特征的均值(用于填充网格点的其他特征) feature_means =df[ [item.replace(‘Δ’, ‘’) for item in feature_cols] ].mean().values for base in base_ids: df_base = df_diff[df_diff[base_col] == base] if len(df_base) < 2: continue X = df_base[feature_cols].values y = df_base[target_col].values #ransac = RANSACRegressor(LinearRegression()) if model_type == “lasso”: #model = RANSACRegressor(LinearRegression(),min_samples=3).fit(X, y) model = LinearRegression(fit_intercept=False).fit(X, y) elif model_type == “gpr”: pass #model = LassoCV(cv=3, random_state=42).fit(X, y) else: raise NotImplementedError(“Only ‘lasso’ is implemented.”) y_pred = model.predict(X) r2 = r2_score(y, y_pred) mse = mean_squared_error(y, y_pred) if r2>0.75: records1.append(base) elif r2<0.6: records2.append(base) else: records3.append(base) records.append({ “base_id”: base, “r2”: r2, “mse”: mse, “n_samples”: len(df_base), }) # 计算该base的中心(特征均值) base_center = df[df['recipeid'] ==base ][ [item.replace('Δ', '') for item in feature_cols] ].values[0] #df[ [item.replace('Δ', '') for item in feature_cols] ].values base_center_y = df[df['recipeid'] == base][[target_col.replace('Δ', '') ]].values[0] # 保存模型中心 base_models[base] = model base_models1[base+'#'+target_col.replace('Δ', '')+'#'+', '.join(df_base['compare_id2'].astype(str)) ] = model base_models2[ base + '#' + target_col.replace('Δ', '')] = model base_centers[base] = base_center base_center_ys[base] = base_center_y def create_coefficient_table_with_stats(base_models, feature_cols): coef_dict = {} for baseid, model in base_models.items(): coef = model.coef_.flatten().round(3) coef[coef == -0.0] = 0.0 coef_dict[baseid] = dict(zip([item.replace('Δ', '') for item in feature_cols], coef)) coef_df = pd.DataFrame.from_dict(coef_dict, orient='index') # 统计行为一行 stats = { col: { 'zero_ratio': (coef_df[col] == 0).mean(), 'positive_ratio': (coef_df[col] > 0).mean(), 'negative_ratio': (coef_df[col] < 0).mean() } for col in coef_df.columns } stats_df = pd.DataFrame(stats).T stats_df.index.name = 'feature' stats_df.reset_index(inplace=True) stats_df = stats_df.rename(index={len(stats_df): 'Stats'}) # 拼接 final_df = pd.concat([coef_df, stats_df.set_index('feature').T], axis=0) # 排序 non_zero_count = (coef_df != 0).sum().sort_values(ascending=False) final_df = final_df.reindex(columns=non_zero_count.index) return final_df # 使用示例: coef_df = create_coefficient_table_with_stats(base_models1, feature_cols) coef_df.to_csv(os.path.join(output_dir, f"{target_col}_summary.csv")) print(os.path.join(output_dir, f"{target_col}_summary.csv")) print("所有图表已生成并保存") # 提取所有的 r2 mse 值 r2_values = [record["r2"] for record in records] mse_values = [record["mse"] for record in records] # 计算最小值、最大值均值 r2_min, r2_max = min(r2_values), max(r2_values) mse_min, mse_max = min(mse_values), max(mse_values) r2_mean = sum(r2_values) / len(r2_values) mse_mean = sum(mse_values) / len(mse_values) # 打印结果 print(f"R2 最小值: {r2_min}, 最大值: {r2_max}, 均值: {r2_mean}") print(f"MSE 最小值: {mse_min}, 最大值: {mse_max}, 均值: {mse_mean}") print(f"r2>0.75:{records1}") print(f"r2<0.6:{records2}") print(f"medium:{records3}") return base_models2, pd.DataFrame(records) 优化以上代码,保证按照循环每个spec训练模型,每个模型最好构建为pipeline的形式,代码逻辑遵循给出的代码,不要增加或减少代码逻辑
09-02
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值