【The Orange Box】卡掉unordered_map

Ubuntu环境下unordered_map性能优化与防hack策略
本文在Ubuntu 18.04.1 LTS环境下,针对Intel i5-4590处理器的unordered_map插入性能问题进行了测试。首次插入耗时较长,但后续插入有所改善。为防止性能被hack,建议手动编写hash函数,提高效率并增强安全性。经测试,该方法有效。

测试环境:
Ubuntu 18.04.1 LTS
Intel® Core™ i5-4590 CPU @ 3.30GHz × 4

使用如下代码:

#include <bits/stdc++.h>
using namespace std;

#define MAXN 100000

void insert_numbers(long long x)
{
    clock_t timestart=clock();
    unordered_map<long long,int> numbers;
    for(int i=1;i<=MAXN;i++)
        numbers[i*x]=i;
    long long sum=0;
    for(auto element:numbers)
        sum+=(element.first/x)*element.second;
    printf("x = %lld : %.3lfseconds, sum = %lld\n",x,(double)(clock()-timestart)/CLOCKS_PER_SEC,sum);
}

int main()
{
    insert_numbers(107897);
    insert_numbers(126271);
}

编译参数:

g++ hack.cpp -o hack

运行结果:
hacked

可以看到,第一次插入需要大量时间(第二次插入供对比)。

如何防hack?

将unordered_map的hash方法改为手写的hash方法。

这里提供一种参考:

#include <bits/stdc++.h>
using namespace std;

#define MAXN 100000

struct custom_hash
{
    unsigned long long operator()(unsigned long long x)const
    {
        x+=0x9e3779b97f4a7c15;
        x=(x^(x>>30))*0xbf58476d1ce4e5b9;
        x=(x^(x>>27))*0x94d049bb133111eb;
        return x^(x>>31);
    }
};

void insert_numbers(long long x)
{
    clock_t timestart=clock();
    unordered_map<long long,int,custom_hash> numbers;
    for(int i=1;i<=MAXN;i++)
        numbers[i*x]=i;
    long long sum=0;
    for(auto element:numbers)
        sum+=(element.first/x)*element.second;
    printf("x = %lld : %.3lfseconds, sum = %lld\n",x,(double)(clock()-timestart)/CLOCKS_PER_SEC,sum);
}

int main()
{
    insert_numbers(107897);
    insert_numbers(126271);
}

编译参数:

g++ hack.cpp -o hack

运行结果:
accepted

防hack成功!

import numpy as np import matplotlib.pyplot as plt from scipy.integrate import cumtrapz from scipy.ndimage import gaussian_filter1d from scipy.spatial import cKDTree from tqdm import tqdm import multiprocessing from functools import partial import time import os import argparse import warnings from collections import defaultdict import dill import re from pymatgen.core import Structure, Lattice # 忽略可能的警告 warnings.filterwarnings("ignore", category=UserWarning) # 专业绘图设置 - 符合Journal of Chemical Physics要求 plt.style.use('seaborn-v0_8-whitegrid') plt.rcParams.update({ 'font.family': 'serif', 'font.serif': ['Times New Roman', 'DejaVu Serif'], 'font.size': 12, 'axes.labelsize': 14, 'axes.titlesize': 16, 'xtick.labelsize': 12, 'ytick.labelsize': 12, 'figure.dpi': 600, 'savefig.dpi': 600, 'figure.figsize': (8, 6), 'lines.linewidth': 2.0, 'legend.fontsize': 10, 'legend.framealpha': 0.8, 'mathtext.default': 'regular', 'axes.linewidth': 1.5, 'xtick.major.width': 1.5, 'ytick.major.width': 1.5, 'xtick.major.size': 5, 'ytick.major.size': 5, }) # 原子类型识别函数 def identify_atom_types(struct): """优化的原子类型识别函数,不再区分P=O和P-O""" # 初始化数据结构 atom_types = { "phosphate_oxygens": {"P-O/P=O": [], "P-OH": []}, "phosphate_hydrogens": [], "water_oxygens": [], "water_hydrogens": [], "hydronium_oxygens": [], "hydronium_hydrogens": [], "fluoride_atoms": [i for i, site in enumerate(struct) if site.species_string == "F"], "aluminum_atoms": [i for i, site in enumerate(struct) if site.species_string == "Al"] } # 构建全局KDTree all_coords = np.array([site.coords for site in struct]) kdtree = cKDTree(all_coords, boxsize=struct.lattice.abc) # 识别磷酸基团 p_atoms = [i for i, site in enumerate(struct) if site.species_string == "P"] phosphate_oxygens = [] for p_idx in p_atoms: # 查找P周围的O原子 (距离 < 1.6Å) neighbors = kdtree.query_ball_point(all_coords[p_idx], r=1.6) p_o_indices = [idx for idx in neighbors if idx != p_idx and struct[idx].species_string == "O"] phosphate_oxygens.extend(p_o_indices) # 识别所有H原子并确定归属 hydrogen_owners = {} h_atoms = [i for i, site in enumerate(struct) if site.species_string == "H"] for h_idx in h_atoms: neighbors = kdtree.query_ball_point(all_coords[h_idx], r=1.2) candidate_os = [idx for idx in neighbors if idx != h_idx and struct[idx].species_string == "O"] if not candidate_os: continue min_dist = float('inf') owner_o = None for o_idx in candidate_os: dist = struct.get_distance(h_idx, o_idx) if dist < min_dist: min_dist = dist owner_o = o_idx hydrogen_owners[h_idx] = owner_o # 分类磷酸氧:带H的为P-OH,不带H的为P-O/P=O for o_idx in phosphate_oxygens: has_hydrogen = any(owner_o == o_idx for h_idx, owner_o in hydrogen_owners.items()) if has_hydrogen: atom_types["phosphate_oxygens"]["P-OH"].append(o_idx) else: atom_types["phosphate_oxygens"]["P-O/P=O"].append(o_idx) # 识别水和水合氢离子 all_o_indices = [i for i, site in enumerate(struct) if site.species_string == "O"] non_phosphate_os = [o_idx for o_idx in all_o_indices if o_idx not in phosphate_oxygens] o_h_count = defaultdict(int) for h_idx, owner_o in hydrogen_owners.items(): o_h_count[owner_o] += 1 for o_idx in non_phosphate_os: h_count = o_h_count.get(o_idx, 0) attached_hs = [h_idx for h_idx, owner_o in hydrogen_owners.items() if owner_o == o_idx] if h_count == 2: atom_types["water_oxygens"].append(o_idx) atom_types["water_hydrogens"].extend(attached_hs) elif h_count == 3: atom_types["hydronium_oxygens"].append(o_idx) atom_types["hydronium_hydrogens"].extend(attached_hs) # 识别磷酸基团的H原子 for o_idx in atom_types["phosphate_oxygens"]["P-OH"]: attached_hs = [h_idx for h_idx, owner_o in hydrogen_owners.items() if owner_o == o_idx] atom_types["phosphate_hydrogens"].extend(attached_hs) return atom_types # 氢键配置 def get_hbond_angle_config(): """返回九类氢键的氢键分析配置""" return [ { "name": "P-O/P=O···Hw", "donor_type": "water_oxygens", # 供体氧 (水中的O) "acceptor_type": "P-O/P=O", # 受体氧 (磷酸中的非羟基氧) "h_type": "water_hydrogens", # 氢原子 (水中的H) "threshold": 2.375, # 氢键距离阈值 (Å) "color": "blue" }, { "name": "P-O/P=O···Hh", "donor_type": "hydronium_oxygens", # 供体氧 (水合氢离子中的O) "acceptor_type": "P-O/P=O", "h_type": "hydronium_hydrogens", # 氢原子 (水合氢离子中的H) "threshold": 2.275, "color": "green" }, { "name": "P-O/P=O···Hp", "donor_type": "P-OH", # 供体氧 (磷酸中的羟基氧) "acceptor_type": "P-O/P=O", "h_type": "phosphate_hydrogens", # 氢原子 (磷酸中的H) "threshold": 2.175, "color": "red" }, { "name": "P-OH···Ow", "donor_type": "P-OH", # 供体氧 (磷酸中的羟基氧) "acceptor_type": "water_oxygens", # 受体氧 (水中的O) "h_type": "phosphate_hydrogens", # 氢原子 (磷酸中的H) "threshold": 2.275, "color": "orange" }, { "name": "Hw···Ow", "donor_type": "water_oxygens", # 供体氧 (水中的O) "acceptor_type": "water_oxygens", # 受体氧 (另一个水中的O) "h_type": "water_hydrogens", # 氢原子 (水中的H) "threshold": 2.375, "color": "purple" }, { "name": "Hh···Ow", "donor_type": "hydronium_oxygens", # 供体氧 (水合氢离子中的O) "acceptor_type": "water_oxygens", # 受体氧 (水中的O) "h_type": "hydronium_hydrogens", # 氢原子 (水合氢离子中的H) "threshold": 2.225, "color": "cyan" }, { "name": "Hw···F", "donor_type": "water_oxygens", # 供体氧 (水中的O) "acceptor_type": "fluoride_atoms", # 受体 (氟离子) "h_type": "water_hydrogens", # 氢原子 (水中的H) "threshold": 2.225, "color": "magenta" }, { "name": "Hh···F", "donor_type": "hydronium_oxygens", # 供体氧 (水合氢离子中的O) "acceptor_type": "fluoride_atoms", "h_type": "hydronium_hydrogens", # 氢原子 (水合氢离子中的H) "threshold": 2.175, "color": "brown" }, { "name": "Hp···F", "donor_type": "P-OH", # 供体氧 (磷酸中的羟基氧) "acceptor_type": "fluoride_atoms", "h_type": "phosphate_hydrogens", # 氢原子 (磷酸中的H) "threshold": 2.275, "color": "pink" } ] # 氢键存在性检查函数 def check_hbond_exists(struct, atom_types, donor_idx, acceptor_idx, hbond_config, bond_threshold=1.3): """检查给定供体-受体对之间是否存在氢键""" # 获取所有原子坐标 all_coords = np.array([site.coords for site in struct]) # 获取氢键配置 hbond_types = get_hbond_angle_config() # 确定氢键类型 hbond_type = None for hb in hbond_types: # 确定供体类型 if hb["donor_type"] == "P-OH": donors = atom_types["phosphate_oxygens"]["P-OH"] else: donors = atom_types[hb["donor_type"]] # 确定受体类型 if hb["acceptor_type"] == "P-O/P=O": acceptors = atom_types["phosphate_oxygens"]["P-O/P=O"] elif hb["acceptor_type"] == "P-OH": acceptors = atom_types["phosphate_oxygens"]["P-OH"] else: acceptors = atom_types[hb["acceptor_type"]] # 检查是否匹配 if donor_idx in donors and acceptor_idx in acceptors: hbond_type = hb break if hbond_type is None: return False # 获取供体坐标 donor_coords = all_coords[donor_idx] # 查找与供体成键的氢原子 (距离 < bond_threshold) kdtree = cKDTree(all_coords, boxsize=struct.lattice.abc) neighbors = kdtree.query_ball_point(donor_coords, r=bond_threshold) hydrogen_candidates = [i for i in neighbors if struct[i].species_string == "H"] # 检查每个候选氢原子 for h_idx in hydrogen_candidates: h_coords = all_coords[h_idx] # 计算H-受体距离 acceptor_coords = all_coords[acceptor_idx] h_acceptor_dist = np.linalg.norm(h_coords - acceptor_coords) # 检查距离阈值 if h_acceptor_dist > hbond_type["threshold"]: continue # 计算键角 vec_dh = h_coords - donor_coords vec_ha = acceptor_coords - h_coords dot_product = np.dot(vec_dh, vec_ha) norm_dh = np.linalg.norm(vec_dh) norm_ha = np.linalg.norm(vec_ha) if norm_dh < 1e-6 or norm_ha < 1e-6: continue cos_theta = dot_product / (norm_dh * norm_ha) cos_theta = np.clip(cos_theta, -1.0, 1.0) angle_deg = np.degrees(np.arccos(cos_theta)) # 检查角度阈值 if angle_deg >= 120.0: return True return False def identify_hbond_pairs(struct, atom_types, hbond_config): """识别单帧中所有可能的氢键对""" hbond_pairs = [] # 获取所有原子坐标 all_coords = np.array([site.coords for site in struct]) kdtree = cKDTree(all_coords, boxsize=struct.lattice.abc) # 处理每种氢键类型 for hb in hbond_config: # 获取供体原子列表 if hb["donor_type"] == "P-OH": donors = atom_types["phosphate_oxygens"]["P-OH"] else: donors = atom_types[hb["donor_type"]] # 获取受体原子列表 if hb["acceptor_type"] == "P-O/P=O": acceptors = atom_types["phosphate_oxygens"]["P-O/P=O"] elif hb["acceptor_type"] == "P-OH": acceptors = atom_types["phosphate_oxygens"]["P-OH"] else: acceptors = atom_types[hb["acceptor_type"]] # 如果没有供体或受体,跳过 if not donors or not acceptors: continue # 为受体构建KDTree acceptor_coords = np.array([all_coords[i] for i in acceptors]) acceptor_kdtree = cKDTree(acceptor_coords, boxsize=struct.lattice.abc) # 遍历所有供体 for donor_idx in donors: donor_coords = all_coords[donor_idx] # 查找在阈值内的受体 acceptor_indices = acceptor_kdtree.query_ball_point( donor_coords, r=hb["threshold"] + 1.0 # 稍微放宽阈值 ) for a_idx_offset in acceptor_indices: acceptor_idx = acceptors[a_idx_offset] # 排除自相互作用 if donor_idx == acceptor_idx: continue # 添加到氢键对列表 hbond_pairs.append((donor_idx, acceptor_idx)) # 去重 hbond_pairs = list(set(hbond_pairs)) return hbond_pairs def process_hbond_frame(struct, atom_types, hbond_pairs, hbond_config): """处理单帧氢键存在性""" hbond_exists = [] for donor_idx, acceptor_idx in hbond_pairs: exists = check_hbond_exists(struct, atom_types, donor_idx, acceptor_idx, hbond_config) hbond_exists.append(exists) return np.array(hbond_exists, dtype=bool) def calculate_scf_worker(args): """工作进程函数:计算单个时间原点的存活相关函数""" t0, structures, hbond_pairs_t0, hbond_config, t_max = args n_frames = len(structures) n_hbonds = len(hbond_pairs_t0) # 初始化SCF数组 scf = np.zeros(t_max + 1) # 获取t0帧的原子类型 atom_types_t0 = identify_atom_types(structures[t0]) # 计算t0时刻的氢键存在性 exists_t0 = process_hbond_frame( structures[t0], atom_types_t0, hbond_pairs_t0, hbond_config ) # 对于每个时间延迟t for t in range(0, min(t_max + 1, n_frames - t0)): t_frame = t0 + t # 获取当前帧的原子类型 atom_types_t = identify_atom_types(structures[t_frame]) # 计算当前帧的氢键存在性 exists_t = process_hbond_frame( structures[t_frame], atom_types_t, hbond_pairs_t0, hbond_config ) # 计算存活相关函数 scf[t] = np.sum(exists_t0 & exists_t) / max(1, np.sum(exists_t0)) return scf def calculate_hbond_lifetime(structures, workers=1, delta_t0=10, t_max=5000): """计算氢键寿命(存活相关函数)""" start_time = time.time() n_frames = len(structures) hbond_config = get_hbond_angle_config() print(f"Calculating hydrogen bond lifetime for {n_frames} frames") print(f"Workers: {workers}, Delta t0: {delta_t0}, Max t: {t_max} frames") # 步骤1: 识别参考帧(t0)的氢键对 t0_ref = 0 # 使用第一帧作为参考 atom_types_ref = identify_atom_types(structures[t0_ref]) hbond_pairs_ref = identify_hbond_pairs( structures[t0_ref], atom_types_ref, hbond_config ) n_hbonds = len(hbond_pairs_ref) print(f"Identified {n_hbonds} potential hydrogen bond pairs") # 步骤2: 准备时间原点列表 t0_list = list(range(0, n_frames - t_max, delta_t0)) if not t0_list: t0_list = [0] print(f"Using {len(t0_list)} time origins") # 准备参数列表 args_list = [ (t0, structures, hbond_pairs_ref, hbond_config, t_max) for t0 in t0_list ] # 步骤3: 并行计算SCF scf_total = np.zeros(t_max + 1) scf_count = np.zeros(t_max + 1) with multiprocessing.Pool(processes=workers) as pool: results = list(tqdm( pool.imap_unordered(calculate_scf_worker, args_list), total=len(t0_list), desc="Calculating SCF" )) # 合并结果 for scf in results: valid_length = min(len(scf), t_max + 1) scf_total[:valid_length] += scf[:valid_length] scf_count[:valid_length] += 1 # 计算平均SCF scf_avg = np.zeros(t_max + 1) for t in range(t_max + 1): if scf_count[t] > 0: scf_avg[t] = scf_total[t] / scf_count[t] # 步骤4: 计算弛豫时间(积分法) # 假设时间步长为0.1 fs (0.0001 ps) time_ps = np.arange(t_max + 1) * 0.0001 integral = cumtrapz(scf_avg, time_ps, initial=0) tau = integral[-1] # 积分弛豫时间 # 平滑SCF曲线 if t_max > 100: scf_smoothed = gaussian_filter1d(scf_avg, sigma=3) else: scf_smoothed = scf_avg # 输出结果 elapsed = time.time() - start_time print(f"Hydrogen bond lifetime calculation completed in {elapsed:.2f} seconds") print(f"Relaxation time (tau): {tau:.4f} ps") return time_ps, scf_avg, scf_smoothed, tau def plot_lifetime_results(time_ps, scf_avg, scf_smoothed, tau, system_name): """绘制氢键寿命结果(符合期刊要求)""" os.makedirs("Lifetime_Plots", exist_ok=True) plt.figure(figsize=(8, 6)) # 绘制原始数据和平滑曲线 plt.plot(time_ps, scf_avg, 'b-', alpha=0.3, label="Raw SCF") plt.plot(time_ps, scf_smoothed, 'r-', linewidth=2.5, label="Smoothed SCF") # 标记弛豫时间 plt.axvline(x=tau, color='g', linestyle='--', linewidth=1.5, label=f"τ = {tau:.4f} ps") # 设置标题和标签 plt.title(f"{system_name}: Hydrogen Bond Survival Correlation Function", fontsize=16) plt.xlabel("Time (ps)", fontsize=14) plt.ylabel("SCF(t)", fontsize=14) plt.yscale('log') # 对数坐标更好展示衰减 plt.grid(True, linestyle='--', alpha=0.7) # 设置坐标轴范围 plt.xlim(0, min(10, time_ps[-1])) # 最多显示10ps plt.ylim(0.001, 1.1) # 添加图例 plt.legend(loc='upper right', framealpha=0.8) # 添加文本信息 plt.text(0.7, 0.95, f"τ = {tau:.4f} ps", transform=plt.gca().transAxes, fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) # 保存图像 plt.tight_layout() filename = os.path.join("Lifetime_Plots", f"{system_name}_lifetime.tiff") plt.savefig(filename, dpi=600, bbox_inches='tight') plt.close() print(f"Saved lifetime plot: {filename}") # 保存数据 data_file = os.path.join("Lifetime_Plots", f"{system_name}_lifetime_data.csv") with open(data_file, 'w') as f: f.write("Time (ps),SCF(t),Smoothed SCF(t)\n") for i in range(len(time_ps)): f.write(f"{time_ps[i]},{scf_avg[i]},{scf_smoothed[i]}\n") print(f"Saved lifetime data: {data_file}") def parse_lammps_dump(dump_file): """解析LAMMPS转储文件""" structures = [] current_frame = None timestep = None num_atoms = 0 box_bounds = None atoms = [] atom_types = [] species_map = {} with open(dump_file, 'r') as f: for line in f: line = line.strip() if not line: continue if "ITEM: TIMESTEP" in line: if current_frame is not None and atoms: # 创建Structure对象 lattice = Lattice.from_parameters( box_bounds[0][1] - box_bounds[0][0], box_bounds[1][1] - box_bounds[1][0], box_bounds[2][1] - box_bounds[2][0], 90, 90, 90 ) species = [species_map[atype] for atype in atom_types] coords = np.array(atoms) struct = Structure(lattice, species, coords) structures.append(struct) # 重置当前帧数据 current_frame = {"timestep": None, "box": None, "atoms": []} atoms = [] atom_types = [] elif "ITEM: NUMBER OF ATOMS" in line: num_atoms = int(next(f).strip()) elif "ITEM: BOX BOUNDS" in line: box_bounds = [] for _ in range(3): bounds = list(map(float, next(f).split())) box_bounds.append(bounds) elif "ITEM: ATOMS" in line: headers = line.split()[2:] id_idx = headers.index("id") if "id" in headers else 0 type_idx = headers.index("type") if "type" in headers else 1 x_idx = headers.index("x") if "x" in headers else 2 y_idx = headers.index("y") if "y" in headers else 3 z_idx = headers.index("z") if "z" in headers else 4 # 确定是否有元素类型信息 if "element" in headers: element_idx = headers.index("element") else: element_idx = None for _ in range(num_atoms): atom_data = next(f).split() atom_id = int(atom_data[id_idx]) atom_type = int(atom_data[type_idx]) # 获取元素符号 if element_idx is not None: element = atom_data[element_idx] else: # 如果没有元素信息,使用类型映射 if atom_type not in species_map: # 这里需要根据您的体系设置元素映射 # 示例映射:1=O, 2=H, 3=P, 4=Al, 5=F type_to_element = { 1: "O", 2: "H", 3: "P", 4: "Al", 5: "F" } species_map[atom_type] = type_to_element.get(atom_type, "X") element = species_map[atom_type] x = float(atom_data[x_idx]) y = float(atom_data[y_idx]) z = float(atom_data[z_idx]) atoms.append([x, y, z]) atom_types.append(element) # 添加最后一帧 if atoms: lattice = Lattice.from_parameters( box_bounds[0][1] - box_bounds[0][0], box_bounds[1][1] - box_bounds[1][0], box_bounds[2][1] - box_bounds[2][0], 90, 90, 90 ) species = [species_map[atype] for atype in atom_types] coords = np.array(atoms) struct = Structure(lattice, species, coords) structures.append(struct) print(f"Loaded {len(structures)} frames from LAMMPS dump file") return structures def load_lammps_trajectory(dump_file, data_file=None): """加载LAMMPS轨迹""" return parse_lammps_dump(dump_file) def main(workers=1): """主函数:加载轨迹并计算氢键寿命""" # 定义要处理的体系 lammps_files = { "MySystem": { "dump_file": "trajectory.lammpstrj", "data_file": "system.data" } } for system_name, files in lammps_files.items(): print(f"\n{'='*50}") print(f"Processing {system_name}") print(f"{'='*50}") try: # 加载LAMMPS轨迹 structures = load_lammps_trajectory( files["dump_file"], files["data_file"] ) # 计算氢键寿命 time_ps, scf_avg, scf_smoothed, tau = calculate_hbond_lifetime( structures, workers=workers, delta_t0=50, # 时间原点间隔 (帧) t_max=20000 # 最大时间延迟 (帧) - 对应2ps (0.1fs/帧) ) # 绘制结果 plot_lifetime_results(time_ps, scf_avg, scf_smoothed, tau, system_name) print(f"Completed processing for {system_name}") except Exception as e: print(f"Error processing {system_name}: {str(e)}") import traceback traceback.print_exc() print("\nHydrogen bond lifetime analysis completed successfully!") if __name__ == "__main__": # 设置命令行参数 parser = argparse.ArgumentParser(description='Calculate hydrogen bond lifetime from LAMMPS trajectory') parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(), help=f'Number of parallel workers (default: {multiprocessing.cpu_count()})') parser.add_argument('--delta_t0', type=int, default=50, help='Time origin spacing in frames (default: 50)') parser.add_argument('--t_max', type=int, default=20000, help='Maximum time delay in frames (default: 20000)') args = parser.parse_args() print(f"Starting hydrogen bond lifetime analysis with {args.workers} workers...") print(f"Delta t0: {args.delta_t0} frames, Max time delay: {args.t_max} frames") # 设置全局参数 def calculate_with_params(structures): return calculate_hbond_lifetime( structures, workers=args.workers, delta_t0=args.delta_t0, t_max=args.t_max ) main(workers=args.workers)其中报错Error processing MySystem: 'H' Traceback (most recent call last): File "hydrogen_bond_analysis7.py", line 618, in main structures = load_lammps_trajectory( File "hydrogen_bond_analysis7.py", line 599, in load_lammps_trajectory return parse_lammps_dump(dump_file) File "hydrogen_bond_analysis7.py", line 521, in parse_lammps_dump species = [species_map[atype] for atype in atom_types] File "hydrogen_bond_analysis7.py", line 521, in <listcomp> species = [species_map[atype] for atype in atom_types] KeyError: 'H'
08-02
按照以上逻辑能否修改我以下的代码来执行?因为我以下的代码已经能够在anaconda promt中执行,所以我想以这个框架去把内容替换来实现这个粗略的计算。代码如下:import numpy as np import matplotlib.pyplot as plt from pymatgen.io.vasp import Vasprun from pymatgen.core.structure import Structure from scipy.signal import savgol_filter from scipy.spatial import cKDTree from tqdm import tqdm import matplotlib as mpl import warnings from collections import defaultdict import os import csv import argparse import multiprocessing from functools import partial import time import dill # 忽略可能的警告 warnings.filterwarnings("ignore", category=UserWarning) # 专业绘图设置 - 符合Journal of Chemical Physics要求 plt.style.use('seaborn-v0_8-whitegrid') mpl.rcParams.update({ 'font.family': 'serif', 'font.serif': ['Times New Roman', 'DejaVu Serif'], 'font.size': 12, 'axes.labelsize': 14, 'axes.titlesize': 16, 'xtick.labelsize': 12, 'ytick.labelsize': 12, 'figure.dpi': 600, # 提高分辨率 'savefig.dpi': 600, 'figure.figsize': (8, 6), # 期刊常用尺寸 'lines.linewidth': 2.0, 'legend.fontsize': 10, 'legend.framealpha': 0.8, 'mathtext.default': 'regular', 'axes.linewidth': 1.5, # 加粗坐标轴线 'xtick.major.width': 1.5, 'ytick.major.width': 1.5, 'xtick.major.size': 5, 'ytick.major.size': 5, }) # 1. 增强的原子类型识别函数 - 逐帧识别 def identify_atom_types(struct): """识别所有关键原子类型并排除自身化学键""" # 磷酸氧分类 p_oxygens = {"P=O": [], "P-O": [], "P-OH": []} phosphate_hydrogens = [] # 仅P-OH基团中的H原子 # 水合氢离子识别 hydronium_oxygens = [] hydronium_hydrogens = [] # H₃O⁺中的H原子 # 普通水分子 water_oxygens = [] water_hydrogens = [] # 普通水中的H原子 # 氟离子 fluoride_atoms = [i for i, site in enumerate(struct) if site.species_string == "F"] # 铝离子 aluminum_atoms = [i for i, site in enumerate(struct) if site.species_string == "Al"] # 创建快速邻居查找表 neighbor_cache = defaultdict(list) for i, site in enumerate(struct): if site.species_string == "O": neighbors = struct.get_neighbors(site, r=1.3) h_neighbors = [n[0] for n in neighbors if n[0].species_string == "H"] neighbor_cache[i] = h_neighbors # 识别水合氢离子 (H₃O⁺) if len(h_neighbors) == 3: hydronium_oxygens.append(i) for h_site in h_neighbors: hydronium_hydrogens.append(h_site.index) # 识别磷酸基团 for site in struct: if site.species_string == "P": neighbors = struct.get_neighbors(site, r=2.0) # 扩大搜索半径 # 筛选氧原子邻居 o_neighbors = [(n[0], n[1]) for n in neighbors if n[0].species_string == "O"] if len(o_neighbors) < 4: # 如果找不到4个氧原子,使用旧方法 for neighbor in o_neighbors: nn_site = neighbor[0] if neighbor[1] < 1.55: p_oxygens["P=O"].append(nn_site.index) else: if any(n[0].species_string == "H" for n in struct.get_neighbors(nn_site, r=1.3)): p_oxygens["P-OH"].append(nn_site.index) else: p_oxygens["P-O"].append(nn_site.index) continue # 按距离排序 o_neighbors.sort(key=lambda x: x[1]) # 最近的氧原子为P=O p_double_o = o_neighbors[0][0] p_oxygens["P=O"].append(p_double_o.index) # 其他三个氧原子 for i in range(1, 4): o_site = o_neighbors[i][0] # 检查氧原子上是否有氢 if neighbor_cache.get(o_site.index, []): p_oxygens["P-OH"].append(o_site.index) else: p_oxygens["P-O"].append(o_site.index) # 识别P-OH基团中的H原子 (磷酸中的H) for o_idx in p_oxygens["P-OH"]: # 获取与P-OH氧相连的H原子 h_neighbors = neighbor_cache.get(o_idx, []) for h_site in h_neighbors: if h_site.species_string == "H": phosphate_hydrogens.append(h_site.index) # 识别普通水分子 (排除磷酸氧和水合氢离子) for i, site in enumerate(struct): if site.species_string == "O" and i not in hydronium_oxygens: is_phosphate_oxygen = False for cat in p_oxygens.values(): if i in cat: is_phosphate_oxygen = True break if not is_phosphate_oxygen: water_oxygens.append(i) # 识别普通水分子中的H原子 (水中的H) for o_idx in water_oxygens: h_neighbors = neighbor_cache.get(o_idx, []) for h_site in h_neighbors: if h_site.species_string == "H": water_hydrogens.append(h_site.index) return { "phosphate_oxygens": p_oxygens, "phosphate_hydrogens": phosphate_hydrogens, "water_oxygens": water_oxygens, "water_hydrogens": water_hydrogens, "hydronium_oxygens": hydronium_oxygens, "hydronium_hydrogens": hydronium_hydrogens, "fluoride_atoms": fluoride_atoms, "aluminum_atoms": aluminum_atoms } # 2. RDF计算函数 - 修复负值问题和序列化问题 def process_frame(struct, center_sel, target_sel, r_max, exclude_bonds, bond_threshold): """处理单帧结构计算,完全处理空原子类型情况""" # 每帧重新识别原子类型(关键!) atom_types = identify_atom_types(struct) # 获取中心原子和目标原子 centers = center_sel(atom_types) targets = target_sel(atom_types) # 处理空原子类型情况 - 第一重保护 if len(centers) == 0 or len(targets) == 0: return { "distances": np.array([], dtype=np.float64), "n_centers": 0, "n_targets": 0, "volume": struct.volume } center_coords = np.array([struct[i].coords for i in centers]) target_coords = np.array([struct[i].coords for i in targets]) lattice = struct.lattice kdtree = cKDTree(target_coords, boxsize=lattice.abc) # 动态确定邻居数量 - 不超过目标原子数 k_val = min(50, len(targets)) # 处理目标原子数量为0的情况 - 第二重保护 if k_val == 0: return { "distances": np.array([], dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } # 执行查询并确保结果统一格式 try: query_result = kdtree.query(center_coords, k=k_val, distance_upper_bound=r_max) except Exception as e: # 异常处理 - 返回空结果 print(f"KDTree query error: {str(e)}") return { "distances": np.array([], dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } # 统一处理不同维度的返回结果 if k_val == 1: # 处理单邻居情况 if isinstance(query_result, tuple): distances, indices = query_result else: distances = query_result indices = np.zeros_like(distances, dtype=int) # 确保数组格式 distances = np.atleast_1d(distances) indices = np.atleast_1d(indices) else: # 多邻居情况 distances, indices = query_result # 确保二维数组格式 if distances.ndim == 1: distances = distances.reshape(-1, 1) indices = indices.reshape(-1, 1) valid_distances = [] for i in range(distances.shape[0]): center_idx = centers[i] for j in range(distances.shape[1]): dist = distances[i, j] # 跳过超出范围的距离 if dist > r_max or np.isinf(dist): continue target_idx = targets[indices[i, j]] # 排除化学键 if exclude_bonds: actual_dist = struct.get_distance(center_idx, target_idx) if actual_dist < bond_threshold: continue valid_distances.append(dist) return { "distances": np.array(valid_distances, dtype=np.float64), "n_centers": len(centers), "n_targets": len(targets), "volume": struct.volume } def calculate_rdf_parallel(structures, center_sel, target_sel, r_max=8.0, bin_width=0.05, exclude_bonds=True, bond_threshold=1.3, workers=1): """ 并行计算径向分布函数 :param workers: 并行工作进程数 """ bins = np.arange(0, r_max, bin_width) hist = np.zeros(len(bins) - 1) total_centers = 0 total_targets = 0 total_volume = 0 # 准备参数 - 使用dill解决序列化问题 dill.settings['recurse'] = True func = partial(process_frame, center_sel=center_sel, target_sel=target_sel, r_max=r_max, exclude_bonds=exclude_bonds, bond_threshold=bond_threshold) # 使用多进程池 with multiprocessing.Pool(processes=workers) as pool: results = [] # 使用imap_unordered提高效率 for res in tqdm(pool.imap_unordered(func, structures), total=len(structures), desc="Calculating RDF"): results.append(res) # 处理结果 - 特别注意空结果处理 n_frames = 0 for res in results: if res is None: continue n_frames += 1 valid_distances = res["distances"] n_centers = res["n_centers"] n_targets = res["n_targets"] volume = res["volume"] # 累加计数 if len(valid_distances) > 0: hist += np.histogram(valid_distances, bins=bins)[0] total_centers += n_centers total_targets += n_targets total_volume += volume # 修正归一化 - 解决负值问题 if n_frames == 0: # 没有有效帧时返回空结果 r = bins[:-1] + bin_width/2 return r, np.zeros_like(r), {"position": None, "value": None} avg_density = total_targets / total_volume if total_volume > 0 else 0 r = bins[:-1] + bin_width/2 rdf = np.zeros_like(r) for i in range(len(hist)): r_lower = bins[i] r_upper = bins[i+1] shell_vol = 4/3 * np.pi * (r_upper**3 - r_lower**3) expected_count = shell_vol * avg_density * total_centers # 避免除以零 if expected_count > 1e-10: rdf[i] = hist[i] / expected_count else: rdf[i] = 0 # 更稳健的平滑处理 - 避免边界效应 if len(rdf) > 10: window_length = min(15, len(rdf)//2*2+1) polyorder = min(5, window_length-1) rdf_smoothed = savgol_filter(rdf, window_length=window_length, polyorder=polyorder, mode='mirror') else: rdf_smoothed = rdf # 计算主要峰值 peak_info = {} mask = (r >= 1.5) & (r <= 3.0) if np.any(mask) and np.any(rdf_smoothed[mask] > 0): peak_idx = np.argmax(rdf_smoothed[mask]) peak_pos = r[mask][peak_idx] peak_val = rdf_smoothed[mask][peak_idx] peak_info = {"position": peak_pos, "value": peak_val} else: peak_info = {"position": None, "value": None} return r, rdf_smoothed, peak_info # 3. 定义精细化的选择器函数(避免lambda序列化问题) def selector_phosphate_P_double_O(atom_types): return atom_types["phosphate_oxygens"]["P=O"] def selector_phosphate_P_OH(atom_types): return atom_types["phosphate_oxygens"]["P-OH"] def selector_phosphate_P_O(atom_types): return atom_types["phosphate_oxygens"]["P-O"] def selector_phosphate_hydrogens(atom_types): return atom_types["phosphate_hydrogens"] def selector_water_only_hydrogens(atom_types): """仅选择水分子中的氢原子""" return atom_types["water_hydrogens"] def selector_hydronium_only_hydrogens(atom_types): """仅选择水合氢离子中的氢原子""" return atom_types["hydronium_hydrogens"] def selector_water_only_oxygens(atom_types): """仅选择水分子中的氧原子""" return atom_types["water_oxygens"] def selector_hydronium_only_oxygens(atom_types): """仅选择水合氢离子中的氧原子""" return atom_types["hydronium_oxygens"] def selector_fluoride_atoms(atom_types): return atom_types["fluoride_atoms"] def selector_aluminum_atoms(atom_types): return atom_types["aluminum_atoms"] def selector_all_phosphate_oxygens(atom_types): return (atom_types["phosphate_oxygens"]["P=O"] + atom_types["phosphate_oxygens"]["P-O"] + atom_types["phosphate_oxygens"]["P-OH"]) # 4. 根据您的要求定义六张图的RDF分组配置 def get_rdf_groups(): """返回六张图的RDF分组配置(完全符合您的需求)""" return { # 图1: Al的配位情况 "Al_Coordination": [ (selector_aluminum_atoms, selector_fluoride_atoms, "Al-F", "blue"), (selector_aluminum_atoms, selector_water_only_oxygens, "Al-Ow", "green"), (selector_aluminum_atoms, selector_all_phosphate_oxygens, "Al-Op", "red") ], # 图2: F与H形成的氢键 "F_Hydrogen_Bonding": [ (selector_fluoride_atoms, selector_water_only_hydrogens, "F-Hw", "lightblue"), (selector_fluoride_atoms, selector_hydronium_only_hydrogens, "F-Hh", "blue"), (selector_fluoride_atoms, selector_phosphate_hydrogens, "F-Hp", "darkblue") ], # 图3: 磷酸作为受体与周围环境的氢键(区分氧类型) "Phosphate_Acceptor": [ (selector_phosphate_P_double_O, selector_water_only_hydrogens, "P=O···Hw", "orange"), (selector_phosphate_P_double_O, selector_hydronium_only_hydrogens, "P=O···Hh", "red"), (selector_phosphate_P_O, selector_water_only_hydrogens, "P-O···Hw", "lightgreen"), (selector_phosphate_P_O, selector_hydronium_only_hydrogens, "P-O···Hh", "green"), (selector_phosphate_P_OH, selector_water_only_hydrogens, "P-OH···Hw", "lightblue"), (selector_phosphate_P_OH, selector_hydronium_only_hydrogens, "P-OH···Hh", "blue") ], # 图4: 磷酸-水-水合氢离子交叉氢键(排除同种类型) "Cross_Species_HBonding": [ (selector_phosphate_hydrogens, selector_water_only_oxygens, "Hp···Ow", "pink"), (selector_phosphate_hydrogens, selector_hydronium_only_oxygens, "Hp···Oh", "purple"), (selector_water_only_hydrogens, selector_all_phosphate_oxygens, "Hw···Op", "lightgreen"), (selector_water_only_hydrogens, selector_hydronium_only_oxygens, "Hw···Oh", "green"), (selector_hydronium_only_hydrogens, selector_water_only_oxygens, "Hh···Ow", "lightblue"), (selector_hydronium_only_hydrogens, selector_all_phosphate_oxygens, "Hh···Op", "blue") ], # 图5: 同类型分子内/间氢键(区分磷酸氧类型) "Same_Species_HBonding": [ (selector_phosphate_hydrogens, selector_phosphate_P_double_O, "Hp···P=O", "red"), (selector_phosphate_hydrogens, selector_phosphate_P_O, "Hp···P-O", "orange"), (selector_phosphate_hydrogens, selector_phosphate_P_OH, "Hp···P-OH", "yellow"), (selector_water_only_hydrogens, selector_water_only_oxygens, "Hw···Ow", "lightblue"), (selector_hydronium_only_hydrogens, selector_hydronium_only_oxygens, "Hh···Oh", "blue") ], # 图6: O-O聚集分析(Op不区分类型) "O_O_Aggregation": [ (selector_all_phosphate_oxygens, selector_water_only_oxygens, "Op-Ow", "blue"), (selector_all_phosphate_oxygens, selector_hydronium_only_oxygens, "Op-Oh", "green"), (selector_all_phosphate_oxygens, selector_all_phosphate_oxygens, "Op-Op", "red"), (selector_water_only_oxygens, selector_hydronium_only_oxygens, "Ow-Oh", "purple"), (selector_water_only_oxygens, selector_water_only_oxygens, "Ow-Ow", "cyan"), (selector_hydronium_only_oxygens, selector_hydronium_only_oxygens, "Oh-Oh", "magenta") ] } # 5. 主程序 - 优化并行处理 def main(workers=1): # 定义要处理的体系 vasprun_files = { "System1": "vasprun1.xml", "System2": "vasprun2.xml", "System3": "vasprun3.xml", "System4": "vasprun4.xml" } # 获取RDF分组配置 rdf_groups = get_rdf_groups() # 标题映射(根据您的要求) title_map = { "Al_Coordination": "Al Coordination Environment", "F_Hydrogen_Bonding": "F-H Hydrogen Bonding", "Phosphate_Acceptor": "Phosphate as H-bond Acceptor", "Cross_Species_HBonding": "Cross H-bonding between Different Species", "Same_Species_HBonding": "Intra- and Inter-molecular H-bonding", "O_O_Aggregation": "O-O Aggregation Analysis" } # 存储所有数据 all_system_data = {} group_y_max = {group_name: 0 for group_name in list(rdf_groups.keys())} group_x_max = { "Al_Coordination": (1.5, 3.5), "F_Hydrogen_Bonding": (1.0, 3.0), "Phosphate_Acceptor": (1.0, 3.0), "Cross_Species_HBonding": (1.0, 3.0), "Same_Species_HBonding": (1.0, 3.0), "O_O_Aggregation": (2.0, 6.0) } # 创建输出目录 os.makedirs("RDF_Plots", exist_ok=True) # 计算所有体系的所有RDF数据 for system_name, vasprun_file in vasprun_files.items(): print(f"\n{'='*50}") print(f"Processing {system_name}: {vasprun_file} with {workers} workers") print(f"{'='*50}") start_time = time.time() try: # 加载VASP结果 vr = Vasprun(vasprun_file, ionic_step_skip=5) structures = vr.structures print(f"Loaded {len(structures)} frames") # 存储体系数据 system_data = { "rdf_results": {}, "peak_infos": {} } # 计算所有RDF分组 for group_name, pairs in rdf_groups.items(): system_data["rdf_results"][group_name] = {} system_data["peak_infos"][group_name] = {} group_y_max_current = 0 for center_sel, target_sel, label, color in pairs: print(f"\nCalculating RDF for: {label}") try: r, rdf, peak_info = calculate_rdf_parallel( structures, center_sel, target_sel, r_max=10.0, exclude_bonds=True, bond_threshold=1.3, workers=workers ) system_data["rdf_results"][group_name][label] = (r, rdf, color) system_data["peak_infos"][group_name][label] = peak_info if len(rdf) > 0: current_max = np.max(rdf) if current_max > group_y_max_current: group_y_max_current = current_max if peak_info["position"] is not None: print(f" Peak for {label}: {peak_info['position']:.3f} Å (g(r) = {peak_info['value']:.2f})") else: print(f" No significant peak found for {label} in 1.5-3.0 Å range") except Exception as e: print(f"Error calculating RDF for {label}: {str(e)}") system_data["rdf_results"][group_name][label] = (np.array([]), np.array([]), color) system_data["peak_infos"][group_name][label] = {"position": None, "value": None} if group_y_max_current > group_y_max[group_name]: group_y_max[group_name] = group_y_max_current all_system_data[system_name] = system_data elapsed = time.time() - start_time print(f"\nCompleted processing for {system_name} in {elapsed:.2f} seconds") except Exception as e: print(f"Error processing {system_name}: {str(e)}") # 为每个分组添加余量 for group_name in group_y_max: group_y_max[group_name] = max(group_y_max[group_name] * 1.15, 3.0) # 确保最小值 # 第二步:生成符合期刊要求的图表 for system_name, system_data in all_system_data.items(): print(f"\nGenerating publication-quality plots for {system_name}") for group_name, group_data in system_data["rdf_results"].items(): fig, ax = plt.subplots(figsize=(8, 6)) # 设置坐标轴范围 xlim = group_x_max.get(group_name, (0, 6.0)) ylim = (0, group_y_max[group_name]) for label, (r, rdf, color) in group_data.items(): if len(r) > 0 and len(rdf) > 0: ax.plot(r, rdf, color=color, label=label, linewidth=2.0) ax.set_xlim(xlim) ax.set_ylim(ylim) # 期刊格式标签 ax.set_xlabel('Radial Distance (Å)', fontweight='bold') ax.set_ylabel('g(r)', fontweight='bold') # 添加体系名称到标题 ax.set_title(f"{system_name}: {title_map[group_name]}", fontsize=16, pad=15) # 精简图例 ncol = 3 if group_name == "Same_Species_HBonding" else 1 # 图5使用三列图例 ax.legend(ncol=ncol, loc='best', framealpha=0.8, fontsize=10) # 添加氢键区域标记(除O-O聚集图外) if group_name != "O_O_Aggregation": ax.axvspan(1.5, 2.5, alpha=0.1, color='green', zorder=0) ax.text(1.7, ylim[1]*0.85, 'H-bond Region', fontsize=10) # 添加网格 ax.grid(True, linestyle='--', alpha=0.5) # 保存高分辨率图片 plt.tight_layout() filename = os.path.join("RDF_Plots", f"RDF_{system_name}_{group_name}.tiff") plt.savefig(filename, bbox_inches='tight', dpi=600, format='tiff') print(f"Saved publication plot: {filename}") plt.close() # 保存Origin兼容数据 save_origin_data(system_name, system_data) print("\nAll RDF analysis completed successfully!") def save_origin_data(system_name, system_data): """保存Origin兼容格式数据""" os.makedirs("Origin_Data", exist_ok=True) system_dir = os.path.join("Origin_Data", system_name) os.makedirs(system_dir, exist_ok=True) # 保存峰值信息 peak_info_path = os.path.join(system_dir, f"Peak_Positions_{system_name}.csv") with open(peak_info_path, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(["Group", "Interaction", "Peak Position (A)", "g(r) Value"]) for group_name, peaks in system_data["peak_infos"].items(): for label, info in peaks.items(): if info["position"] is not None: writer.writerow([group_name, label, f"{info['position']:.3f}", f"{info['value']:.3f}"]) else: writer.writerow([group_name, label, "N/A", "N/A"]) print(f"Saved peak positions: {peak_info_path}") # 保存RDF数据 for group_name, group_results in system_data["rdf_results"].items(): group_dir = os.path.join(system_dir, group_name) os.makedirs(group_dir, exist_ok=True) for label, (r, rdf, color) in group_results.items(): if len(r) > 0 and len(rdf) > 0: safe_label = label.replace(" ", "_").replace("/", "_").replace("=", "_") safe_label = safe_label.replace("(", "").replace(")", "").replace("$", "") filename = f"RDF_{system_name}_{group_name}_{safe_label}.csv" filepath = os.path.join(group_dir, filename) with open(filepath, 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(["Distance (A)", "g(r)"]) for i in range(len(r)): writer.writerow([f"{r[i]:.6f}", f"{rdf[i]:.6f}"]) print(f"Saved Origin data: {filename}") if __name__ == "__main__": # 设置命令行参数 parser = argparse.ArgumentParser(description='Calculate RDF for VASP simulations') parser.add_argument('--workers', type=int, default=multiprocessing.cpu_count(), help=f'Number of parallel workers (default: {multiprocessing.cpu_count()})') args = parser.parse_args() print(f"Starting RDF analysis with {args.workers} workers...") main(workers=args.workers) 你要按照我们现在的粗略计算(为准备后续精细化计算)来替换该代码中的逻辑内容
07-12
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值