import osimport numpy as npimport pandas as pdfrom tqdm import tqdmimport jsonfrom collections import defaultdict
def read_xyz_file(file_path):
"""
读取XYZ格式的分子结构文件
返回原子类型列表和坐标数组
"""
try:
with open(file_path, 'r') as f:
lines = f.readlines()
# 跳过空行
lines = [line.strip() for line in lines if line.strip()]
if len(lines) < 2:
raise ValueError(f"文件 {file_path} 内容不足")
num_atoms = int(lines[0])
atoms = []
coords = []
for i in range(2, 2 + num_atoms):
if i >= len(lines):
raise ValueError(f"文件 {file_path} 原子数量与声明不符")
parts = lines[i].split()
if len(parts) < 4:
continue
atom = parts[0]
try:
x, y, z = map(float, parts[1:4])
atoms.append(atom)
coords.append([x, y, z])
except ValueError:
continue
return atoms, np.array(coords)
except Exception as e:
print(f"读取文件 {file_path} 时出错: {e}")
return [], np.array([])
def extract_basic_features(atoms, coords):
"""
从分子结构中提取基本特征
"""
if len(atoms) == 0:
return {}
# 计算质心
centroid = np.mean(coords, axis=0)
# 计算原子间距离矩阵
n_atoms = len(atoms)
dist_matrix = np.zeros((n_atoms, n_atoms))
for i in range(n_atoms):
for j in range(i+1, n_atoms):
dist = np.linalg.norm(coords[i] - coords[j])
dist_matrix[i, j] = dist
dist_matrix[j, i] = dist
# 简单统计特征
features = {
'num_atoms': n_atoms,
'centroid': centroid.tolist(),
'max_distance': np.max(dist_matrix),
'min_distance': np.min(dist_matrix[np.nonzero(dist_matrix)]) if n_atoms > 1 else 0,
'element_counts': {elem: atoms.count(elem) for elem in set(atoms)}
}
return features
def process_reaction_folder(reaction_folder):
"""
处理单个反应文件夹的数据
"""
# 查找文件夹中的XYZ文件
xyz_files = [f for f in os.listdir(reaction_folder) if f.endswith('.xyz')]
# 确定文件类型
reactant_file = None
ts_file = None
product_file = None
for f in xyz_files:
if 'reactant' in f.lower() or 'r_' in f.lower():
reactant_file = os.path.join(reaction_folder, f)
elif 'ts' in f.lower() or 'transition' in f.lower():
ts_file = os.path.join(reaction_folder, f)
elif 'product' in f.lower() or 'p_' in f.lower():
product_file = os.path.join(reaction_folder, f)
# 如果无法通过文件名判断,尝试按顺序分配
if not all([reactant_file, ts_file, product_file]) and len(xyz_files) >= 3:
xyz_files.sort()
reactant_file, ts_file, product_file = [os.path.join(reaction_folder, f) for f in xyz_files[:3]]
# 读取结构文件
reactant_atoms, reactant_coords = read_xyz_file(reactant_file) if reactant_file else ([], [])
ts_atoms, ts_coords = read_xyz_file(ts_file) if ts_file else ([], [])
product_atoms, product_coords = read_xyz_file(product_file) if product_file else ([], [])
# 检查数据有效性
if len(reactant_atoms) == 0 or len(ts_atoms) == 0 or len(product_atoms) == 0:
return None
# 提取特征
reactant_features = extract_basic_features(reactant_atoms, reactant_coords)
ts_features = extract_basic_features(ts_atoms, ts_coords)
product_features = extract_basic_features(product_atoms, product_coords)
# 计算反应物与过渡态之间的位移
# 注意:根据赛题说明,原子顺序已经对齐
displacement = ts_coords - reactant_coords
return {
'reaction_id': os.path.basename(reaction_folder),
'reactant': {
'atoms': reactant_atoms,
'coords': reactant_coords,
'features': reactant_features
},
'ts': {
'atoms': ts_atoms,
'coords': ts_coords,
'features': ts_features
},
'product': {
'atoms': product_atoms,
'coords': product_coords,
'features': product_features
},
'displacement': displacement
}
def explore_dataset(data_root):
"""
探索数据集,统计基本信息
"""
# 获取所有反应文件夹
reaction_folders = [os.path.join(data_root, d) for d in os.listdir(data_root)
if os.path.isdir(os.path.join(data_root, d))]
print(f"找到 {len(reaction_folders)} 个反应文件夹")
# 处理每个反应
reactions_data = []
element_counts = defaultdict(int)
atom_counts = []
failed_reactions = []
for folder in tqdm(reaction_folders, desc="处理反应数据"):
try:
reaction_data = process_reaction_folder(folder)
if reaction_data:
reactions_data.append(reaction_data)
# 统计元素出现次数
for atom in reaction_data['reactant']['atoms']:
element_counts[atom] += 1
# 统计原子数量
atom_counts.append(len(reaction_data['reactant']['atoms']))
else:
failed_reactions.append(folder)
except Exception as e:
print(f"处理反应 {folder} 时出错: {e}")
failed_reactions.append(folder)
# 数据集统计信息
dataset_info = {
'total_reactions': len(reactions_data),
'failed_reactions': len(failed_reactions),
'elements': dict(element_counts),
'avg_atoms': np.mean(atom_counts) if atom_counts else 0,
'min_atoms': min(atom_counts) if atom_counts else 0,
'max_atoms': max(atom_counts) if atom_counts else 0,
'sample_reactions': reactions_data[:3] if reactions_data else [] # 保存前3个反应作为样本
}
return dataset_info, reactions_data
def save_dataset_info(dataset_info, output_path):
"""保存数据集信息到JSON文件"""
# 将numpy数组转换为列表,以便JSON序列化
def convert_numpy(obj):
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
return obj
with open(output_path, 'w') as f:
json.dump(dataset_info, f, indent=2, default=convert_numpy)
def main():
# 设置数据路径 - 根据你的实际路径修改
train_data_path = "train_data"
# 确保数据路径存在
if not os.path.exists(train_data_path):
print(f"错误: 数据路径 '{train_data_path}' 不存在")
return
# 探索数据集
print("开始探索数据集...")
dataset_info, reactions_data = explore_dataset(train_data_path)
# 打印数据集信息
print(f"\n数据集统计:")
print(f"成功处理反应: {dataset_info['total_reactions']}")
print(f"失败反应: {dataset_info['failed_reactions']}")
print(f"元素分布: {dataset_info['elements']}")
print(f"平均原子数: {dataset_info['avg_atoms']:.2f}")
print(f"最小原子数: {dataset_info['min_atoms']}")
print(f"最大原子数: {dataset_info['max_atoms']}")
# 保存数据集信息
save_dataset_info(dataset_info, 'dataset_info.json')
print("数据集信息已保存到 dataset_info.json")
# 保存处理后的数据(可选)
if reactions_data:
# 你可以选择保存处理后的数据供后续使用
np.save('processed_reactions.npy', reactions_data)
print("处理后的反应数据已保存到 processed_reactions.npy")
if __name__ == "__main__":
main()
In [ ]
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom collections import Counterimport json
# 设置中文字体支持(如果需要)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 加载之前保存的数据集信息with open('dataset_info.json', 'r') as f:
dataset_info = json.load(f)
print("数据集详细信息:")print(f"总反应数: {dataset_info['total_reactions']}")print(f"元素分布: O(氧):{dataset_info['elements']['O']}, C(碳):{dataset_info['elements']['C']}, H(氢):{dataset_info['elements']['H']}, N(氮):{dataset_info['elements']['N']}")print(f"平均原子数: {dataset_info['avg_atoms']:.2f}")print(f"原子数范围: {dataset_info['min_atoms']} - {dataset_info['max_atoms']}")
corrected_elements = {
'O': dataset_info['elements']['O'],
'C': dataset_info['elements']['C'],
'H': dataset_info['elements']['H'],
'N': dataset_info['elements']['N']
}
# 可视化元素分布
plt.figure(figsize=(10, 6))
plt.bar(corrected_elements.keys(), corrected_elements.values())
plt.title('元素分布')
plt.xlabel('元素')
plt.ylabel('出现次数')
plt.tight_layout()
plt.savefig('element_distribution.png')
plt.show()
# 加载处理后的反应数据try:
reactions_data = np.load('processed_reactions.npy', allow_pickle=True)
print(f"成功加载 {len(reactions_data)} 个反应的数据")
# 分析反应类型
reaction_types = []
for reaction in reactions_data[:100]: # 只分析前100个反应以节省时间
reactant_atoms = reaction['reactant']['atoms']
product_atoms = reaction['product']['atoms']
# 简单的反应类型判断(基于原子数量变化)
atom_diff = len(product_atoms) - len(reactant_atoms)
if atom_diff > 0:
reaction_types.append('加成反应')
elif atom_diff < 0:
reaction_types.append('消除反应')
else:
reaction_types.append('重排反应')
# 统计反应类型分布
type_counts = Counter(reaction_types)
print("反应类型分布(基于原子数变化):")
for rtype, count in type_counts.items():
print(f"{rtype}: {count}")
# 可视化反应类型分布
plt.figure(figsize=(8, 6))
plt.pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')
plt.title('反应类型分布')
plt.savefig('reaction_type_distribution.png')
plt.show()
except FileNotFoundError:
print("未找到 processed_reactions.npy 文件")
reactions_data = []
# 创建分子描述符函数def create_molecular_descriptors(atoms, coords):
"""创建分子描述符"""
if len(atoms) == 0:
return {}
# 原子质量字典
atomic_masses = {
'H': 1.008, 'C': 12.011, 'N': 14.007, 'O': 15.999,
'F': 18.998, 'P': 30.974, 'S': 32.065, 'Cl': 35.453
}
# 计算分子质量
mass = sum(atomic_masses.get(atom, 12.0) for atom in atoms)
# 计算质心
centroid = np.mean(coords, axis=0)
# 计算惯性矩
centered_coords = coords - centroid
inertia_tensor = np.zeros((3, 3))
for i in range(len(atoms)):
x, y, z = centered_coords[i]
mass_i = atomic_masses.get(atoms[i], 12.0)
inertia_tensor[0, 0] += mass_i * (y**2 + z**2)
inertia_tensor[1, 1] += mass_i * (x**2 + z**2)
inertia_tensor[2, 2] += mass_i * (x**2 + y**2)
inertia_tensor[0, 1] -= mass_i * x * y
inertia_tensor[0, 2] -= mass_i * x * z
inertia_tensor[1, 2] -= mass_i * y * z
inertia_tensor[1, 0] = inertia_tensor[0, 1]
inertia_tensor[2, 0] = inertia_tensor[0, 2]
inertia_tensor[2, 1] = inertia_tensor[1, 2]
# 计算惯性矩的主值
principal_moments = np.linalg.eigvalsh(inertia_tensor)
# 计算回转半径
gyration_radius = np.sqrt(np.sum(principal_moments) / mass) if mass > 0 else 0
return {
'molecular_mass': mass,
'centroid': centroid.tolist(),
'principal_moments': principal_moments.tolist(),
'gyration_radius': gyration_radius
}
# 创建特征数据集if len(reactions_data) > 0:
features_list = []
for i, reaction in enumerate(reactions_data[:500]): # 只处理前500个反应以节省时间
if i % 100 == 0:
print(f"处理第 {i} 个反应...")
reaction_id = reaction['reaction_id']
# 反应物特征
reactant_features = create_molecular_descriptors(
reaction['reactant']['atoms'],
reaction['reactant']['coords']
)
# 产物特征
product_features = create_molecular_descriptors(
reaction['product']['atoms'],
reaction['product']['coords']
)
# 过渡态特征
ts_features = create_molecular_descriptors(
reaction['ts']['atoms'],
reaction['ts']['coords']
)
# 计算反应物和产物之间的差异
mass_change = product_features['molecular_mass'] - reactant_features['molecular_mass']
gyration_change = product_features['gyration_radius'] - reactant_features['gyration_radius']
features = {
'reaction_id': reaction_id,
'num_atoms': len(reaction['reactant']['atoms']),
'reactant_mass': reactant_features['molecular_mass'],
'product_mass': product_features['molecular_mass'],
'ts_mass': ts_features['molecular_mass'],
'mass_change': mass_change,
'reactant_gyration': reactant_features['gyration_radius'],
'product_gyration': product_features['gyration_radius'],
'ts_gyration': ts_features['gyration_radius'],
'gyration_change': gyration_change
}
features_list.append(features)
# 创建DataFrame并保存
feature_df = pd.DataFrame(features_list)
feature_df.to_csv('molecular_features.csv', index=False)
print("分子特征已保存到 molecular_features.csv")
# 可视化特征分布
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0, 0].hist(feature_df['num_atoms'], bins=20, alpha=0.7, color='skyblue')
axes[0, 0].set_title('原子数分布')
axes[0, 0].set_xlabel('原子数')
axes[0, 0].set_ylabel('频率')
axes[0, 1].hist(feature_df['reactant_mass'], bins=20, alpha=0.7, color='lightgreen')
axes[0, 1].set_title('反应物分子质量分布')
axes[0, 1].set_xlabel('分子质量')
axes[0, 1].set_ylabel('频率')
axes[1, 0].hist(feature_df['mass_change'], bins=20, alpha=0.7, color='lightcoral')
axes[1, 0].set_title('质量变化分布')
axes[1, 0].set_xlabel('质量变化')
axes[1, 0].set_ylabel('频率')
axes[1, 1].hist(feature_df['gyration_change'], bins=20, alpha=0.7, color='gold')
axes[1, 1].set_title('回转半径变化分布')
axes[1, 1].set_xlabel('回转半径变化')
axes[1, 1].set_ylabel('频率')
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.show()
# 计算特征之间的相关性
correlation_matrix = feature_df[['num_atoms', 'reactant_mass', 'product_mass',
'mass_change', 'reactant_gyration', 'product_gyration',
'gyration_change']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('特征相关性矩阵')
plt.tight_layout()
plt.savefig('feature_correlation.png')
plt.show()
print("\n下一步建议:")print("1. 基于特征数据集探索化学反应模式")print("2. 开始构建机器学习模型")print("3. 考虑使用图神经网络(GNN)处理分子结构")
In [ ]
import numpy as npimport pandas as pdfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_errorfrom sklearn.preprocessing import StandardScaler
# 加载特征数据try:
feature_df = pd.read_csv('molecular_features.csv')
print(f"成功加载 {len(feature_df)} 个反应的特征数据")
# 准备特征和目标变量
# 这里我们使用简单的特征来预测过渡态的回转半径
# 在实际应用中,你需要预测原子坐标,这需要更复杂的模型
X = feature_df[['num_atoms', 'reactant_mass', 'product_mass',
'reactant_gyration', 'product_gyration']].values
y = feature_df['ts_gyration'].values
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42
)
# 训练随机森林模型
print("训练随机森林模型...")
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 评估模型
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"模型性能:")
print(f"均方根误差(RMSE): {rmse:.4f}")
print(f"平均回转半径: {np.mean(y):.4f}")
print(f"RMSE/平均值: {rmse/np.mean(y)*100:.2f}%")
# 特征重要性
feature_importance = pd.DataFrame({
'feature': ['num_atoms', 'reactant_mass', 'product_mass',
'reactant_gyration', 'product_gyration'],
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n特征重要性:")
print(feature_importance)
except FileNotFoundError:
print("未找到 molecular_features.csv 文件,请先运行数据分析脚本")
print("\n下一步建议:")print("1. 这个基线模型只是预测了一个简单的物理量")print("2. 实际任务需要预测完整的3D分子结构")print("3. 考虑使用图神经网络(GNN)或生成模型来处理这个任务")
In [ ]
import paddleimport paddle.nn as nnimport paddle.nn.functional as Fimport numpy as npimport matplotlib.pyplot as pltfrom tqdm import tqdmimport osimport json
# 设置随机种子以确保可重复性
paddle.seed(42)
np.random.seed(42)
# 定义原子类型到索引的映射
ATOM_MAPPING = {'H': 0, 'C': 1, 'N': 2, 'O': 3}
NUM_ATOM_TYPES = len(ATOM_MAPPING)
class MolecularFeatureExtractor(nn.Layer):
"""分子特征提取器 - 不使用图结构"""
def __init__(self, input_dim, hidden_dim=64):
super(MolecularFeatureExtractor, self).__init__()
self.linear1 = nn.Linear(input_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, hidden_dim)
self.bn = nn.BatchNorm1D(hidden_dim)
def forward(self, x):
x = F.relu(self.linear1(x))
x = self.bn(x)
x = F.relu(self.linear2(x))
# 全局平均池化
x = paddle.mean(x, axis=1, keepdim=False)
return x
class TransitionStatePredictor(nn.Layer):
"""基于PaddlePaddle的过渡态预测模型 - 不使用图结构"""
def __init__(self, feature_dim, hidden_dim=64, output_dim=3, max_atoms=23):
super(TransitionStatePredictor, self).__init__()
self.max_atoms = max_atoms
# 反应物和产物的特征提取器
self.reactant_encoder = MolecularFeatureExtractor(feature_dim, hidden_dim)
self.product_encoder = MolecularFeatureExtractor(feature_dim, hidden_dim)
# 融合层
self.fusion_linear1 = nn.Linear(hidden_dim * 2, hidden_dim)
self.fusion_linear2 = nn.Linear(hidden_dim, hidden_dim)
# 坐标预测层
self.coord_predictor = nn.Linear(hidden_dim, output_dim * max_atoms)
def forward(self, reactant_feat, product_feat):
# 编码反应物
rx = self.reactant_encoder(reactant_feat)
# 编码产物
px = self.product_encoder(product_feat)
# 融合反应物和产物信息
# 修正:使用paddle.concat而不是pandas.concat,并且axis=1
fused = paddle.concat([rx, px], axis=1)
fused = F.relu(self.fusion_linear1(fused))
fused = F.relu(self.fusion_linear2(fused))
# 预测坐标位移
coord_shift = self.coord_predictor(fused)
coord_shift = coord_shift.reshape([-1, self.max_atoms, 3])
return coord_shift
def create_molecular_features(atoms, coords, max_atoms=23):
"""创建分子特征向量"""
num_nodes = min(len(atoms), max_atoms)
# 创建节点特征 (原子类型 + 坐标)
node_features = []
for i in range(num_nodes):
atom = atoms[i]
coord = coords[i]
# 原子类型 one-hot 编码
atom_idx = ATOM_MAPPING.get(atom, 0)
atom_one_hot = [0] * NUM_ATOM_TYPES
atom_one_hot[atom_idx] = 1
# 组合特征: 原子类型 + 坐标
node_feature = atom_one_hot + coord.tolist()
node_features.append(node_feature)
# 如果原子数少于最大值,填充零向量
if num_nodes < max_atoms:
for _ in range(max_atoms - num_nodes):
node_features.append([0] * (NUM_ATOM_TYPES + 3))
# 展平为单个向量
flattened_features = np.array(node_features).flatten()
return flattened_features
def prepare_paddle_data(reactions_data, max_atoms=23):
"""准备PaddlePaddle数据"""
reactant_features = []
product_features = []
displacements = []
for reaction in tqdm(reactions_data, desc="准备分子特征数据"):
# 处理反应物
reactant_atoms = reaction['reactant']['atoms']
reactant_coords = reaction['reactant']['coords']
reactant_feat = create_molecular_features(reactant_atoms, reactant_coords, max_atoms)
# 处理产物
product_atoms = reaction['product']['atoms']
product_coords = reaction['product']['coords']
product_feat = create_molecular_features(product_atoms, product_coords, max_atoms)
# 获取位移(反应物到过渡态)
displacement = reaction['displacement']
# 如果原子数少于最大值,填充零
if displacement.shape[0] < max_atoms:
padding = np.zeros((max_atoms - displacement.shape[0], 3))
displacement = np.vstack([displacement, padding])
reactant_features.append(reactant_feat)
product_features.append(product_feat)
displacements.append(displacement.flatten().astype('float32'))
# 转换为Paddle张量
reactant_features = paddle.to_tensor(np.array(reactant_features, dtype='float32'))
product_features = paddle.to_tensor(np.array(product_features, dtype='float32'))
displacements = paddle.to_tensor(np.array(displacements, dtype='float32'))
return reactant_features, product_features, displacements
def train_paddle_model(model, data, num_epochs=50, learning_rate=0.001):
"""训练PaddlePaddle模型"""
reactant_features, product_features, displacements = data
# 定义优化器和损失函数
optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=learning_rate)
criterion = nn.MSELoss()
train_losses = []
# 创建数据集
dataset = [(reactant_features[i], product_features[i], displacements[i]) for i in range(len(reactant_features))]
for epoch in range(num_epochs):
model.train()
total_loss = 0
for i in tqdm(range(len(dataset)), desc=f"Epoch {epoch+1}/{num_epochs}"):
rx, px, disp = dataset[i]
# 前向传播
pred_displacement = model(rx.unsqueeze(0), px.unsqueeze(0))
# 计算损失
loss = criterion(pred_displacement, disp.reshape(pred_displacement.shape))
# 反向传播
loss.backward()
optimizer.step()
optimizer.clear_grad()
total_loss += loss.numpy()[0]
avg_loss = total_loss / len(dataset)
train_losses.append(avg_loss)
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}")
return model, train_losses
# 主程序if __name__ == "__main__":
# 加载处理后的反应数据
try:
reactions_data = np.load('processed_reactions.npy', allow_pickle=True)
print(f"成功加载 {len(reactions_data)} 个反应的数据")
# 只使用前100个反应进行演示(减少计算时间)
sample_reactions = reactions_data[:100]
# 准备数据
max_atoms = 23
data = prepare_paddle_data(sample_reactions, max_atoms)
# 初始化模型
feature_dim = max_atoms * (NUM_ATOM_TYPES + 3) # 每个分子的特征维度
model = TransitionStatePredictor(feature_dim, hidden_dim=64, output_dim=3, max_atoms=max_atoms)
# 训练模型
print("开始训练PaddlePaddle模型...")
trained_model, train_losses = train_paddle_model(
model, data, num_epochs=10, learning_rate=0.001
)
# 保存模型
paddle.save(trained_model.state_dict(), 'ts_predictor_paddle.pdparams')
print("模型已保存到 ts_predictor_paddle.pdparams")
# 绘制训练损失曲线
plt.figure(figsize=(10, 6))
plt.plot(train_losses)
plt.title('训练损失曲线')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig('training_loss_paddle.png')
plt.show()
except FileNotFoundError:
print("未找到 processed_reactions.npy 文件,请先运行数据预处理脚本")
In [ ]
import paddleimport paddle.nn as nnimport numpy as npimport osimport json
# 简单的线性插值模型class LinearInterpolationModel:
"""简单的线性插值模型 - 作为基线"""
def predict(self, reactant_coords, product_coords):
"""
使用简单的线性插值预测过渡态
假设过渡态位于反应物和产物的中间位置
"""
return (reactant_coords + product_coords) / 2
# 主程序if __name__ == "__main__":
# 加载处理后的反应数据
try:
reactions_data = np.load('processed_reactions.npy', allow_pickle=True)
print(f"成功加载 {len(reactions_data)} 个反应的数据")
# 使用简单的线性插值模型
model = LinearInterpolationModel()
# 评估模型
total_error = 0
count = 0
for reaction in reactions_data[:100]: # 只评估前100个反应
reactant_coords = reaction['reactant']['coords']
product_coords = reaction['product']['coords']
true_ts_coords = reaction['ts']['coords']
# 预测过渡态
pred_ts_coords = model.predict(reactant_coords, product_coords)
# 计算误差
error = np.sqrt(np.mean((true_ts_coords - pred_ts_coords)**2))
total_error += error
count += 1
if count % 10 == 0:
print(f"反应 {count}, RMSE: {error:.4f} Å")
avg_error = total_error / count
print(f"平均RMSE: {avg_error:.4f} Å")
# 保存简单的预测结果
predictions = []
for reaction in reactions_data[:100]:
reactant_coords = reaction['reactant']['coords']
product_coords = reaction['product']['coords']
pred_ts_coords = model.predict(reactant_coords, product_coords)
# 格式化预测结果
coords_list = []
for i, atom in enumerate(reaction['reactant']['atoms']):
coords_list.append([atom, float(pred_ts_coords[i, 0]),
float(pred_ts_coords[i, 1]), float(pred_ts_coords[i, 2])])
predictions.append({
"reaction_id": reaction['reaction_id'],
"coordinates": json.dumps(coords_list)
})
# 保存为CSV文件
import csv
with open('simple_predictions.csv', 'w', newline='') as csvfile:
fieldnames = ['reaction_id', 'coordinates']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for pred in predictions:
writer.writerow(pred)
print("简单预测结果已保存到 simple_predictions.csv")
# 生成XYZ文件
os.makedirs('ts_predictions', exist_ok=True)
for reaction in reactions_data[:100]:
reactant_coords = reaction['reactant']['coords']
product_coords = reaction['product']['coords']
pred_ts_coords = model.predict(reactant_coords, product_coords)
# 保存为XYZ格式
xyz_file = os.path.join('ts_predictions', f"{reaction['reaction_id']}.xyz")
with open(xyz_file, 'w') as f:
f.write(f"{len(reaction['reactant']['atoms'])}\n")
f.write(f"Predicted transition state for {reaction['reaction_id']}\n")
for i, atom in enumerate(reaction['reactant']['atoms']):
f.write(f"{atom} {pred_ts_coords[i, 0]:.6f} {pred_ts_coords[i, 1]:.6f} {pred_ts_coords[i, 2]:.6f}\n")
# 压缩XYZ文件
import zipfile
with zipfile.ZipFile("ts_predictions.zip", 'w') as zipf:
for root, dirs, files in os.walk('ts_predictions'):
for file in files:
if file.endswith('.xyz'):
zipf.write(os.path.join(root, file), file)
print("XYZ文件已压缩为 ts_predictions.zip")
except FileNotFoundError:
print("未找到 processed_reactions.npy 文件,请先运行数据预处理脚本")
In [12]
import subprocessimport pandas as pd
# 检查评估脚本是否存在if os.path.exists('get_score.py'):
try:
# 运行评估脚本
result = subprocess.run(['python', 'get_score.py', 'simple_predictions.csv'],
capture_output=True, text=True)
print("评估脚本输出:")
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)
print("返回码:", result.returncode)
# 检查是否生成了result.txt
if os.path.exists('result.txt'):
with open('result.txt', 'r') as f:
result_content = f.read()
print("result.txt内容:")
print(result_content)
except Exception as e:
print(f"运行评估脚本时出错: {e}")else:
print("未找到 get_score.py 评估脚本")
评估脚本输出:
STDOUT: 评测结果已保存至: /home/aistudio/result.txt
STDERR:
返回码: 0
result.txt内容:
评测结果: 评测失败: 提交文件不存在: simple_predictions.csv
状态码: 1
==================================================
原始JSON数据:
{
"score": 0.0,
"errorMsg": "评测失败: 提交文件不存在: simple_predictions.csv",
"code": 1,
"data": [
{
"score": 0
}
]
}
In [14]
import osimport numpy as npimport jsonimport csvimport zipfilefrom tqdm import tqdm
# 重新定义读取XYZ文件的函数,确保可用def read_xyz_file(file_path):
"""读取XYZ格式的分子结构文件"""
try:
with open(file_path, 'r') as f:
lines = f.readlines()
# 跳过空行
lines = [line.strip() for line in lines if line.strip()]
if len(lines) < 2:
raise ValueError(f"文件 {file_path} 内容不足")
num_atoms = int(lines[0])
atoms = []
coords = []
for i in range(2, 2 + num_atoms):
if i >= len(lines):
raise ValueError(f"文件 {file_path} 原子数量与声明不符")
parts = lines[i].split()
if len(parts) < 4:
continue
atom = parts[0]
try:
x, y, z = map(float, parts[1:4])
atoms.append(atom)
coords.append([x, y, z])
except ValueError:
continue
return atoms, np.array(coords)
except Exception as e:
print(f"读取文件 {file_path} 时出错: {e}")
return [], np.array([])
# 线性插值模型class LinearInterpolationModel:
"""简单的线性插值模型 - 作为基线"""
def predict(self, reactant_coords, product_coords):
"""
使用简单的线性插值预测过渡态
假设过渡态位于反应物和产物的中间位置
"""
return (reactant_coords + product_coords) / 2
def generate_predictions_for_test_set(test_data_path, output_csv="pre.csv"):
"""为测试集生成预测"""
# 检查测试集路径是否存在
if not os.path.exists(test_data_path):
print(f"错误: 测试集路径 '{test_data_path}' 不存在")
return None
# 获取所有测试反应文件夹
test_folders = [f for f in os.listdir(test_data_path)
if os.path.isdir(os.path.join(test_data_path, f))]
# 确保有500个测试反应
if len(test_folders) != 500:
print(f"警告: 测试集包含 {len(test_folders)} 个反应,但预期有 500 个")
print(f"找到 {len(test_folders)} 个测试反应")
# 使用线性插值模型
model = LinearInterpolationModel()
predictions = []
# 处理每个测试反应
for folder in tqdm(test_folders, desc="生成测试集预测"):
reaction_id = folder # 直接使用文件夹名作为反应ID
try:
# 尝试不同的文件命名约定
reactant_files = [
os.path.join(test_data_path, folder, "reactant.xyz"),
os.path.join(test_data_path, folder, "r.xyz"),
os.path.join(test_data_path, folder, "R.xyz")
]
product_files = [
os.path.join(test_data_path, folder, "product.xyz"),
os.path.join(test_data_path, folder, "p.xyz"),
os.path.join(test_data_path, folder, "P.xyz")
]
# 找到实际存在的反应物文件
reactant_file = None
for f in reactant_files:
if os.path.exists(f):
reactant_file = f
break
# 找到实际存在的产物文件
product_file = None
for f in product_files:
if os.path.exists(f):
product_file = f
break
if not reactant_file or not product_file:
print(f"警告: 反应 {reaction_id} 缺少反应物或产物文件")
# 添加空预测
predictions.append({
"reaction_id": reaction_id,
"coordinates": "[]"
})
continue
# 读取反应物和产物结构
reactant_atoms, reactant_coords = read_xyz_file(reactant_file)
product_atoms, product_coords = read_xyz_file(product_file)
# 确保反应物和产物原子数相同
if len(reactant_atoms) != len(product_atoms):
print(f"警告: 反应 {reaction_id} 的反应物和产物原子数不同")
# 使用最小原子数
min_atoms = min(len(reactant_atoms), len(product_atoms))
reactant_atoms = reactant_atoms[:min_atoms]
reactant_coords = reactant_coords[:min_atoms]
product_atoms = product_atoms[:min_atoms]
product_coords = product_coords[:min_atoms]
# 预测过渡态
pred_ts_coords = model.predict(reactant_coords, product_coords)
# 格式化坐标数据为JSON字符串 - 修复JSON格式
coords_list = []
for i, atom in enumerate(reactant_atoms):
coords_list.append([
atom,
float(pred_ts_coords[i, 0]),
float(pred_ts_coords[i, 1]),
float(pred_ts_coords[i, 2])
])
# 将坐标列表转换为JSON字符串,确保格式正确
coords_json = json.dumps(coords_list)
# 添加到预测结果
predictions.append({
"reaction_id": reaction_id,
"coordinates": coords_json
})
except Exception as e:
print(f"处理反应 {reaction_id} 时出错: {e}")
# 添加空预测
predictions.append({
"reaction_id": reaction_id,
"coordinates": "[]"
})
# 按反应ID排序预测结果
predictions.sort(key=lambda x: x["reaction_id"])
# 保存为CSV文件
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['reaction_id', 'coordinates']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for pred in predictions:
writer.writerow(pred)
print(f"预测结果已保存到 {output_csv}")
print(f"共生成 {len(predictions)} 个预测")
return output_csv
def generate_xyz_files_for_test_set(test_data_path, output_dir="ts_predictions"):
"""为测试集生成XYZ文件"""
# 检查测试集路径是否存在
if not os.path.exists(test_data_path):
print(f"错误: 测试集路径 '{test_data_path}' 不存在")
return None
# 获取所有测试反应文件夹
test_folders = [f for f in os.listdir(test_data_path)
if os.path.isdir(os.path.join(test_data_path, f))]
print(f"找到 {len(test_folders)} 个测试反应")
# 使用线性插值模型
model = LinearInterpolationModel()
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 处理每个测试反应
for folder in tqdm(test_folders, desc="生成XYZ文件"):
reaction_id = folder # 直接使用文件夹名作为反应ID
try:
# 尝试不同的文件命名约定
reactant_files = [
os.path.join(test_data_path, folder, "reactant.xyz"),
os.path.join(test_data_path, folder, "r.xyz"),
os.path.join(test_data_path, folder, "R.xyz")
]
product_files = [
os.path.join(test_data_path, folder, "product.xyz"),
os.path.join(test_data_path, folder, "p.xyz"),
os.path.join(test_data_path, folder, "P.xyz")
]
# 找到实际存在的反应物文件
reactant_file = None
for f in reactant_files:
if os.path.exists(f):
reactant_file = f
break
# 找到实际存在的产物文件
product_file = None
for f in product_files:
if os.path.exists(f):
product_file = f
break
if not reactant_file or not product_file:
print(f"警告: 反应 {reaction_id} 缺少反应物或产物文件")
continue
# 读取反应物和产物结构
reactant_atoms, reactant_coords = read_xyz_file(reactant_file)
product_atoms, product_coords = read_xyz_file(product_file)
# 确保反应物和产物原子数相同
if len(reactant_atoms) != len(product_atoms):
print(f"警告: 反应 {reaction_id} 的反应物和产物原子数不同")
# 使用最小原子数
min_atoms = min(len(reactant_atoms), len(product_atoms))
reactant_atoms = reactant_atoms[:min_atoms]
reactant_coords = reactant_coords[:min_atoms]
product_atoms = product_atoms[:min_atoms]
product_coords = product_coords[:min_atoms]
# 预测过渡态
pred_ts_coords = model.predict(reactant_coords, product_coords)
# 为每个反应创建子文件夹
reaction_dir = os.path.join(output_dir, reaction_id)
os.makedirs(reaction_dir, exist_ok=True)
# 保存为XYZ格式
xyz_file = os.path.join(reaction_dir, "ts.xyz")
with open(xyz_file, 'w') as f:
f.write(f"{len(reactant_atoms)}\n")
f.write(f"Predicted transition state for {reaction_id}\n")
for i, atom in enumerate(reactant_atoms):
f.write(f"{atom} {pred_ts_coords[i, 0]:.6f} {pred_ts_coords[i, 1]:.6f} {pred_ts_coords[i, 2]:.6f}\n")
except Exception as e:
print(f"处理反应 {reaction_id} 时出错: {e}")
# 压缩XYZ文件
zip_path = "ts_predictions.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
for root, dirs, files in os.walk(output_dir):
for file in files:
if file.endswith('.xyz'):
file_path = os.path.join(root, file)
# 在ZIP中保持目录结构
arcname = os.path.relpath(file_path, output_dir)
zipf.write(file_path, arcname)
print(f"XYZ文件已压缩为 {zip_path}")
return zip_path
# 主程序if __name__ == "__main__":
# 设置测试集路径
test_data_path = "test_data" # 根据实际路径修改
# 生成CSV预测文件
csv_file = generate_predictions_for_test_set(test_data_path)
# 生成XYZ文件
zip_file = generate_xyz_files_for_test_set(test_data_path)
# 评估预测结果
if csv_file and os.path.exists('get_score.py'):
import subprocess
print("\n运行评估脚本...")
result = subprocess.run(['python', 'get_score.py', csv_file],
capture_output=True, text=True)
print("评估脚本输出:")
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)
print("返回码:", result.returncode)
# 检查是否生成了result.txt
if os.path.exists('result.txt'):
with open('result.txt', 'r') as f:
result_content = f.read()
print("result.txt内容:")
print(result_content)
print("\n下一步:")
print("1. 检查预测文件格式是否正确")
print("2. 如果评估通过,提交 CSV 文件和 ZIP 文件")
print("3. 如果评估失败,检查错误信息并修正")
写出与此代码相似的代码,使其能在飞桨aistudio中运行成功且能在比赛中提交成功,保留提供代码中对JSON数据的处理
最新发布