在上一篇文章中,我们探讨了强化学习在机器人控制中的应用。本文将深入介绍图生成模型及其在分子设计领域的应用,这是一个结合深度学习与化学的交叉领域。我们将使用PyTorch Geometric实现基于图神经网络的分子生成模型,并在ZINC250k数据集上进行分子生成实验。
一、图生成模型基础
1. 分子表示方法
表示形式 | 优点 | 缺点 |
---|---|---|
SMILES字符串 | 紧凑易存储 | 语法约束强 |
分子图 | 保留结构信息 | 需要专门处理 |
3D构象 | 包含空间信息 | 计算成本高 |
2. 图生成模型分类
class GraphGenModels:
def __init__(self):
self.autoregressive = ["GraphRNN", "MolGPT"] # 逐步生成
self.one_shot = ["VAE", "GAN"] # 整体生成
self.flow_based = ["GraphNVP", "MoFlow"] # 可逆变换
3. 分子生成评价指标
def calculate_metrics(generated_mols):
validity = len([m for m in generated_mols if m is not None]) / len(generated_mols)
uniqueness = len(set([Chem.MolToSmiles(m) for m in generated_mols if m])) / len(generated_mols)
novelty = len([m for m in generated_mols if m and Chem.MolToSmiles(m) not in train_smiles]) / len(generated_mols)
return {"validity": validity, "uniqueness": uniqueness, "novelty": novelty}
二、基于图变分自编码器的分子设计
1. 环境配置
pip install torch torch-geometric rdkit-pypi networkx matplotlib
2. 分子图数据处理
import torch
from torch_geometric.data import Data, Dataset
from rdkit import Chem
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_add_pool
from torch_geometric.loader import DataLoader
# 1. 数据集定义
class MoleculeDataset(Dataset):
def __init__(self, smiles_list):
super().__init__()
self.smiles_list = smiles_list
def __len__(self):
return len(self.smiles_list)
def __getitem__(self, idx):
smiles = self.smiles_list[idx]
mol = Chem.MolFromSmiles(smiles)
if mol is None:
raise ValueError(f"Invalid SMILES at index {idx}: {smiles}")
# 原子特征
atom_features = []
for atom in mol.GetAtoms():
feature = [
float(atom.GetAtomicNum()),
float(atom.GetDegree()),
float(atom.GetFormalCharge()),
float(atom.GetIsAromatic())
]
atom_features.append(feature)
# 边索引
edge_index = []
for bond in mol.GetBonds():
i = bond.GetBeginAtomIdx()
j = bond.GetEndAtomIdx()
edge_index.append([i, j])
edge_index.append([j, i]) # 无向图
# 转换为PyG数据格式
x = torch.tensor(atom_features, dtype=torch.float)
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
return Data(x=x, edge_index=edge_index, smiles=smiles)
# 2. 归一化数据集
class NormalizedMoleculeDataset(MoleculeDataset):
def __init__(self, smiles_list):
super().__init__(smiles_list)
self.feat_means = None
self.feat_stds = None
def _init_stats(self):
if self.feat_means is None:
temp_data = [MoleculeDataset.__getitem__(self, i) for i in range(min(100, len(self)))]
all_features = torch.cat([d.x for d in temp_data], dim=0)
self.feat_means = torch.mean(all_features, dim=0, keepdim=True)
self.feat_stds = torch.std(all_features, dim=0, keepdim=True) + 1e-6
def __getitem__(self, idx):
self._init_stats()
data = super().__getitem__(idx)
data.x = (data.x - self.feat_means) / self.feat_stds
return data
3. 图变分自编码器实现
# 3. GVAE模型
class StableGVAE(nn.Module):
def __init__(self, node_dim, latent_dim):
super().__init__()
self.node_dim = node_dim
self.latent_dim = latent_dim
self.training_step = 0
# 编码器
self.conv1 = GCNConv(node_dim, 128)
self.norm1 = nn.LayerNorm(128)
self.conv2 = GCNConv(128, 256)
self.norm2 = nn.LayerNorm(256)
self.conv3 = GCNConv(256, 512)
self.norm3 = nn.LayerNorm(512)
# 潜空间
self.mean = nn.Linear(512, latent_dim)
self.logstd = nn.Linear(512, latent_dim)
self.logstd_max = 2
# 解码器
self.node_decoder = nn.Sequential(
nn.Linear(latent_dim, 256),
nn.LayerNorm(256),
nn.ReLU(),
nn.Linear(256, 128),
nn.LayerNorm(128),
nn.ReLU(),
nn.Linear(128, node_dim)
)
# 初始化权重
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def encode(self, x, edge_index, batch):
x = F.relu(self.norm1(self.conv1(x, edge_index)))
x = F.relu(self.norm2(self.conv2(x, edge_index)))
x = F.relu(self.norm3(self.conv3(x, edge_index)))
x = global_add_pool(x, batch)
mean = self.mean(x)
logstd = torch.clamp(self.logstd(x), max=self.logstd_max)
return mean, logstd
def reparameterize(self, mean, logstd):
std = torch.exp(logstd)
eps = torch.randn_like(std)
return mean + eps * std
def forward(self, data):
x, edge_index, batch = data.x, data.edge_index, data.batch
# 编码
mean, logstd = self.encode(x, edge_index, batch)
z = self.reparameterize(mean, logstd)
# 解码
z_nodes = z[batch]
recon = self.node_decoder(z_nodes)
# 计算损失
recon_loss = F.mse_loss(recon, x)
kl_loss = -0.5 * torch.mean(1 + logstd - mean.pow(2) - logstd.exp())
# 动态调整KL权重
kl_weight = min(0.1, 0.01 * (self.training_step / 100))
total_loss = recon_loss + kl_weight * kl_loss
return total_loss, mean, logstd
4. 分子生成
# 4. 训练函数
def train(model, loader, optimizer, epochs=10):
model.train()
for epoch in range(epochs):
total_loss = 0
for batch in loader:
batch = batch.to(device)
optimizer.zero_grad()
loss, mean, logstd = model(batch)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
model.training_step += 1
print(f'Epoch {epoch + 1}, Loss: {total_loss / len(loader):.4f}')
print(f' Mean |z|: {torch.mean(torch.abs(mean)).item():.4f}')
print(f' Std exp(logstd): {torch.mean(torch.exp(logstd)).item():.4f}')
# 5. 主程序
if __name__ == "__main__":
# 示例数据
smiles_list = ["CCO", "CCN", "CC(=O)O", "c1ccccc1", "C1CCCCC1", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"]
# 创建数据集
dataset = NormalizedMoleculeDataset(smiles_list)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
# 设备设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# 初始化模型
model = StableGVAE(node_dim=4, latent_dim=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# 开始训练
train(model, loader, optimizer, epochs=20)
输出为:
Using device: cuda
Epoch 1, Loss: 1.8490
Mean |z|: 4.5453
Std exp(logstd): 3.1631
Epoch 2, Loss: 1.4654
Mean |z|: 4.6283
Std exp(logstd): 3.1690
Epoch 3, Loss: 1.3454
Mean |z|: 4.7163
Std exp(logstd): 3.1893
Epoch 4, Loss: 1.1472
Mean |z|: 4.8204
Std exp(logstd): 3.1949
Epoch 5, Loss: 1.2276
Mean |z|: 4.9333
Std exp(logstd): 3.1850
Epoch 6, Loss: 1.2820
Mean |z|: 5.0495
Std exp(logstd): 3.1673
Epoch 7, Loss: 1.0549
Mean |z|: 5.1596
Std exp(logstd): 3.1532
Epoch 8, Loss: 1.1106
Mean |z|: 5.2716
Std exp(logstd): 3.1439
Epoch 9, Loss: 0.9983
Mean |z|: 5.3749
Std exp(logstd): 3.1421
Epoch 10, Loss: 0.8377
Mean |z|: 5.4713
Std exp(logstd): 3.1421
Epoch 11, Loss: 0.9254
Mean |z|: 5.5570
Std exp(logstd): 3.1369
Epoch 12, Loss: 0.8516
Mean |z|: 5.6426
Std exp(logstd): 3.1326
Epoch 13, Loss: 0.8242
Mean |z|: 5.7219
Std exp(logstd): 3.1306
Epoch 14, Loss: 0.8309
Mean |z|: 5.7898
Std exp(logstd): 3.1227
Epoch 15, Loss: 0.8721
Mean |z|: 5.8507
Std exp(logstd): 3.1251
Epoch 16, Loss: 0.7623
Mean |z|: 5.9046
Std exp(logstd): 3.1181
Epoch 17, Loss: 0.9805
Mean |z|: 5.9619
Std exp(logstd): 3.1088
Epoch 18, Loss: 0.6974
Mean |z|: 6.0277
Std exp(logstd): 3.0938
Epoch 19, Loss: 0.7069
Mean |z|: 6.0858
Std exp(logstd): 3.0828
Epoch 20, Loss: 0.7618
Mean |z|: 6.1408
Std exp(logstd): 3.0777
三、总结与展望
本文实现了基于图变分自编码器的分子生成系统,主要技术亮点包括:
-
完整的分子图处理流程:从SMILES到PyG数据格式
-
图生成模型架构:结合GNN与VAE的优势
在下一篇文章中,我们将探讨混合精度训练与梯度缩放技术,介绍如何利用FP16加速训练同时保持模型稳定性。