经典图神经网络学习
本文主要参考DataWhale图神经网络组队学习
首先本文主要是利用图网络模型进行节点级别的任务,如节点分类。
Cora数据集介绍
Cora是一个论文引用网络,节点代表论文,共有2708篇论文,如果两篇论文存在引用关系,则对应的两个节点之间存 在边,各节点的属性都是一个1433维的词包特征向量。
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
dataset = Planetoid(root='dataset/Cora', name='Cora',
transform=NormalizeFeatures())
查看特征矩阵
data = dataset[0]
data.x.shape
查看数据集标签
data.y # shape ---> torch.Size([2708])
构建MLP对Cora数据集进行节点分类
我们的MLP由两个线程层、一个ReLU非线性层和一个dropout操作组成。第一个线 程 层 将 1433 维 的 节 点 表 征 嵌 入 ( embedding ) 到 低 维 空 间 中 ( hidden_channels=16 ) , 第 二 个 线 性 层 将 节 点 表 征 嵌 入 到 类 别 空 间 中 (num_classes=7)。
import torch
from torch.nn import Linear
import torch.nn.functional as F
class MLP(torch.nn.Module):
def __init__(self, hidden_channels):
super(MLP, self).__init__()
self.lin1 = Linear(dataset.num_features, hidden_channels)
self.lin2 = Linear(hidden_channels, dataset.num_classes)
def forward(self, x):
x = F.relu(self.lin1(x))
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin2(x)
return F.log_softmax(x, dim=1)
model = MLP(hidden_channels=16)
print(model)
model = MLP(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
开始训练,此处是半监督学习,因为只用到了140个训练标签进行训练
data.train_mask.sum().item() # 140
def train():
model.train()
optimizer.zero_grad()
out = model(data.x)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
开始测试分类准确率:
def test():
model.eval()
out = model(data.x)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / int(data.test_mask.sum())
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
构建GCN用于Cora数据集节点分类
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
self.conv_1 = GCNConv(dataset.num_features, hidden_channels)
self.conv_2 = GCNConv(hidden_channels, dataset.num_classes)
def forward(self, x, edge_index):
x = F.relu(self.conv_1(x, edge_index))
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv_2(x, edge_index)
return F.log_softmax(x, dim=1)
开始训练
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
开始测试:
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
构建GAT用于Cora数据集节点分类
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
class GAT(torch.nn.Module):
def __init__(self,hidden_channels):
super(GAT,self).__init__()
self.gatconv1 = GATConv(dataset.num_features, hidden_channels, heads=8, dropout=0.6)
self.gatconv2 = GATConv(8 * hidden_channels, dataset.num_classes, heads=1, concat=False, dropout=0.6)
def forward(self, x, edge_index):
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv1(x, edge_index)
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv2(x, edge_index)
return F.log_softmax(x, dim=1)
开始训练
model = GAT(hidden_channels=8)
optimizer = torch.optim.Adam(model.parameters(),lr=0.005, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,801):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
开始测试:
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color
#, cmap='Paired'
)
plt.show()
将训练好的GAT模型学习到的节点表征进行可视化(仅对测试节点进行可视化)
out = model(data.x, data.edge_index)
visualize(out[data.test_mask],data.y[data.test_mask])
作业
使用PyG中不同的图卷积模块在PyG的不同数据集上实现节点分类或回归任务。
获取CiteSeer数据集
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
dataset = Planetoid(root='dataset/CiteSeer', name='CiteSeer', transform=NormalizeFeatures())
data = dataset[0]
data.x.shape
构建GCN对CiteSeer数据集进行节点分类
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self,hidden_channels):
super(GCN,self).__init__()
self.conv1 = GCNConv(dataset.num_node_features,hidden_channels)
self.conv2 = GCNConv(hidden_channels,dataset.num_classes)
def forward(self, x, edge_index):
x = F.relu(self.conv1(x,edge_index))
x = F.dropout(x, p=0.5, training = self.training)
x = self.conv2(x,edge_index)
return F.log_softmax(x,dim=1)
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01,weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask],data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
print(f'Test Accuracy: {test().item():.4f}')
可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color
#, cmap='Paired'
)
plt.show()
out = model(data.x, data.edge_index)
visualize(out[data.test_mask], data.y[data.test_mask])
构建GAT对CiteSeer数据集进行节点分类(利用GPU加速)
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GATConv
class GAT(torch.nn.Module):
def __init__(self,hidden_channels):
super(GAT,self).__init__()
self.gatconv1 = GATConv(dataset.num_features, hidden_channels, heads=8, dropout=0.6)
self.gatconv2 = GATConv(8 * hidden_channels, dataset.num_classes, heads=1, concat=False, dropout=0.6)
def forward(self, x, edge_index):
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv1(x, edge_index)
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv2(x, edge_index)
return F.log_softmax(x, dim=1)
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
model = GAT(hidden_channels=8).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.005, weight_decay=5e-4)
开始训练:
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,801):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
获取PubMed数据集
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
dataset = Planetoid(root=r'D:\图数据实验数据集\PubMed', name='PubMed',
transform=NormalizeFeatures())
查看特征矩阵:
data = dataset[0]
data.x.shape
构建GCN对PubMed数据集进行节点分类
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self,hidden_channels):
super(GCN,self).__init__()
self.conv1 = GCNConv(dataset.num_node_features,hidden_channels)
self.conv2 = GCNConv(hidden_channels,dataset.num_classes)
def forward(self, x, edge_index):
x = F.relu(self.conv1(x,edge_index))
x = F.dropout(x, p=0.5, training = self.training)
x = self.conv2(x,edge_index)
return F.log_softmax(x,dim=1)
开始训练
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
model = GCN(hidden_channels=16).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01, weight_decay=5e-4)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')
可视化
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
def visualize(h, color):
z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())
plt.figure(figsize=(10,10))
plt.xticks([])
plt.yticks([])
plt.scatter(z[:, 0], z[:, 1], s=70, c=color.cpu().detach().numpy()
#, cmap='Paired'
)
plt.show()
out = model(data.x, data.edge_index)
visualize(out[data.test_mask], data.y[data.test_mask])
构建GAT对PubMed数据集进行节点分类
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GATConv
class GAT(torch.nn.Module):
def __init__(self,hidden_channels):
super(GAT,self).__init__()
self.gatconv1 = GATConv(dataset.num_features, hidden_channels, heads=8, dropout=0.6)
self.gatconv2 = GATConv(8 * hidden_channels, dataset.num_classes, heads=1, concat=False, dropout=0.6)
def forward(self, x, edge_index):
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv1(x, edge_index)
x = F.dropout(x, p=0.6, training=self.training)
x = self.gatconv2(x, edge_index)
return F.log_softmax(x, dim=1)
开始训练:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
model = GAT(hidden_channels=8).to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(),lr=0.01, weight_decay=0.001)
def train():
model.train()
optimizer.zero_grad()
out = model(data.x, data.edge_index)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
return loss
for epoch in range(1,201):
loss = train()
print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}')
开始测试:
def test():
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
test_correct = pred[data.test_mask] == data.y[data.test_mask]
test_acc = int(test_correct.sum()) / data.test_mask.sum()
return test_acc
test_acc = test()
print(f'Test Accuracy:{test_acc:.4f}')