import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
zip_file = keras.utils.get_file(
fname="cora.tgz",#保存的文件名
origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",#下载路径
extract=True,#下载后解压
)
data_dir = os.path.join(os.path.dirname(zip_file), "cora")#获取解压后的路径再与cora拼接
#加载引用关系文件,相当于图中的边
citations = pd.read_csv(
os.path.join(data_dir, "cora.cites"),#拼接路径
sep="\t",#以制表符作为分割
header=None,#无表头
names=["target", "source"],
)
print("Citations shape:", citations.shape)
citations.sample(frac=1).head()#将原本的所有数据打乱然后取前五个
#定义特征数据
column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
#加载论文数据,相当于图中的节点
papers = pd.read_csv(
os.path.join(data_dir, "cora.content"),
sep="\t",
header=None,
names=column_names,
)
print("Papers shape:", papers.shape)
print(papers.sample(5).T)
print(papers.subject.value_counts())
class_values = sorted(papers["subject"].unique())#获取到类别的数量
class_idx = {name: id for id, name in enumerate(class_values)}#根据论文类别建立索引
paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}#根据论文id建立索引
#使用索引建立映射
papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])
plt.figure(figsize=(10, 10))
colors = papers["subject"].tolist()
cora_graph = nx.from_pandas_edgelist(citations.sample(n=1500))#随机选取1500条数据并转化为图的结构
subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])#获取到图中节点的论文类别
nx.draw_spring(cora_graph, node_size=15, node_color=subjects)
train_data, test_data = [], []
#按照论文类别分组,且每个类别大约半的数据在训练集,另一半在测试集。
for _, group_data in papers.groupby("subject"):
random_selection = np.random.rand(len(group_data.index)) <= 0.5
train_data.append(group_data[random_selection])
test_data.append(group_data[~random_selection])
#将所有类别的训练集和测试集分别合并并且打乱数据
train_data = pd.concat(train_data).sample(frac=1)
test_data = pd.concat(test_data).sample(frac=1)
print("Train data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
hidden_units = [32, 32]#隐藏层:邻居聚合和消息传递
learning_rate = 0.01
dropout_rate = 0.5
num_epochs = 300
batch_size = 256
#训练和评估模型
def run_experiment(model, x_train, y_train):
model.compile(
optimizer=keras.optimizers.Adam(learning_rate),#优化器
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),#损失函数
metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],#分类准确率
)
# 早停回调:防止过拟合,当验证集的准确率不再提升时停止训练.
early_stopping = keras.callbacks.EarlyStopping(
monitor="val_acc", #监听验证集的准确率
patience=50,#允许验证集准确率不提升的最大轮数
restore_best_weights=True#回到最佳模型
)
#模型训练
history = model.fit(
x=x_train,#训练数据
y=y_train,#训练标签
epochs=num_epochs,
batch_size=batch_size,
validation_split=0.15,#训练集划分15%为验证集
callbacks=[early_stopping],#启用早停
)
return history
#数据可视化
def display_learning_curves(history):
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(history.history["loss"])
ax1.plot(history.history["val_loss"])
ax1.legend(["train", "val"], loc="upper right")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Loss")
ax2.plot(history.history["acc"])
ax2.plot(history.history["val_acc"])
ax2.legend(["train", "test"], loc="upper right")
ax2.set_xlabel("Epochs")
ax2.set_ylabel("Accuracy")
plt.show()
#前馈神经网络:输入->隐藏层->输出
def create_ffn(hidden_units, dropout_rate, name=None):
fnn_layers = []
#隐藏层结构
for units in hidden_units:
fnn_layers.append(layers.BatchNormalization())#批归一化
fnn_layers.append(layers.Dropout(dropout_rate))#丢弃
fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))#全连接
return keras.Sequential(fnn_layers, name=name)
feature_names = set(papers.columns) - {"paper_id", "subject"}#获取特征列名
num_features = len(feature_names)#特征数量
num_classes = len(class_idx)#类别数量
#将训练和测试的的特征转化为numpy
x_train = train_data[feature_names].to_numpy()
x_test = test_data[feature_names].to_numpy()
#训练和测试的标签
y_train = train_data["subject"]
y_test = test_data["subject"]
#带有跳跃连接的前反馈神经网络的基线模型
def create_baseline_model(hidden_units, num_classes, dropout_rate=0.2):
inputs = layers.Input(shape=(num_features,), name="input_features")
x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)#对输入数据进行非线性变化,进行特征提取。
#跳跃连接:解决深层前馈网络易出现梯度消失或训练不稳定。
for block_idx in range(4):
#创造一个新的fnn
x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x)
#跳跃链接
x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1])
#输出
logits = layers.Dense(num_classes, name="logits")(x)
#返回模型
return keras.Model(inputs=inputs, outputs=logits, name="baseline")
#创建模型
baseline_model = create_baseline_model(hidden_units, num_classes, dropout_rate)
baseline_model.summary()
#训练模型
history = run_experiment(baseline_model, x_train, y_train)
#可视化展示
display_learning_curves(history)
#测试结果
_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")
#生成随机实例
def generate_random_instances(num_instances):
token_probability = x_train.mean(axis=0)#特征全局出现的概率
instances = []
for _ in range(num_instances):
probabilities = np.random.uniform(size=len(token_probability))#生成随机数
instance = (probabilities <= token_probability).astype(int)
instances.append(instance)
return np.array(instances)
def display_class_probabilities(probabilities):
for instance_idx, probs in enumerate(probabilities):
print(f"Instance {instance_idx + 1}:")
for class_idx, prob in enumerate(probs):
print(f"- {class_values[class_idx]}: {round(prob * 100, 2)}%")
new_instances = generate_random_instances(num_classes)
logits = baseline_model.predict(new_instances)#获取模型输出
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()#转为概率
display_class_probabilities(probabilities)
#边
edges = citations[["source", "target"]].to_numpy().T
#边权重
edge_weights = tf.ones(shape=edges.shape[1])
#点特征
node_features = tf.cast(
papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
)
#图
graph_info = (node_features, edges, edge_weights)
print("Edges shape:", edges.shape)
print("Nodes shape:", node_features.shape)
class GraphConvLayer(layers.Layer):
def __init__(
self,
hidden_units,
dropout_rate=0.2,
aggregation_type="mean",
combination_type="concat",#自身特征和聚合特征的结合方式:拼接
normalize=False,
*args,
**kwargs,
):
super(GraphConvLayer, self).__init__(*args, **kwargs)
self.aggregation_type = aggregation_type
self.combination_type = combination_type
self.normalize = normalize
self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
if self.combination_type == "gated":
#门控制单元GRU
self.update_fn = layers.GRU(
units=hidden_units,
activation="tanh",
recurrent_activation="sigmoid",
dropout=dropout_rate,
return_state=True,
recurrent_dropout=dropout_rate,
)
else:
self.update_fn = create_ffn(hidden_units, dropout_rate)
#消息准备
def prepare(self, node_repesentations, weights=None):
#对消息进行非线性变换
messages = self.ffn_prepare(node_repesentations)
#有权重则加权计算
if weights is not None:
messages = messages * tf.expand_dims(weights, -1)
return messages
#消息聚合
"""def aggregate(self, node_indices, neighbour_messages, node_repesentations):
#node_indices表示接收聚合消息的目标节点
num_nodes = node_repesentations.shape[0]
if self.aggregation_type == "sum":
aggregated_message = tf.math.unsorted_segment_sum(
neighbour_messages, node_indices, num_segments=num_nodes #参数分别代表邻居消息,分组索引,分组数量
)
elif self.aggregation_type == "mean":
aggregated_message = tf.math.unsorted_segment_mean(
neighbour_messages, node_indices, num_segments=num_nodes
)
elif self.aggregation_type == "max":
aggregated_message = tf.math.unsorted_segment_max(
neighbour_messages, node_indices, num_segments=num_nodes
)
else:
raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
return aggregated_message"""
def aggregate(self, node_indices, neighbour_messages, node_repesentations):
# 计算度矩阵(假设edge_weights已包含自环)
num_nodes = node_repesentations.shape[0]
degrees = tf.math.unsorted_segment_sum(
tf.ones_like(edge_weights), node_indices, num_segments=num_nodes
)
D_inv_sqrt = tf.pow(degrees + 1e-7, -0.5) # 避免除零
# 获取归一化权重
norm_weights = D_inv_sqrt * edge_weights * tf.gather(D_inv_sqrt, node_indices)
# 加权聚合
aggregated_message = tf.math.unsorted_segment_sum(
neighbour_messages * tf.expand_dims(norm_weights, -1),
node_indices,
num_segments=num_nodes)
return aggregated_message
#消息更新
def update(self, node_repesentations, aggregated_messages):
if self.combination_type == "gru":
h = tf.stack([node_repesentations, aggregated_messages], axis=1)
elif self.combination_type == "concat":
h = tf.concat([node_repesentations, aggregated_messages], axis=1)
elif self.combination_type == "add":
h = node_repesentations + aggregated_messages
else:
raise ValueError(f"Invalid combination type: {self.combination_type}.")
#特征变换
node_embeddings = self.update_fn(h)
if self.combination_type == "gru":
node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
#归一化
if self.normalize:
node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
return node_embeddings
#消息传递
def call(self, inputs):
node_repesentations, edges, edge_weights = inputs
edges = tf.cast(edges, dtype=tf.int32)
edge_weights = tf.cast(edge_weights, dtype=tf.float32)
#源节点和目标节点
node_indices, neighbour_indices = edges[0], edges[1]
#聚合邻居的消息
neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)
#邻居消息准备
neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
#消息聚合
aggregated_messages = self.aggregate(
node_indices, neighbour_messages, node_repesentations
)
#更新
return self.update(node_repesentations, aggregated_messages)
class GNNNodeClassifier(tf.keras.Model):
def __init__(
self,
graph_info,
num_classes,
hidden_units,
aggregation_type="sum",
combination_type="concat",
dropout_rate=0.2,
normalize=True,
*args,
**kwargs,
):
super(GNNNodeClassifier, self).__init__(*args, **kwargs)
node_features, edges, edge_weights = graph_info
self.node_features = tf.convert_to_tensor(graph_info[0], dtype=tf.float32)
self.edges = tf.convert_to_tensor(graph_info[1], dtype=tf.int32)
self.edge_weights = tf.convert_to_tensor(graph_info[2], dtype=tf.float32)
# 设置权重为1
if self.edge_weights is None:
self.edge_weights = tf.ones(shape=edges.shape[1])
#权重和为一
self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)
#预处理:对原始节点特征进行非线性变化
self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
#第一个图卷积层
self.conv1 = GraphConvLayer(
hidden_units,
dropout_rate,
aggregation_type,
combination_type,
normalize,
name="graph_conv1",
)
#第二个
self.conv2 = GraphConvLayer(
hidden_units,
dropout_rate,
aggregation_type,
combination_type,
normalize,
name="graph_conv2",
)
#后处理层
self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
#逻辑输出层
self.compute_logits = layers.Dense(units=num_classes, name="logits")
def call(self, input_node_indices):
#预处理
x = self.preprocess(self.node_features)
#第一个图卷积层
x1 = self.conv1((x, self.edges, self.edge_weights))
x = x1 + x
#第二个图卷积层
x2 = self.conv2((x, self.edges, self.edge_weights))
x = x2 + x
#后处理
x = self.postprocess(x)
node_embeddings = tf.gather(x, input_node_indices)
return self.compute_logits(node_embeddings)
gnn_model = GNNNodeClassifier(
graph_info=graph_info,
num_classes=num_classes,
hidden_units=hidden_units,
dropout_rate=dropout_rate,
name="gnn_model",
)
print("GNN output shape:", gnn_model([1, 10, 100]))
gnn_model.summary()
x_train = train_data.paper_id.to_numpy()
history = run_experiment(gnn_model, x_train, y_train)
display_learning_curves(history)
x_test = test_data.paper_id.to_numpy()
_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")
#加入新节点
num_nodes = node_features.shape[0]
new_node_features = np.concatenate([node_features, new_instances])
#为新节点加入边
new_node_indices = [i + num_nodes for i in range(num_classes)]
new_citations = []
#引用关系
for subject_idx, group in papers.groupby("subject"):
subject_papers = list(group.paper_id)
#从当前学科选五篇
selected_paper_indices1 = np.random.choice(subject_papers, 5)
#从所有论文中选两篇
selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
#合并选中的id
selected_paper_indices = np.concatenate(
[selected_paper_indices1, selected_paper_indices2], axis=0
)
#添加边
citing_paper_indx = new_node_indices[subject_idx]
for cited_paper_idx in selected_paper_indices:
new_citations.append([citing_paper_indx, cited_paper_idx])
new_citations = np.array(new_citations).T
new_edges = np.concatenate([edges, new_citations], axis=1)#合并新旧边
print("Original node_features shape:", gnn_model.node_features.shape)
print("Original edges shape:", gnn_model.edges.shape)
#更新图
gnn_model.node_features = new_node_features
gnn_model.edges = new_edges
gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
print("New node_features shape:", gnn_model.node_features.shape)
print("New edges shape:", gnn_model.edges.shape)
logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))#输出
probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()#概率
display_class_probabilities(probabilities)报错ensorflow.python.framework.errors_impl.InvalidArgumentError: Exception encountered when calling layer 'graph_conv1' (type GraphConvLayer).
{{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:CPU:0}} Incompatible shapes: [2708] vs. [5429] [Op:Mul]
Call arguments received by layer 'graph_conv1' (type GraphConvLayer):
• inputs=('tf.Tensor(shape=(2708, 32), dtype=float32)', 'tf.Tensor(shape=(2, 5429), dtype=int32)', 'tf.Tensor(shape=(5429,), dtype=float32)')
最新发布