Node2Vec实战
数据结构
两个两个连接的节点
1 2
2 3
4 5
主程序构建
G = nx.read_edgelist('../data/text.txt',
create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
## 构建模型
model = Node2Vec(G, walk_length=10, num_walks=80,
p=0.25, q=4, workers=1, use_rejection_sampling=0)
## 训练
model.train(embed_size=4, window_size=5, iter=3)
embeddings = model.get_embeddings()
print(embeddings)
初始生成节点到节点的概率
def preprocess_transition_probs(self):
'''
Preprocessing of transition probabilities for guiding the random walks.
'''
####get_alias_edge这个函数是对每条边设定为二阶randomwalk的概率形式
###这个函数的作用是生成每个边界的概率,同时会有alias_setup这个函数将概率进行转换,方便后面抽样
G = self.G
is_directed = self.is_directed
alias_nodes = {}
for node in G.nodes():
unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]#读取每个邻点权重
norm_const = sum(unnormalized_probs)###权重求和,作为公式中正则项常数的那个分母
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]###除以分母
alias_nodes[node] = alias_setup(normalized_probs)
alias_edges = {}
triads = {}
if is_directed:
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
else:
for edge in G.edges():
alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
self.alias_nodes = alias_nodes
self.alias_edges = alias_edges
return
get_alias_edge
是得到节点到节点的概率
def get_alias_edge(self, src, dst):####二阶ramdom walk
#src是随机游走序列中的上一个节点,dst是当前节点
'''
Get the alias edge setup lists for a given edge.
'''
G = self.G
p = self.p
q = self.q
unnormalized_probs = []
for dst_nbr in sorted(G.neighbors(dst)):
if dst_nbr == src:
unnormalized_probs.append(G[dst][dst_nbr]['weight']/p)
elif G.has_edge(dst_nbr, src):
unnormalized_probs.append(G[dst][dst_nbr]['weight'])
else:
unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
norm_const = sum(unnormalized_probs)
normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
return alias_setup(normalized_probs)
alias_setup
:输入概率,得到对应的两组数,方便后面的抽样调用
def alias_setup(probs):
'''
alias_setup的作用是根据二阶random walk输出的概率变成每个节点对应两个数,被后面的alias_draw函数所进行抽样
'''
K = len(probs)
q = np.zeros(K)
J = np.zeros(K, dtype=np.int)
smaller = []
larger = []
for kk, prob in enumerate(probs):
q[kk] = K*prob
if q[kk] < 1.0:
smaller.append(kk)
else:
larger.append(kk)##kk是下标,表示哪些下标小
while len(smaller) > 0 and len(larger) > 0:
small = smaller.pop()##smaller自己也会减少最右边的值
large = larger.pop()
J[small] = large
q[large] = q[large] + q[small] - 1.0
if q[large] < 1.0:
smaller.append(large)
else:
larger.append(large)
return J, q
alias_draw
抽样函数
def alias_draw(J, q):
'''
Draw sample from a non-uniform discrete distribution using alias sampling.
'''
K = len(J)
kk = int(np.floor(np.random.rand()*K))
if np.random.rand() < q[kk]:
return kk
else:
return J[kk]
node2vec_walk就是对于给定的长度,对于开始节点开始模拟这个节点的路径,涉及的函数都在上面提及
def node2vec_walk(self, walk_length, start_node):
'''
Simulate a random walk starting from start node.
'''
G = self.G
alias_nodes = self.alias_nodes
alias_edges = self.alias_edges
walk = [start_node]
######alias_draw这个函数是等于是根据二阶random walk概率选择下一个点
while len(walk) < walk_length:
cur = walk[-1]
cur_nbrs = sorted(G.neighbors(cur))###G.neighbors(cur)得到cur一级关联的节点
if len(cur_nbrs) > 0:
if len(walk) == 1:
####cur[0]
walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
else:
prev = walk[-2]
next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0],
alias_edges[(prev, cur)][1])]
walk.append(next)
else:
break
return walk