deepwalk源码解读5: walks.py

最新推荐文章于 2024-04-15 17:10:38 发布

相国大人

最新推荐文章于 2024-04-15 17:10:38 发布

阅读量3k

点赞数 6

CC 4.0 BY-SA版权

分类专栏： python

本文链接：https://blog.youkuaiyun.com/github_36326955/article/details/82697946

python 专栏收录该内容

50 篇文章

订阅专栏

本文深入剖析DeepWalk源代码，解读其核心算法与实现细节，包括随机游走、语料库构建、多线程处理等关键技术。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

deepwalk源代码链接
https://github.com/sunxiangguo/NRL_deepwalk

1. 原始代码分析


#!usr/bin/env python
# -*- coding:utf-8 _*-

# import logging
from io import open
from os import path
# from time import time
from multiprocessing import cpu_count
import random
from concurrent.futures import ProcessPoolExecutor
from collections import Counter

from six.moves import zip

from deepwalk.myanalysis import graph

# logger = logging.getLogger("deepwalk")

__current_graph = None

# speed up the string encoding 然而这个参数在代码中并没有再次出现过
# __vertex2str = None

"""
count_words(file)和
count_textfiles(files, workers=1)的作用是
计算语料库中各个词的词频
"""


def count_words(file):
    """
    Counts the word frequences in a list of sentences.
    file里面是一系列walks，一个walk是一行，这个函数最终返回这个file中各个单词出现的次数。

    Note:
      This is a helper function for parallel execution of `Vocabulary.from_text`
      method.
    exmaple:

    file='./output.walks.0'
    print(count_words(file))

    输出:
    Counter({'34': 1446, '1': 1414, '33': 920, '3': 914, '2': 752, '4': 537,
            '32': 520, '9': 443, '14': 432, '24': 420, '6': 408, '7': 393,
            '28': 377, '31': 355, '8': 346, '30': 320, '25': 295, '11': 292,
            '5': 284, '29': 281, '20': 271, '26': 265, '17': 196, '10': 190,
            '18': 181, '15': 173, '21': 168, '13': 167, '19': 159, '22': 155,
            '27': 153, '16': 152, '23': 134, '12': 87})

  """
    c = Counter()
    with open(file, 'r') as f:
        for l in f:
            words = l.strip().split()
            c.update(words)
    return c


def count_textfiles(files, workers=1):
    """
    这个函数使用里多线程的技巧，我们首先需要了解ProcessPoolExecutor方法

    c = Counter()
    with ProcessPoolExecutor(max_workers=workers) as executor:
        for c_ in executor.map(count_words, files):
            c.update(c_)
    return c

    这段代码是标准使用模式，意思是：
    开启workers个进程处理器，将他们整体命名为executor，或者说executor是
    一个ProcessPoolExecutor实例化的对象的别名。


    executor.map(called_function, input_array)
    这句代码的意思是并行化的完成下面的功能:
    result=[]
    for item in input_array:
        re=called_function(item)
        result.append(re)
    不同的是，这段代码不能并行计算，而executor.map(called_function, input_array)
    可以实现并行计算，并且显然在代码上更加简洁


    count_words这个函数会返回一个Counter，因此
    c = Counter()
    for c_ in executor.map(count_words, files):
        c.update(c_)
    这段代码的意思是，每次得到一个Counter(即代码中的c_)，就用这个Counter去更新
    全局的Counter(即代码中的c)


    关于from collections import Counter的具体或更多用法：

    Counter是继承自Dict的一个子类，因此它本质上仍然是一个字典类型
    Dict subclass for counting hashable items.  Sometimes called a bag
    or multiset.  Elements are stored as dictionary keys and their counts
    are stored as dictionary values.

    ..>>> c = Counter('abcdeabcdabcaba')  # count elements from a string

    ..>>> c.most_common(3)                # three most common elements
    [('a', 5), ('b', 4), ('c', 3)]
    ..>>> sorted(c)                       # list all unique elements
    ['a', 'b', 'c', 'd', 'e']
    ..>>> ''.join(sorted(c.elements()))   # list elements with repetitions
    'aaaaabbbbcccdde'
    .>>> sum(c.values())                 # total of all counts
    15

    .>>> c['a']                          # count of letter 'a'
    5
    .>>> for elem in 'shazam':           # update counts from an iterable
    ...     c[elem] += 1                # by adding 1 to each element's count
    .>>> c['a']                          # now there are seven 'a'
    7
    .>>> del c['b']                      # remove all 'b'
    .>>> c['b']                          # now there are zero 'b'
    0

    .>>> d = Counter('simsalabim')       # make another counter
    .>>> c.update(d)                     # add in the second counter
    .>>> c['a']                          # now there are nine 'a'
    9

    .>>> c.clear()                       # empty the counter
    .>>> c
    Counter()
    Note:  If a count is set to zero or reduced to zero, it will remain
    in the counter until the entry is deleted or the counter is cleared:
    .>>> c = Counter('aaabbc')
    .>>> c['b'] -= 2                     # reduce the count of 'b' by two
    .>>> c.most_common()                 # 'b' is still in, but its count is zero
    [('a', 3), ('c', 1), ('b', 0)]

    """
    c = Counter()
    with ProcessPoolExecutor(max_workers=workers) as executor:
        for c_ in executor.map(count_words, files):
            c.update(c_)
    return c


def count_lines(f):
    if path.isfile(f):  # Test whether a path is a regular file
        num_lines = sum(1 for line in open(f))
        """
        上面这行代码相当于：
            a=[1 for line in open(f)] # a=[1,1,1,1,1,...,1]
            num_lines = sum(a)
        """
        return num_lines
    else:
        return 0


def _write_walks_to_disk(args):
    """
    :param args: ppw, path_length, alpha, random.Random(rand.randint(0, 2 ** 31)), file_
    ppw: paths number for a single worker
    path_length: 随机游走的路径长度
    alpha: 以alpha的概率停止继续走，回到路径的起点重新走
    random.Random(rand.randint(0, 2 ** 31)): 随机数种子，这个种子是在rand种子基础上建立的种子
    file_: './output.walks.0' 随机游走路径存储的文件
    :return:
    """
    num_paths, path_length, alpha, rand, f = args
    G = __current_graph
    # t_0 = time()
    with open(f, 'w') as fout:
        for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
                                                     alpha=alpha, rand=rand):
            fout.write(u"{}\n".format(u" ".join(v for v in walk)))
    # logger.debug("Generated new file {}, it took {} seconds".format(f, time() - t_0))
    return f


def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
                        always_rebuild=False):
    """
       :param G:
       :param filebase: './output.walks'（实际的文件是'./output.walks.x'）
       :param num_paths: 每一个起点走几轮
       :param path_length: 随机游走路径长度
       :param alpha:alpha=0, 指的是以概率1-alpha从当前节点继续走下去(可以回到上一个节点)，或者以alpha的概率停止继续走，回到路径的起点重新走。
                    请注意：这里的随机游走路径未必是连续的，有可能是走着走着突然回到起点接着走
                    即便alpha=0，也不能保证以后的路径不会出现起点，因为有可能从起点开始走到第二个点时，这第二个点以1的概率继续走下去，
                    这时候它可能会回到起点。
                    但是如果alpha=1,那么就意味着它不可能继续走下去，只能每次都从起点开始重新走，所以这个时候打印出来的路径序列是一连串的起点。
       :param rand: 随机数种子，确保随机过程可以复现
       :param num_workers: 进程处理器个数
       :param always_rebuild:
       :return:
       """
    global __current_graph
    """
    关于global变量的使用，请参考博客
    [python基础 - global关键字及全局变量的用法](https://blog.youkuaiyun.com/diaoxuesong/article/details/42552943)
    """

    __current_graph = G
    # files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_paths))]
    """从代码分析来看，上面这句是写错了，应该这样写:"""
    files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_workers))]

    print("num_paths is: ", num_paths)
    print([str(x) for x in list(range(num_paths))])
    print("file_lists:", files_list)
    """
    './output.walks.0'
    './output.walks.1'
    './output.walks.2'
    ...
    """
    expected_size = len(G)  # nodes number
    args_list = []  # ?
    files = []  # files_list只是一系列的文件名，files里才是真正存入采样路径后的文件

    if num_paths <= num_workers:
        paths_per_worker = [1 for x in range(num_paths)]
        """
        如果采样轮数小于处理器个数，
        那么其中一些处理器中，每一个处理器负责其中一轮采样，
        剩下一些处理器则没有任务
        """
    else:
        paths_per_worker = [len(list(filter(lambda z: z != None, [y for y in x])))
                            for x in graph.grouper(int(num_paths / num_workers) + 1, range(1, num_paths + 1))]
        """
        grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')
        第一个参数是整数int，第二个参数是可迭代对象，例如range(1,7+1) （返回1，2，3，4，5，6，7的迭代对象）
        如果第三个参数不写，则default为None,这样
        grouper(3, range(1,7+1))-->(1,2,3),(4,5,6),(7,None,None)

        假如现在有7轮，有3个workers那么每个worker至少需要处理7/3=2.3333...轮，为了保证所有轮数都被处理
        向上取整得3轮.即int(7/3)+1，

        轮数序列号为：1,2,3,4,5,6,7，所以每3个轮数被分成一组,得到：
        (1,2,3)(4,5,6)(7,None,None)

        filter(function, seq)可以从seq中筛选出是的function为True的元素
        filter(lambda z: z != None, [y for y in x])
        意思就是说，对于刚才得到的(1,2,3)(4,5,6)(7,None,None)中的任何一个进行筛选,例如
        若x=(7,None,None)则
        seq=[y for y in x]=[7,None,None]
        function相当于:
            fun(z):
                if z!=None:
                    return True
        这里的z是[7,None,None]中的每一个元素。

        这样通过filter之后，得到的结果为：
        [7]
        再将其进行len(list([7]))得到1，即这一个worker处理的轮数为1轮



        存在的问题：如果有6轮，3个处理器，那么按照它的方法，分配的结果是

        处理器1：1，2，3
        处理器2：4，5，6
        处理器3：None      
        """
    """
    其实上面的if-else完全可以合并：
    paths_per_worker = [len(list(filter(lambda z: z != None, [y for y in x])))
                        for x in graph.grouper(int(num_paths / num_workers) 
                                               if num_paths % num_workers == 0 
                                               else int(num_paths / num_workers) + 1, 
                                               range(1, num_paths + 1))]

    """

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for size, file_, ppw in zip(executor.map(count_lines, files_list), files_list, paths_per_worker):

            """
            假如有7轮，3个worker，按照原始代码，path_per_worker=[3,3,1]
            size    file_           ppw
            0   './output.walks.0'  3
            0   './output.walks.1'  3
            0   './output.walks.2'  1
            """
            if always_rebuild or size != (ppw * expected_size):
                args_list.append((ppw, path_length, alpha, random.Random(rand.randint(0, 2 ** 31)), file_))
                """
                random.Random(rand.randint(0, 2 ** 31))的意思是：

                首先rand在这里是外面传进来的随机数种子：rand=random.Random(0)
                然后用种子生成随机数，随机数大小介于0-2^31之间,即:random.Random(0).randint(0, 2 ** 31)
                得到的这个随机数仍然是一个整数，把这个整数作为新的种子,即:
                random.Random(random.Random(0).randint(0, 2 ** 31))
                """
            else:
                files.append(file_)
            """
            这里的if-else语句的意思是：
            假如我们现在有20个节点，7轮,3个处理器(各个处理器处理的轮数应为：3,3,1)，那么最终要生成三个文件应为：
            './output.walks.0'  3*20=60行
            './output.walks.1'  3*20=60行
            './output.walks.2'  1*20=20行

            从中可以看出ppw * expected_size表示当前文件理应存入获取的行数


            假如我们因为某些原因，已经生成好了'./output.walks.0'，但是半道停了下来，再次开始时，我们希望可以接着做，而不是从新生成
            './output.walks.0'文件，即我们希望always_rebuild=False
            那么这个时候我们就要判断一下已有的'./output.walks.0'文件是不是完成了，
            如若还没有完成，即size != (ppw * expected_size)，那么这个文件就重新开始。
            如果已经写完了，那么直接把这个文件添加到files里面就可以了。即files.append(file_)

            args_list实际生成了多少个参数组，就意味着我们有多少个文件需要完成任务，每一个参数组对应一个处理器因此后面的代码写的是：
            for file_ in executor.map(_write_walks_to_disk, args_list):
                files.append(file_)
            """


    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for file_ in executor.map(_write_walks_to_disk, args_list):
            files.append(file_)

    return files


class WalksCorpus(object):
    def __init__(self, file_list):
        """

        :param file_list: 这是write_walks_to_disk写在本地里的一组文件列表
        """
        self.file_list = file_list

    def __iter__(self):
        """
        Python 中的顺序类型，都是可迭代的（list, tuple, string）。
        其余包括 dict, set, file 也是可迭代的。
        对于用户自己实现的类型，如果提供了 __iter__() 或者 __getitem__() 方法，
        那么该类的对象也是可迭代的。


        假如file_list=['output.walks.0','output.walks.1']其中
        'output.walks.0':
        8 2 4 8 2 31 34 28 24 28 25 28 3 10 3
        2 1 22 1 18 2 8 2 4 2 18 1 8 3 2 1
        'output.walks.1':
         32 25 26 32 29 3 33 31 9 33 16 34
         6 11 1 20 34 30 24 30 24 28 3 1 14

        那么这个函数返回后得到的就是：
        [   [8, 2, 4, 8, 2, 31, 34, 28, 24, 28, 25, 28, 3, 10, 3],
            [2, 1, 22, 1, 18, 2, 8, 2, 4, 2, 18, 1, 8, 3, 2, 1],
            [32, 25, 26, 32, 29, 3, 33, 31, 9, 33, 16, 34 ],
            [6, 11, 1, 20, 34, 30, 24, 30, 24, 28, 3, 1, 14]
        ]


        更多关于迭代和yield的内容，可以参考博文
        [Python 中的黑暗角落（一）：理解 yield 关键字](https://liam0205.me/2017/06/30/understanding-yield-in-python/)
        :return:
        """
        for file in self.file_list:
            with open(file, 'r') as f:
                for line in f:
                    yield line.split()

"""
下面这段代码是作者临时写的，忘记删除了，实际上根本没有第二次出现过
"""
def combine_files_iter(file_list):
    for file in file_list:
        with open(file, 'r') as f:
            for line in f:
                yield line.split()

2. 代码的精简

#!usr/bin/env python  
# -*- coding:utf-8 _*-  
""" 
@project:deepwalk-master
@author:xiangguosun 
@contact:sunxiangguodut@qq.com
@website:http://blog.youkuaiyun.com/github_36326955
@file: simple_walks.py 
@platform: macOS High Sierra 10.13.1 Pycharm pro 2017.1 
@time: 2018/09/14 
"""

from io import open
from os import path
from multiprocessing import cpu_count
import random
from concurrent.futures import ProcessPoolExecutor
from collections import Counter

from six.moves import zip

from deepwalk.myanalysis import graph

__current_graph = None


def count_words(file):
    c = Counter()
    with open(file, 'r') as f:
        for l in f:
            words = l.strip().split()
            c.update(words)
    return c


def count_textfiles(files, workers=1):
    c = Counter()
    with ProcessPoolExecutor(max_workers=workers) as executor:
        for c_ in executor.map(count_words, files):
            c.update(c_)
    return c


def count_lines(f):
    if path.isfile(f):  # Test whether a path is a regular file
        num_lines = sum(1 for line in open(f))
        return num_lines
    else:
        return 0


def _write_walks_to_disk(args):
    num_paths, path_length, alpha, rand, f = args
    G = __current_graph
    with open(f, 'w') as fout:
        for walk in graph.build_deepwalk_corpus_iter(G=G, num_paths=num_paths, path_length=path_length,
                                                     alpha=alpha, rand=rand):
            fout.write(u"{}\n".format(u" ".join(v for v in walk)))
    return f


def write_walks_to_disk(G, filebase, num_paths, path_length, alpha=0, rand=random.Random(0), num_workers=cpu_count(),
                        always_rebuild=False):
    global __current_graph

    __current_graph = G

    files_list = ["{}.{}".format(filebase, str(x)) for x in list(range(num_workers))]

    expected_size = len(G)
    args_list = []
    files = []

    paths_per_worker = [len(list(filter(lambda z: z != None, [y for y in x])))
                        for x in graph.grouper(int(num_paths / num_workers)
                                               if num_paths % num_workers == 0
                                               else int(num_paths / num_workers) + 1,
                                               range(1, num_paths + 1))]

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for size, file_, ppw in zip(executor.map(count_lines, files_list), files_list, paths_per_worker):
            if always_rebuild or size != (ppw * expected_size):
                args_list.append((ppw, path_length, alpha, random.Random(rand.randint(0, 2 ** 31)), file_))
            else:
                files.append(file_)

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        for file_ in executor.map(_write_walks_to_disk, args_list):
            files.append(file_)

    return files


class WalksCorpus(object):
    def __init__(self, file_list):
        self.file_list = file_list

    def __iter__(self):
        for file in self.file_list:
            with open(file, 'r') as f:
                for line in f:
                    yield line.split()