python实现基因组组装算法 De Bruijn Graph_python de brujin graph-优快云博客

本文链接：https://blog.youkuaiyun.com/dujidan/article/details/145720349

python实现基因组组装算法 De Bruijn Graph

代码实现
参考：

在这里插入图片描述

代码实现


class Node:
    """ Class Node to represent a vertex in the de bruijn graph """
    def __init__(self, lab):
        self.label = lab
        self.indegree = 0
        self.outdegree = 0

class Edge:
    def __init__(self, lab):
        self.label = lab

def read_reads(fname):
    """ Read short reads in FASTA format. It is assumed that one line in the input file correspond to one read. """
    f = open(fname, 'r')
    lines = f.readlines()
    f.close()
    reads = []

    for line in lines:
        if line[0] != '>':
            reads = reads + [line.rstrip()]

    return reads

def construct_graph(reads, k):
    """ Construct de bruijn graph from sets of short reads with k length word"""
    edges = dict()
    vertices = dict()

    for read in reads:
        i = 0
        while i+k < len(read):
            v1 = read[i:i+k]
            v2 = read[i+1:i+k+1]
            if v1 in edges.keys():
                vertices[v1].outdegree += 1
                edges[v1] += [Edge(v2)]
            else:
                vertices[v1] = Node(v1)
                vertices[v1].outdegree += 1
                edges[v1] = [Edge(v2)]
            if v2 in edges.keys():
                vertices[v2].indegree += 1
            else:
                vertices[v2] = Node(v2)
                vertices[v2].indegree += 1
                edges[v2] = []
            i += 1

    return (vertices, edges)

def output_contigs(g):
    """ Perform searching for Eulerian path in the graph to output genome assembly"""
    V = g[0]
    E = g[1]
    # Pick starting node (the vertex with zero in degree)
    start = list(V.keys())[0]
    for k in list(V.keys()):
        if V[k].indegree < V[start].indegree:
            start = k

    contig = start
    current = start
    while len(E[current]) > 0:
        # Pick the next node to be traversed (for now, at random)
        next = E[current][0]
        del E[current][0]
        contig += next.label[-1]
        current = next.label

    return contig

def print_graph(g):
    """ Print the information in the graph to be (somewhat) presentable """
    V = g[0]
    E = g[1]
    for k in V.keys():
        print ("name: ", V[k].label, ". indegree: ", V[k].indegree, ". outdegree: ", V[k].outdegree)
        print ("Edges: ")
        for e in E[k]:
            print (e.label)
        print()

if __name__ == "__main__":
    # Main script
    fname = '/g200reads.fa'
    reads = read_reads(fname)
    print (reads)
    g = construct_graph(reads, 11)
    contig = output_contigs(g)
    print (contig)
    exit()
    
    test = ['bcdefg', 'defghi', 'abcd']
    g = construct_graph(test, 3)
    # print_graph(g)
    # for k in g.keys():
    #   print( k, g[k])
    contig = output_contigs(g)

    print (contig)

参考：

https://zhuanlan.zhihu.com/p/57177938
https://data-science-sequencing.github.io/Win2018/lectures/lecture7/

https://github.com/mitbal/py-debruijn
https://github.com/stutiagrawal/Python-DeBruijn/blob/master/deBruijn.py
https://github.com/10XGenomics/rust-debruijn