Needleman-Wunsch Algorithm

本文详细介绍了如何利用C++编程语言实现Needleman-Wunsch算法,对两个不同个体的基因序列进行相似性比对。通过优化的全局序列对齐方法,即使在进化过程中发生了插入、删除或核苷酸变化,也能得到良好的匹配结果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Let's say you have two genetic sequences from two different individuals, and you want to know whether they represent the same gene. How would you go about it? You could let your computer do a simple string comparison, but then you would almost never get a match, even if the two nucleotide sequences do come from the same gene. That's because there is so much genetic variation, especially if the individuals come from two different species. What you need is a more sophisticated approach, one that will give you a good match even if the sequences have diverged in the course of evolution through mutations such as insertions, deletions or nucleotide changes.

This is what the Needleman-Wunsch algorithm does. It optimizes the alignment between two similar sequences, and assumes that the sequences are similar throughout. In other words, it is a global alignment algorithm. The Smith-Watermann algorithm is even more useful, because it doesn't assume that the sequences should be similar everywhere, but instead tries to line up sections of the sequences that give a good match. I implemented the Needleman-Wunsch Algorithm in C++ based on its description in Biological Sequence Analysis, published in 1998. The book is an excellent introduction to biological sequence analysis, including other dynamic programming algorithms, such as hidden Markov models.

C++ Source Code:

  • main.cpp
  • /*---------------------------------------------------------------
     *
     *   main.cpp for nw program.
     *
     *   Implements the Needleman-Wunsch algorithm
     *   for global nucleotide sequence alignment.
     *
     *   Rolf Muertter,  rolf@dslextreme.com
     *   9/5/2006
     *
     ---------------------------------------------------------------*/
    
    
    #include "nw.h"
    
    using namespace std;
    
    
    int  main( int argc, char ** argv )
    {
            char *  program = *argv ;
            bool    prm = false;
    
    
            if( argc < 3 )
            {
                    cerr << "\n   Usage:   " << program << " seq_1 seq_2 [-p]\n";
                    cerr << "\n   -p:       Print matrices\n";
                    cerr << "\n   Output:   alignment\n\n";
    
                    exit( 1 ) ;
            }                       
            
            // Sequences to be aligned
            string  seq_1   =  argv[ 1 ] ;
            string  seq_2   =  argv[ 2 ] ;
    
            if( argc == 4 )
            {
                    string  pr_arg  =  argv[ 3 ] ;
                    if( pr_arg == "-p" )  prm = true;   // Print matrices
            }                       
    
            #if DEBUG
                cout << "seq_1: " << seq_1 << endl;
                cout << "seq_2: " << seq_2 << endl;
                cout << "-p: " << prm << endl;
            #endif
    
            // Aligned sequences
            string  seq_1_al;
            string  seq_2_al;
    
            // Get alignment
            nw( seq_1, seq_2, seq_1_al, seq_2_al, prm ) ;   
    
            // Print alignment
            print_al( seq_1_al, seq_2_al ) ;        
    
            return  0 ;
    }
    


  • nw.cpp
  • /*-------------------------------------------
     * 
     *        nw.cpp for program nw
     * 
     -------------------------------------------*/
    
    #include "nw.h"
    
    using namespace std;
    
    
    int nw(                                                          
            string       seq_1,          /*  Needleman-Wunsch   */
            string       seq_2,          /*  algorithm for      */
            string&      seq_1_al,       /*  global alignment   */
            string&      seq_2_al,       /*  of nt sequence.    */
            bool         prm
          )
    {
            int  d = 2 ;                 /* gap penalty */
    
            int  L1 = seq_1.length();
            int  L2 = seq_2.length();
    
            // Dynamic programming matrix
            int ** F = new int * [ L2+1 ];
            for( int i = 0; i <= L2; i++ )  F[ i ] = new int [ L1 ];
    
            // Traceback matrix
            char ** traceback = new char * [ L2+1 ];
            for( int i = 0; i <= L2; i++ )  traceback[ i ] = new char [ L1 ];
    
            // Initialize traceback and F matrix (fill in first row and column)
            dpm_init( F, traceback, L1, L2, d );
    
            // Create alignment
            nw_align( F, traceback, seq_1, seq_2, seq_1_al, seq_2_al, d );
    
            #if DEBUG
                int  L_al = seq_1_al.length();
                cout << "Length after alignment: " << L_al << endl;
            #endif
    
            if( prm )
            {
                    cout << "\nDynamic programming matrix: " << "\n\n";
                    print_matrix( F, seq_1, seq_2 );
    
                    cout << "\nTraceback matrix: " << "\n\n";
                    print_traceback( traceback, seq_1, seq_2 );
    
                    cout << endl;
            }
    
            for( int i = 0; i <= L2; i++ )  delete F[ i ];
            delete[] F;
            for( int i = 0; i <= L2; i++ )  delete traceback[ i ];
            delete[] traceback;
    
            return  0 ;
    }
    
    
    void  dpm_init( int ** F, char ** traceback, int L1, int L2, int d )
    {
            F[ 0 ][ 0 ] =  0 ;
            traceback[ 0 ][ 0 ] = 'n' ;
    
            int i=0, j=0;
    
            for( j = 1; j <= L1; j++ )
            {
                    F[ 0 ][ j ] =  -j * d ;
                    traceback[ 0 ][ j ] =  '-' ;
            }
            for( i = 1; i <= L2; i++ )
            {
                    F[ i ][ 0 ] =  -i * d ;
                    traceback[ i ][ 0 ] =  '|' ;
            }
    }
    
    
    int nw_align(                  // Needleman-Wunsch algorithm
                  int **     F,
                  char **    traceback,
                  string     seq_1,
                  string     seq_2,
                  string&    seq_1_al,
                  string&    seq_2_al,
                  int        d         // Gap penalty
                )
    {
            int        k = 0, x = 0, y = 0;
            int        fU, fD, fL ;
            char       ptr, nuc ;
            int        i = 0, j = 0;
    
            const int  a =  2;   // Match
            const int  b = -1;   // Mismatch
    
            const int  s[ 4 ][ 4 ] = { { a, b, b, b },    /* substitution matrix */
                                       { b, a, b, b },
                                       { b, b, a, b },
                                       { b, b, b, a } } ;
    
            int  L1 = seq_1.length();
            int  L2 = seq_2.length();
    
            for( i = 1; i <= L2; i++ )
            {
                    for( j = 1; j <= L1; j++ )
                    {
                            nuc = seq_1[ j-1 ] ;
    
                            switch( nuc )
                            {
                                    case 'A':  x = 0 ;  break ;
                                    case 'C':  x = 1 ;  break ;
                                    case 'G':  x = 2 ;  break ;
                                    case 'T':  x = 3 ;
                            }
    
                            nuc = seq_2[ i-1 ] ;
    
                            switch( nuc )
                            {
                                    case 'A':  y = 0 ;  break ;
                                    case 'C':  y = 1 ;  break ;
                                    case 'G':  y = 2 ;  break ;
                                    case 'T':  y = 3 ;
                            }
    
                            fU = F[ i-1 ][ j ] - d ;
                            fD = F[ i-1 ][ j-1 ] + s[ x ][ y ] ;
                            fL = F[ i ][ j-1 ] - d ;
    
                            F[ i ][ j ] = max( fU, fD, fL, &ptr ) ;
    
                            traceback[ i ][ j ] =  ptr ;
                    }
            }
            i-- ; j-- ;
    
            while( i > 0 || j > 0 )
            {
                    switch( traceback[ i ][ j ] )
                    {
                            case '|' :      seq_1_al += '-' ; 
                                            seq_2_al += seq_2[ i-1 ] ; 
                                            i-- ;
                                            break ;
    
                            case '\\':      seq_1_al += seq_1[ j-1 ] ; 
                                            seq_2_al += seq_2[ i-1 ] ; 
                                            i-- ;  j-- ;
                                            break ;
    
                            case '-' :      seq_1_al += seq_1[ j-1 ] ; 
                                            seq_2_al += '-' ; 
                                            j-- ;
                    }
                    k++ ;
            }
            
            reverse( seq_1_al.begin(), seq_1_al.end() );
            reverse( seq_2_al.begin(), seq_2_al.end() );
    
            return  0 ;
    }
    
    
    int  max( int f1, int f2, int f3, char * ptr )
    {
            int  max = 0 ;
    
            if( f1 >= f2 && f1 >= f3 )  
            {
                    max = f1 ;
                    *ptr = '|' ;
            }
            else if( f2 > f3 )              
            {
                    max = f2 ;
                    *ptr = '\\' ;
            }
            else
            {
                    max = f3 ;
                    *ptr = '-' ;
            }
            
            return  max ;   
    }
    
    
    void  print_matrix( int ** F, string seq_1, string seq_2 )
    {
            int  L1 = seq_1.length();
            int  L2 = seq_2.length();
    
            cout << "        ";
            for( int j = 0; j < L1; j++ )
            {
                    cout << seq_1[ j ] << "   ";
            }
            cout << "\n  ";
    
            for( int i = 0; i <= L2; i++ )
            {
                    if( i > 0 )
                    {
                            cout << seq_2[ i-1 ] << " ";
                    }
                    for( int j = 0; j <= L1; j++ )
                    {
                            cout.width( 3 );
                            cout << F[ i ][ j ] << " ";
                    }
                    cout << endl;
            }
    }
    
    
    void  print_traceback( char ** traceback, string seq_1, string seq_2 )
    {
            int  L1 = seq_1.length();
            int  L2 = seq_2.length();
    
            cout << "    ";
            for( int j = 0; j < L1; j++ )
            {
                    cout << seq_1[ j ] << " ";
            }
            cout << "\n  ";
    
            for( int i = 0; i <= L2; i++ )
            {
                    if( i > 0 )
                    {
                            cout << seq_2[ i-1 ] << " ";
                    }
                    for( int j = 0; j <= L1; j++ )
                    {
                            cout << traceback[ i ][ j ] << " ";
                    }
                    cout << endl;
            }
    }
    
    
    void  print_al( string& seq_1_al, string& seq_2_al )
    {
            cout << seq_1_al << endl;
            cout << seq_2_al << endl;
    }
    
    


  • nw.h
  • /*
     * nw.h for program nw.
     *
     */
    
    #include <iostream>
    #include <string>
    #include <algorithm>
    #include <stdlib.h>
    #include <stdio.h>
    
    #define DEBUG 0
    
    using namespace std;
    
    
    extern int  nw( 
                     string, string, 
                     string&, string&,
                     bool
                  );
    
    extern int  nw_align( 
                          int **, char **,
                          string, string, 
                          string&, string&,
                          int 
                        );
    
    extern void  dpm_init        ( int **, char **, int, int, int );
    extern void  print_al        ( string&, string& );
    extern void  print_matrix    ( int ** const, string, string );
    extern void  print_traceback ( char ** const, string, string );
    extern int   max             ( int, int, int, char * );
    




评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值