Boyer Moore Pattern Matching Algorithm

最新推荐文章于 2023-05-22 16:12:33 发布

Allen

最新推荐文章于 2023-05-22 16:12:33 发布

阅读量2.9k

点赞数

分类专栏：编程技巧文章标签： algorithm 算法 iostream struct pair null

本文链接：https://blog.youkuaiyun.com/daineng/article/details/693866

版权

编程技巧专栏收录该内容

13 篇文章

订阅专栏

本文介绍了Boyer-Moore算法的工作原理，通过实例展示了算法如何通过跳过部分比较来提高效率。文章包含算法的简单实现及一个演示程序，探讨了在特定情况下的效率问题，并提出快速匹配的概念作为增强算法的可能途径。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

/*****************************************************
* Boyer Moore Pattern Matching Algorithm
* 20060427 by daineng@nj.cpsecure
*****************************************************/

这个算法的特点在于从pattern的最后一位开始比较，一旦不
符合则把pattern提前到当前比较位置上的元素和pattern中某
个一致为止。

Boyer_Moore的算法说明有很多，但不是很明白，结合下面的
数据将会很容易理解这个算法。（由于字符位置比较重要，
下面的例子需要在等字符宽度的字体下显示才有效果）

抱歉我没有太多的时间说明这个问题。

-------------------------------------------------
         1         2         3         4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
This is a test of the Boyer Moore algorithm
algorithm
87654321
        >> (sf['a'] - 0) = 8
        algorithm
                 >> (sf['f'] - 0) = 9
                 algorithm
                          >> (sf['e'] - 0) = 9
                          algorithm
                                  >> (sf['a'] - 0 = 8
                                  algorithm

          |
    xxxxBooooxxxx
      Boooo
          |

-------------------------------------------------
         1         2         3         4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
aaaaaaabcc.....
aaabbbbccc
777333311
>> (sf['b'] - 2) = 1
aaabbbbccc
aaaaaaabccabaaabbbbccc....
        >> (sf['a'] - 0) = 7
        aaabbbbccc
           >> (sf['b'] - 0) = 3
           aaabbbbccc
            >> (sf['b'] - 2) = 1
            aaabbbbccc

CooooBooooo
Booo
>> (sf[C] - 3) = 1
Booo
>> (sf[o] - 3) = -2

=============================================================
相信到这里你已经明白这个算法了，下面给出一个只匹配字符串的
简单例子
=============================================================

/
// matcher_bm.h
// Boyer Moore
// 20060427 by daineng@nj.cpsecure

#ifndef _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
#define _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_

struct st_chpat;

class matcher_bm
{
    public:
        matcher_bm(const char*);
        ~matcher_bm();

    public:
        void init();
        const char* scan(const char*);

private:
struct st_chpat *m_chpat;
};

#endif

/
// matcher_bm.cpp
// Boyer Moore
// 20060427 by daineng@nj.cpsecure

#include <iostream>
#include <map>
#include <string.h>

#include "matcher_bm.h"

using namespace std;

typedef map<char, int>::iterator sf_itor;

class chpat_sf : public map<char, int> {
    public:
        int operator[](map<char, int>::key_type key) {
            sf_itor itor = find(key);
            if (itor == end())
                return dvaule;
            else
                return itor->second;
        }
        int dvaule;
};

struct st_chpat {
    const char *_pat;
    int pat_len;
    chpat_sf *_shift;
};

matcher_bm::matcher_bm(const char *_pattern) {
    m_chpat = new struct st_chpat;
    m_chpat->_shift = new chpat_sf;
    m_chpat->pat_len = strlen(_pattern);
    m_chpat->_pat = _pattern;
}

matcher_bm::~matcher_bm() {
delete m_chpat->_shift;
delete m_chpat;
}

typedef pair <char, int> sf_pair;

void matcher_bm::init()
{
    m_chpat->_shift->dvaule = m_chpat->pat_len;
    m_chpat->_shift->clear();
    int size = m_chpat->pat_len - 1;
    pair<sf_itor, bool> pr;
    for (int iLoc(0); iLoc < size; iLoc++) {
        int shift = size - iLoc;
        pr = m_chpat->_shift->insert(sf_pair(*(m_chpat->_pat + iLoc), shift));
        if (!pr.second)
            pr.first->second = shift;
    }
}

const char* matcher_bm::scan(const char *_target) {
    const char *_pat_tail = m_chpat->_pat + m_chpat->pat_len - 1;
    const char *_suffix = _target + m_chpat->pat_len - 1;
    const char *_target_end = _target + strlen(_target);
    cout << "          01234567890123456789012345678901234567890123456789" << endl;
    cout << "target : " << _target << endl;
    cout << "pattern : " << m_chpat->_pat << endl;
    while (_suffix < _target_end) {
        cout << "match   : " << _suffix - m_chpat->pat_len + 1 << endl;
        int i = 0;
        for (; i < m_chpat->pat_len; i++) {
            if (*(_pat_tail - i) != *(_suffix - i)) {
                int step = m_chpat->_shift->operator[](*(_suffix - i)) - i;
                if (step < 1) step = 1;
                //cout << ">> (sf['" << *(_suffix - i) << "'] - "
                    //<< i << ") = " << step << endl;
                _suffix += step;
                break;
            }
        }
        if (i == m_chpat->pat_len)
            return (_suffix - m_chpat->pat_len + 1);
    }
    return NULL;
}

/
// demo.cpp 这是一个演示的程序
// Boyer Moore
// 20060427 by daineng

#include <iostream>
#include "matcher_bm.h"

using namespace std;

struct st_target_pat {
    const char *target;
    const char *pattern;
} target_pat [] = {
    {"This is a test of the Boyer Moore algorithm", "algorithm"},
    {"CooBooooBooooo", "Booooo"},
    {"aaaaaaabccabaaabbbbccc.", "aaabbbbccc"},
    {"CooooBooooo", "Booo"},
    {"11111111", "0"},
    {NULL, NULL}
};

int main(void)
{
    for (int itp(0); target_pat[itp].target != NULL; itp++) {
        cout << endl << "Target-Pattern[" << itp << "] :" << endl;
        matcher_bm matcher(target_pat[itp].pattern);
        matcher.init();
        const char *p;
        if (NULL != (p = matcher.scan(target_pat[itp].target))) {
            cout << "Match @ " << (int)(p - target_pat[itp].target)
                << " Zero-Based" << endl;
        }
    }
    return 0;
}

=============================================================
如果以上对这个算法的理解没有错的话，那么从下面的例子可以看出
这个算法在某些情况下效果非常不好，记得大学的数据结构上介绍过
快速匹配，当时没有仔细研究，依稀记得好像是根据已经匹配过的纪
录来决定步进量，不需要再匹配已经匹配的部分。按这样的原理就需
要初始化的时候用pattern匹配一次pattern，然后纪录某种结果。

结合到BM算法中应该就可以加强BM的匹配算法，在BM步进为1的时候
如果快速匹配的步进大于1就选择后者。快速匹配好像就是用来处理
这种重复比较多的情况。

按BM的算法有时候步进会小于1，这个时候把步进量设置成1
=============================================================

010000000000
110 :1=1
>> (sf[0] - 2) = 1
110
>> (sf[0] - 1) = 2
110

000000000000
100 :1=2,0=1
>> (sf[0] - 2]) = -1

00100000
1010 :1=1, 0=2
>> (sf[0] - 3) = -1

00110000
1010 :1=1, 0=2
>> (sf[1] - 0) = 1