/*****************************************************
* Boyer Moore Pattern Matching Algorithm
* 20060427 by daineng@nj.cpsecure
*****************************************************/
这个算法的特点在于从pattern的最后一位开始比较,一旦不
符合则把pattern提前到当前比较位置上的元素和pattern中某
个一致为止。
Boyer_Moore的算法说明有很多,但不是很明白,结合下面的
数据将会很容易理解这个算法。(由于字符位置比较重要,
下面的例子需要在等字符宽度的字体下显示才有效果)
抱歉我没有太多的时间说明这个问题。
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
This is a test of the Boyer Moore algorithm
algorithm
87654321
>> (sf['a'] - 0) = 8
algorithm
>> (sf['f'] - 0) = 9
algorithm
>> (sf['e'] - 0) = 9
algorithm
>> (sf['a'] - 0 = 8
algorithm
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
CooBooooBooooo
Booooo
51111
>> (sf['B'] - 2) = 3
Booooo
>> (sf['B'] - 0) = 5
Booooo
|
xxxxBooooxxxx
Boooo
|
-------------------------------------------------
1 2 3 4
1234567890123456789012345678901234567890123456789
-------------------------------------------------
aaaaaaabcc.....
aaabbbbccc
777333311
>> (sf['b'] - 2) = 1
aaabbbbccc
aaaaaaabccabaaabbbbccc....
>> (sf['a'] - 0) = 7
aaabbbbccc
>> (sf['b'] - 0) = 3
aaabbbbccc
>> (sf['b'] - 2) = 1
aaabbbbccc
CooooBooooo
Booo
>> (sf[C] - 3) = 1
Booo
>> (sf[o] - 3) = -2
=============================================================
相信到这里你已经明白这个算法了,下面给出一个只匹配字符串的
简单例子
=============================================================
/
// matcher_bm.h
// Boyer Moore
// 20060427 by daineng@nj.cpsecure
#ifndef _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
#define _MATCHER_BM_H_20060427_BY_DAINENG_AT_NJ_CPSECURE_
struct st_chpat;
class matcher_bm
{
public:
matcher_bm(const char*);
~matcher_bm();
public:
void init();
const char* scan(const char*);
private:
struct st_chpat *m_chpat;
};
#endif
/
// matcher_bm.cpp
// Boyer Moore
// 20060427 by daineng@nj.cpsecure
#include <iostream>
#include <map>
#include <string.h>
#include "matcher_bm.h"
using namespace std;
typedef map<char, int>::iterator sf_itor;
class chpat_sf : public map<char, int> {
public:
int operator[](map<char, int>::key_type key) {
sf_itor itor = find(key);
if (itor == end())
return dvaule;
else
return itor->second;
}
int dvaule;
};
struct st_chpat {
const char *_pat;
int pat_len;
chpat_sf *_shift;
};
matcher_bm::matcher_bm(const char *_pattern) {
m_chpat = new struct st_chpat;
m_chpat->_shift = new chpat_sf;
m_chpat->pat_len = strlen(_pattern);
m_chpat->_pat = _pattern;
}
matcher_bm::~matcher_bm() {
delete m_chpat->_shift;
delete m_chpat;
}
typedef pair <char, int> sf_pair;
void matcher_bm::init()
{
m_chpat->_shift->dvaule = m_chpat->pat_len;
m_chpat->_shift->clear();
int size = m_chpat->pat_len - 1;
pair<sf_itor, bool> pr;
for (int iLoc(0); iLoc < size; iLoc++) {
int shift = size - iLoc;
pr = m_chpat->_shift->insert(sf_pair(*(m_chpat->_pat + iLoc), shift));
if (!pr.second)
pr.first->second = shift;
}
}
const char* matcher_bm::scan(const char *_target) {
const char *_pat_tail = m_chpat->_pat + m_chpat->pat_len - 1;
const char *_suffix = _target + m_chpat->pat_len - 1;
const char *_target_end = _target + strlen(_target);
cout << " 01234567890123456789012345678901234567890123456789" << endl;
cout << "target : " << _target << endl;
cout << "pattern : " << m_chpat->_pat << endl;
while (_suffix < _target_end) {
cout << "match : " << _suffix - m_chpat->pat_len + 1 << endl;
int i = 0;
for (; i < m_chpat->pat_len; i++) {
if (*(_pat_tail - i) != *(_suffix - i)) {
int step = m_chpat->_shift->operator[](*(_suffix - i)) - i;
if (step < 1) step = 1;
//cout << ">> (sf['" << *(_suffix - i) << "'] - "
//<< i << ") = " << step << endl;
_suffix += step;
break;
}
}
if (i == m_chpat->pat_len)
return (_suffix - m_chpat->pat_len + 1);
}
return NULL;
}
/
// demo.cpp 这是一个演示的程序
// Boyer Moore
// 20060427 by daineng
#include <iostream>
#include "matcher_bm.h"
using namespace std;
struct st_target_pat {
const char *target;
const char *pattern;
} target_pat [] = {
{"This is a test of the Boyer Moore algorithm", "algorithm"},
{"CooBooooBooooo", "Booooo"},
{"aaaaaaabccabaaabbbbccc.", "aaabbbbccc"},
{"CooooBooooo", "Booo"},
{"11111111", "0"},
{NULL, NULL}
};
int main(void)
{
for (int itp(0); target_pat[itp].target != NULL; itp++) {
cout << endl << "Target-Pattern[" << itp << "] :" << endl;
matcher_bm matcher(target_pat[itp].pattern);
matcher.init();
const char *p;
if (NULL != (p = matcher.scan(target_pat[itp].target))) {
cout << "Match @ " << (int)(p - target_pat[itp].target)
<< " Zero-Based" << endl;
}
}
return 0;
}
=============================================================
如果以上对这个算法的理解没有错的话,那么从下面的例子可以看出
这个算法在某些情况下效果非常不好,记得大学的数据结构上介绍过
快速匹配,当时没有仔细研究,依稀记得好像是根据已经匹配过的纪
录来决定步进量,不需要再匹配已经匹配的部分。按这样的原理就需
要初始化的时候用pattern匹配一次pattern,然后纪录某种结果。
结合到BM算法中应该就可以加强BM的匹配算法,在BM步进为1的时候
如果快速匹配的步进大于1就选择后者。快速匹配好像就是用来处理
这种重复比较多的情况。
按BM的算法有时候步进会小于1,这个时候把步进量设置成1
=============================================================
010000000000
110 :1=1
>> (sf[0] - 2) = 1
110
>> (sf[0] - 1) = 2
110
000000000000
100 :1=2,0=1
>> (sf[0] - 2]) = -1
00100000
1010 :1=1, 0=2
>> (sf[0] - 3) = -1
00110000
1010 :1=1, 0=2
>> (sf[1] - 0) = 1