字符串匹配算法之总结

问题提出:

定字符串text, pattern,确定是否pattern 是text的子串,若是,请返回最先匹配的位置。


问题解决:

       1.暴力搜索:

       2.KMP 算法:涉及到求pattern的后缀数组,以保证每次的有效偏移,加快搜索;

       3.Boyer-Moore算法:详述请参见维基百科;

       4.Karp-Rabin 算法:详述请参见维基百科;


实现代码:

#ifndef  _PATTERN_SEARCH_H_
#define  _PATTERN_SEARCH_H_

#include <assert.h>

/*
* The implementation of brute force search
*
*/
int PatternMatchBrute( const char* text, const char* pattern )
{
	assert( text );
	assert( pattern );

	size_t textLen = strlen( text );
	size_t patternLen = strlen( pattern );

	for( size_t i = 0; i <= textLen - patternLen; i++ )
	{
		size_t start = i;
		size_t j = 0;
		for( ; j < patternLen; j++ )
		{
			if( text[start] == pattern[j] )
			{
				start++;
			}
			else
			{
				break;
			}
		}

		if( j == patternLen )
		{
			return i;
		}
	}

	return -1;
}


/*
* Helper function it can be used to calculate suffix array which associated with KMP algorithm 
*
*/
int* CalcSuffix( const char* pattern, size_t len )
{
	int* suffix = new int[ len + 1 ];
	memset( suffix, 0x00, sizeof(int)*( len + 1) );

	int j = 0;
	for( int i = 1; i < len; i++ )
	{
		while( j > 0 && pattern[i] != pattern[j] )
		{
			j = suffix[j - 1];
		}

		if( pattern[i] == pattern[j] )
		{
			suffix[i] = ++j;
		}

	}

	return suffix;
}

/*
* The implementation of KMP algorithm
*
*/
int PatternMatchKMP( const char* text, const char* pattern )
{
	assert( text );
	assert( pattern );

	size_t textLen = strlen( text );
	size_t patternLen = strlen( pattern );
	int* suffix = CalcSuffix( pattern,  patternLen );

	for( int i = 0; i <= textLen - patternLen;  )
	{	
		int start = i;
		int j = 0;
		for( j = 0; j < patternLen; j++ )
		{
			if( text[i] == pattern[j] )
			{
				i++;
			}
			else
			{
				i += j - suffix[j];
				break;
			}
		}

		if( j == patternLen )
		{
			delete [] suffix;
			return i - patternLen;
		}

		if( start == i )
		{
			i++;
		}

	}

	delete [] suffix;
	return -1;
}

/*
* The implementation of BoyerMoore algorithm
*
*/
int PatternMatchBME( const char* text, const char* pattern )
{
	int base = 256;
	int* right = new int[base];
	memset( right, -1, sizeof(int) * base );

	int len = strlen( pattern );
	for( int i = 0; i < len; i++ )
	{
		right[pattern[i]] = i;
	}

	int skip = -1;
	int strLen = strlen( text );
	for( int i = 0; i <= strLen - len; i += skip )
	{
		skip = 0;
		for( int j = len - 1; j >= 0; j-- )
		{
			if( text[i + j] != pattern[j] )
			{
				skip = j - right[text[i + j]]; // key point
				if( skip < 1 )
				{
					skip = 1;
				}

				break;
			}
		}

		if( 0 == skip )
		{
			delete [] right;
			return i;
		}
	}


	delete [] right;

	return -1;

}

/*
* Helper function
*
*/
int hashValue( int len, int base, int prime )
{
	int res = 1;
	for( int i = 0; i < len - 1; i++ )
	{
		res = ( res * base )%prime;
	}

	return res;
}

/*
* The implementation of Rabin-Karp algorithm
*
*/
int PatternMatchKRB( const char* str, const char* pat )
{
	int base = 256;
	int prime = 101;

	int len = strlen( pat );
	int h = hashValue( len, base, prime );

	int hashStr = 0;
	int hashPat = 0;
	for( int i = 0; i < len; i++ )
	{
		hashStr = ( hashStr * base + str[i] ) % prime;
		hashPat = ( hashPat * base + pat[i] ) % prime;
	}

	int strLen = strlen( str );
	for( int i = 0; i <= strLen - len; i++ )
	{
		if( hashStr == hashPat )
		{
			int j = 0;
			for( ; j < len; j++ )
			{
				if( str[i + j] != pat[j] )
					break;
			}

			if( j == len )
			{
				return i;
			}
		}

		if( i < strLen - len )
		{
			hashStr = ( base * ( hashStr - str[i] * h ) + str[i + len]) % prime;
			if( hashStr < 0 )
			{
				hashStr += prime;
			}
		}
	}

	return -1;
}


/*
* Test interface
*
*/
void TestPatternSearch()
{
	const char* text = "acccwdocccwwhowccsiowiowwwccwweioewchcccandccswwveaoiewddddiweoicccchacccwwsfchadchanddsoisndochandischurcccchandchinawitnessbrchandeakoutmiters";
	const char* pattern = "cccwws";

	const char* substr = strstr( text, pattern );
	int pos = substr - text;

	int newPos = PatternMatchBrute( text, pattern );

	int kmpPos = PatternMatchKMP( text, pattern );

	int boyerPos = PatternMatchBME( text, pattern );

	int krbPos = PatternMatchKRB( text, pattern );

	getchar();
}




#endif 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值