提取Python stringlib中的"BMHBNFS"字符串查找算法

最新推荐文章于 2022-01-20 20:30:00 发布

功名半纸

最新推荐文章于 2022-01-20 20:30:00 发布

阅读量2.5k

点赞数

CC 4.0 BY-SA版权

分类专栏：技术文摘文章标签： python character search 算法测试 cache

本文链接：https://blog.youkuaiyun.com/force_eagle/article/details/7193650

技术文摘专栏收录该内容

62 篇文章

订阅专栏

本文介绍Python中stringlib库实现的字符串查找算法，包括Boyer-Moore、Horspool等算法，并提供源代码示例及测试结果。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Python中的stringlib字符串查找算法是Boyer-Moore, Horspool, Sunday, Bloom Filter几种算法的合成体, 大概的原理如下:

def find(s, p):
    # find first occurrence of p in s
    n = len(s)
    m = len(p)
    skip = delta1(p)[p[m-1]]
    i = 0
    while i <= n-m:
        if s[i+m-1] == p[m-1]: # (boyer-moore)
            # potential match
            if s[i:i+m-1] == p[:m-1]:
                return i
            if s[i+m] not in p:
                i = i + m + 1 # (sunday)
            else:
                i = i + skip # (horspool)
        else:
            # skip
            if s[i+m] not in p:
                i = i + m + 1 # (sunday)
            else:
                i = i + 1
    return -1 # not found

以下是具体实现:

/* stringlib: fastsearch implementation */

#ifndef STRINGLIB_FASTSEARCH_H
#define STRINGLIB_FASTSEARCH_H

#include <stdint.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* fast search/count implementation, based on a mix between boyer-
   moore and horspool, with a few more bells and whistles on the top.
   for some more background, see: http://effbot.org/zone/stringlib.htm */

/* note: fastsearch may access s[n], which isn't a problem when using
   Python's ordinary string types, but may cause problems if you're
   using this code in other contexts.  also, the count mode returns -1
   if there cannot possible be a match in the target string, and 0 if
   it has actually checked for matches, but didn't find any.  callers
   beware! */

#define FAST_COUNT 0
#define FAST_SEARCH 1
#define FAST_RSEARCH 2

#ifndef LONG_BIT
#define LONG_BIT 32
#endif

#if LONG_BIT >= 128
#define STRINGLIB_BLOOM_WIDTH 128
#elif LONG_BIT >= 64
#define STRINGLIB_BLOOM_WIDTH 64
#elif LONG_BIT >= 32
#define STRINGLIB_BLOOM_WIDTH 32
#else
#error "LONG_BIT is smaller than 32"
#endif

#define STRINGLIB_BLOOM_ADD(mask, ch) \
	((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
#define STRINGLIB_BLOOM(mask, ch)     \
	((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))

ssize_t fastsearch(const char *s, ssize_t n,
		   const char *p, ssize_t m,
		   ssize_t maxcount, int mode)
{
	unsigned long mask;
	ssize_t skip, count = 0;
	ssize_t i, j, mlast, w;

	w = n - m;

	if (w < 0 || (mode == FAST_COUNT && maxcount == 0)) {
		return -1;
	}

	/* look for special cases */
	if (m <= 1) {
		if (m <= 0) {
			return -1;
		}
		/* use special case for 1-character strings */
		if (mode == FAST_COUNT) {
			for (i = 0; i < n; i++)
				if (s[i] == p[0]) {
					count++;
					if (count == maxcount) {
						return maxcount;
					}
				}
			return count;
		}
		else if (mode == FAST_SEARCH) {
			for (i = 0; i < n; i++)
				if (s[i] == p[0]) {
					return i;
				}
		}
		else {      /* FAST_RSEARCH */
			for (i = n - 1; i > -1; i--)
				if (s[i] == p[0]) {
					return i;
				}
		}
		return -1;
	}

	mlast = m - 1;
	skip = mlast - 1;
	mask = 0;

	if (mode != FAST_RSEARCH) {

		/* create compressed boyer-moore delta 1 table */

		/* process pattern[:-1] */
		for (i = 0; i < mlast; i++) {
			STRINGLIB_BLOOM_ADD(mask, p[i]);
			if (p[i] == p[mlast]) {
				skip = mlast - i - 1;
			}
		}
		/* process pattern[-1] outside the loop */
		STRINGLIB_BLOOM_ADD(mask, p[mlast]);

		for (i = 0; i <= w; i++) {
			/* note: using mlast in the skip path slows things down on x86 */
			if (s[i + m - 1] == p[m - 1]) {
				/* candidate match */
				for (j = 0; j < mlast; j++)
					if (s[i + j] != p[j]) {
						break;
					}
				if (j == mlast) {
					/* got a match! */
					if (mode != FAST_COUNT) {
						return i;
					}
					count++;
					if (count == maxcount) {
						return maxcount;
					}
					i = i + mlast;
					continue;
				}
				/* miss: check if next character is part of pattern */
				if (!STRINGLIB_BLOOM(mask, s[i + m])) {
					i = i + m;
				}
				else {
					i = i + skip;
				}
			}
			else {
				/* skip: check if next character is part of pattern */
				if (!STRINGLIB_BLOOM(mask, s[i + m])) {
					i = i + m;
				}
			}
		}
	}
	else {      /* FAST_RSEARCH */

		/* create compressed boyer-moore delta 1 table */

		/* process pattern[0] outside the loop */
		STRINGLIB_BLOOM_ADD(mask, p[0]);
		/* process pattern[:0:-1] */
		for (i = mlast; i > 0; i--) {
			STRINGLIB_BLOOM_ADD(mask, p[i]);
			if (p[i] == p[0]) {
				skip = i - 1;
			}
		}

		for (i = w; i >= 0; i--) {
			if (s[i] == p[0]) {
				/* candidate match */
				for (j = mlast; j > 0; j--)
					if (s[i + j] != p[j]) {
						break;
					}
				if (j == 0)
					/* got a match! */
				{
					return i;
				}
				/* miss: check if previous character is part of pattern */
				if (!STRINGLIB_BLOOM(mask, s[i - 1])) {
					i = i - m;
				}
				else {
					i = i - skip;
				}
			}
			else {
				/* skip: check if previous character is part of pattern */
				if (!STRINGLIB_BLOOM(mask, s[i - 1])) {
					i = i - m;
				}
			}
		}
	}

	if (mode != FAST_COUNT) {
		return -1;
	}
	return count;
}

#endif

测试代码

#include <arch/cycle.h>

int main(int argc, char **argv)
{
	char *str = "GET / HTTP 1.0\r\nHost: www.xxx.com\r\nCache: \r\nCache:\r\n Length:\r\n";
	ssize_t rc = 0;
	uint64_t start, end;

	start = get_cycle_count();
	rc = fastsearch(str, strlen(str), "Cache:", 6, 2, FAST_SEARCH);
	end = get_cycle_count();
	printf("fastsearch return %u cost %llu \n", rc, end - start);
	printf("result = %s\n", str + rc);
	rc = fastsearch(str, strlen(str), "Cache:", 6, -1, FAST_COUNT);
	printf("result = %u\n", rc);
	return 0;
}

看stringlib测试数据, 还是蛮可以的.

我在tile平台上测试发现还没有snort中的BMH算法速度快.

不过这个只是单一测试, 没有考虑到cache的情况, 仅供参考.

原文参考: