/*
file:Aho-Corasick automaton.c
brief:该算法在1975年产生于贝尔实验室,是著名的多模匹配算法之一,
一个常见的使用场景就是给出n个单词,再给出一段包含m个字符的文章,让你找出有多少个单词在文章里出现过。
这个Aho就是Alfred V.Aho,龙书的第一作者。。。。
auther:yejing
data:2014.08.26
ver:1(create the file 8/26)
test pc:ubuntu 12.14
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define ALPHABET_NUM 26
typedef struct NODE{
struct node* fail;
struct node* next;
struct node* child[ALPHABET_NUM];
int words_num;
}node_t;
typedef struct {
node_t front;
node_t rear;
}queue_t;
static queue_t* create_queue(void){
queue_t* queue = (queue_t*)malloc(sizeof(queue_t));
queue->front = queue->rear = NULL;
return queue;
}
static void des_queue(queue_t* queue){
if(!queue)
return;
while(!is_queue_empty(queue)){
node_t* tmp = queue->front;
queue->front = tmp->next;
free(tmp);
}
free(queue);
return;
}
#define is_queue_empty(queue) ((queue->front == queue->rear): 1 ? 0)
static node_t* deque(queue_t* queue){
if(!queue || queue->front == queue->rear)
return NULL;
node_t* tmp;
tmp = queue->front;
queue->front = queue->front->next;
return tmp;
}
static void enque(queue_t* queue, node_t* node){
if(!queue || !node)
return;
node_t* tmp = queue->rear;
queue->rear->next = node;
queue->rear = node;
return;
}
void make_fail_by_bfs(node_t* trie_root){
if(!trie_root)
return;
int i;
node_t* tmp, tmp_next;
queue_t* queue = create_queue();
if(!queue)
return;
enque(queue, trie_root);
while(is_queue_empty(queue)){
tmp = queue->front;
for(i = 0; i < ALPHABET_NUM; ++i){
if(tmp == trie_root)
tmp->child[i]->next = trie_root;//first layer
else{
tmp_next = tmp->next;
while(tmp_next){
if(tmp_next->child[i]){
tmp->child[i]->next = tmp_next->child[i];
break;
}
tmp_next = tmp_next->next;
}
if(!tmp_next)
tmp->child[i]->next = trie_root;
}
enque(queue, tmp->child[i]);
}
}
}
static void init_single_node(node_t *p_node){
memset((char*)p_node, 0, sizeof(node_t));
return;
}
static void trie_insert(char* buf, int len, node_t* trie_root){
if(!buf || !trie_root)
return;
int i = 0;
int index = 0;
node_t* tmp = trie_root;
for(i = 0; i < len; ++i){
index = buf[i] - 'a';
if(!tmp->child[index]){
tmp->child[index] = (node_t *)malloc(sizeof(node_t));
init_single_node(tmp->child[index]);
}
tmp = tmp->child[index];
}
trie_root->words_amount++;
return;
}
static void des_trie(node_t* trie_root){
if(!trie_root)
return;
int i = 0;
for(i = 0; i < ALPHABET_NUM; ++i){
if(trie_root->child[i])
des_trie(trie_root->child[i]);
}
free(trie_root);
return;
}
static node_t* build_trie(){
int i = 0;
int pattern_num = 0;
char tmp[ALPHABET_NUM];
memset(tmp, 0, sizeof(char) * ALPHABET_NUM);
printf("please input the total pattern number \n");
scanf("%d", &pattern_num);
getchar();
node_t* trie_root = (node_t *)malloc(sizeof(node_t));
init_single_node(trie_root);
while(pattern_num--){
printf("please input a pattern \n");
scanf("%s",tmp);
getchar();
trie_insert(tmp, strlen(tmp), trie_root);
}
}
void static aho_corasick_main(char* buf, int len, node_t* trie_root){
if(!buf || !tire_root)
return;
int i = 0, match_num = 0;
node_t* tmp = tire_root;
int tmp_char_hex;
while(i < len){
tmp_char_hex = buf[i] - 'a';
while(tmp != trie_root && !tmp->child[tmp_char_hex])
tmp = tmp->next;
tmp = tmp->child[tmp_char_hex];
if(!tmp)
tmp = trie_root;
node_t tmp1 = tmp;
while(tmp1 != trie_root && tmp1->words_num){
match_num = tmp1->words_num;
tmp1->words_num = 0;
tmp1 = tmp1->next;
}
}
return match_num;
}
int main(int argc, char* argv[])
{
node_t* trie_node
char *tmp_buf = (char *)malloc(sizeof(char) * 9527);
trie_node = build_trie();
build_fail_array(trie_node);
aho_corasick_main(tmp, trie_node);
des_trie(trie_node);
return 1;
}
字符串匹配之AC自动机
最新推荐文章于 2024-06-04 08:36:20 发布
