AC自动机(Aho-Corasick Automaton)是一种革命性的多模式串匹配算法,它将字符串匹配的效率提升到了新的高度。本文将从理论基础到实践实现,全面深入地解析AC自动机。
1. 算法背景与历史
1.1 问题起源
在1975年,Alfred V. Aho和Margaret J. Corasick为了解决贝尔实验室在文本处理中遇到的多模式匹配效率问题,提出了这一算法。当时,传统的逐个模式串匹配方法在处理大量模式串时效率极低。
1.2 算法演进
AC自动机是KMP算法的多模式扩展:
- KMP算法:单模式串的线性时间匹配
- AC自动机:多模式串的线性时间匹配
- 核心突破:通过共享前缀结构和失败指针机制,避免了重复计算
2. 理论基础
2.1 Trie树结构详解
Trie树(前缀树)是AC自动机的基础数据结构,具有以下特性:
// Trie节点的完整结构
struct TrieNode {
// 子节点映射:字符 -> 节点指针
std::unordered_map<char, TrieNode*> children;
// 失败指针:指向最长公共后缀的节点
TrieNode* fail;
// 输出列表:存储在此节点结束的所有模式串
std::vector<int> output;
// 模式串信息
int patternIndex; // 模式串索引
int depth; // 节点深度(从根节点的距离)
bool isEnd; // 是否为某个模式串的结束节点
// 统计信息(用于优化)
int matchCount; // 以此节点为结束的匹配数量
TrieNode() : fail(nullptr), patternIndex(-1),
depth(0), isEnd(false), matchCount(0) {}
};
Trie树的构建过程:
- 从根节点开始
- 对每个模式串的每个字符,创建或找到对应的子节点
- 标记模式串结束位置
- 构建完整的前缀共享结构
2.2 失败指针的数学原理
失败指针的本质是最长真后缀匹配。对于节点N代表的字符串S,其失败指针指向的节点代表S的最长真后缀,且该后缀必须是某个模式串的前缀。
形式化定义:
设节点u对应字符串P,其失败指针指向节点v对应字符串Q,则:
- Q是P的真后缀(Q ⊂ P)
- Q是某个模式串的前缀
- |Q|最大
2.3 输出指针的传递性
输出指针具有传递性:如果节点u的失败指针指向v,且v是某个模式串的结束节点,则u也能匹配该模式串。
输出集合的构建:
Output(u) = {patternIndex(u)} ∪ Output(fail(u))
3. 算法详细步骤
3.1 Trie树构建算法
void ACAutomaton::insert(const std::string& pattern, int index) {
TrieNode* current = root;
for (int i = 0; i < pattern.length(); i++) {
char c = pattern[i];
// 如果不存在对应子节点,则创建
if (current->children.find(c) == current->children.end()) {
current->children[c] = new TrieNode();
current->children[c]->depth = current->depth + 1;
}
current = current->children[c];
}
// 标记模式串结束
current->isEnd = true;
current->patternIndex = index;
current->output.push_back(index);
}
构建过程示例:
模式串:{“he”, “she”, “his”, “hers”}
构建过程:
1. 插入"he":root -> 'h' -> 'e'(end)
2. 插入"she":root -> 's' -> 'h' -> 'e'(end)
3. 插入"his":root -> 'h' -> 'i' -> 's'(end)
4. 插入"hers":root -> 'h' -> 'e' -> 'r' -> 's'(end)
3.2 失败指针构建算法(BFS)
void ACAutomaton::buildFailurePointers() {
std::queue<TrieNode*> q;
// 初始化根节点
root->fail = root;
// 将根节点的所有子节点入队
for (auto& child : root->children) {
child.second->fail = root;
q.push(child.second);
}
// BFS遍历
while (!q.empty()) {
TrieNode* current = q.front();
q.pop();
// 处理当前节点的所有子节点
for (auto& child : current->children) {
char c = child.first;
TrieNode* childNode = child.second;
// 寻找失败指针的目标节点
TrieNode* failNode = current->fail;
// 沿着失败指针链向上查找
while (failNode != root &&
failNode->children.find(c) == failNode->children.end()) {
failNode = failNode->fail;
}
// 找到合适的失败指针目标
if (failNode->children.find(c) != failNode->children.end() &&
failNode->children[c] != childNode) {
childNode->fail = failNode->children[c];
} else {
childNode->fail = root;
}
// 继承输出
if (childNode->fail->isEnd) {
childNode->output.push_back(childNode->fail->patternIndex);
}
// 将子节点加入队列
q.push(childNode);
}
}
}
失败指针构建示例:
节点'h':fail -> root
节点's':fail -> root
节点'e'(he的e):fail -> root
节点'h'(she的h):fail -> 'h'(he的h)
节点'e'(she的e):fail -> 'e'(he的e)
节点'i':fail -> root
节点's'(his的s):fail -> root
节点'r':fail -> root
节点's'(hers的s):fail -> 's'(his的s)
3.3 搜索匹配算法
std::vector<std::pair<int, int>> ACAutomaton::search(const std::string& text) {
std::vector<std::pair<int, int>> matches;
TrieNode* current = root;
for (int i = 0; i < text.length(); i++) {
char c = text[i];
// 状态转移:沿着失败指针链找到合适的节点
while (current != root &&
current->children.find(c) == current->children.end()) {
current = current->fail;
}
// 转移到下一个状态
if (current->children.find(c) != current->children.end()) {
current = current->children[c];
}
// 输出所有匹配结果
if (current->isEnd) {
int startPos = i - patterns[current->patternIndex].length() + 1;
matches.push_back({startPos, current->patternIndex});
}
// 输出通过失败指针继承的匹配
TrieNode* temp = current->fail;
while (temp != root && temp->isEnd) {
int startPos = i - patterns[temp->patternIndex].length() + 1;
matches.push_back({startPos, temp->patternIndex});
temp = temp->fail;
}
}
return matches;
}
4. 高级优化技术
4.1 字符集优化
对于ASCII字符集,可以使用数组代替哈希表:
class OptimizedACAutomaton {
private:
static const int ALPHABET_SIZE = 26; // 假设只有小写字母
struct OptimizedNode {
OptimizedNode* children[ALPHABET_SIZE];
OptimizedNode* fail;
std::vector<int> output;
int patternIndex;
OptimizedNode() : fail(nullptr), patternIndex(-1) {
for (int i = 0; i < ALPHABET_SIZE; i++) {
children[i] = nullptr;
}
}
};
};
4.2 预计算转移函数
// 预计算每个状态在所有字符下的转移目标
void precomputeTransitions() {
std::queue<TrieNode*> q;
q.push(root);
while (!q.empty()) {
TrieNode* current = q.front();
q.pop();
for (char c = 'a'; c <= 'z'; c++) {
TrieNode* next = getTransition(current, c);
transitionTable[current][c] = next;
}
for (auto& child : current->children) {
q.push(child.second);
}
}
}
// 直接查表转移,O(1)时间
TrieNode* getTransition(TrieNode* node, char c) {
if (node->children.find(c) != node->children.end()) {
return node->children[c];
}
if (node == root) {
return root;
}
return getTransition(node->fail, c);
}
4.3 内存池优化
class MemoryPool {
private:
std::vector<TrieNode*> pool;
std::queue<TrieNode*> freeList;
public:
TrieNode* allocate() {
if (freeList.empty()) {
pool.push_back(new TrieNode());
return pool.back();
}
TrieNode* node = freeList.front();
freeList.pop();
return node;
}
void deallocate(TrieNode* node) {
freeList.push(node);
}
~MemoryPool() {
for (auto node : pool) {
delete node;
}
}
};
5. 完整的生产级实现
#include <iostream>
#include <vector>
#include <queue>
#include <string>
#include <unordered_map>
#include <algorithm>
class ProductionACAutomaton {
private:
struct Node {
std::unordered_map<char, Node*> children;
Node* fail;
std::vector<int> output;
int patternIndex;
bool isEnd;
Node() : fail(nullptr), patternIndex(-1), isEnd(false) {}
};
Node* root;
std::vector<std::string> patterns;
std::vector<std::string> originalPatterns; // 保持原始模式串
bool isBuilt;
public:
ProductionACAutomaton() : root(new Node()), isBuilt(false) {}
~ProductionACAutomaton() {
clear(root);
}
void clear(Node* node) {
if (node) {
for (auto& child : node->children) {
clear(child.second);
}
delete node;
}
}
// 添加模式串(支持动态添加)
void addPattern(const std::string& pattern) {
if (isBuilt) {
throw std::runtime_error("Cannot add pattern after building");
}
originalPatterns.push_back(pattern);
std::string normalized = normalizePattern(pattern);
insert(normalized, originalPatterns.size() - 1);
}
// 模式串预处理
std::string normalizePattern(const std::string& pattern) {
std::string result = pattern;
// 转换为小写
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
return result;
}
// 文本预处理
std::string normalizeText(const std::string& text) {
std::string result = text;
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
return result;
}
void insert(const std::string& pattern, int index) {
Node* current = root;
for (char c : pattern) {
if (current->children.find(c) == current->children.end()) {
current->children[c] = new Node();
}
current = current->children[c];
}
current->isEnd = true;
current->patternIndex = index;
current->output.push_back(index);
}
void build() {
if (isBuilt) return;
std::queue<Node*> q;
// 初始化根节点
root->fail = root;
for (auto& child : root->children) {
child.second->fail = root;
q.push(child.second);
}
// BFS构建失败指针
while (!q.empty()) {
Node* current = q.front();
q.pop();
for (auto& child : current->children) {
char c = child.first;
Node* childNode = child.second;
Node* failNode = current->fail;
while (failNode != root &&
failNode->children.find(c) == failNode->children.end()) {
failNode = failNode->fail;
}
if (failNode->children.find(c) != failNode->children.end() &&
failNode->children[c] != childNode) {
childNode->fail = failNode->children[c];
} else {
childNode->fail = root;
}
// 继承输出
if (childNode->fail->isEnd) {
childNode->output.push_back(childNode->fail->patternIndex);
}
q.push(childNode);
}
}
isBuilt = true;
}
// 搜索匹配
struct MatchResult {
int start;
int end;
int patternIndex;
std::string matchedText;
MatchResult(int s, int e, int p, const std::string& text)
: start(s), end(e), patternIndex(p), matchedText(text.substr(s, e-s+1)) {}
};
std::vector<MatchResult> search(const std::string& text) {
if (!isBuilt) {
build();
}
std::vector<MatchResult> results;
Node* current = root;
std::string normalizedText = normalizeText(text);
for (int i = 0; i < normalizedText.length(); i++) {
char c = normalizedText[i];
// 状态转移
while (current != root &&
current->children.find(c) == current->children.end()) {
current = current->fail;
}
if (current->children.find(c) != current->children.end()) {
current = current->children[c];
}
// 收集所有匹配
Node* temp = current;
while (temp != root) {
if (temp->isEnd) {
int startPos = i - originalPatterns[temp->patternIndex].length() + 1;
results.emplace_back(startPos, i, temp->patternIndex, text);
}
temp = temp->fail;
}
}
return results;
}
// 批量搜索
std::vector<std::vector<MatchResult>> batchSearch(const std::vector<std::string>& texts) {
std::vector<std::vector<MatchResult>> results;
for (const auto& text : texts) {
results.push_back(search(text));
}
return results;
}
// 获取模式串数量
int patternCount() const {
return originalPatterns.size();
}
// 获取特定模式串
const std::string& getPattern(int index) const {
if (index < 0 || index >= originalPatterns.size()) {
throw std::out_of_range("Pattern index out of range");
}
return originalPatterns[index];
}
// 打印自动机结构(用于调试)
void printStructure() const {
std::queue<std::pair<Node*, std::string>> q;
q.push({root, "root"});
while (!q.empty()) {
auto [node, path] = q.front();
q.pop();
std::cout << "Node: " << path;
if (node->isEnd) {
std::cout << " (End of pattern " << node->patternIndex << ")";
}
std::cout << std::endl;
for (auto& child : node->children) {
q.push({child.second, path + " -> " + child.first});
}
}
}
};
// 使用示例
int main() {
ProductionACAutomaton ac;
// 添加模式串
ac.addPattern("he");
ac.addPattern("she");
ac.addPattern("his");
ac.addPattern("hers");
// 构建自动机
ac.build();
// 搜索
std::string text = "She sells seashells by the seashore.";
auto matches = ac.search(text);
std::cout << "在文本 \"" << text << "\" 中找到 " << matches.size() << " 个匹配:" << std::endl;
for (const auto& match : matches) {
std::cout << "位置 [" << match.start << "," << match.end << "]: \""
<< match.matchedText << "\" 匹配模式串 \""
<< ac.getPattern(match.patternIndex) << "\"" << std::endl;
}
return 0;
}
6. 复杂度分析与性能测试
6.1 复杂度证明
构建阶段:
- 时间复杂度:O(m),m为所有模式串总长度
- 证明:每个字符最多被访问常数次(插入一次,构建失败指针时最多访问常数次)
搜索阶段:
- 时间复杂度:O(n + z),n为文本长度,z为匹配结果数量
- 证明:每个文本字符最多导致一次成功转移和若干次失败转移,但失败转移的总次数与成功转移成线性关系
6.2 性能测试
#include <chrono>
void performanceTest() {
ProductionACAutomaton ac;
// 生成测试数据
std::vector<std::string> patterns;
for (int i = 0; i < 1000; i++) {
patterns.push_back("pattern" + std::to_string(i));
}
std::string text = "";
for (int i = 0; i < 100000; i++) {
text += "pattern" + std::to_string(i % 100) + " ";
}
auto start = std::chrono::high_resolution_clock::now();
// 构建
for (const auto& pattern : patterns) {
ac.addPattern(pattern);
}
ac.build();
// 搜索
auto matches = ac.search(text);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "构建和搜索耗时: " << duration.count() << "ms" << std::endl;
std::cout << "找到 " << matches.size() << " 个匹配" << std::endl;
}
7. 实际应用案例
7.1 敏感词过滤系统
class SensitiveWordFilter {
private:
ProductionACAutomaton ac;
std::unordered_set<std::string> whitelist;
public:
void addSensitiveWord(const std::string& word) {
ac.addPattern(word);
}
void addWhitelistWord(const std::string& word) {
whitelist.insert(word);
}
std::vector<std::string> filter(const std::string& text) {
auto matches = ac.search(text);
std::vector<std::string> filtered;
int lastPos = 0;
for (const auto& match : matches) {
std::string word = text.substr(match.start, match.end - match.start + 1);
// 检查白名单
if (whitelist.find(word) != whitelist.end()) {
continue;
}
// 添加未匹配部分
if (match.start > lastPos) {
filtered.push_back(text.substr(lastPos, match.start - lastPos));
}
// 添加替换符号
filtered.push_back("***");
lastPos = match.end + 1;
}
// 添加剩余部分
if (lastPos < text.length()) {
filtered.push_back(text.substr(lastPos));
}
return filtered;
}
};
7.2 生物信息学应用
class DNASequenceMatcher {
private:
ProductionACAutomaton ac;
public:
DNASequenceMatcher() {
// DNA碱基只有A、T、C、G
ac = ProductionACAutomaton();
}
void addGeneSequence(const std::string& sequence, int geneId) {
ac.addPattern(sequence);
}
std::vector<std::pair<int, int>> findGenes(const std::string& genome) {
return ac.search(genome);
}
};
8. 算法变体与扩展
8.1 带权重的AC自动机
struct WeightedNode {
std::unordered_map<char, WeightedNode*> children;
WeightedNode* fail;
std::vector<std::pair<int, double>> output; // 模式串索引和权重
double maxWeight; // 以该节点为结束的最大权重
WeightedNode() : fail(nullptr), maxWeight(0.0) {}
};
8.2 模糊匹配AC自动机
支持一定编辑距离的模糊匹配:
class FuzzyACAutomaton {
private:
ProductionACAutomaton ac;
int maxEditDistance;
public:
std::vector<std::pair<int, int>> fuzzySearch(const std::string& text, int maxDistance) {
// 结合动态规划和AC自动机
// 实现编辑距离约束下的多模式匹配
}
};
9. 总结
AC自动机作为一种经典的多模式匹配算法,具有以下优势:
- 高效性:线性时间复杂度,适合大规模文本处理
- 灵活性:易于扩展和优化
- 实用性:广泛应用于各种实际场景
3318

被折叠的 条评论
为什么被折叠?



