正则表达式转换成NFA的实现
NFA
正则表达式解析的转换规则一般有两种方式,NFA(Nondeterministic Finite Automata 不确定的有穷自动机) 和***DFA(Deterministic Finite Automata 确定的有穷自动机)***。
一个NFA由以下几部分组成:
- 一个有穷的状态集合S
- 一个输入符号集合E
- 一个转换函数,它为每一个状态和输入符号集{E}∪{空字符串ε}都给出了对应的后继状态
- S中的一个状态S0(该状态为起始状态)
- S的一个子集F被标记为接受状态(终止状态的集合)
转换图
不管是NFA还是DFA都可以转换一张转换图。图中的结点是状态,带有标号边表示转换函数。从状态s到状态t存在一条标号为a的边当且仅当状态t是状态s在输入a上的后继状态之一。如果是NFA,则一条边的不仅可以是输入符合集合中的一个输入元素,也可以是一个空字符ε。
下图给出了一个识别(a|b)*abb的状态转换图,下面的状态3是结束状态,以两个嵌套的圆圈表示

正则表达式转换NFA的算法
这个算法比较简单,描述的不太清楚,可参考下面的链接和《编译原理》(龙书) 第二版 3.7.4章节
参考链接: link.
很感谢这位老哥能把NFA转换算法写得这么清楚,让我受用无穷
正则式转换算法的代码
/*
* NFA inplmentation by C++
* We can check the detail information on this blog
* https://deniskyashif.com/2019/02/17/implementing-a-regular-expression-engine/
* In the first step,we should parse regex expression
*/
#include <map>
#include <vector>
#include <stack>
#include <memory>
#include <cctype>
#include <algorithm>
#include <set>
#include <queue>
#include <string>
#include <stdio.h>
using namespace std;
struct NfaNode
{
int stateNum;
bool isEnd; // true for final State node
bool isEpsilon; // true for epsilon connection node and
vector<NfaNode*> epsilonTransitions; // save the empty epsilon state
map<char, NfaNode*> transition;
};
struct Nfa
{
NfaNode* startNode;
NfaNode* endNode;
};
#define SUB_BRACE_REGEX_FLAG '@'
NfaNode* createState(bool isEnd)
{
static int stateNum = 0;
NfaNode* node = new NfaNode();
node->isEnd = isEnd;
node->isEpsilon = false;
node->stateNum = stateNum++;
return node;
}
Nfa* createNfa()
{
Nfa* nfa = new Nfa();
nfa->startNode = createState(false);
nfa->endNode = createState(true);
return nfa;
}
Nfa* parseRegex(string& regex)
{
stack<Nfa*> stackNfa;
stack<char> stackOperator;
if (!regex.empty())
{
string::iterator it = regex.begin();
if (!isalpha(*it) && (*it) != '(')
{
printf("regex:%s is not started with alphabet\n", regex.c_str());
return NULL;
}
if (isalpha(*it))
{
Nfa* nfa = createNfa();
nfa->startNode->transition[*it] = nfa->endNode;
stackNfa.push(nfa);
}
stackOperator.push(*it++);
while ((it != regex.end()))
{
char cur = *it;
char top = stackOperator.top();
if (isalpha(cur))
{
if (isalpha(top) || SUB_BRACE_REGEX_FLAG == top || '*' == top)
{
// connect the previous NFA graph
Nfa* nfa = stackNfa.top();
NfaNode* endNode = createState(true);
nfa->endNode->transition[cur] = endNode;
nfa->endNode->isEnd = false;
nfa->endNode = endNode;
//previous alphabet out the stack , current alphabet push to stack
stackOperator.pop();
stackOperator.push(*it++);
continue;
}
if ('|' == top)
{
// or operator, the previous NFA has been complete, merge this two NFA
Nfa* curnfa = createNfa();
curnfa->startNode->transition[cur] = curnfa->endNode;
Nfa* newNfa = createNfa(); // the NFA that merges the two NFAs
Nfa* topnfa = stackNfa.top();
newNfa->startNode->isEpsilon = true;
newNfa->startNode->epsilonTransitions.push_back(curnfa->startNode);
newNfa->startNode->epsilonTransitions.push_back(topnfa->startNode);
// conenct end node
curnfa->endNode->isEpsilon = true;
topnfa->endNode->isEpsilon = true;
topnfa->endNode->isEnd = false;
curnfa->endNode->isEnd = false;
curnfa->endNode->epsilonTransitions.push_back(newNfa->endNode);
topnfa->endNode->epsilonTransitions.push_back(newNfa->endNode);
stackNfa.pop(); // pop the previous NFA, push the newNFA
delete topnfa;
delete curnfa;
stackNfa.push(newNfa);
stackOperator.pop();
stackOperator.push(*it++);
continue;
}
if ('(' == top)
{
Nfa* curnfa = createNfa();
curnfa->startNode->transition[cur] = curnfa->endNode;
stackNfa.push(curnfa);
stackOperator.push(*it++);;
continue;
}
}
if ('(' == cur)
{
//whatever the stackOperator top element, currentlly means that this will create a new NFA graph for later using.
// so push it into the stackOperator
stackOperator.push(*it++);
continue;
}
if ('|' == cur)
{
if ('(' == top || '|' == top)
{
printf("illegal brace or | operator \n"); // this means this regex like "||" "(|", those regexs are not right
break;
}
// alphabet should pop out the stack
stackOperator.pop();
stackOperator.push(*it++);
continue;
}
if (')' == cur)
{
if ('|' == top || '(' == top)
{
printf("illegal brace or | operator \n"); // this means regex like "(a|)" "()" "(()", those regex are not right
break;
}
stackOperator.pop();
// check if the braces are couple
if (stackOperator.empty())
{
printf("miss match brace\n");
break;
}
top = stackOperator.top();
if (top != '(')
{
printf("miss match brace\n");
break;
}
// pop out the left brace
stackOperator.pop();
if (!stackOperator.empty())
{
top = stackOperator.top();
if (isalpha(top))
{
if (stackNfa.size() < 2)
{
printf("miss match left regex \n");
break;
}
//this means the regex like this "ab(cd)"
// need to merge the previous 2 nfa in stackNfa
Nfa* first = stackNfa.top();
stackNfa.pop();
Nfa* second = stackNfa.top();
stackNfa.pop();
// merge the two NFA, then push it into the stackNfa
second->startNode->transition.clear();
second->startNode->transition[top] = first->startNode;
delete second->endNode;
second->endNode = first->endNode;
delete first;
stackNfa.push(second);
stackOperator.pop();
}
if ('|' == top)
{
if (stackNfa.size() < 2)
{
printf("miss match left regex \n");
break;
}
//this means the regex like this "b|(cd)"
// need to merge the previous 2 nfa in stackNfa
Nfa* first = stackNfa.top();
stackNfa.pop();
Nfa* second = stackNfa.top();
stackNfa.pop();
Nfa* newNfa = createNfa();
newNfa->startNode->isEpsilon = true;
newNfa->startNode->epsilonTransitions.push_back(first->startNode);
newNfa->startNode->epsilonTransitions.push_back(second->startNode);
first->endNode->isEnd = false;
second->endNode->isEnd = false;
first->endNode->isEpsilon = true;
second->endNode->isEpsilon = true;
first->endNode->epsilonTransitions.push_back(newNfa->endNode);
second->endNode->epsilonTransitions.push_back(newNfa->endNode);
delete first;
delete second;
stackNfa.push(newNfa);
stackOperator.pop();
}
if ('*' == top)
{
// merge the previous 2 nfa in stackNfa
// this means regex like this "a*(ab)"
if (stackNfa.size() < 2)
{
printf("miss match regex \n");
break;
}
Nfa* first = stackNfa.top();
stackNfa.pop();
Nfa* second = stackNfa.top();
stackNfa.pop();
// delete the first start node
second->endNode->isEnd = false;
second->endNode->isEpsilon = first->startNode->isEpsilon;
second->endNode->transition = first->startNode->transition;
second->endNode->epsilonTransitions.clear();
second->endNode->epsilonTransitions = first->startNode->epsilonTransitions;
delete first->startNode;
delete first;
stackNfa.push(second);
stackOperator.pop();
}
}
stackOperator.push(SUB_BRACE_REGEX_FLAG);
it++;
continue;
}
if ('*' == cur)
{
if (top == '*')
{
it++;
continue;
}
if ('|' == top || '(' == top)
{
printf("ileagal regex\n");
break;
}
if (stackNfa.empty())
{
// this regex like "*"
printf("empty NFA stack , invalid regex\n");
break;
}
Nfa* nfa = createNfa();
Nfa* top = stackNfa.top();
nfa->startNode->isEpsilon = true;
nfa->startNode->epsilonTransitions.push_back(top->startNode);
nfa->startNode->epsilonTransitions.push_back(nfa->endNode);
top->endNode->isEnd = false;
top->endNode->isEpsilon = true;
top->endNode->epsilonTransitions.push_back(nfa->endNode);
top->endNode->epsilonTransitions.push_back(top->startNode);
stackNfa.pop();
delete top;
stackNfa.push(nfa);
stackOperator.pop();
stackOperator.push(*it++);
continue;
}
}
}
printf("NFA stack size:%d\n", stackNfa.size()); // the stackNfa size must be 1
return stackNfa.top();
}
void printNfa(Nfa* nfa)
{
if (!nfa)
{
return;
}
set<NfaNode*> visitedNode;
queue<NfaNode*> visited;
visited.push(nfa->startNode);
while (!visited.empty())
{
NfaNode* node = visited.front();
visited.pop();
if (visitedNode.count(node) > 0)
{
continue;
}
if (node->isEpsilon)
{
for (vector<NfaNode*>::iterator it = node->epsilonTransitions.begin(); it != node->epsilonTransitions.end(); it++)
{
printf("%d---Epsilon---%d\n", node->stateNum, (*it)->stateNum);
visited.push(*it);
}
}
for (map<char, NfaNode*>::iterator it = node->transition.begin(); it != node->transition.end(); it++)
{
printf("%d---%c---%d\n", node->stateNum, it->first, it->second->stateNum);
visited.push(it->second);
}
visitedNode.insert(node);
// check the endNode are right
//if (node->isEnd)
//{
// printf("EndNode: %s stateNum:%d \n", (node == nfa->endNode) ? "true" : "false", node->stateNum);
// }
}
}
int main()
{
string regex = "(a|b)*cd";
Nfa* nfa = parseRegex(regex);
printNfa(nfa);
return 0;
}
/*
* 该实现版本的代码存在内存泄漏,因为只是个人兴趣爱好,也没有可以去遵循任何编码规范,所以核心没有特别的注重过内存的申请和释放,也存在不足,希望不影响大家的阅读。
* 但是正则表达式转换成对应的NFA思想的算法实现基本完成了,该实现版本的正则式不支持[]
* 和数字字符这类的用法,只支持括号*|字母这类的字符操作。对于括号的处理,当我遇到右括号时,表示我当前的子表达式已经处理完毕,
* 需要将当前的NFA栈和前一个NFA图进行合并。
* 该算法的实现利用了两个栈:一个符号栈和一个NFA栈,符号栈表示的是前一个处理的字符,NFA栈
* 表示的是前一个处理的字符所对应的NFA图形,该NFA要么被合并成更大的NFA而被替换,要么是链接
* 新的NFA栈而被替换。该实现参考了google Re2的源码,但是对于其关于NFA中q0,q1的处理不是很理解。
* 该算法的思想与编译原理中LR文法算法的处理方式类似。
*/
上面的输出结果为
NFA stack size:1
6—Epsilon—>4
6—Epsilon—>7
4—Epsilon—>2
4—Epsilon—>0
7—c--->8
2—b--->3
0—a--->1
8—d--->9
3—Epsilon—>5
1—Epsilon—>5
5—Epsilon—>7
5—Epsilon—>4
上面的数字代表状态,Epsilon代表空字符串ε.上面的输出表示的是
(a|b)*cd 转换后的边。根据这个输出信息构造的NFA转换图为

图形画起来真的是麻烦啊,哈哈,手画了一下,希望不影响大家的理解。好了就到这里吧,第一次写MarkDown的博客,实在是见笑了。
本文详细介绍了如何通过正则表达式创建NFA(非确定有限自动机),包括状态转移规则、转换图构建和C++代码实现。实例解析了'a|b)*abb'的NFA构造过程,并展示了关键步骤和转换结果。
2986

被折叠的 条评论
为什么被折叠?



