regex -> 逆波兰表达式 -> NFA -> DFA

最新推荐文章于 2023-11-07 16:40:34 发布
原创最新推荐文章于 2023-11-07 16:40:34 发布 · 151 阅读
1 ·
CC 4.0 BY-SA版权
文章标签：
#算法
本文介绍了如何利用DFA解决LeetCode中的正则表达式问题，步骤包括后序到中序表达式转换、Thompson算法构建NFA、NFA转DFA，并通过实例演示了字符串匹配的过程。关键步骤涉及字符优先级处理和自动机构造技术。
  1 class Solution {
  2 public:
  3 
  4     /*
  5     * 利用DFA解决leetcode 10.正则表达式 问题
  6     * 
  7     * 步骤：
  8     * 1. 将正则字符串视为后序遍历表达式，利用栈和运算符优先级转换为
  9     *    中序遍历表达式，即逆波兰表达式。
 10     * 2. 使用中序串通过Thompson算法构造NFA。
 11     * 3. 将NFA使用子集构造算法转换为DFA。
 12     * 4. 将给定串s输入DFA模拟所有状态，若出现s读取完毕且处于结束节点则返回true。
 13     * 
 14     * 注意：
 15     * 为了简单起见，所有的new都没有进行delete。
 16     * 
 17     * 参考：
 18     * 第1步可以参考台湾清华大学 韩永楷 《数据结构》课程 第7.2 - 7.3课
 19     * 第2-4步可以参考中科大 华保健 《编译原理》课程 第2.3.1 - 3.2.4课
 20     */
 21 
 22     // NFA使用的节点
 23     struct Node;
 24 
 25     // NFA使用的路径
 26     struct Path
 27     {
 28         char c;
 29         Node * dest;
 30         Path(char in_c, Node* in_dest) :c(in_c), dest(in_dest) {}
 31     };
 32 
 33     struct Node
 34     {
 35         int index;
 36         vector<Path> pathes;
 37         Node()
 38         {
 39             static int i = 0;
 40             index = i++;
 41         }
 42         Node(vector<Path> in_pathes) :pathes(in_pathes) {}
 43         void AddPath(char c, Node* dest)
 44         {
 45             pathes.emplace_back(c, dest);
 46         }
 47     };
 48 
 49     // 将后序遍历正则串转换为中序遍历
 50     // 使用建立ExpressionTree的方法
 51     // 字符连接则添加一个&作为运算符
 52     queue<char> ToInOrder(string s)
 53     {
 54         int len = s.length();
 55         queue<char> ans;
 56         stack<char> symbols; // 符号栈
 57         bool prevIsChar = false;
 58 
 59         // 将低优先级的运算符全部弹出至ans
 60         auto PopLowPriority = [&]()
 61         {
 62             while (!symbols.empty())
 63             {
 64                 ans.push(symbols.top());
 65                 symbols.pop();
 66             }
 67 
 68         };
 69 
 70         for (int i = 0; i < len; ++i)
 71         {
 72             char c = s[i];
 73             if (c == '*')
 74             {
 75                 if (!symbols.empty() && symbols.top() == '&')
 76                 {
 77                     symbols.push(c);
 78                 }
 79                 else
 80                 {
 81                     PopLowPriority();
 82                     symbols.push(c);
 83                 }
 84                 prevIsChar = true;
 85                 continue;
 86             }
 87 
 88             if (prevIsChar)
 89             {
 90                 PopLowPriority();
 91                 symbols.push('&');
 92             }
 93 
 94             ans.push(c);
 95 
 96             //
 97             if (i + 1 < len && s[i + 1] != '*')
 98                 prevIsChar = true;
 99             else
100                 prevIsChar = false;
101         }
102 
103         PopLowPriority();
104 
105         return ans;
106     }
107 
108     struct Tree
109     {
110         Node* start,*end;
111     };
112 
113     // 使用Thompson算法，将中序的串构造成NFA
114     Tree CalcInOrder(queue<char> inorder)
115     {
116         stack<Tree> temp;
117 
118         while (!inorder.empty())
119         {
120             char c = inorder.front();
121             inorder.pop();
122 
123             if (c == '*')
124             {
125                 auto &tree = temp.top();
126 
127                 Node* oldStart = tree.start;
128                 Node* oldEnd = tree.end;
129 
130                 Node* newStart = new Node;
131                 Node* newEnd = new Node;
132                 newStart->AddPath(0, oldStart);
133                 newStart->AddPath(0, newEnd);
134                 oldEnd->AddPath(0, oldStart);
135                 oldEnd->AddPath(0, newEnd);
136 
137                 tree.start = newStart;
138                 tree.end = newEnd;
139                 continue;
140             }
141 
142             if (c == '&')
143             {
144                 auto e2 = temp.top();temp.pop();
145                 auto e1 = temp.top(); temp.pop();
146 
147                 Tree tree;
148                 tree.start = e1.start;
149                 tree.end = e2.end;
150                 e1.end->AddPath(0, e2.start);
151 
152                 temp.push(tree);
153                 continue;
154             }
155 
156             Node* node1 = new Node;
157             Node* node2 = new Node;
158             node1->AddPath(c, node2);
159             temp.push({ node1,node2 });
160         }
161         return temp.top();
162     }
163 
164     // 闭包函数
165     // 返回：一个节点的闭包，即
166     //       一个节点通过ε路径能够达到的所有节点的集合(包括自身)
167     set<Node*> e_closure(Node* start)
168     {
169         set<Node*> ans;
170         queue<Node*> q;
171         q.push(start);
172         while (!q.empty())
173         {
174             auto node = q.front();
175             q.pop();
176 
177             ans.insert(node);
178 
179             for (auto& path : node->pathes)
180                 if (path.c == 0 && ans.find(path.dest) == ans.end())
181                     q.push(path.dest);
182         }
183         return ans;
184     }
185 
186     // DFA使用的节点结构
187     struct FinalNode;
188     struct FinalPath
189     {
190         char c;
191         FinalNode* dest;
192         FinalPath(char c, FinalNode* dest) :c(c), dest(dest) {}
193     };
194 
195     struct FinalNode
196     {
197         int index;
198         bool isEnd;
199         vector<FinalPath> pathes;
200         FinalNode(bool isEnd) :isEnd(isEnd)
201         {
202             static int i = 0;
203             index = i++;
204         }
205         void AddPath(char c, FinalNode *dest)
206         {
207             pathes.emplace_back(c,dest);
208         }
209     };
210 
211     using FinalTree = FinalNode*;
212 
213     // 子集构造算法 将NFA转换为DFA
214     FinalTree BuildSubSet(const Tree tree)
215     {
216         // 返回一个节点是否为end节点
217         auto IsEnd = [&](Node* node)->bool
218         {
219             return node == tree.end;
220         };
221 
222         // 返回一个集合中是否有end节点
223         auto SetIsEnd = [&](const set<Node*>& nodeSet)->bool
224         {
225             return nodeSet.find(tree.end) != nodeSet.end();
226         };
227 
228         set<Node*> q0;
229         q0 = e_closure(tree.start);
230 
231         FinalNode* n0 = new FinalNode(SetIsEnd(q0));
232         map<set<Node*>, FinalNode*> newTreeDict;
233         newTreeDict[q0] = n0;
234 
235         FinalTree newTree;
236         newTree = n0;
237 
238         set<set<Node*>> Q;
239         Q.insert(q0);
240 
241         queue<set<Node*>> worklist;
242         worklist.push(q0);
243         while (!worklist.empty())
244         {
245             auto q = worklist.front();
246             worklist.pop();
247 
248             // 
249             map<char, set<Node*>> dict;
250             for (auto node : q)
251             {
252                 for (auto& path : node->pathes)
253                 {
254                     if (path.c != 0)
255                     {
256                         dict[path.c].insert(path.dest);
257                     }
258                 }
259             }
260 
261             for (auto& pr : dict)
262             {
263                 for (auto node : pr.second)
264                 {
265                     auto t = e_closure(node);
266                     FinalNode* node_t = nullptr;
267                     if (newTreeDict.find(t) != newTreeDict.end())
268                         node_t = newTreeDict[t];
269                     else
270                     {
271                         node_t=new FinalNode(SetIsEnd(t));
272                         newTreeDict[t] = node_t;
273                     }
274 
275                     //
276                     FinalNode* node_q = newTreeDict[q];
277                     node_q->AddPath(pr.first, node_t);
278 
279                     if (Q.find(t) == Q.end())
280                     {
281                         Q.insert(t);
282                         worklist.push(t);
283                     }
284                 }
285             }
286         }
287         return newTree;
288     }
289 
290     // 遍历DFA，确定字符串是否匹配
291     bool IsMatch(string s, FinalTree dfa)
292     {
293         struct State
294         {
295             int spos; // 当前的字符串序号
296             FinalNode* node; // 当前所在的节点
297         };
298 
299         int len = s.length();
300         State state0 = { -1,dfa };
301         queue<State> q;
302         q.push(state0);
303 
304         while (!q.empty())
305         {
306             auto state = q.front();
307             q.pop();
308 
309             // 若字符串已读取到头，且当前节点为末尾，则返回true
310             if (state.spos == len-1)
311             {
312                 if (state.node->isEnd)
313                     return true;
314 
315                 // 只是字符串读到头，但节点不是尾节点
316                 continue;
317             }
318 
319             for (auto& path : state.node->pathes)
320             {
321                 // 若下一个字符 和路径匹配，或者该路径匹配一个'.'
322                 if (path.c == s[state.spos + 1] || path.c=='.')
323                 {
324                     // 加入队列
325                     q.push({ state.spos + 1,path.dest });
326                 }
327             }
328         }
329         return false;
330     }
331 
332     bool isMatch(string s, string p)
333     {
334         if (p.empty()) return s.empty();
335         auto q = ToInOrder(p);
336         auto tree = CalcInOrder(q);
337         auto DFA = BuildSubSet(tree);
338 
339         return IsMatch(s, DFA);
340     }
341 };