编译原理之美 --- 05 | 语法分析（三）：实现一门简单的脚本语言

本文链接：https://blog.youkuaiyun.com/u012319493/article/details/103756023

脚本语言支持变量，通过使用字典作为变量存储区实现。

赋值语句中的等号后面可匹配表达式。

尝试一个规则不成功之后，恢复到原样，再去尝试另外的规则，这个现象就叫做“回溯”。

simple_script.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from play_with_compiler.craft.simple_parser import SimpleParser
from play_with_compiler.craft.base_type import ASTNodeType
import sys

'''
 * 一个简单的脚本解释器。
 * 所支持的语法，请参见simple_parser.py
 *
 * 运行脚本：
 * 在命令行下，键入：python simple_script.py
 * 则进入一个REPL界面。你可以依次敲入命令。比如：
 * > 2+3;
 * > int age = 10;
 * > int b;
 * > b = 10*2;
 * > age = age + b;
 * > exit();  //退出REPL界面。
 *
 * 你还可以使用一个参数 -v，让每次执行脚本的时候，都输出AST和整个计算过程。
 '''
class SimpleScript(object):
    def __init__(self, verbose):
       self._variables = {}
       self._verbose = verbose

    '''
    遍历AST，计算值
    '''
    def evaluate(self, node, indent):
        result = None
        if self._verbose:
            print('%s Calcalationg: %s:' %(indent, node.get_type()))
        
        if node.get_type() == ASTNodeType.Programm:
            for child in node.get_children():
                result = self.evaluate(child, indent)
        elif node.get_type() == ASTNodeType.Additive:
            child1 = node.get_children()[0]
            value1 = self.evaluate(child1, indent + "\t")
            child2 = node.get_children()[1]
            value2 = self.evaluate(child2, indent + "\t")
            if node.get_text() == '+':
                result = int(value1) + int(value2)
            else:
                result = int(value1) - int(value2)
        elif node.get_type() == ASTNodeType.Multiplicative:
            child1 = node.get_children()[0]
            value1 = self.evaluate(child1, indent + "\t")
            child2 = node.get_children()[1]
            value2 = self.evaluate(child2, indent + "\t")
            if node.get_text() == '*':
                result = int(value1) * int(value2)
            else:
                result = int(value1) / int(value2)
        elif node.get_type() == ASTNodeType.IntLiteral:
            result = int(node.get_text())
        elif node.get_type() == ASTNodeType.Identifier:
            var_name = node.get_text()
            value = self._variables.get(var_name)
            if value != None:
                result = int(value)
            else:
                raise Exception('variavle ' + var_name + ' has not been set any value')
        elif node.get_type() == ASTNodeType.AssignmentStmt:
            var_name = node.get_text()
            if var_name not in self._variables.keys():
                raise Exception('unknown variable: ' + var_name)
            # 接着执行下面的代码
        elif node.get_type() == ASTNodeType.IntDeclaration:
            var_name = node.get_text()
            var_value = None
            if len(node.get_children()) > 0:
                child = node.get_children()[0]
                result = self.evaluate(child, indent + '\t')
                var_value = int(result)
            self._variables[var_name] = var_value
        
        if self._verbose:
            print('%sResult: %s' %(indent, result))
        elif indent == '':
            if node.get_type() == ASTNodeType.IntDeclaration or node.get_type() == ASTNodeType.AssignmentStmt:
                print('%s: %s' %(node.get_text(), result))
            elif node.get_type() != ASTNodeType.Programm:
                print(result)
        return result

'''
实现一个简单的 REPL
'''
def play(args):
    verbose = False
    if (len(args) > 0 and args[0] == '-v'):
        verbose = True
        print('verbose mode')
    print('Simple script language!')

    parser = SimpleParser()
    script = SimpleScript(verbose)
    script_text = ""

    while True:
        try:
            line = raw_input(">")
            if line == 'exit();':
                print("good bye!")
                break
            script_text += line + "\n"
            if line.endswith(";"):
                tree = parser.parse(script_text)
                if verbose:
                    parser.dump_AST(tree, "")
                script.evaluate(tree, "")
                script_text = ""
        except Exception as e:
            print('119: %s' %e)
            script_text = ''

play(sys.argv[1:])

simple_parser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from play_with_compiler.craft.simple_lexer import SimpleLexer
from play_with_compiler.craft.base_type import Token, TokenReader, ASTNodeType, TokenType
from play_with_compiler.craft.simple_calculator import SimpleASTNode

'''
 * 一个简单的语法解析器。
 * 能够解析简单的表达式、变量声明和初始化语句、赋值语句。
 * 它支持的语法规则为：
 *
 * programm -> int_declare | expressionStatement | assignmentStatement
 * int_declare -> 'int' Id ( = additive) ';'
 * expressionStatement -> addtive ';'
 * addtive -> multiplicative ( (+ | -) multiplicative)*
 * multiplicative -> primary ( (* | /) primary)*
 * primary -> IntLiteral | Id | (additive)
'''
class SimpleParser(object):
    '''
    解析脚本
    '''
    def parse(self, script):
        lexer = SimpleLexer()
        tokens = lexer.tokenize(script)
        root_node = self.prog(tokens)
        return root_node

    '''
    AST的根节点，解析的入口
    '''
    def prog(self, tokens):
        node = SimpleASTNode(ASTNodeType.Programm, 'pwc')
        while tokens.peek():
            child = self.int_declare(tokens)
            
            if not child:
                child = self.expression_statement(tokens)

            if not child:
                child = self.assignment_statement(tokens)

            if not child:
                node.add_child(child)

            if not child:
                raise Exception('unknown statement')

            node.add_child(child)
            
        return node

    '''
    表达式语句，即表达式后面跟个分号
    '''
    def expression_statement(self, tokens):
        pos = tokens.get_position()
        node = self.additive(tokens)
        if node:
            token = tokens.peek()
            if (token and token.get_type() == TokenType.SemiColon):
                tokens.read()
            else:
                node = None
                tokens.set_position(pos) # 回溯
        return node
    
    '''
    赋值语句，如age = 10*2;
    '''
    def assignment_statement(self, tokens):
        node = None
        token = tokens.peek() # 预读，看看下面是不是标识符
        if (token != None and token.get_type() == TokenType.Identifier):
            token = tokens.read() # 读入标识符
            node = SimpleASTNode(ASTNodeType.AssignmentStmt, token.get_text())
            token = tokens.peek() # 预读，看下面是不是等号
            if (token != None and token.get_type() == TokenType.Assignment):
                tokens.read() # 取出等号
                child = self.additive(tokens)
                if (child == None): # 出错，等号右边不是一个合法的表达式
                    raise Exception('invalide assignment statement, expecting an expression')
                else:
                    node.add_child(child) # 添加子节点
                    token = tokens.peek() # 预读，看后面是不是分号
                    if (token != None and token.get_type() == TokenType.SemiColon):
                        tokens.read()  # 消耗掉该分号
                    else:
                        raise Exception('invalid statement, expecting semicolon')
            else: # 回溯，吐出之前消化掉的标识符
                tokens.unread()
                node = None
        return node

    '''
     * 整型变量声明，如：
     * int a;
     * int b = 2*3;
    '''
    def int_declare(self, tokens):
        node = None
        token = tokens.peek()
        if (token and token.get_type() == TokenType.Int):
            token = tokens.read()
            if (tokens.peek().get_type() == TokenType.Identifier):
                token = tokens.read()
                node = SimpleASTNode(ASTNodeType.IntDeclaration, token.get_text())
                token = tokens.peek()
                if (token and token.get_type() == TokenType.Assignment):
                    tokens.read()  # 取出等号
                    child = self.additive(tokens)
                    if (not child):
                        raise Exception('invlide variable initialization, expecting an expression')
                    else:
                        node.add_child(child)
            else:
                raise Exception('variable name expected')

            if node:
                token = tokens.peek()
                if (token and token.get_type() == TokenType.SemiColon):
                    tokens.read()
                else:
                    raise Exception('invalid statemennt, expecting semicolon')
        return node
    
    '''
    加法表达式
    '''
    def additive(self, tokens):
        child1 = self.multiplicative(tokens) # 应用 add 规则
        node = child1
        if child1:
            while True:  # 循环应用 add' 规则
                token = tokens.peek()
                if (token and (token.get_type() == TokenType.Plus or token.get_type() == TokenType.Minus)):
                    token = tokens.read() # 读出加号
                    child2 = self.multiplicative(tokens) # 计算下级节点
                    if child2:
                        node = SimpleASTNode(ASTNodeType.Additive, token.get_text())
                        node.add_child(child1)
                        node.add_child(child2)
                        child1 = node
                    else:
                        raise Exception('invlide additive expression, expecting the right part.')
                else:
                    break
        return node

    '''
    乘法表达式
    '''
    def multiplicative(self, tokens):
        child1 = self.primary(tokens)
        node = child1
        while True:
            token = tokens.peek()
            if (token != None and (token.get_type() == TokenType.Star or token.get_type() == TokenType.Slash)):
                token = tokens.read()
                child2 = self.primary(tokens)
                if (child2 != None):
                    node = SimpleASTNode(ASTNodeType.Multiplicative, token.get_text())
                    node.add_child(child1)
                    node.add_child(child2)
                    child1 = node
                else:
                    raise Exception('invalid multiplicative expression, expecting the right part.')
            else:
                break
        return node

    '''
    基础表达式
    '''
    def primary(self, tokens):
        node = None
        token = tokens.peek()
        if token:
            if (token.get_type() == TokenType.IntLiteral):
                token = tokens.read()
                node = SimpleASTNode(ASTNodeType.IntLiteral, token.get_text())
            elif (token.get_type() == TokenType.Identifier):
                token = tokens.read()
                node = SimpleASTNode(ASTNodeType.Identifier, token.get_text())
            elif (token.get_type() == TokenType.LeftParen):
                tokens.read()
                node = self.additive(tokens)
                if node:
                    token = tokens.peek()
                    if (token and token.get_type() == TokenType.RightParen):
                        tokens.read()
                    else:
                        raise Exception('expecting right parenthesis')
                else:
                    raise Exception('expecting an additive expression inside parenthesis')
        return node # 这个方法也做了AST的简化，就是不用构造一个primary节点，直接返回子节点。因为它只有一个子节点

    '''
    * 打印输出AST的树状结构
    * @param node
    * @param indent 缩进字符，由tab组成，每一级多一个tab
    '''
    def dump_AST(self, node, indent):
        if not node:
            return
        print("%s%s %s" %(indent, node.node_type, node.text))
        for child in node.get_children():
            self.dump_AST(child, indent + "\t")

结果：

# python simple_script.py
Simple script language!
>2;
2
>2+3*5;
17
>age;
119: variavle age has not been set any value
>int a = 5;
a: 5
>int b = a + 2;
b: 7
>b;
7
>exit();
good bye!