要实现一门语言,第一要务就是要能够处理文本文件,搞明白其中究竟写了些什么。传统上,我们会先利用“词法分析器”(也称为“扫描器”)将输入切成“语元(token)”,然后再做处理。词法分析器返回的每个语元都带有一个语元编号,此外可能还会附带一些元数据(比如某个数值)。
看了词法分析器的教程,第一章大概就是讲了如何用语元(token)来表示每个词,使用函数不断读取每个词,并且判断和分类。
词法分析器先对每个单词进行分类使用enum Token定义了5个种类:代码结束的eof;def;extern;
identifier和number。
主要的作用就是循环读取每个词 ,然后进行分类,当然数字部分比较复杂,因为可能是0.1.1这种,需要建立报错机制。但是例子中没有给出。分类是使用c++中isdigit,isalpha之类的函数。对了,特殊符号是直接返回ascii码,比如“+”返回ascii码,定义ID,以后根据ID来确定是什么运算。
在看读取的例程时有一个问题:在读取字母时,为什么要读取数字?
在说语法分析器之前要说一下AST——抽象语法树(abstract syntax code,AST)是源代码的抽象语法结构的树状表示,树上的每个节点都表示源代码中的一种结构,这所以说是抽象的,是因为抽象语法树并不会表示出真实语法出现的每一个细节,比如说,嵌套括号被隐含在树的结构中,并没有以节点的形式呈现。抽象语法树并不依赖于源语言的语法,也就是说语法分析阶段所采用的上下文无文文法,因为在写文法时,经常会对文法进行等价的转换(消除左递归,回溯,二义性等),这样会给文法分析引入一些多余的成分,对后续阶段造成不利影响,甚至会使合个阶段变得混乱。因些,很多编译器经常要独立地构造语法分析树,为前端,后端建立一个清晰的接口。
我的想法是这样的,词法分析器对单词进行分析,然后就要将单词串起来,而这个串就是AST,这个工具就是语法分析器。
举个简单例子:1+3*(4-1)+2
这个比词法分析器难多了,首先最基础的构建这个AST,主要是ExprAST,其下有VariableExprAST (变量)、BinaryExprAST (运算)、CallExprAST(函数),和call辅助类型PrototypeAST,FunctionAST 。不同的功能词组有不同的定义。
AST框架建好了就是充填,首先是读取一个词,就是一个Token,,然后需要读取下一个Token,进行循环。
接下来定义生成规则,比如()啊。
最主要的就是解析二元表达式了,根据算数优先级进行解析。
一个函数使用嵌套进行解析了二元表达式,很强!
static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
// If this is a binop, find its precedence.
while (1) {
//GetTokPrecedence是判断读取的词是不是运算符,返回的是事先设定好的优先级,如果不是运算符,返回-1.
int TokPrec = GetTokPrecedence();
// If this is a binop that binds at least as tightly as the current binop,
// consume it, otherwise we are done.
//若果不是运算符,比如单独一个a,这样就返回本身。
if (TokPrec < ExprPrec)
return LHS;
// Okay, we know this is a binop.
//知道运算种类.
int BinOp = CurTok;
//读取运算符右边的,会进行循环,因为运算符右边可能是一些组合,a+(a+b)
getNextToken(); // eat binop
//如果没有,就返回0,会报错的
// Parse the primary expression after the binary operator.
ExprAST *RHS = ParsePrimary();
if (!RHS) return 0;
//开始进行嵌套的地方
// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
int NextPrec = GetTokPrecedence();
if (TokPrec < NextPrec) {
RHS = ParseBinOpRHS(TokPrec+1, RHS);
if (RHS == 0) return 0;
}
// Merge LHS/RHS.
LHS = new BinaryExprAST(BinOp, LHS, RHS);
}
}
之后还有函数解析等。
完整源码如下:
#include <cstdio>
#include <cstdlib>
#include <string>
#include <map>
#include <vector>
//===----------------------------------------------------------------------===//
// Lexer
//===----------------------------------------------------------------------===//
// The lexer returns tokens [0-255] if it is an unknown character, otherwise one
// of these for known things.
enum Token {
tok_eof = -1,
// commands
tok_def = -2, tok_extern = -3,
// primary
tok_identifier = -4, tok_number = -5
};
static std::string IdentifierStr; // Filled in if tok_identifier
static double NumVal; // Filled in if tok_number
/// gettok - Return the next token from standard input.
static int gettok() {
static int LastChar = ' ';
// Skip any whitespace.
while (isspace(LastChar))
LastChar = getchar();
if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]*
IdentifierStr = LastChar;
while (isalnum((LastChar = getchar())))
IdentifierStr += LastChar;
if (IdentifierStr == "def") return tok_def;
if (IdentifierStr == "extern") return tok_extern;
return tok_identifier;
}
if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+
std::string NumStr;
do {
NumStr += LastChar;
LastChar = getchar();
} while (isdigit(LastChar) || LastChar == '.');
NumVal = strtod(NumStr.c_str(), 0);
return tok_number;
}
if (LastChar == '#') {
// Comment until end of line.
do LastChar = getchar();
while (LastChar != EOF && LastChar != '\n' && LastChar != '\r');
if (LastChar != EOF)
return gettok();
}
// Check for end of file. Don't eat the EOF.
if (LastChar == EOF)
return tok_eof;
// Otherwise, just return the character as its ascii value.
int ThisChar = LastChar;
LastChar = getchar();
return ThisChar;
}
//===----------------------------------------------------------------------===//
// Abstract Syntax Tree (aka Parse Tree)
//===----------------------------------------------------------------------===//
/// ExprAST - Base class for all expression nodes.
class ExprAST {
public:
virtual ~ExprAST() {}
};
/// NumberExprAST - Expression class for numeric literals like "1.0".
class NumberExprAST : public ExprAST {
double Val;
public:
NumberExprAST(double val) : Val(val) {}
};
/// VariableExprAST - Expression class for referencing a variable, like "a".
class VariableExprAST : public ExprAST {
std::string Name;
public:
VariableExprAST(const std::string &name) : Name(name) {}
};
/// BinaryExprAST - Expression class for a binary operator.
class BinaryExprAST : public ExprAST {
char Op;
ExprAST *LHS, *RHS;
public:
BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs)
: Op(op), LHS(lhs), RHS(rhs) {}
};
/// CallExprAST - Expression class for function calls.
class CallExprAST : public ExprAST {
std::string Callee;
std::vector<ExprAST*> Args;
public:
CallExprAST(const std::string &callee, std::vector<ExprAST*> &args)
: Callee(callee), Args(args) {}
};
/// PrototypeAST - This class represents the "prototype" for a function,
/// which captures its name, and its argument names (thus implicitly the number
/// of arguments the function takes).
class PrototypeAST {
std::string Name;
std::vector<std::string> Args;
public:
PrototypeAST(const std::string &name, const std::vector<std::string> &args)
: Name(name), Args(args) {}
};
/// FunctionAST - This class represents a function definition itself.
class FunctionAST {
PrototypeAST *Proto;
ExprAST *Body;
public:
FunctionAST(PrototypeAST *proto, ExprAST *body)
: Proto(proto), Body(body) {}
};
//===----------------------------------------------------------------------===//
// Parser
//===----------------------------------------------------------------------===//
/// CurTok/getNextToken - Provide a simple token buffer. CurTok is the current
/// token the parser is looking at. getNextToken reads another token from the
/// lexer and updates CurTok with its results.
static int CurTok;
static int getNextToken() {
return CurTok = gettok();
}
/// BinopPrecedence - This holds the precedence for each binary operator that is
/// defined.
static std::map<char, int> BinopPrecedence;
/// GetTokPrecedence - Get the precedence of the pending binary operator token.
static int GetTokPrecedence() {
if (!isascii(CurTok))
return -1;
// Make sure it's a declared binop.
int TokPrec = BinopPrecedence[CurTok];
if (TokPrec <= 0) return -1;
return TokPrec;
}
/// Error* - These are little helper functions for error handling.
ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;}
PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; }
FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; }
static ExprAST *ParseExpression();
/// identifierexpr
/// ::= identifier
/// ::= identifier '(' expression* ')'
static ExprAST *ParseIdentifierExpr() {
std::string IdName = IdentifierStr;
getNextToken(); // eat identifier.
if (CurTok != '(') // Simple variable ref.
return new VariableExprAST(IdName);
// Call.
getNextToken(); // eat (
std::vector<ExprAST*> Args;
if (CurTok != ')') {
while (1) {
ExprAST *Arg = ParseExpression();
if (!Arg) return 0;
Args.push_back(Arg);
if (CurTok == ')') break;
if (CurTok != ',')
return Error("Expected ')' or ',' in argument list");
getNextToken();
}
}
// Eat the ')'.
getNextToken();
return new CallExprAST(IdName, Args);
}
/// numberexpr ::= number
static ExprAST *ParseNumberExpr() {
ExprAST *Result = new NumberExprAST(NumVal);
getNextToken(); // consume the number
return Result;
}
/// parenexpr ::= '(' expression ')'
static ExprAST *ParseParenExpr() {
getNextToken(); // eat (.
ExprAST *V = ParseExpression();
if (!V) return 0;
if (CurTok != ')')
return Error("expected ')'");
getNextToken(); // eat ).
return V;
}
/// primary
/// ::= identifierexpr
/// ::= numberexpr
/// ::= parenexpr
static ExprAST *ParsePrimary() {
switch (CurTok) {
default: return Error("unknown token when expecting an expression");
case tok_identifier: return ParseIdentifierExpr();
case tok_number: return ParseNumberExpr();
case '(': return ParseParenExpr();
}
}
/// binoprhs
/// ::= ('+' primary)*
static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) {
// If this is a binop, find its precedence.
while (1) {
int TokPrec = GetTokPrecedence();
// If this is a binop that binds at least as tightly as the current binop,
// consume it, otherwise we are done.
if (TokPrec < ExprPrec)
return LHS;
// Okay, we know this is a binop.
int BinOp = CurTok;
getNextToken(); // eat binop
// Parse the primary expression after the binary operator.
ExprAST *RHS = ParsePrimary();
if (!RHS) return 0;
// If BinOp binds less tightly with RHS than the operator after RHS, let
// the pending operator take RHS as its LHS.
int NextPrec = GetTokPrecedence();
if (TokPrec < NextPrec) {
RHS = ParseBinOpRHS(TokPrec+1, RHS);
if (RHS == 0) return 0;
}
// Merge LHS/RHS.
LHS = new BinaryExprAST(BinOp, LHS, RHS);
}
}
/// expression
/// ::= primary binoprhs
///
static ExprAST *ParseExpression() {
ExprAST *LHS = ParsePrimary();
if (!LHS) return 0;
return ParseBinOpRHS(0, LHS);
}
/// prototype
/// ::= id '(' id* ')'
static PrototypeAST *ParsePrototype() {
if (CurTok != tok_identifier)
return ErrorP("Expected function name in prototype");
std::string FnName = IdentifierStr;
getNextToken();
if (CurTok != '(')
return ErrorP("Expected '(' in prototype");
std::vector<std::string> ArgNames;
while (getNextToken() == tok_identifier)
ArgNames.push_back(IdentifierStr);
if (CurTok != ')')
return ErrorP("Expected ')' in prototype");
// success.
getNextToken(); // eat ')'.
return new PrototypeAST(FnName, ArgNames);
}
/// definition ::= 'def' prototype expression
static FunctionAST *ParseDefinition() {
getNextToken(); // eat def.
PrototypeAST *Proto = ParsePrototype();
if (Proto == 0) return 0;
if (ExprAST *E = ParseExpression())
return new FunctionAST(Proto, E);
return 0;
}
/// toplevelexpr ::= expression
static FunctionAST *ParseTopLevelExpr() {
if (ExprAST *E = ParseExpression()) {
// Make an anonymous proto.
PrototypeAST *Proto = new PrototypeAST("", std::vector<std::string>());
return new FunctionAST(Proto, E);
}
return 0;
}
/// external ::= 'extern' prototype
static PrototypeAST *ParseExtern() {
getNextToken(); // eat extern.
return ParsePrototype();
}
//===----------------------------------------------------------------------===//
// Top-Level parsing
//===----------------------------------------------------------------------===//
static void HandleDefinition() {
if (ParseDefinition()) {
fprintf(stderr, "Parsed a function definition.\n");
} else {
// Skip token for error recovery.
getNextToken();
}
}
static void HandleExtern() {
if (ParseExtern()) {
fprintf(stderr, "Parsed an extern\n");
} else {
// Skip token for error recovery.
getNextToken();
}
}
static void HandleTopLevelExpression() {
// Evaluate a top-level expression into an anonymous function.
if (ParseTopLevelExpr()) {
fprintf(stderr, "Parsed a top-level expr\n");
} else {
// Skip token for error recovery.
getNextToken();
}
}
/// top ::= definition | external | expression | ';'
static void MainLoop() {
while (1) {
fprintf(stderr, "ready> ");
switch (CurTok) {
case tok_eof: return;
case ';': getNextToken(); break; // ignore top-level semicolons.
case tok_def: HandleDefinition(); break;
case tok_extern: HandleExtern(); break;
default: HandleTopLevelExpression(); break;
}
}
}
//===----------------------------------------------------------------------===//
// Main driver code.
//===----------------------------------------------------------------------===//
int main() {
// Install standard binary operators.
// 1 is lowest precedence.
BinopPrecedence['<'] = 10;
BinopPrecedence['+'] = 20;
BinopPrecedence['-'] = 20;
BinopPrecedence['*'] = 40; // highest.
// Prime the first token.
fprintf(stderr, "ready> ");
getNextToken();
// Run the main "interpreter loop" now.
MainLoop();
return 0;
}