错误处理的方法之一是认可错误的输入,将它判定为某一种正确输入的变体,然后在报出错误后再其纠正它。这个过程很大程度上跟经验相关,对于出错的经验越丰富,那么就可以矫正越多种类的错误。在词法分析中这一点体现得并不多——词法本身很简单,而敲击键错误导致的某些错误程序员应该会及时看到(比如在Jerry编程时非注释的部分输入了一个"@"符号)并改正它。而语法分析过程中这种情况则少得多,比如输入
a = 2 + 3
if (a == 0)
这样一小段,上一句赋值后没有分号,而导致后面的分支语句这是会被认为仍是上一句的一部分,但是这显然是不合理的。按照刚才说的矫正方法,可以增加一条伪语法产生式:
语句 -> 赋值
与标准产生式
语句 -> 赋值 <分号>
平行使用,两者效果类似,不过在使用前者时会额外报出错误。关于语法的部分以后再慢慢聊,现在先谈谈如何实现在词法分析过程中引入简单的错误处理过程。
首先,要将枚举类型AcceptType中的DENY这个类型干掉。如刚才所说,现在我们要把错误的类型矫正为正确的类型,因此假定词法分析不再会产生拒绝接受,而对于任何输入都会有一个正确的类型与之对应。这么做以后,一个状态或一个符号的类型不再是它是否错误的依据了,因此得引入了另一个东西:
typedef const char const* ErrMsg; #define MAX_CHARS (1 << 8) struct State { AcceptType type; ErrMsg err; struct State* nextState[MAX_CHARS]; }; struct Token { int line; AcceptType type; ErrMsg err; char* image; };
我猜想就目前的情况而言仅仅用一条信息(const char const*)来指代一个错误还是可行的,不过也许以后会扩充,所以把它专门弄成一个类型。而在状态和符号中都增加了一个记录错误信息的域。如果一个状态没有错误,则它为空,否则指向某个错误信息。作为例子,现在给出这样一些错误状况:
非注释的部分包含特殊符号,如"@^%"这样的东西或者非ASCII字符,这种情况下认为这一行剩余的符号都非法;
实型数有多于一个小数点,或者后面紧跟了字母,这种情况下认为这是不正确的数;
整数后面紧跟字母,这时认为这是不正确的标识符(很怪异?);
不完整的注释,有些程序员会不小心注释掉一段,而忘记在程序的末尾追加注释结束符了;
不完整的符号,如单个"|"或"&"。
实现并不复杂,思路是,向已有的自动机中添加新的“伪”状态。这些状态构造时与普通状态没有差别,不过它们各自的err域不为空。当一个符号被该状态接受时,对应也会改变err域。比如:
static ErrMsg invalidChar = "Invalid character.", fakeChar = jerryStates + (++stateNr); fakeChar->err = invalidChar; fakeChar->type = SKIP; for(s = 0; s < MAX_CHARS; ++s) { jerryStates->nextState[s] = fakeChar; fakeChar->nextState[s] = fakeChar; } fakeChar->nextState['\n'] = NULL;
注意这一段要在所有状态初始化之前,这样一来以后对初始状态nextState域的操作才能覆盖掉这一段通吃任何字符的转移方式。下面给出目前版本的词法分析实现:
/* dfa.c */ #include<string.h> #include<stdlib.h> #include<stdio.h> #include"const.h" #include"datastruct.h" #include"dfa.h" // 这里重构了,这样字母枚举循环不必写成两个了 #define LETTER "abcdefghijklmnopqrstuvwxyz"\ "ABCDEFGHIJKLMNOPQRSTUVWXYZ_" #define DIGIT "0123456789" extern int nextChar(void); extern struct Token* firstToken(void); extern struct Token* nextToken(void); extern void eofToken(void); // 各种错误信息 static ErrMsg invalidChar = "Invalid character.", abnormalNum = "Abnormal formatted number.", abnormalIdent = "Abnormal identifier.", incompleteComment = "Incomplete comment.", incompleteSymbol = "Incomplete symbol segment."; static struct State* initStates(void); static int foundAsResvWord(char* image); static void flushToken(struct State*, struct Token*); void tokenize(void) { struct State* state[2]; int sw = 0; int character; char* image; struct State* initial = initStates(); struct Token* token = firstToken(); character = nextChar(); // printf("--CHAR CODE-- %d %c", character, character); if(EOF == character) { exit(0); } while(1) { state[sw] = initial->nextState[character]; state[1 ^ sw] = NULL; image = token->image; *(image++) = character; while(NULL != state[sw]) { character = nextChar(); // printf("--CHAR CODE-- %d %c ", character, character); if(EOF == character) { flushToken(state[sw], token); // 这里重构了 // token->err = state[sw]->err; // token->type = state[sw]->type; // if(IDENT == token->type) { // token->type += foundAsResvWord(token->image); // } *image = 0; free(initial); eofToken(); return; } *(image++) = (char)(character & 0xff); // *image = 0; printf("-- INFO -- %d %c %s\n", sw, character, token->image); state[1 ^ sw] = state[sw]->nextState[character]; sw ^= 1; // equivalent to "sw = 1 - sw;" } sw ^= 1; // printf("--RECONGIZED--\n"); *(image - 1) = 0; flushToken(state[sw], token); // 这里重构了 // token->err = state[sw]->err; // token->type = state[sw]->type; // if(IDENT == token->type) { // token->type += foundAsResvWord(token->image); // } token = nextToken(); } } static char* RESV_WORD_LIST[] = { "", "else", "if", "while", "read", "write", "break", "int", "real", NULL }; static int foundAsResvWord(char* image) { int i = 1; for(; NULL != RESV_WORD_LIST[i]; ++i) { if(0 == strcmp(image, RESV_WORD_LIST[i])) { return i; } } return 0; } // 增加了一个函数,来自于抽取的已有代码 static void flushToken(struct State* s, struct Token* t) { t->err = s->err; t->type = s->type; if(IDENT == t->type) { t->type += foundAsResvWord(t->image); } } #define NR_STATES (64) static struct State* initStates(void) { struct State* jerryStates; int stateNr = 0, s; char* character; struct { char* symbol; AcceptType type; ErrMsg err; // 同样多了这个域 } SYMS[] = { {"+", PLUS, NULL}, {"-", MINUS, NULL}, {"*", MULTIPLY, NULL}, {"/", DIVIDE, NULL}, {"=", ASSIGN, NULL}, {"!", NOT, NULL}, {"<", LT, NULL}, {">", GT, NULL}, {";", EOS, NULL}, {",", COMMA, NULL}, {"(", LPARENT, NULL}, {")", RPARENT, NULL}, {"[", LBRACKET, NULL}, {"]", RBRACKET, NULL}, {"{", LBRACE, NULL}, {"}", RBRACE, NULL}, {"&", AND, incompleteSymbol}, {"|", OR, incompleteSymbol}, {"==", EQ, NULL}, {"<=", LE, NULL}, {">=", GE, NULL}, {"!=", NE, NULL}, {"&&", AND, NULL}, {"||", OR, NULL}, {NULL, SKIP} }; struct State* iter; struct State* commentInLineStart,* commentMultiLineStart, * commentMultiLine2,* comment; struct State* space; struct State* integer,* realnum; struct State* identifier; struct State* fakeChar; // invalidChar struct State* fakeNumber; // abnormalNum struct State* fakeIdent; // abnormalIdent jerryStates = (struct State*)malloc(NR_STATES * sizeof(struct State)); memset(jerryStates, 0, NR_STATES * sizeof(struct State)); // 伪状态 fakeChar = jerryStates + (++stateNr); fakeChar->err = invalidChar; fakeNumber = jerryStates + (++stateNr); fakeNumber->err = abnormalNum; fakeIdent = jerryStates + (++stateNr); fakeIdent->err = abnormalIdent; fakeChar->type = SKIP; fakeNumber->type = REAL; fakeIdent->type = IDENT; for(s = 0; s < MAX_CHARS; ++s) { jerryStates->nextState[s] = fakeChar; fakeChar->nextState[s] = fakeChar; } fakeChar->nextState['\n'] = NULL; for(s = 0; NULL != SYMS[s].symbol; ++s) { iter = jerryStates; // printf("--INFO-- %d\n", s); for(character = SYMS[s].symbol; *character; ++character) { // printf("---CHAR-- %c %d\n", *character, SYMS[s].type); if(fakeChar == iter->nextState[(int)*character] || 0 == iter->nextState[(int)*character]) { // 注意这里分支条件改变了!! iter->nextState[(int)*character] = jerryStates + (++stateNr); iter->nextState[(int)*character]->type = SYMS[s].type; iter->nextState[(int)*character]->err = SYMS[s].err; // printf("---APPEND-- %c %d %d\n", *character, stateNr, SYMS[s].type); } iter = iter->nextState[(int)*character]; } } commentInLineStart = jerryStates + (++stateNr); commentMultiLineStart = jerryStates + (++stateNr); commentMultiLine2 = jerryStates + (++stateNr); comment = jerryStates + (++stateNr); commentInLineStart->type = commentMultiLineStart->type = commentMultiLine2->type = comment->type = SKIP; // 不再是DENY了 commentInLineStart->err = commentMultiLineStart->err = commentMultiLine2->err = incompleteComment; // 注释不完整 jerryStates->nextState['/']->nextState['/'] = commentInLineStart; jerryStates->nextState['/']->nextState['*'] = commentMultiLineStart; for(s = 0; s < MAX_CHARS; ++s) { commentInLineStart->nextState[s] = commentInLineStart; commentMultiLineStart->nextState[s] = commentMultiLineStart; commentMultiLine2->nextState[s] = commentMultiLineStart; } commentInLineStart->nextState['\n'] = comment; commentMultiLineStart->nextState['*'] = commentMultiLine2; commentMultiLine2->nextState['*'] = commentMultiLine2; commentMultiLine2->nextState['/'] = comment; identifier = jerryStates + (++stateNr); identifier->type = IDENT; for(s = 0; s <= LETTER[s]; ++s) { jerryStates->nextState[(int)LETTER[s]] = identifier; identifier->nextState[(int)LETTER[s]] = identifier; } integer = jerryStates + (++stateNr); realnum = jerryStates + (++stateNr); integer->type = INTEGER; realnum->type = REAL; for(s = 0; DIGIT[s]; ++s) { jerryStates->nextState[(int)DIGIT[s]] = integer; integer->nextState[(int)DIGIT[s]] = integer; realnum->nextState[(int)DIGIT[s]] = realnum; identifier->nextState[(int)DIGIT[s]] = identifier; } jerryStates->nextState['.'] = realnum; integer->nextState['.'] = realnum; space = jerryStates + (++stateNr); space->type = SKIP; jerryStates->nextState[' '] = space; jerryStates->nextState['\t'] = space; jerryStates->nextState['\r'] = space; jerryStates->nextState['\n'] = space; space->nextState[' '] = space; space->nextState['\t'] = space; space->nextState['\r'] = space; space->nextState['\n'] = space; // 追加的伪状态转移 for(s = 0; LETTER[s]; ++s) { integer->nextState[(int)LETTER[s]] = fakeIdent; fakeIdent->nextState[(int)LETTER[s]] = fakeIdent; realnum->nextState[(int)LETTER[s]] = fakeNumber; fakeNumber->nextState[(int)LETTER[s]] = fakeNumber; } for(s = 0; DIGIT[s]; ++s) { fakeIdent->nextState[(int)DIGIT[s]] = fakeIdent; fakeNumber->nextState[(int)DIGIT[s]] = fakeNumber; } realnum->nextState['.'] = fakeNumber; fakeNumber->nextState['.'] = fakeNumber; // printf("--INFO-- DFA built. %d\n", stateNr); return jerryStates; } #undef DIGIT #undef LETTER #undef NR_STATES
对应的,要将jerry.c文件做一点修改,增加错误处理模块:
/* jerry.c */ #include<stdio.h> #include<stdlib.h> #include<string.h> #include"const.h" #include"datastruct.h" #include"dfa.h" FILE* in,* out; int currentChar, lineNumber = 1; char buffer[50]; struct Token token = { 0, SKIP, NULL, buffer }; // 输出错误信息 void error(int line, ErrMsg* err) { fprintf(stderr, "Line %d Error: %s\n", line, *err); } int nextChar(void) { currentChar = fgetc(in); lineNumber += ('\n' == currentChar); return currentChar; } struct Token* firstToken(void) { token.line = lineNumber; token.err = NULL; return &token; } struct Token* nextToken(void) { if(NULL != token.err) { error(token.line, token.err); // 这里输出错误 } if(SKIP != token.type) { fprintf(out, " Line: %d Type: %d Image: %s\n", token.line, token.type, token.image); } token.line = lineNumber; token.err = NULL; return &token; } void eofToken(void) { nextToken(); // 这里抽取了原有代码 fprintf(out, " Line: %d Type: %d\n", lineNumber, END); printf(" End...\n"); exit(0); } void handleFile(int argc, char* argv[]) { in = fopen(argv[0], "r"); if(NULL == in) { fprintf(stderr, "Usage: %s access error.\n", argv[0]); exit(0); } if(argc == 3) { if(strcmp("-o", argv[1])) { fprintf(stderr, "Unknown parameter: %s\n", argv[1]); out = stdout; } else { out = fopen(argv[2], "w"); if(NULL == out) { fprintf(stderr, "Usage: %s access error.\n", argv[2]); exit(0); } } } else { out = stdout; } } int main(int argc, char* argv[]) { if(argc < 2) { fprintf(stderr, "Usage: no input file.\n"); exit(0); } handleFile(argc - 1, argv + 1); tokenize(); return 0; }