[无内容] lexeme与token的对应关系……

本文讨论了ANTLR中lexeme与token的关系,并提出了如何让一个符号(如等号)能够映射到多个不同token(例如“EQ”和“AssignementOperator”)的问题。通过对比其他工具的做法,如lcc lexer中的实现方式,展示了复杂符号映射的可能性。
在ANTLR里一个lexeme真的只能对应一种token?我是没把说明看仔细还是怎样,但是这等号字符(“=”)该如何对应到多个token上呢?我只是想让它既对应“EQ”又对应“AssignementOperator”而已……yacc里可以的嘛

[url=http://www.cs.princeton.edu/software/lcc]lcc[/url]的lexer里,lexer.c的那个static unsigned char map[256]的定义有点意思。看到像是第77行的:
/* 101 A   */    LETTER|HEX,

我便湿了啊……

顺带,lexer.c的开头还有这个定义:
enum { BLANK=01,  NEWLINE=02, LETTER=04,
DIGIT=010, HEX=020, OTHER=040 };
#include <stdio.h> #include <ctype.h> #include <string.h> #include <stdlib.h> // Token类型定义 typedef enum { TOKEN_ID, // 标识符 TOKEN_NUMBER, // 数字 TOKEN_KEYWORD, // 关键字 TOKEN_OPERATOR, // 运算符 TOKEN_DELIMITER, // 分隔符 TOKEN_EOF, // 结束符 TOKEN_ERROR // 错误 } TokenType; // 关键字列表 const char *keywords[] = { "if", "else", "while", "return", "int", "float", "char" }; // Token结构体 typedef struct { TokenType type; char lexeme[32]; int line; } Token; // 全局状态 char *source; // 源代码缓冲区 int position = 0; // 当前位置 int line = 1; // 当前行号 // 获取下一个字符 char next_char() { return source[position++]; } // 回退一个字符 void backtrack() { position--; } // 判断是否为关键字 int is_keyword(const char *str) { int num_keywords = sizeof(keywords) / sizeof(keywords[0]); for (int i = 0; i < num_keywords; i++) { if (strcmp(str, keywords[i]) == 0) { return 1; } } return 0; } // 获取下一个token Token get_next_token() { Token token; token.line = line; char current = next_char(); // 跳过空白字符 while (isspace(current)) { if (current == '\n') line++; current = next_char(); } // 标识符或关键字 if (isalpha(current) || current == '_') { int idx = 0; token.lexeme[idx++] = current; while (isalnum(current = next_char()) || current == '_') { token.lexeme[idx++] = current; } backtrack(); token.lexeme[idx] = '\0'; token.type = is_keyword(token.lexeme) ? TOKEN_KEYWORD : TOKEN_ID; return token; } // 数字常量 if (isdigit(current)) { int idx = 0; token.lexeme[idx++] = current; while (isdigit(current = next_char())) { token.lexeme[idx++] = current; } backtrack(); token.lexeme[idx] = '\0'; token.type = TOKEN_NUMBER; return token; } // 运算符和分隔符 token.lexeme[0] = current; token.lexeme[1] = '\0'; switch (current) { case '+': case '-': case '*': case '/': case '=': case '<': case '>': case '!': token.type = TOKEN_OPERATOR; break; case ';': case ',': case '(': case ')': case '{': case '}': case '[': case ']': token.type = TOKEN_DELIMITER; break; case EOF: case '\0': token.type = TOKEN_EOF; strcpy(token.lexeme, "EOF"); break; default: token.type = TOKEN_ERROR; } return token; } int main() { // 示例源代码 (可替换为文件读取) char code[] = "int main() {\n int x = 42;\n if (x > 0) {\n return x;\n }\n}"; source = code; // 分词过程 Token token; do { token = get_next_token(); printf("Line %d: [%s] - Type: %d\n", token.line, token.lexeme, token.type); } while (token.type != TOKEN_EOF); return 0; } 分析一下
最新发布
10-08
#include <stdio.h> #include <stdlib.h> #include <ctype.h> #include <string.h> #include <locale.h> // 添加locale头文件支持中文 typedef enum { TOKEN_EOF, TOKEN_ID, TOKEN_INT, TOKEN_ASSIGN, TOKEN_IF, TOKEN_THEN, TOKEN_ELSE, TOKEN_WHILE, TOKEN_DO, TOKEN_BEGIN, TOKEN_END, TOKEN_PROGRAM, TOKEN_PERIOD, TOKEN_EQ, TOKEN_NE, TOKEN_LT, TOKEN_GT, TOKEN_LE, TOKEN_GE, TOKEN_PLUS, TOKEN_MINUS, TOKEN_MUL, TOKEN_DIV, TOKEN_LPAREN, TOKEN_RPAREN, TOKEN_SEMICOLON } TokenType; typedef struct { TokenType type; char lexeme[32]; int line; int column; } Token; char *keywords[] = {"program", "begin", "end", "if", "then", "else", "while", "do"}; TokenType keyword_types[] = {TOKEN_PROGRAM, TOKEN_BEGIN, TOKEN_END, TOKEN_IF, TOKEN_THEN, TOKEN_ELSE, TOKEN_WHILE, TOKEN_DO}; Token current_token; FILE *input; int ch; int current_line = 1; int current_column = 0; void next_char() { ch = fgetc(input); current_column++; if (ch == '\n') { current_line++; current_column = 0; } } void skip_whitespace() { while (isspace(ch)) next_char(); } void read_identifier_or_keyword(Token *token) { int i = 0; token->line = current_line; token->column = current_column; while (isalnum(ch) || ch == '_') { token->lexeme[i++] = ch; next_char(); } token->lexeme[i] = '\0'; for (int j = 0; j < sizeof(keywords)/sizeof(keywords[0]); j++) { if (strcmp(token->lexeme, keywords[j]) == 0) { token->type = keyword_types[j]; return; } } token->type = TOKEN_ID; } void read_number(Token *token) { int i = 0; token->line = current_line; token->column = current_column; while (isdigit(ch)) { token->lexeme[i++] = ch; next_char(); } token->lexeme[i] = '\0'; token->type = TOKEN_INT; } void get_next_token(Token *token) { skip_whitespace(); token->line = current_line; token->column = current_column; switch (ch) { case EOF: token->type = TOKEN_EOF; strcpy(token->lexeme, "EOF"); break; case ':': next_char(); if (ch == '=') { token->type = TOKEN_ASSIGN; strcpy(token->lexeme, ":="); next_char(); } else { token->type = TOKEN_ASSIGN; strcpy(token->lexeme, ":"); } break; case '=': next_char(); if (ch == '=') { token->type = TOKEN_EQ; strcpy(token->lexeme, "=="); next_char(); } else { token->type = TOKEN_ASSIGN; strcpy(token->lexeme, "="); } break; case '!': next_char(); if (ch == '=') { token->type = TOKEN_NE; strcpy(token->lexeme, "!="); next_char(); } else { token->type = TOKEN_NE; strcpy(token->lexeme, "!"); } break; case '<': next_char(); if (ch == '=') { token->type = TOKEN_LE; strcpy(token->lexeme, "<="); next_char(); } else { token->type = TOKEN_LT; strcpy(token->lexeme, "<"); } break; case '>': next_char(); if (ch == '=') { token->type = TOKEN_GE; strcpy(token->lexeme, ">="); next_char(); } else { token->type = TOKEN_GT; strcpy(token->lexeme, ">"); } break; case '+': token->type = TOKEN_PLUS; strcpy(token->lexeme, "+"); next_char(); break; case '-': token->type = TOKEN_MINUS; strcpy(token->lexeme, "-"); next_char(); break; case '*': token->type = TOKEN_MUL; strcpy(token->lexeme, "*"); next_char(); break; case '/': token->type = TOKEN_DIV; strcpy(token->lexeme, "/"); next_char(); break; case '(': token->type = TOKEN_LPAREN; strcpy(token->lexeme, "("); next_char(); break; case ')': token->type = TOKEN_RPAREN; strcpy(token->lexeme, ")"); next_char(); break; case ';': token->type = TOKEN_SEMICOLON; strcpy(token->lexeme, ";"); next_char(); break; case '.': token->type = TOKEN_PERIOD; strcpy(token->lexeme, "."); next_char(); break; default: if (isalpha(ch) || ch == '_') { read_identifier_or_keyword(token); } else if (isdigit(ch)) { read_number(token); } else { token->line = current_line; token->column = current_column; sprintf(token->lexeme, "非法字符 '%c'", ch); token->type = TOKEN_EOF; next_char(); } } } void parse_program(); void parse_statement(); void parse_assignment(); void parse_if_stmt(); void parse_while_stmt(); void parse_compound_stmt(); void parse_expression(); void parse_logical_expr(); void parse_term(); void parse_factor(); void match(TokenType expected_type) { if (current_token.type == expected_type) { get_next_token(&current_token); } else { // 中文错误信息 printf("语法错误 [%d:%d] 预期: ", current_token.line, current_token.column); switch(expected_type) { case TOKEN_PROGRAM: printf("'program'"); break; case TOKEN_BEGIN: printf("'begin'"); break; case TOKEN_END: printf("'end'"); break; case TOKEN_IF: printf("'if'"); break; case TOKEN_THEN: printf("'then'"); break; case TOKEN_ELSE: printf("'else'"); break; case TOKEN_WHILE: printf("'while'"); break; case TOKEN_DO: printf("'do'"); break; case TOKEN_ID: printf("标识符"); break; case TOKEN_INT: printf("整数"); break; case TOKEN_ASSIGN: printf("':='"); break; case TOKEN_EQ: printf("'=='"); break; case TOKEN_NE: printf("'!='"); break; case TOKEN_LT: printf("'<'"); break; case TOKEN_GT: printf("'>'"); break; case TOKEN_LE: printf("'<='"); break; case TOKEN_GE: printf("'>='"); break; case TOKEN_PLUS: printf("'+'"); break; case TOKEN_MINUS: printf("'-'"); break; case TOKEN_MUL: printf("'*'"); break; case TOKEN_DIV: printf("'/'"); break; case TOKEN_LPAREN: printf("'('"); break; case TOKEN_RPAREN: printf("')'"); break; case TOKEN_SEMICOLON: printf("';'"); break; case TOKEN_PERIOD: printf("'.'"); break; default: printf("token %d", expected_type); } printf(", 实际: '%s'\n", current_token.lexeme); exit(1); } } void parse_program() { match(TOKEN_PROGRAM); match(TOKEN_ID); match(TOKEN_SEMICOLON); parse_compound_stmt(); match(TOKEN_PERIOD); if (current_token.type != TOKEN_EOF) { printf("语法错误 [%d:%d] 预期结束符, 实际: '%s'\n", current_token.line, current_token.column, current_token.lexeme); exit(1); } printf("该程序是正确的。\n"); } void parse_compound_stmt() { match(TOKEN_BEGIN); parse_statement(); while (current_token.type == TOKEN_SEMICOLON) { match(TOKEN_SEMICOLON); parse_statement(); } match(TOKEN_END); } void parse_statement() { switch (current_token.type) { case TOKEN_ID: parse_assignment(); break; case TOKEN_IF: parse_if_stmt(); break; case TOKEN_WHILE: parse_while_stmt(); break; case TOKEN_BEGIN: parse_compound_stmt(); break; default: printf("语法错误 [%d:%d] 非法的语句开始: '%s'\n", current_token.line, current_token.column, current_token.lexeme); exit(1); } } void parse_assignment() { match(TOKEN_ID); match(TOKEN_ASSIGN); parse_expression(); } void parse_if_stmt() { match(TOKEN_IF); parse_logical_expr(); match(TOKEN_THEN); parse_statement(); if (current_token.type == TOKEN_ELSE) { match(TOKEN_ELSE); parse_statement(); } } void parse_while_stmt() { match(TOKEN_WHILE); parse_logical_expr(); match(TOKEN_DO); parse_statement(); } void parse_logical_expr() { parse_expression(); switch (current_token.type) { case TOKEN_EQ: case TOKEN_NE: case TOKEN_LT: case TOKEN_GT: case TOKEN_LE: case TOKEN_GE: match(current_token.type); parse_expression(); break; default: printf("语法错误 [%d:%d] 预期关系运算符, 实际: '%s'\n", current_token.line, current_token.column, current_token.lexeme); exit(1); } } void parse_expression() { parse_term(); while (current_token.type == TOKEN_PLUS || current_token.type == TOKEN_MINUS) { match(current_token.type); parse_term(); } } void parse_term() { parse_factor(); while (current_token.type == TOKEN_MUL || current_token.type == TOKEN_DIV) { match(current_token.type); parse_factor(); } } void parse_factor() { switch (current_token.type) { case TOKEN_INT: match(TOKEN_INT); break; case TOKEN_ID: match(TOKEN_ID); break; case TOKEN_LPAREN: match(TOKEN_LPAREN); parse_expression(); match(TOKEN_RPAREN); break; default: printf("语法错误 [%d:%d] 非法的表达式因子: '%s'\n", current_token.line, current_token.column, current_token.lexeme); exit(1); } } int main(int argc, char *argv[]) { // 设置本地化环境支持中文 setlocale(LC_ALL, "zh_CN.UTF-8"); if (argc < 2) { printf("用法: %s <文件名>\n", argv[0]); return 1; } input = fopen(argv[1], "r"); if (!input) { perror("打开文件失败"); return 1; } // 初始化词法分析器 next_char(); get_next_token(&current_token); // 开始语法分析 parse_program(); fclose(input); return 0; }
06-27
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值