flex & bison

最新推荐文章于 2022-12-06 20:57:43 发布

tailuzhecom

最新推荐文章于 2022-12-06 20:57:43 发布

阅读量401

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/tailuzhecom/article/details/88770507

本文介绍了使用flex和bison构建解析器的过程，包括统计代码行数、字符数和数字字符数的功能，以及如何处理Pascal语言的正则表达式。深入探讨了flex的Start Condition和bison的union类型、运算符优先级以及冲突解决策略。通过Calculator Practice项目展示了实际应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

flex

yytext是匹配的字符串，yyleng是该字符串的长度

scan的规则：
If action code returns, scanning resumes on the next call to yylex(); if it doesn’t return, scanning resumes immediately.

统计代码行数，字符数，数字字符数

%{
#include <stdio.h>		/* 这段代码会一字不差地拷贝到生成的代码中 */
#include <math.h>
int num_lines = 0, num_chars = 0, digit_num = 0;
%}

%%
\n ++num_lines;
[0-9] ++digit_num; ++num_chars;
.  ++num_chars;
%%

main() {
    yylex();
    printf("# of lines = %d, # of chars = %d, # of digits = %d\n", num_lines, num_chars, digit_num);
}

pascal语言的flex文件

pascal.lex

%{
#include <math.h>
%}

DIGIT [0-9]	/* name definition */
ID    [a-z][a-z0-9]*

/* 先匹配前面的模式，匹配到的字符串存储在yytext中 */
%% 
{DIGIT}+ {
	printf("An integer: %s (%d)\n", yytext, atoi(yytext));
}

{DIGIT}+"."{DIGIT}* {	/* 使用花括号展开name definition */
	printf("A float: %s (%g)\n", yytext, atof(yytext));
}

if|then|begin|end|procedure|function {
	printf("A keyword: %s\n", yytext);
}

{ID}	printf("An identifier: %s\n", yytext);
"+"|"-"|"*"|"/"	printf("An operator: %s\n", yytext);
"{"[^}\n]*"}"
[ \t\n]+
.	printf("Unrecognized character: %s\n", yytext);
%%

main()
{
	yyin = stdin;
	yylex();
}

正则表达式

()表示一个表达式的匹配，如(abc)需要遇到字符串abc时才匹配，遇到a不会匹配
[]表示单个字符的匹配，如[abc]遇到其中任何一个字母就匹配
匹配以abc开头的字符串，(^abc)
匹配以abc结尾的字符串，(abc$)

[abc]	A single character of: a, b, or c
[^abc]	Any single character except: a, b, or c
[a-z]	Any single character in the range a-z
[a-zA-Z]	Any single character in the range a-z or A-Z
^	Start of line
$	End of line
\A	Start of string
\z	End of string
.	Any single character
\s	Any whitespace character
\S	Any non-whitespace character
\d	Any digit
\D	Any non-digit
\w	Any word character (letter, number, underscore) 匹配字母、数字和下划线
\W	Any non-word character
\b	Any word boundary
(...)	Capture everything enclosed
(a|b)	a or b
a?	Zero or one of a
a*	Zero or more of a
a+	One or more of a
a{3}	Exactly 3 of a
a{3,}	3 or more of a
a{3,6}	Between 3 and 6 of a

Start Condition

INITIAL自动定义，为%s类型(匹配任何规则)，%x 则匹配对应的规则，如声明%x COMMENT，有<COMMENT>{str}，则在已经BEGIN(COMMENT)的情况下才匹配str

%{
#include <unistd.h>
%}

%x COMMENT   /* 只配对<COMMENT>的规则 */

%%
"/*"	BEGIN(COMMENT);
"username" printf("username\n");
<COMMENT>[^*\n]*
<COMMENT>"*/"	{ printf("INITIAL\n"); BEGIN(INITIAL); }
. printf("strange char\n");
%%

main() {
    yylex();
}

bison

union声明token类型，token的类型在声明时用<>括起来，否则默认为整型，比如下面的例子中NUMBER的类型是d

%{
#include <stdio.h>
#include <stdlib.h>
#include "ACalculator.h"
%}

%union {
	struct ast *a;
	double d;
}

%token <d> NUMBER
%token EOL

%type <a> exp factor term

Calculator Practice

ACalculator.h

extern int yylineno;
void yyerror(char* s, ...);

struct ast {
    int nodetype;
    struct ast *l;
    struct ast *r;
};

struct numval
{
    /* data */
    int nodetype;
    double number;
};

struct ast* newast(int nodetype, struct ast *l, struct ast *r);
struct ast* newnum(double d);

double eval(struct ast*);
void treefreee(struct ast*);

ACalculator.c

#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include "ACalculator.h"

struct ast* newast(int nodetype, struct ast* l, struct ast* r) {
    struct ast* a = (struct ast*)malloc(sizeof(struct ast));

    if (!a) {
        yyerror("out of space\n");
        exit(0);
    }

    a->nodetype = nodetype;
    a->l = l;
    a->r = r;
    return a;
}


struct ast* newnum(double d) {
    struct numval* a = (struct numval*)malloc(sizeof(struct numval));
    if (!a) {
        yyerror("out of space");
        exit(0);
    }

    a->nodetype = 'K';
    a->number = d;
    return (struct ast*)a;    // 注意返回的类型
}

double
eval(struct ast* a) {
    double v;
    switch (a->nodetype)
    {
        case 'K':
            v = ((struct numval*)a)->number;
            break;

        case '+':
            v = eval(a->l) + eval(a->r);
            break;

        case '-':
            v = eval(a->l) - eval(a->r);
            break;
        
        case '*':
            v = eval(a->l) * eval(a->r);
            break;

        case '/':
            v = eval(a->l) / eval(a->r);
            break;

        case '|':     // abs
            v = eval(a->l);
            if (v < 0)
                v = -v;
            break;

        case 'M':
            v = -eval(a->l);
            break;
    
        default:
            printf("internal errror: bad node %c\n", a->nodetype);
            break;
    }
    return v; 
}

void 
treefree(struct ast* a) {
    switch (a->nodetype)
    {
        case '+':
        case '-':
        case '*':
        case '/':
            treefree(a->r);
        case '|':
        case 'M':
            treefree(a->l);
        case 'K':
            free(a);
            break;    
        default:
            break;
    }
}

void 
yyerror(char* s, ...) {
    va_list ap;
    va_start(ap, s);

    fprintf(stderr, "%d: error: ", yylineno);
    vfprintf(stderr, s, ap);
    fprintf(stderr, "\n");
}

int main() {
    printf("> ");
    return yyparse();
}

ACalculator.flex

%option noyywrap nodefault yylineno

%{
#include "ACalculator.h"
#include "Parser.tab.h"
%}

EXP ([Ee][-+]?[0-9]+)

%%
"+" |
"-" | 
"*" |
"/" |
"|" |
"(" |
")" 	{ return yytext[0]; }     // 注意这种写法
[0-9]+"."[0-9]*{EXP}? | 
"."?[0-9]+{EXP}? {yylval.d = atof(yytext); return NUMBER; }
\n { return EOL; }
"//".*
[ \t] {}
. { yyerror("Mystery character %c\n", *yytext); }

%%

Parser.y

%{
#include <stdio.h>
#include <stdlib.h>
#include "ACalculator.h"
%}

%union {  
	struct ast *a;
	double d;
}

%token <d> NUMBER
%token EOL

%type <a> exp factor term

%% 
calclist: 
	| calclist exp EOL {
		printf("= %4.4g\n", eval($2));
		treefree($2);
		printf("> ");
	}
	| calclist EOL { printf("> "); }
	;

exp: factor
	| exp '+' factor { $$ = newast('+', $1, $3); }
	| exp '-' factor { $$ = newast('-', $1, $3); }
	;

factor: term
	| factor '*' term { $$ = newast('*', $1, $3); }
	| factor '/' term { $$ = newast('/', $1, $3); }
	;

term: NUMBER { $$ = newnum($1); }
	| '|' term { $$ = newast('|', $2, NULL); }
	| '(' exp ')' { $$ = $2; }
	| '-' term { $$ = newast('M', $2, NULL); }
	;
%%

Makefile

all:
	bison -d Parser.y
	flex ACalculator.flex
	gcc lex.yy.c ACalculator.c Parser.tab.c -o ACalculator -lfl

如何处理冲突：
定义运算符的优先级，优先级由上往下逐级递增，%left声明该运算符有左结合性，%nonassoc通常用于一元运算符

%left '+' '-'
%left '*' '/'
%nonassoc '|' UMINUS

%token和%type的区别
%type用于非终结符和带有属性的终结符，%token用于