这学期的编译课程设计需要做一个类c编译器,准确的说是完善上学期做的大实验。
上学期的实验中,使用antlr完成的编译器识别的语法很有限,基本上是个计算器的语法,于是这次决定弄语法一个更加完整。
语法支持:
声明,赋值,函数,if-else,while,for。
首先是词法分析和语法分析,antlr源文件如下:
grammar c;
options{
output=AST;
ASTLabelType=CommonTree;
}
tokens{
PROG;STAT;IFSTAT;IF;ELSE;WHILESTAT;FORSTAT;DECLAREVAR;DECLAREFUNC;CALLFUNC;GIVEVALUE;CALL;FUNC1;FUNC2;FUNC3;FUNC4;
}
ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
;
INT : '0'..'9'+
;
FLOAT
: ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
| '.' ('0'..'9')+ EXPONENT?
| ('0'..'9')+ EXPONENT
;
num : INT
| FLOAT
;
num_type : 'int'
| 'float'
;
COMMENT
: '//' ~('\n'|'\r')* '\r'? '\n' {$channel=HIDDEN;}
| '/*' ( options {greedy=false;} : . )* '*/' {$channel=HIDDEN;}
;
WS : ('\r'|'\n'|' '|'\t')+{$channel=HIDDEN;};
END : ';'
;
boolexpr : expr ('=='^|'!='^|'>'^|'<'^|'>='^|'<='^) expr
;
expr : multexpr (('+'^ | '-'^) multexpr)*
;
multexpr : atom (('*'^|'/'^) atom)*
;
atom : '(' expr ')' -> ^(expr)
| num
| ID
| callfunc
;
declarevar : 'int'^ ID
| 'float'^ ID
;
givevalue : ID '=' expr
-> ^('=' ID expr)
;
ifstat options{ backtrack=true; }
: 'if' '(' boolexpr ')' '{' s1=stat* '}' 'else' '{' s2=stat* '}'
-> ^(IF boolexpr $s1 ELSE $s2)
| 'if' '(' boolexpr ')' '{' stat* '}'
-> ^(IF boolexpr stat*)
;
whilestat options{ backtrack=true; }
: 'while' '(' boolexpr ')' '{' stat* '}'
-> ^('while' boolexpr stat*)
;
forstat options{ backtrack=true; }
: 'for' '(' s1=givevalue ';' s2=boolexpr ';' s3=givevalue ')' '{' s4=stat* '}'
-> ^('for' $s1 $s2 $s4 $s3)
;
declarefunc
: 'void' ID '(' ')' '{' stat* '}' -> ^(FUNC1 ID stat*)
| ('int'|'float') ID '(' ')' '{' stat* 'return' expr END '}' -> ^(FUNC2 ID stat* expr)
| 'void' ID '(' (num_type ID) (',' (num_type ID))* ')' '{' stat* '}' ->
^(FUNC3 ID (num_type ID)+ stat*)
| ('int'|'float') ID '(' (num_type ID) (',' (num_type ID))* ')' '{' stat* 'return' expr END '}'
-> ^(FUNC4 ID (num_type ID)+ stat* expr)
;
callfunc : ID '(' ')' -> ^(CALL ID)
| ID '(' expr (',' expr)* ')' -> ^(CALL ID expr (expr)*)
;
stat : declarevar END -> ^(DECLAREVAR declarevar)
| givevalue END-> ^(GIVEVALUE givevalue)
| ifstat -> ^(IFSTAT ifstat)
| forstat -> ^(FORSTAT forstat)
| whilestat -> ^(WHILESTAT whilestat)
| declarefunc -> ^(DECLAREFUNC declarefunc)
| callfunc END -> ^(CALLFUNC callfunc)
;
prog : stat* -> ^(PROG stat*)
;
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
这个grammar输出一个ast
输入为
int b;
void main(){
if(b==0){
b=34;
}
print(b);
}
void test(int a){
print(a);
}
的时候,生成的抽象语法树如下,
(PROG (DECLAREVAR (int b)) (DECLAREFUNC (FUNC1 main (IFSTAT (IF (== b 0) (GIVEVALUE (= b 34)))) (CALLFUNC (CALL print b)))) (DECLAREFUNC (FUNC3 test int a (CALLFUNC (CALL print a)))))
然后是树解析器,这一步就是把上面的ast转变为汇编,主要是注意函数调用需要平衡堆栈,其他倒没什么,为了方便调试,生成.s文件中嵌入了一个print()函数
tree grammar cTree;
options{
tokenVocab=c;
ASTLabelType=CommonTree;
output=template;
}
@headers{
}
@members{
int index=0;
int data_index=0;
int labelindex=0;
int labelindex_if=0;
int num=0;
int labelindex_bool=0;
int labelindex_while=0;
int labelindex_for=0;
int labelindex_func=0;
String []text = new String[1000];
String []data = new String[1000];
}
boolexpr : ^('==' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
| ^('!=' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
| ^('>=' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jng label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
| ^('<=' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
| ^('>' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
| ^('<' s1=expr s2=expr)
{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jg label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
labelindex_bool++;}
;
expr : ^('+' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="add eax,ebx";text[index++]="push rax";}
| ^('-' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="sub eax,ebx";text[index++]="push rax";}
| ^('*' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="mul eax,ebx";text[index++]="push rax";}
| ^('/' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="div eax,ebx";text[index++]="push rax";}
| INT{text[index++]="mov eax,"+$INT+"";text[index++]="push rax";}
| FLOAT{text[index++]="mov eax,"+$FLOAT+"";text[index++]=" push rax";}
| ID{text[index++]="mov eax,["+$ID+"]";text[index++]="push rax";}
| callfunc{text[index++]="push rax";}
;
declarevar : ^('int' ID){data[data_index++]=$ID+" dd 0";}
| ^('float' ID){data[data_index++]=$ID+" dd 0";}
;
givevalue : ^('=' ID expr){text[index++]="pop rax";text[index++]="mov ["+$ID+"],eax";}
;
ifstat options{backtrack=true;}
: ^(IF boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if0"+labelindex_if+"";}
s=stat*{text[index++]="label_if0"+labelindex+":";}){labelindex_if++;}
| ^(IF boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if1"+labelindex_if+"";}
s1=stat*{text[index++]="jmp label_if2"+labelindex_if+"";text[index++]="label1_if1"+labelindex_if+":";}
ELSE
s2=stat*{text[index++]="label_if2"+labelindex_if+":";}){labelindex_if++;}
;
whilestat : ^('while'{text[index++]="label_while"+labelindex_while+":";} boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_while2"+labelindex_while+"";}
stat*{text[index++]="jmp label_while"+labelindex_while+"\nlabel_while2"+labelindex_while+":";}){labelindex_while++;}
;
forstat : ^('for' s1=givevalue{text[index++]="label_for"+labelindex_for+":";}
s2=boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_for2"+labelindex_for+"";}
s4=stat*
s3=givevalue{text[index++]="jmp label_for"+labelindex_for+"";text[index++]="label_for2"+labelindex_for+":";} ){labelindex_for++;}
;
declarefunc
: ^(FUNC1
ID {text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";}
s1=stat* {text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";}){labelindex_func++;}
| ^(FUNC2 ID{text[index++]="jmp label"+labelindex+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";}
stat*
expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex+":";}){labelindex++;}
| ^(FUNC3 a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";}
(('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";} b=ID{text[index++]="mov rax,[ rbp+"+8+"+8+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+
stat*{text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";})
{labelindex_func++;}
| ^(FUNC4{num=0;} a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";}
(('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";} b=ID{text[index++]="mov rax,[ rbp+"+8+"+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+
stat*
expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex_func+":";})
{labelindex++;}
;
callfunc : ^(CALL ID){text[index++]="call label_"+$ID+"";}
| ^(CALL ID{num=0;} (expr{num++;})+){text[index++]="push "+num;text[index++]="call label_"+$ID+"";}
;
stat : ^(DECLAREVAR declarevar)
| ^(GIVEVALUE givevalue)
| ^(IFSTAT ifstat)
| ^(FORSTAT forstat)
| ^(WHILESTAT whilestat)
| ^(DECLAREFUNC declarefunc)
| ^(CALLFUNC callfunc)
;
prog : ^(PROG stat*)
{
System.out.println("section .data\nt resb 100\nbuffer db 0 ,0,0");
for(int i=0;i<data_index;i++){
System.out.println(data[i]);
}
System.out.println("section .text\nglobal _start\n_start:\n");
for(int i=0;i<index;i++){
System.out.println(text[i]);
}
System.out.println("call label_main\njmp label_a\nlabel_print:\nxor rcx,rcx\nxor rax,rax\npush rbp\nmov rbp,rsp\nmov rax,[rbp+8+8*2]\nlabel_prog:\nmov rbx,10\ndiv bl\nadd ah,30h\nmov ebx,buffer\nsub ebx,ecx\ndec ebx\nmov [ebx],ah\nmov ah,0\ninc rcx\ncmp rax,0\njnz label_prog\nmov ax,4\nmov ebx,1\nmov edx,ecx\nmov ecx,buffer\nsub ecx,edx\nint 80h\nmov rsp,rbp\npop rbp\nret\nlabel_a:\nmov ax,1\nmov ebx,0\nint 0x80\n");
}
;
主程序,调用.g文件生成的java文件如下
//cm.java
import java.io.*;
import org.antlr.runtime.ANTLRFileStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.tree.CommonTree;
import org.antlr.runtime.tree.CommonTreeNodeStream;
public class cm {
public static void main(String args[]) throws Exception {
cLexer lex = new cLexer(new ANTLRFileStream("./input.txt"));
CommonTokenStream tokens = new CommonTokenStream(lex);
cParser par = new cParser(tokens);
cParser.prog_return ret = par.prog();
CommonTree t = ret.tree;
// System.out.println(t.toStringTree());
CommonTreeNodeStream nodes = new CommonTreeNodeStream(t);
nodes.setTokenStream(tokens);
cTree walker = new cTree(nodes);
walker.prog();
}
}
这样,就得到了.s文件,使用nasm生成.o文件后,生成可执行文件,测试
java org.antlr.Tool c.g
java org.antlr.Tool cTree.g
javac *.java
java cm > input.s
nasm -f elf64 input.s
ld -s -o input input.o
github https://github.com/lizhongguo/myc