思路很简单,将源代码作为长字符串进行读入,之后通过switch语句,及状态转换图进行词素识别,并对识别的词素进行整理输出。想法很简单,具体的实现有一点小困难,自我感觉良好的部分是符号表类SymTable的创建,为词素的判断提供了很大的帮助,并对以后分析器的改进停工了方便,例如将C语言分析器改为Java分析器,只需要改变Java语言中于C语言不同的关键字,运算符,界符等。例外的一个亮点就是switch语句的应用,大大化简了程序。本来按照运算符的分类,状态转换图至少有60个状态,还是挺恐怖的。废话不多说了,看代码吧:
package com.ant.model;
public class MyModel {
private String source;
private String target;
private StringBuffer SBSource;
private StringBuffer SBTarget = new StringBuffer();
private StringBuffer lexBuf = new StringBuffer();//用于存储预读的词素,进行判断分析
private String lexStr;
private int state = 0;//表明当前所处状态
private int start = 0;//表明状态转化图的出示状态
private char c;
private int begin = 0;//类似指针,表明读取字符所在位置
private int forward = 0;//类似指针,用于表明向前搜索的位置
private SymTable sym_Table = new SymTable();//符号表,用于存放预定义的关键字信息,及分析过程中新添加的标识符
private int flags = 1;//用于标记终态是否需要指针后移一位,0,需要;1,不需要
private int note = 0;
public MyModel() {
}
public MyModel(String source){
//接受源代码,并以StringBuffer的形式存储
this.source = source;
SBSource = new StringBuffer(source);
analysis();
}
public void analysis(){
//对源代码进行分析
while(begin < source.length()){
SBTarget.append(nextToken() +'\n');
lexBuf.delete(0, lexBuf.length());
start = state = 0;
}
System.out.println(begin);
System.out.println(source.length());
System.out.println(forward);
}
public String nextToken(){
//词法分析器的核心代码,根据状态转化图,对每一个词素进行识别,并将记号及其属性返回
/**
* case 0~16:完成对运算符的识别
* case 17~19:完成对标识符的识别,并通过查找符号表,实现关键字的判断
*/
while(true){
if(note == 1 ){
//读取内容为注释内容,另行处理
while (nextChar()!='*'){
lexBuf.append(nextChar());
forward++;
}
note = 0;
return "< " + lexBuf.toString() + " " + "注释" + " >";
}else{
switch(state){
case 0: c = nextChar();
if(c == ' '||c == '\t'|| c == '\n'){
state = 0;
forward = ++begin;
System.out.println(forward);
}
else{
switch(c){
case '<': state = 13; break;
case '>': state = 15; break;
case '-': state = 3; break;
case '!': state = 4; break;
case '+': state = 5; break;
case '*': state = 6; break;
case '/': state = 7; break;
case '%': state = 8; break;
case '=': state = 9; break;
case '&': state = 10; break;
case '^': state = 11; break;
case '|': state = 12; break;
case '#':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '"':
case '.':
case '~':
case '?':
case ':':
case ';':
case ',': state = 1; break;
default: state = fail();
}
}
break;
case 1:
return final_state();
case 2:
flags = 0;
return final_state();
case 3:
//识别'-','->','--','-='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '>':
case '-':
case '=': state = 1;break;
default: state = 2;
}
break;
case 4:
//'!','!='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
case 5:
//'+','++','+='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '+':
case '=': state = 1;break;
default: state = 2;
}
break;
case 6:
//'*','*=','*/'
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '=':
case '/': state = 1;break;
default: state = 2;
}
break;
case 7:
//'/','/*','/='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '*':
case '=': state = 1;break;
default: state = 2;
}
break;
case 8:
//'%','%='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
case 9:
//'=','=='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
case 10:
//'&','&&','&='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '&':
case '=': state = 1;break;
default: state = 2;
}
break;
case 11:
//'^','^='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
case 12:
//'|','||','|='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '|':
case '=': state = 1;break;
default: state = 2;
}
break;
case 13:
//'<','<<','<='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '<': state = 14;break;
case '=': state = 1;break;
default: state = 2;
}
break;
case 14:
//'<<='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
case 15:
//'>','>>','>='
lexBuf.append(c);
forward++;
c = nextChar();
switch(c){
case '>': state = 16;break;
case '=': state = 1;break;
default: state = 2;
}
break;
case 16:
//'>>='
lexBuf.append(c);
forward++;
c = nextChar();
if(c == '=') state = 1;
else state = 2;
break;
//完成对标记符的捕获,并识别
case 17:
if(Character.isLetter(c)) state = 18;
else state = fail();
break;
case 18:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isLetter(c)||Character.isDigit(c))
state = 18;
else state = 19;
break;
case 19:
flags = 0;
return final_state();
//完成对数字的识别
case 20:
if(Character.isDigit(c)) state = 21;
else state = fail();
break;
case 21:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c))
state = 21;
else if(c == '.' )
state = 22;
else if(c == 'E'|| c == 'e')
state = 24;
else state = 27;
break;
case 22:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c)) state = 23;
else state = fail();
break;
case 23:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c))
state = 23;
else if(c == 'E'|| c == 'e')
state = 24;
else state = 27;
break;
case 24:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c))
state = 26;
else if(c == '-'||c == '+')
state = 25;
else state = fail();
break;
case 25:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c))
state = 26;
else state = fail();
break;
case 26:
lexBuf.append(c);
forward++;
c = nextChar();
if(Character.isDigit(c))
state = 26;
else state = 27;
case 27:
lexStr = lexBuf.toString();
flags = 0;
retract();
install_id();
return "< " + lexStr + " " + "NUM" + " >";
case 28:
//完成对注释语句的读取
lexBuf.append(c);
forward++;
case 30:
return " ";
}
}
}
}
//当状态位终态是执行的操作
public String final_state(){
if(flags == 1)
lexBuf.append(c);
retract();
install_id();
if(lexStr.equals("/*")){
note = 1;
}
return "< " +'"'+ lexStr +'"' +" " + gettoken() + " >";
}
//将搜索指示器回调1个字符位置,标记flags是否需要指针后移一位,0,需要;1,不需要
public void retract(){
if(flags == 0){
begin = forward;
flags = 1;
}
else{
begin = ++forward;
}
}
//在符号表中查找词素,当它被标记为关键字时,返回0;当为程序变量时,返回指向相应符号表表型的指针。
//若为找到该词素,则将该词素作为变量填入符号表,并返回指向新建表项的指针
public int install_id(){
lexStr = lexBuf.toString();
int index = sym_Table.lookup(lexStr);
if(index == 0){
sym_Table.insert(lexStr,"id");
return sym_Table.getSymtable().size();
}
else{
return 0;
}
}
//在符号表中查找词素,若词素为关键字,则返回相应的记号,否则返回记号id
public String gettoken(){
int index = sym_Table.lookup(lexStr);
if(index == 0){
return "id";
}
else {
return sym_Table.getSymtable().get(index).getToken();
}
}
public char nextChar(){
//读取下一个字符
if(forward < source.length()){
return SBSource.charAt(forward);
}else {
return '@';
}
}
public int fail(){
//状态转化图识别失败
forward = begin;//有待商榷??
switch(start){
case 0: start = 17; break;
case 17: start = 20; break;
case 20:
case 22:
case 24:
case 25: start = 30; break;
default:
}
return start;
}
public String getTarget(){
//获取分析结果,并以String的形式返回给用户
target = SBTarget.toString();
return target;
}
}
package com.ant.model;
import java.util.ArrayList;
/**
* 符号表,用于保存源语言结构的各种信息。包括关键字的预存储,及分析过程中的标识符的存储
* @author Administrator
*
*/
public class SymTable {
private StringBuffer lexmes = new StringBuffer();//用于存储形成标识符的字符串
private ArrayList<Symbol> symtable = new ArrayList<Symbol>();//符号表的实现,内为创建的符号表表项的内部类
private static char EOS = '@';
public SymTable() {
symtable.add(new Symbol(-1,""));//向符号表插入指针指向为空的Symbol
init();//完成关键字的预存储
}
//完成C语言关键字的预存储
public void init(){
//C语言关键字的预存储
insert("auto","auto");
insert("break","break");
insert("case","case");
insert("char","cahr");
insert("const","const");
insert("continue","continue");
insert("default","default");
insert("do","do");
insert("double","double");
insert("else","else");
insert("enum","enum");
insert("extern","extern");
insert("float","flout");
insert("for","for");
insert("goto","goto");
insert("if","if");
insert("int","int");
insert("long","long");
insert("register","register");
insert("return","return");
insert("short","short");
insert("signed","signed");
insert("sizeof","sizeof");
insert("static","static");
insert("struct","struct");
insert("switch","switch");
insert("typedef","typedef");
insert("union","union");
insert("unsigned","unsigned");
insert("void","void");
insert("volatilc","volatile");
insert("while","while");
//C语言主要界符的预存储
insert("(","界符");
insert(")","界符");
insert("/*","界符");
insert("*/","界符");
insert("{","界符");
insert("}","界符");
insert(";","界符");
insert('"' + "","界符");
//C语言主要运算符的预存储
insert("->","运算符");
insert("[","运算符");
insert("]","运算符");
insert(".","运算符");
insert("!","运算符");
insert("~","运算符");
insert("++","运算符");
insert("--","运算符");
insert("-","运算符");
insert("*","运算符");
insert("&","运算符");
insert("/","运算符");
insert("%","运算符");
insert("+","运算符");
insert("-","运算符");
insert("<<","运算符");
insert(">>","运算符");
insert("<","运算符");
insert("<=","运算符");
insert(">","运算符");
insert(">=","运算符");
insert("==","运算符");
insert("!=","运算符");
insert("^","运算符");
insert("|","运算符");
insert("&&","运算符");
insert("||","运算符");
insert("?","运算符");
insert(":","运算符");
insert("=","运算符");
insert("+=","运算符");
insert("-=","运算符");
insert("*=","运算符");
insert("/=","运算符");
insert("%=","运算符");
insert(">>=","运算符");
insert("<<=","运算符");
insert("&=","运算符");
insert("^=","运算符");
insert("|=","运算符");
insert(",","运算符");
insert("#","特殊字符");
}
//将字符串s和记号token插入相应的表项,并返回相应的表项的索引
public int insert(String s,String token){
int index = lexmes.length();
symtable.add(new Symbol(index,token));
lexmes.append(s + EOS);
return symtable.size();
}
//到符号表中查找字符串s,如果找到则返回相应的表项的索引,否则返回0
//ArrayList从0开始查找
public int lookup(String s){
for(int i = 1;i < symtable.size();i++){
StringBuffer lexBf = new StringBuffer();//用于暂时存储从lexmes中读取的字符
int index = symtable.get(i).getLexptr();//获取字符表中第i个元素所指示的字符串在lexmes中的起始位置
while(lexmes.charAt(index)!= EOS){
lexBf.append(lexmes.charAt(index));
index++;
}
//将从lexmes中读取的字符串进行比较,相同,则退出查找,否则继续,直至查找结束
if(s.equals(lexBf.toString())){
return i;
}
}
return 0;
}
public static void main(String [] args){
SymTable sym = new SymTable();
System.out.println(sym.lookup("case"));
}
public ArrayList<Symbol> getSymtable() {
return symtable;
}
public void setSymtable(ArrayList<Symbol> symtable) {
this.symtable = symtable;
}
//创建内部类,用于存储符号表表项
public class Symbol{
private int lexptr;//用于标记词素在StringBuffer中的起始位置
private String token;//词素的相应记号,标记符统一为id,关键字的为相应的关键字
private String attribute;//词素的额外属性,暂时不予考虑,
public Symbol(int lexptr,String token){
this.lexptr = lexptr;
this.token = token;
this.attribute = " ";
}
public Symbol(int lexptr,String token,String attribute){
this.lexptr = lexptr;
this.token = token;
this.attribute = attribute;
}
public void setLexptr(int lexptr) {
this.lexptr = lexptr;
}
public int getLexptr() {
return lexptr;
}
public void setToken(String token) {
this.token = token;
}
public String getToken() {
return token;
}
public void setAttribute(String attribute) {
this.attribute = attribute;
}
public String getAttribute() {
return attribute;
}
}
}
分析器的界面就不用再说了吧,无非是一个输入框,输入框,一个执行按钮。
刚刚做完就忍不住和同志们分享一下,哈哈,当然还有很多不足。例如注释的识别,还有一些尚未发现的bug。不过感觉还是很好的。哈哈