词法分析的主要任务是对源程序进行单词序列化,主要是看得懂“状态转换图”,即所谓的“自动机”,程序的实现是根据状态转换图来完成的,图理解了,程序就非常好写了,一个状态是一个case,然后在不同的状态之间跳转,如果能够达到最后的状态,则说明符合某一个词法,那么程序就可以正确的识别这个单词了。
在此举的一个例子是识别c语言的源程序的,程序也是用c语言写的,用c语言来识别c语言,好像有点不太对劲,但是这有点像先有鸡还是先有蛋的问题,不去深究这个问题,关键是看词法分析的方法和原理。
另外在写程序之前还要对状态转换图进行优化之类的工作,这些工作都是很重要的理论知识的体现,这些工作才是最具有挑战性的工作,写程序只是将最终别人思考的结果展示出来而已,所以我们不能只知其一,不知其二。
还有就是我在网上没有找到比较全面的识别c语言的状态转换图,于是就自己画了一个,很凌乱,等加工之后,再贴出来,这里先贴出来源程序:
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
FILE * fp;
char ch;
char * key[39]={"auto","break","case","char","const","continue","default","do","double",
"else","enum","extern","float","for","goto","if","int","long","register",
"return","short","signed","sizeof","static","struct", "switch","typedef",
"union","unsigned","void","volatile","while","define","endif","ifdef",
"ifndef","include","line","undef"};
int state=0;//当前的状态
int start=0;//起始状态
int latter;//保存匹配之前指针的位置
char buffer[20];//保存当前的字符串
int flag=1;
//找到是否有关键字
int search(char * chs)
{
int i=0;
int p;
for (i=0;i<39;i++)
{
if (strcmp(key[i],chs)==0)
{
p=i+1;
break;
}
else
p=0;
}
return(p);
}
//判断是否为字符
int isletter(char ch)
{
int flag=0;
if((ch>=97 && ch<=122) || (ch>=65 && ch<=90) || ch=='_')//是字母
flag=1;
return flag;
}
//判断是否为数字
int isdigit(char ch)
{
int flag=0;
if(ch>=48 && ch<=57)
flag=1;
return flag;
}
//当所有的状态都不能匹配时
void recover()
{
}
//如果匹配失败
int fail()
{
int i;
for(i=0;i<20;i++)
buffer[i]='\0';
fseek(fp,latter,0);//将指针回退到匹配之前的位置
switch(start)
{
case 0:
start=3;
break;
case 3:
start=11;
break;
case 11:
start=16;
break;
case 16:
start=19;
break;
case 19:
start=23;
break;
case 23:
start=27;
break;
case 27:
start=60;
break;
default:
//编译错误,所有的状态都不能识别该字符序列
break;
}
return start;
}
//每次执行这个方法就会识别一个词素,要么识别成功,要么识别失败
void nexttoken()
{
int i=0;
int j;
while(1)
{
switch(state)
{
//case 0到2是识别标识符和关键字的状态
case 0:
if((ch=fgetc(fp))==EOF)
{
flag=0;
return;
}
if(ch==' ' || ch=='\n' || ch=='\t' || ch=='\r')
{
state=0;
latter=ftell(fp);
}
else if(isletter(ch))
{
state=1;
buffer[i++]=ch;
}
else
state=fail();
break;
case 1:
if((ch=fgetc(fp))==EOF)
return;
if(isletter(ch) || isdigit(ch))
{
state=1;
buffer[i++]=ch;
}
else
state=2;
break;
case 2:
if(search(buffer))//关键字
printf("(3, %s)\n",buffer);
else//标识符
printf("(1, %s)\n",buffer);
fseek(fp,-1,1);//将文件指针从当前位置回退一位
state=0;
start=0;
for(j=0;j<i;j++)
buffer[j]='\0';
return;
//case 3到10是识别无符号复数的
case 3:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=4;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 4:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=4;
buffer[i++]=ch;
}
else if(ch=='.')
{
state=5;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 5:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=6;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 6:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=6;
buffer[i++]=ch;
}
else if(ch=='E')
{
state=7;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 7:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='+' || ch=='-')
{
state=8;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 8:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=9;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 9:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=9;
buffer[i++]=ch;
}
else
state=10;
break;
case 10:
printf("(2, %s)\n",buffer);
fseek(fp,-1,1);//将文件指针从当前位置回退一位
state=0;
start=0;
for(j=0;j<i;j++)
buffer[j]='\0';
return;
//case 11到15是识别无符号实数的
case 11:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=12;
buffer[i++]=ch;
}
else
state=fail();
break;
case 12:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=12;
buffer[i++]=ch;
}
else if(ch=='.')
{
state=13;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 13:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=14;
buffer[i++]=ch;
}
else
state=fail();
break;
case 14:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=14;
buffer[i++]=ch;
}
else
state=15;
break;
case 15:
printf("(2, %s)\n",buffer);
fseek(fp,-1,1);//将文件指针从当前位置回退一位
state=0;
start=0;
for(j=0;j<i;j++)
buffer[j]='\0';
return;
//case 16到18是识别无符号整数的
case 16:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=17;
buffer[i++]=ch;
}
else
state=fail();
break;
case 17:
if((ch=fgetc(fp))==EOF)
return;
if(isdigit(ch))
{
state=17;
buffer[i++]=ch;
}
else
state=18;
break;
case 18:
printf("(2, %s)\n",buffer);
fseek(fp,-1,1);//将文件指针从当前位置回退一位
state=0;
start=0;
for(j=0;j<i;j++)
buffer[j]='\0';
return;
//case 19到22是识别char型字符常量的
case 19:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\'')
{
state=20;
buffer[i++]=ch;
}
else
state=fail();
break;
case 20:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\'')
{
state=22;
buffer[i]=ch;
}
else
{
state=21;
buffer[i++]=ch;
}
break;
case 21:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\'')
{
state=22;
buffer[i]=ch;
}
else if(ch=='0' || ch=='t' || ch=='n' || ch=='r' || ch=='\"')
{
state=21;
buffer[i++]=ch;
}
else
state=fail();
break;
case 22:
printf("(2, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<=i;j++)
buffer[j]='\0';
return;
//case 23到26是识别字符串常量的
case 23:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\"')
{
state=24;
buffer[i++]=ch;
}
else
{
state=fail();
i=0;
}
break;
case 24:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\"')
{
state=26;
buffer[i]=ch;
}
else
{
state=25;
buffer[i++]=ch;
}
break;
case 25:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='\"')
{
state=26;
buffer[i]=ch;
}
else
{
state=25;
buffer[i++]=ch;
}
break;
case 26:
printf("(2, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<=i;j++)
buffer[j]='\0';
return;
//case 27到59是识别各种运算符的
case 27:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='=')
{
state=28;
buffer[i++]=ch;
}
else if(ch=='+')
{
state=31;
buffer[i++]=ch;
}
else if(ch=='-')
{
state=34;
buffer[i++]=ch;
}
else if(ch=='*')
{
state=37;
buffer[i++]=ch;
}
else if(ch=='/')
{
state=38;
buffer[i++]=ch;
}
else if(ch=='%')
{
state=42;
buffer[i++]=ch;
}
else if(ch=='<')
{
state=43;
buffer[i++]=ch;
}
else if(ch=='>')
{
state=47;
buffer[i++]=ch;
}
else if(ch=='&')
{
state=50;
buffer[i++]=ch;
}
else if(ch=='|')
{
state=54;
buffer[i++]=ch;
}
else if(ch=='!')
{
state=57;
buffer[i++]=ch;
}
else
state=fail();
break;
case 28:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='=')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 29:
printf("(4, %s)\n",buffer);
fseek(fp,-1,1);//将文件指针从当前位置回退一位
state=0;
start=0;
for(j=0;j<20;j++)
buffer[j]='\0';
return;
case 30:
printf("(4, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<20;j++)
buffer[j]='\0';
return;
case 31:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='+')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 34:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='-')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 37:
printf("(4, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<20;j++)
buffer[j]='\0';
return;
case 38:
if((ch=fgetc(fp))==EOF)
return;
//过滤单行注释
if(ch=='/')
{
while((ch=fgetc(fp))!='\n')
{}
latter=ftell(fp);
state=0;
start=0;
i=0;
}
/*过滤多行注释*/
else if(ch=='*')
{
char temp;
do
{
temp=fgetc(fp);
ch=fgetc(fp);
if(temp==EOF || ch==EOF)
break;
fseek(fp,-1,1);
}while(temp!='*' || ch!='/');
if(temp=='*' && ch=='/')
fseek(fp,1,1);
latter=ftell(fp);
state=0;
start=0;
i=0;
}
else
state=29;
break;
case 42:
printf("(4, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<20;j++)
buffer[j]='\0';
return;
case 43:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='=' || ch=='>')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 47:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='=')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 50:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='&')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 54:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='|')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
case 57:
if((ch=fgetc(fp))==EOF)
return;
if(ch=='=')
{
state=30;
buffer[i++]=ch;
}
else
state=29;
break;
//case 60到68是识别界符的
case 60:
if((ch=fgetc(fp))==EOF)
return;
if(ch==':' || ch==',' || ch==';' || ch=='(' || ch==')' || ch=='{' || ch=='}' || ch=='[' || ch==']' || ch=='#')
{
state=61;
buffer[i++]=ch;
}
else
{
printf("sysnax error: %s\n",buffer);
}
break;
case 61:
printf("(5, %s)\n",buffer);
state=0;
start=0;
for(j=0;j<20;j++)
buffer[j]='\0';
return;
default:
return;
break;
}
}
}
int main()
{
if((fp=fopen("test.c","r"))!=NULL)
{
while(flag)
{
latter=ftell(fp);//保存一下当前的指针,以防当匹配失败时,能重新从要开始匹配的地方换用新的状态转换图进行匹配
nexttoken();
}
}
return 0;
}
输出示例如下:
1、2、3、4、5表示的是单词的类别