Lua虚拟机之语法分析(二)

本文是作者在项目发布前的忙碌中抽出时间撰写的Lua虚拟机系列文章的第二篇,主要聚焦于语法分析。通过分析lua虚拟机源代码,特别是luaX_init方法在lstate.c和llex.c中的作用,阐述了如何生成关键字到全局状态的字符串表,为理解Lua虚拟机的工作原理提供了深入见解。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

      最近实在是忙,项目就快要发布了,加班加点在所难免。继上一篇关于lua基本数据类型的简单分析后,我将继续写阅读lua虚拟机的源代码笔记,这是第一次接触虚拟机工作原理,lua虚拟机代码量不大,是个很好的学习的例子,应当坚持下去。关于语法分析,我觉得只需要弄清楚从哪分析,怎么分析以及最终的生成结果就差不多了。

      当通过调用lstate.c:lua_newstate()方法生成一个新的lua_State时,其内部又会调用llex.c:luaX_init方法,该方法的作用是生成lua的关键字(保留字)到global_State的字符串表中(stringtable,以哈希表的形式保存),关于lua的保留字,在llex.c中可看到下面这段:

/* ORDER RESERVED */
static const char *const luaX_tokens [] = {
    "and", "break", "do", "else", "elseif",
    "end", "false", "for", "function", "goto", "if",
    "in", "local", "nil", "not", "or", "repeat",
    "return", "then", "true", "until", "while",
    "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
    "<number>", "<name>", "<string>"
};
      而真正让虚拟机开始分析源代码,则是通过lapi.c:lua_load函数,lauxlib.c中则封装了对lua_load的调用,其中包括从文件加载源码的分析lauxlib.c:luaL_loadfilex,以及从内存加载lauxlib.c:luaL_loadbufferex。在lua_load中,除了必要的初始化,比如生成一个ZIO对象(流对象)负责处理文件或字符串的输入,还调用了ldo.c:luaD_protectedparser函数,而在这个函数中,又最终调用了lparser.c:luaY_parser函数,好了,虚拟机开始进入源码的分析阶段了,代码如下:

Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
                      Dyndata *dyd, const char *name, int firstchar) {
  LexState lexstate; // 扫描状态机,单词读取、步进等操作
  FuncState funcstate; // 函数状态机,每输入一个源代码文件或者是一段源代码字符串,都将有一个状态机对应
  Closure *cl = luaF_newLclosure(L, 1);  /* create main closure */
  /* anchor closure (to avoid being collected) */
  setclLvalue(L, L->top, cl);
  incr_top(L);
  funcstate.f = cl->l.p = luaF_newproto(L);
  funcstate.f->source = luaS_new(L, name);  /* create and anchor TString */
  lexstate.buff = buff;
  lexstate.dyd = dyd;
  dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
  luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
  mainfunc(&lexstate, &funcstate);
  lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
  /* all scopes should be correctly finished */
  lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
  return cl;  /* it's on the stack too */ // 返回的闭包,将保存在lua_State当前的栈顶
}
      关于LexState对象,其责任就是记录当前的行号、符号、期望的下一个符号、读取字符串或者数字,当然还有很多其他的信息,具体可以参看llex.h中的定义。而FuncState中,则保存了函数的原型(Proto),包含的元素,所在的块,下一个字节码的地址,局部变量个数等,具体定义如下:(lparser.h)

/* state needed to generate code for a given function */
typedef struct FuncState {
  Proto *f;  /* current function header */
  Table *h;  /* table to find (and reuse) elements in `k' */
  struct FuncState *prev;  /* enclosing function */
  struct LexState *ls;  /* lexical state */
  struct BlockCnt *bl;  /* chain of current blocks */
  int pc;  /* next position to code (equivalent to `ncode') */
  int lasttarget;   /* 'label' of last 'jump label' */
  int jpc;  /* list of pending jumps to `pc' */
  int nk;  /* number of elements in `k' */
  int np;  /* number of elements in `p' */
  int firstlocal;  /* index of first local var (in Dyndata array) */
  short nlocvars;  /* number of elements in 'f->locvars' */
  lu_byte nactvar;  /* number of active local variables */
  lu_byte nups;  /* number of upvalues */
  lu_byte freereg;  /* first free register */
} FuncState;
      还有几个比较重要的结构,分别是闭包与原型,定义分别如下:

/*
** Function Prototypes
*/
typedef struct Proto {
  CommonHeader;
  TValue *k;  /* constants used by the function */
  Instruction *code; // 字节码数组
  struct Proto **p;  /* functions defined inside the function */
  int *lineinfo;  /* map from opcodes to source lines (debug information) */
  LocVar *locvars;  /* information about local variables (debug information) */
  Upvaldesc *upvalues;  /* upvalue information */
  union Closure *cache;  /* last created closure with this prototype */
  TString  *source;  /* used for debug information */
  int sizeupvalues;  /* size of 'upvalues' */
  int sizek;  /* size of `k' */
  int sizecode; // 字节码数组大小
  int sizelineinfo;
  int sizep;  /* size of `p' */
  int sizelocvars; // 局部变量个数
  int linedefined;
  int lastlinedefined;
  GCObject *gclist;
  lu_byte numparams;  /* number of fixed parameters */
  lu_byte is_vararg;
  lu_byte maxstacksize;  /* maximum stack used by this function */
} Proto;

/*
** Lua Upvalues
*/
typedef struct UpVal {
  CommonHeader;
  TValue *v;  /* points to stack or to its own value */
  union {
    TValue value;  /* the value (when closed) */
    struct {  /* double linked list (when open) */
      struct UpVal *prev;
      struct UpVal *next;
    } l;
  } u;
} UpVal;

/*
** Closures
*/

#define ClosureHeader \
	CommonHeader; lu_byte nupvalues; GCObject *gclist

typedef struct CClosure {
  ClosureHeader;
  lua_CFunction f;
  TValue upvalue[1];  /* list of upvalues */
} CClosure;

typedef struct LClosure {
  ClosureHeader;
  struct Proto *p;
  UpVal *upvals[1];  /* list of upvalues */
} LClosure;

typedef union Closure {
  CClosure c;
  LClosure l;
} Closure;
      UpValue可以简单的理解为函数外部定义的局部变量,如下所示的a就是upvalue

function f()
    local a = 0;
    return function() return a+1 end;
end
      还有个概念需要稍微关注,那就是block,直接翻译成代码块好了,什么是一个代码块以及怎么表示一个代码块,如下:

if   xxxx  then 
   abcd.... // 这部分可以算代码块
end

//代码块的定义
typedef struct BlockCnt {
  struct BlockCnt *previous;  /* chain */
  short firstlabel;  /* index of first label in this block */
  short firstgoto;  /* index of first pending goto in this block */
  lu_byte nactvar;  /* # active locals outside the block */
  lu_byte upval;  /* true if some variable in the block is an upvalue */
  lu_byte isloop;  /* true if `block' is a loop */
} BlockCnt;
      阅读lparser.c时,还发现有两种东西需要去理解,一个称为exp即表达式,一个是statement,表达式可能是简单的true或false,也可能是~a这种带一元操作符的,还有可能是a+b这种,也还有形如{a=x,b=x}被称为构造(constructor)的表达式,复杂的表达式,也是由简单的表达式所组成的,以下是lua中简单表达式的定义:

static void simpleexp (LexState *ls, expdesc *v) {
  /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
                  constructor | FUNCTION body | suffixedexp */
  switch (ls->t.token) {
    case TK_NUMBER: {
      init_exp(v, VKNUM, 0);
      v->u.nval = ls->t.seminfo.r;
      break;
    }
    case TK_STRING: {
      codestring(ls, v, ls->t.seminfo.ts);
      break;
    }
    case TK_NIL: {
      init_exp(v, VNIL, 0);
      break;
    }
    case TK_TRUE: {
      init_exp(v, VTRUE, 0);
      break;
    }
    case TK_FALSE: {
      init_exp(v, VFALSE, 0);
      break;
    }
    case TK_DOTS: {  /* vararg */ // 点操作符
      FuncState *fs = ls->fs;
      check_condition(ls, fs->f->is_vararg,
                      "cannot use " LUA_QL("...") " outside a vararg function");
      init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
      break;
    }
    case '{': {  /* constructor */
      constructor(ls, v);
      return;
    }
    case TK_FUNCTION: {
      luaX_next(ls);
      body(ls, v, 0, ls->linenumber);
      return;
    }
    default: {
      suffixedexp(ls, v);
      return;
    }
  }
  luaX_next(ls);
}
      而statement则可以看lparser.c:statement()函数,通过该函数反过来又可以对lua的语法有个大概的了解与印象加深,如下:

static void statement (LexState *ls) {
  int line = ls->linenumber;  /* may be needed for error messages */
  enterlevel(ls);
  switch (ls->t.token) {
    case ';': {  /* stat -> ';' (empty statement) */
      luaX_next(ls);  /* skip ';' */
      break;
    }
    case TK_IF: {  /* stat -> ifstat */
      ifstat(ls, line);
      break;
    }
    case TK_WHILE: {  /* stat -> whilestat */
      whilestat(ls, line);
      break;
    }
    case TK_DO: {  /* stat -> DO block END */
      luaX_next(ls);  /* skip DO */
      block(ls);
      check_match(ls, TK_END, TK_DO, line);
      break;
    }
    case TK_FOR: {  /* stat -> forstat */
      forstat(ls, line);
      break;
    }
    case TK_REPEAT: {  /* stat -> repeatstat */
      repeatstat(ls, line);
      break;
    }
    case TK_FUNCTION: {  /* stat -> funcstat */
      funcstat(ls, line);
      break;
    }
    case TK_LOCAL: {  /* stat -> localstat */
      luaX_next(ls);  /* skip LOCAL */
      if (testnext(ls, TK_FUNCTION))  /* local function? */
        localfunc(ls);
      else
        localstat(ls);
      break;
    }
    case TK_DBCOLON: {  /* stat -> label */
      luaX_next(ls);  /* skip double colon */
      labelstat(ls, str_checkname(ls), line);
      break;
    }
    case TK_RETURN: {  /* stat -> retstat */
      luaX_next(ls);  /* skip RETURN */
      retstat(ls);
      break;
    }
    case TK_BREAK:   /* stat -> breakstat */
    case TK_GOTO: {  /* stat -> 'goto' NAME */
      gotostat(ls, luaK_jump(ls->fs));
      break;
    }
    default: {  /* stat -> func | assignment */
      exprstat(ls);
      break;
    }
  }
  lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
             ls->fs->freereg >= ls->fs->nactvar);
  ls->fs->freereg = ls->fs->nactvar;  /* free registers */
  leavelevel(ls);
}
      exp也好,statement也好,这种代码的组织形式是非常容易掌握的,就不再对case里的各种exp或者stat进一步分析了,可能还忽略了一点,那就是luaX_next是怎么保证工作正确的呢,如下:llex.c

void luaX_next (LexState *ls) {
  ls->lastline = ls->linenumber;
  if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
    ls->t = ls->lookahead;  /* use this one */
    ls->lookahead.token = TK_EOS;  /* and discharge it */
  }
  else
    ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
}
      llex.c:llex()函数就不再列出了,一个大循环加一个大switch,阅读没什么难度。还有最后一个问题,这样分析的最后保存的结果是什么,其实在lpaerse.c:luaY_parser函数中就已经知道了,那就是FuncState,表达式被翻成变量与字节码,存入Proto中。








评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值