Lua虛擬機器之語法分析(二)
阿新 • • 發佈:2019-02-07
最近實在是忙,專案就快要釋出了,加班加點在所難免。繼上一篇關於lua基本資料型別的簡單分析後,我將繼續寫閱讀lua虛擬機器的原始碼筆記,這是第一次接觸虛擬機器工作原理,lua虛擬機器程式碼量不大,是個很好的學習的例子,應當堅持下去。關於語法分析,我覺得只需要弄清楚從哪分析,怎麼分析以及最終的生成結果就差不多了。
當通過呼叫lstate.c:lua_newstate()方法生成一個新的lua_State時,其內部又會呼叫llex.c:luaX_init方法,該方法的作用是生成lua的關鍵字(保留字)到global_State的字串表中(stringtable,以雜湊表的形式儲存),關於lua的保留字,在llex.c中可看到下面這段:
而真正讓虛擬機器開始分析原始碼,則是通過lapi.c:lua_load函式,lauxlib.c中則封裝了對lua_load的呼叫,其中包括從檔案載入原始碼的分析lauxlib.c:luaL_loadfilex,以及從記憶體載入lauxlib.c:luaL_loadbufferex。在lua_load中,除了必要的初始化,比如生成一個ZIO物件(流物件)負責處理檔案或字串的輸入,還呼叫了ldo.c:luaD_protectedparser函式,而在這個函式中,又最終呼叫了lparser.c:luaY_parser函式,好了,虛擬機器開始進入原始碼的分析階段了,程式碼如下:/* ORDER RESERVED */ static const char *const luaX_tokens [] = { "and", "break", "do", "else", "elseif", "end", "false", "for", "function", "goto", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while", "..", "...", "==", ">=", "<=", "~=", "::", "<eof>", "<number>", "<name>", "<string>" };
關於LexState物件,其責任就是記錄當前的行號、符號、期望的下一個符號、讀取字串或者數字,當然還有很多其他的資訊,具體可以參看llex.h中的定義。而FuncState中,則儲存了函式的原型(Proto),包含的元素,所在的塊,下一個位元組碼的地址,區域性變數個數等,具體定義如下:(lparser.h)Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, Dyndata *dyd, const char *name, int firstchar) { LexState lexstate; // 掃描狀態機,單詞讀取、步進等操作 FuncState funcstate; // 函式狀態機,每輸入一個原始碼檔案或者是一段原始碼字串,都將有一個狀態機對應 Closure *cl = luaF_newLclosure(L, 1); /* create main closure */ /* anchor closure (to avoid being collected) */ setclLvalue(L, L->top, cl); incr_top(L); funcstate.f = cl->l.p = luaF_newproto(L); funcstate.f->source = luaS_new(L, name); /* create and anchor TString */ lexstate.buff = buff; lexstate.dyd = dyd; dyd->actvar.n = dyd->gt.n = dyd->label.n = 0; luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar); mainfunc(&lexstate, &funcstate); lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs); /* all scopes should be correctly finished */ lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0); return cl; /* it's on the stack too */ // 返回的閉包,將儲存在lua_State當前的棧頂 }
/* state needed to generate code for a given function */
typedef struct FuncState {
Proto *f; /* current function header */
Table *h; /* table to find (and reuse) elements in `k' */
struct FuncState *prev; /* enclosing function */
struct LexState *ls; /* lexical state */
struct BlockCnt *bl; /* chain of current blocks */
int pc; /* next position to code (equivalent to `ncode') */
int lasttarget; /* 'label' of last 'jump label' */
int jpc; /* list of pending jumps to `pc' */
int nk; /* number of elements in `k' */
int np; /* number of elements in `p' */
int firstlocal; /* index of first local var (in Dyndata array) */
short nlocvars; /* number of elements in 'f->locvars' */
lu_byte nactvar; /* number of active local variables */
lu_byte nups; /* number of upvalues */
lu_byte freereg; /* first free register */
} FuncState;
還有幾個比較重要的結構,分別是閉包與原型,定義分別如下:
/*
** Function Prototypes
*/
typedef struct Proto {
CommonHeader;
TValue *k; /* constants used by the function */
Instruction *code; // 位元組碼陣列
struct Proto **p; /* functions defined inside the function */
int *lineinfo; /* map from opcodes to source lines (debug information) */
LocVar *locvars; /* information about local variables (debug information) */
Upvaldesc *upvalues; /* upvalue information */
union Closure *cache; /* last created closure with this prototype */
TString *source; /* used for debug information */
int sizeupvalues; /* size of 'upvalues' */
int sizek; /* size of `k' */
int sizecode; // 位元組碼陣列大小
int sizelineinfo;
int sizep; /* size of `p' */
int sizelocvars; // 區域性變數個數
int linedefined;
int lastlinedefined;
GCObject *gclist;
lu_byte numparams; /* number of fixed parameters */
lu_byte is_vararg;
lu_byte maxstacksize; /* maximum stack used by this function */
} Proto;
/*
** Lua Upvalues
*/
typedef struct UpVal {
CommonHeader;
TValue *v; /* points to stack or to its own value */
union {
TValue value; /* the value (when closed) */
struct { /* double linked list (when open) */
struct UpVal *prev;
struct UpVal *next;
} l;
} u;
} UpVal;
/*
** Closures
*/
#define ClosureHeader \
CommonHeader; lu_byte nupvalues; GCObject *gclist
typedef struct CClosure {
ClosureHeader;
lua_CFunction f;
TValue upvalue[1]; /* list of upvalues */
} CClosure;
typedef struct LClosure {
ClosureHeader;
struct Proto *p;
UpVal *upvals[1]; /* list of upvalues */
} LClosure;
typedef union Closure {
CClosure c;
LClosure l;
} Closure;
UpValue可以簡單的理解為函式外部定義的區域性變數,如下所示的a就是upvalue
function f()
local a = 0;
return function() return a+1 end;
end
還有個概念需要稍微關注,那就是block,直接翻譯成程式碼塊好了,什麼是一個程式碼塊以及怎麼表示一個程式碼塊,如下:
if xxxx then
abcd.... // 這部分可以算程式碼塊
end
//程式碼塊的定義
typedef struct BlockCnt {
struct BlockCnt *previous; /* chain */
short firstlabel; /* index of first label in this block */
short firstgoto; /* index of first pending goto in this block */
lu_byte nactvar; /* # active locals outside the block */
lu_byte upval; /* true if some variable in the block is an upvalue */
lu_byte isloop; /* true if `block' is a loop */
} BlockCnt;
閱讀lparser.c時,還發現有兩種東西需要去理解,一個稱為exp即表示式,一個是statement,表示式可能是簡單的true或false,也可能是~a這種帶一元操作符的,還有可能是a+b這種,也還有形如{a=x,b=x}被稱為構造(constructor)的表示式,複雜的表示式,也是由簡單的表示式所組成的,以下是lua中簡單表示式的定義:
static void simpleexp (LexState *ls, expdesc *v) {
/* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
constructor | FUNCTION body | suffixedexp */
switch (ls->t.token) {
case TK_NUMBER: {
init_exp(v, VKNUM, 0);
v->u.nval = ls->t.seminfo.r;
break;
}
case TK_STRING: {
codestring(ls, v, ls->t.seminfo.ts);
break;
}
case TK_NIL: {
init_exp(v, VNIL, 0);
break;
}
case TK_TRUE: {
init_exp(v, VTRUE, 0);
break;
}
case TK_FALSE: {
init_exp(v, VFALSE, 0);
break;
}
case TK_DOTS: { /* vararg */ // 點操作符
FuncState *fs = ls->fs;
check_condition(ls, fs->f->is_vararg,
"cannot use " LUA_QL("...") " outside a vararg function");
init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
break;
}
case '{': { /* constructor */
constructor(ls, v);
return;
}
case TK_FUNCTION: {
luaX_next(ls);
body(ls, v, 0, ls->linenumber);
return;
}
default: {
suffixedexp(ls, v);
return;
}
}
luaX_next(ls);
}
而statement則可以看lparser.c:statement()函式,通過該函式反過來又可以對lua的語法有個大概的瞭解與印象加深,如下:
static void statement (LexState *ls) {
int line = ls->linenumber; /* may be needed for error messages */
enterlevel(ls);
switch (ls->t.token) {
case ';': { /* stat -> ';' (empty statement) */
luaX_next(ls); /* skip ';' */
break;
}
case TK_IF: { /* stat -> ifstat */
ifstat(ls, line);
break;
}
case TK_WHILE: { /* stat -> whilestat */
whilestat(ls, line);
break;
}
case TK_DO: { /* stat -> DO block END */
luaX_next(ls); /* skip DO */
block(ls);
check_match(ls, TK_END, TK_DO, line);
break;
}
case TK_FOR: { /* stat -> forstat */
forstat(ls, line);
break;
}
case TK_REPEAT: { /* stat -> repeatstat */
repeatstat(ls, line);
break;
}
case TK_FUNCTION: { /* stat -> funcstat */
funcstat(ls, line);
break;
}
case TK_LOCAL: { /* stat -> localstat */
luaX_next(ls); /* skip LOCAL */
if (testnext(ls, TK_FUNCTION)) /* local function? */
localfunc(ls);
else
localstat(ls);
break;
}
case TK_DBCOLON: { /* stat -> label */
luaX_next(ls); /* skip double colon */
labelstat(ls, str_checkname(ls), line);
break;
}
case TK_RETURN: { /* stat -> retstat */
luaX_next(ls); /* skip RETURN */
retstat(ls);
break;
}
case TK_BREAK: /* stat -> breakstat */
case TK_GOTO: { /* stat -> 'goto' NAME */
gotostat(ls, luaK_jump(ls->fs));
break;
}
default: { /* stat -> func | assignment */
exprstat(ls);
break;
}
}
lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
ls->fs->freereg >= ls->fs->nactvar);
ls->fs->freereg = ls->fs->nactvar; /* free registers */
leavelevel(ls);
}
exp也好,statement也好,這種程式碼的組織形式是非常容易掌握的,就不再對case裡的各種exp或者stat進一步分析了,可能還忽略了一點,那就是luaX_next是怎麼保證工作正確的呢,如下:llex.c
void luaX_next (LexState *ls) {
ls->lastline = ls->linenumber;
if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
ls->t = ls->lookahead; /* use this one */
ls->lookahead.token = TK_EOS; /* and discharge it */
}
else
ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
}
llex.c:llex()函式就不再列出了,一個大迴圈加一個大switch,閱讀沒什麼難度。還有最後一個問題,這樣分析的最後儲存的結果是什麼,其實在lpaerse.c:luaY_parser函式中就已經知道了,那就是FuncState,表示式被翻成變數與位元組碼,存入Proto中。