自己動手實現一個簡單c編譯器
阿新 • • 發佈:2019-01-24
這學期的編譯課程設計需要做一個類c編譯器,準確的說是完善上學期做的大實驗。
上學期的實驗中,使用antlr完成的編譯器識別的語法很有限,基本上是個計算器的語法,於是這次決定弄語法一個更加完整。
語法支援:
宣告,賦值,函式,if-else,while,for。
首先是詞法分析和語法分析,antlr原始檔如下:
grammar c; options{ output=AST; ASTLabelType=CommonTree; } tokens{ PROG;STAT;IFSTAT;IF;ELSE;WHILESTAT;FORSTAT;DECLAREVAR;DECLAREFUNC;CALLFUNC;GIVEVALUE;CALL;FUNC1;FUNC2;FUNC3;FUNC4; } ID : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')* ; INT : '0'..'9'+ ; FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? | '.' ('0'..'9')+ EXPONENT? | ('0'..'9')+ EXPONENT ; num : INT | FLOAT ; num_type : 'int' | 'float' ; COMMENT : '//' ~('\n'|'\r')* '\r'? '\n' {$channel=HIDDEN;} | '/*' ( options {greedy=false;} : . )* '*/' {$channel=HIDDEN;} ; WS : ('\r'|'\n'|' '|'\t')+{$channel=HIDDEN;}; END : ';' ; boolexpr : expr ('=='^|'!='^|'>'^|'<'^|'>='^|'<='^) expr ; expr : multexpr (('+'^ | '-'^) multexpr)* ; multexpr : atom (('*'^|'/'^) atom)* ; atom : '(' expr ')' -> ^(expr) | num | ID | callfunc ; declarevar : 'int'^ ID | 'float'^ ID ; givevalue : ID '=' expr -> ^('=' ID expr) ; ifstat options{ backtrack=true; } : 'if' '(' boolexpr ')' '{' s1=stat* '}' 'else' '{' s2=stat* '}' -> ^(IF boolexpr $s1 ELSE $s2) | 'if' '(' boolexpr ')' '{' stat* '}' -> ^(IF boolexpr stat*) ; whilestat options{ backtrack=true; } : 'while' '(' boolexpr ')' '{' stat* '}' -> ^('while' boolexpr stat*) ; forstat options{ backtrack=true; } : 'for' '(' s1=givevalue ';' s2=boolexpr ';' s3=givevalue ')' '{' s4=stat* '}' -> ^('for' $s1 $s2 $s4 $s3) ; declarefunc : 'void' ID '(' ')' '{' stat* '}' -> ^(FUNC1 ID stat*) | ('int'|'float') ID '(' ')' '{' stat* 'return' expr END '}' -> ^(FUNC2 ID stat* expr) | 'void' ID '(' (num_type ID) (',' (num_type ID))* ')' '{' stat* '}' -> ^(FUNC3 ID (num_type ID)+ stat*) | ('int'|'float') ID '(' (num_type ID) (',' (num_type ID))* ')' '{' stat* 'return' expr END '}' -> ^(FUNC4 ID (num_type ID)+ stat* expr) ; callfunc : ID '(' ')' -> ^(CALL ID) | ID '(' expr (',' expr)* ')' -> ^(CALL ID expr (expr)*) ; stat : declarevar END -> ^(DECLAREVAR declarevar) | givevalue END-> ^(GIVEVALUE givevalue) | ifstat -> ^(IFSTAT ifstat) | forstat -> ^(FORSTAT forstat) | whilestat -> ^(WHILESTAT whilestat) | declarefunc -> ^(DECLAREFUNC declarefunc) | callfunc END -> ^(CALLFUNC callfunc) ; prog : stat* -> ^(PROG stat*) ; fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
這個grammar輸出一個ast
輸入為
int b;
void main(){
if(b==0){
b=34;
}
print(b);
}
void test(int a){
print(a);
}
的時候,生成的抽象語法樹如下,
(PROG (DECLAREVAR (int b)) (DECLAREFUNC (FUNC1 main (IFSTAT (IF (== b 0) (GIVEVALUE (= b 34)))) (CALLFUNC (CALL print b)))) (DECLAREFUNC (FUNC3 test int a (CALLFUNC (CALL print a)))))
然後是樹解析器,這一步就是把上面的ast轉變為彙編,主要是注意函式呼叫需要平衡堆疊,其他倒沒什麼,為了方便除錯,生成.s檔案中嵌入了一個print()函式
主程式,呼叫.g檔案生成的java檔案如下tree grammar cTree; options{ tokenVocab=c; ASTLabelType=CommonTree; output=template; } @headers{ } @members{ int index=0; int data_index=0; int labelindex=0; int labelindex_if=0; int num=0; int labelindex_bool=0; int labelindex_while=0; int labelindex_for=0; int labelindex_func=0; String []text = new String[1000]; String []data = new String[1000]; } boolexpr : ^('==' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} | ^('!=' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} | ^('>=' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jng label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} | ^('<=' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} | ^('>' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} | ^('<' s1=expr s2=expr) {text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jg label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":"; labelindex_bool++;} ; expr : ^('+' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="add eax,ebx";text[index++]="push rax";} | ^('-' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="sub eax,ebx";text[index++]="push rax";} | ^('*' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="mul eax,ebx";text[index++]="push rax";} | ^('/' s1=expr s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="div eax,ebx";text[index++]="push rax";} | INT{text[index++]="mov eax,"+$INT+"";text[index++]="push rax";} | FLOAT{text[index++]="mov eax,"+$FLOAT+"";text[index++]=" push rax";} | ID{text[index++]="mov eax,["+$ID+"]";text[index++]="push rax";} | callfunc{text[index++]="push rax";} ; declarevar : ^('int' ID){data[data_index++]=$ID+" dd 0";} | ^('float' ID){data[data_index++]=$ID+" dd 0";} ; givevalue : ^('=' ID expr){text[index++]="pop rax";text[index++]="mov ["+$ID+"],eax";} ; ifstat options{backtrack=true;} : ^(IF boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if0"+labelindex_if+"";} s=stat*{text[index++]="label_if0"+labelindex+":";}){labelindex_if++;} | ^(IF boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if1"+labelindex_if+"";} s1=stat*{text[index++]="jmp label_if2"+labelindex_if+"";text[index++]="label1_if1"+labelindex_if+":";} ELSE s2=stat*{text[index++]="label_if2"+labelindex_if+":";}){labelindex_if++;} ; whilestat : ^('while'{text[index++]="label_while"+labelindex_while+":";} boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_while2"+labelindex_while+"";} stat*{text[index++]="jmp label_while"+labelindex_while+"\nlabel_while2"+labelindex_while+":";}){labelindex_while++;} ; forstat : ^('for' s1=givevalue{text[index++]="label_for"+labelindex_for+":";} s2=boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_for2"+labelindex_for+"";} s4=stat* s3=givevalue{text[index++]="jmp label_for"+labelindex_for+"";text[index++]="label_for2"+labelindex_for+":";} ){labelindex_for++;} ; declarefunc : ^(FUNC1 ID {text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";} s1=stat* {text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";}){labelindex_func++;} | ^(FUNC2 ID{text[index++]="jmp label"+labelindex+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";} stat* expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex+":";}){labelindex++;} | ^(FUNC3 a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";} (('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";} b=ID{text[index++]="mov rax,[ rbp+"+8+"+8+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+ stat*{text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";}) {labelindex_func++;} | ^(FUNC4{num=0;} a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";} (('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";} b=ID{text[index++]="mov rax,[ rbp+"+8+"+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+ stat* expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex_func+":";}) {labelindex++;} ; callfunc : ^(CALL ID){text[index++]="call label_"+$ID+"";} | ^(CALL ID{num=0;} (expr{num++;})+){text[index++]="push "+num;text[index++]="call label_"+$ID+"";} ; stat : ^(DECLAREVAR declarevar) | ^(GIVEVALUE givevalue) | ^(IFSTAT ifstat) | ^(FORSTAT forstat) | ^(WHILESTAT whilestat) | ^(DECLAREFUNC declarefunc) | ^(CALLFUNC callfunc) ; prog : ^(PROG stat*) { System.out.println("section .data\nt resb 100\nbuffer db 0 ,0,0"); for(int i=0;i<data_index;i++){ System.out.println(data[i]); } System.out.println("section .text\nglobal _start\n_start:\n"); for(int i=0;i<index;i++){ System.out.println(text[i]); } System.out.println("call label_main\njmp label_a\nlabel_print:\nxor rcx,rcx\nxor rax,rax\npush rbp\nmov rbp,rsp\nmov rax,[rbp+8+8*2]\nlabel_prog:\nmov rbx,10\ndiv bl\nadd ah,30h\nmov ebx,buffer\nsub ebx,ecx\ndec ebx\nmov [ebx],ah\nmov ah,0\ninc rcx\ncmp rax,0\njnz label_prog\nmov ax,4\nmov ebx,1\nmov edx,ecx\nmov ecx,buffer\nsub ecx,edx\nint 80h\nmov rsp,rbp\npop rbp\nret\nlabel_a:\nmov ax,1\nmov ebx,0\nint 0x80\n"); } ;
//cm.java
import java.io.*;
import org.antlr.runtime.ANTLRFileStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.tree.CommonTree;
import org.antlr.runtime.tree.CommonTreeNodeStream;
public class cm {
public static void main(String args[]) throws Exception {
cLexer lex = new cLexer(new ANTLRFileStream("./input.txt"));
CommonTokenStream tokens = new CommonTokenStream(lex);
cParser par = new cParser(tokens);
cParser.prog_return ret = par.prog();
CommonTree t = ret.tree;
// System.out.println(t.toStringTree());
CommonTreeNodeStream nodes = new CommonTreeNodeStream(t);
nodes.setTokenStream(tokens);
cTree walker = new cTree(nodes);
walker.prog();
}
}
這樣,就得到了.s檔案,使用nasm生成.o檔案後,生成可執行檔案,測試
java org.antlr.Tool c.g
java org.antlr.Tool cTree.g
javac *.java
java cm > input.s
nasm -f elf64 input.s
ld -s -o input input.o
github https://github.com/lizhongguo/myc