1. 程式人生 > >自己動手實現一個簡單c編譯器

自己動手實現一個簡單c編譯器

這學期的編譯課程設計需要做一個類c編譯器,準確的說是完善上學期做的大實驗。

上學期的實驗中,使用antlr完成的編譯器識別的語法很有限,基本上是個計算器的語法,於是這次決定弄語法一個更加完整。

語法支援:

宣告,賦值,函式,if-else,while,for。

首先是詞法分析和語法分析,antlr原始檔如下:

grammar c;
options{
    output=AST;
    ASTLabelType=CommonTree;
}

tokens{
PROG;STAT;IFSTAT;IF;ELSE;WHILESTAT;FORSTAT;DECLAREVAR;DECLAREFUNC;CALLFUNC;GIVEVALUE;CALL;FUNC1;FUNC2;FUNC3;FUNC4;
}

ID  :	('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
    ;

INT :	'0'..'9'+
    ;

FLOAT 
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?  
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT 
    ;

num	:	INT
	|	FLOAT
;

num_type	:	'int'
	|	'float'
	;

COMMENT
    :   '//' ~('\n'|'\r')* '\r'? '\n' {$channel=HIDDEN;}
    |   '/*' ( options {greedy=false;} : . )* '*/' {$channel=HIDDEN;}
    ;

WS  :   ('\r'|'\n'|' '|'\t')+{$channel=HIDDEN;};
END	:	';'
	;
	
boolexpr	:	expr	('=='^|'!='^|'>'^|'<'^|'>='^|'<='^)	expr
	;
expr	:	multexpr	(('+'^	|	'-'^)	multexpr)*
;

multexpr	:	atom	(('*'^|'/'^)	atom)*
;

atom	:	'('	expr	')'     ->      ^(expr)
	|	num
	|	ID
	|	callfunc
	;
	
declarevar	:	'int'^	ID
	|	'float'^	ID
	;
	
	
givevalue	:	ID 	'='	expr
	->	^('='	ID	expr)
	;
	
ifstat	options{ backtrack=true;  }
	:	'if'	'('	boolexpr	')'	'{'	s1=stat*	'}'	'else'	'{'	s2=stat*	'}'
	->	^(IF	boolexpr	$s1	ELSE	$s2)
	|	'if'	'('	boolexpr	')'	'{'	stat*	'}'
	->	^(IF	boolexpr	stat*)
	;
	
whilestat	options{ backtrack=true;  }
	:	'while'	'('	boolexpr	')'	'{'	stat*	'}'
	->	^('while'	boolexpr	stat*)	
	;

forstat	options{ backtrack=true;  }
	:	'for'	'('	s1=givevalue	';'	s2=boolexpr	';'	s3=givevalue	')'	'{'	s4=stat*	'}'
	->	^('for'	$s1	$s2	$s4     $s3)
	;

declarefunc
	:	'void'	ID	'('	')'	'{'	stat*	'}'	->	^(FUNC1 ID	stat*)
	|	('int'|'float')	ID	'('	')'	'{'	stat*	'return'	expr	END	'}'	->	^(FUNC2	ID	stat*	expr)
	|	'void'	ID	'('	(num_type	ID)	(','	(num_type	ID))*	')'	'{'	stat*	'}'	->
		^(FUNC3	ID	(num_type	ID)+	stat*)
	|	('int'|'float')	ID	'('	(num_type	ID)	(','	(num_type	ID))*	')'	'{'	stat*	'return'	expr	END	'}'	
	->	^(FUNC4	ID	(num_type	ID)+	stat*	expr)
	;	

callfunc	:	ID	'('	')'	->	^(CALL	ID)
	|	ID	'('	expr	(','	expr)*	')'	->	^(CALL	ID	expr	(expr)*)
	;

stat	:	declarevar	END	->	^(DECLAREVAR	declarevar)
	|	givevalue	END->	^(GIVEVALUE	givevalue)
	|	ifstat	->	^(IFSTAT	ifstat)
	|	forstat	->	^(FORSTAT	forstat)
	|	whilestat	->	^(WHILESTAT	whilestat)
	|	declarefunc	->	^(DECLAREFUNC	declarefunc)
	|	callfunc	END	->	^(CALLFUNC	callfunc)
	;

prog	:	stat*	->	^(PROG	stat*)
	;

fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;


這個grammar輸出一個ast

輸入為

int b;
void main(){
if(b==0){
b=34;
}
print(b);
}
void test(int a){
print(a);
}
的時候,
生成的抽象語法樹如下,

(PROG (DECLAREVAR (int b)) (DECLAREFUNC (FUNC1 main (IFSTAT (IF (== b 0) (GIVEVALUE (= b 34)))) (CALLFUNC (CALL print b)))) (DECLAREFUNC (FUNC3 test int a (CALLFUNC (CALL print a)))))

然後是樹解析器,這一步就是把上面的ast轉變為彙編,主要是注意函式呼叫需要平衡堆疊,其他倒沒什麼,為了方便除錯,生成.s檔案中嵌入了一個print()函式

tree grammar cTree;
options{
tokenVocab=c;
ASTLabelType=CommonTree;
output=template;
}

@headers{

}

@members{
int	index=0;
int     data_index=0;
int 	labelindex=0;
int 	labelindex_if=0;
int     num=0;
int     labelindex_bool=0;
int     labelindex_while=0;
int     labelindex_for=0;
int     labelindex_func=0;
String  []text  =       new     String[1000];
String  []data  =       new     String[1000];
}

boolexpr	:	^('=='	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	|	^('!='	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnz label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	|	^('>='	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jng label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	|	^('<='	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jnl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	|	^('>'	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jl label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	|	^('<'	s1=expr	s2=expr)
	{text[index++]="pop rax";text[index++]="pop rbx";text[index++]="cmp eax,ebx";text[index++]="jg label_bool_1"+labelindex_bool+"";text[index++]="push 0";text[index++]="jmp label_bool_2"+labelindex_bool+"";text[index++]="label_bool_1"+labelindex_bool+":push 1";text[index++]="label_bool_2"+labelindex_bool+":";
        labelindex_bool++;}
	;
expr	:	^('+'	s1=expr	s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="add eax,ebx";text[index++]="push rax";}
	|	^('-'	s1=expr	s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="sub eax,ebx";text[index++]="push rax";}
	|	^('*'	s1=expr	s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="mul eax,ebx";text[index++]="push rax";}
	|	^('/'	s1=expr	s2=expr){text[index++]="pop rbx";text[index++]="pop rax";text[index++]="div eax,ebx";text[index++]="push rax";}
	|	INT{text[index++]="mov eax,"+$INT+"";text[index++]="push rax";}
	|	FLOAT{text[index++]="mov eax,"+$FLOAT+"";text[index++]=" push rax";}
	|	ID{text[index++]="mov eax,["+$ID+"]";text[index++]="push rax";}
	|	callfunc{text[index++]="push rax";}
	;

	
declarevar	:	^('int'	ID){data[data_index++]=$ID+" dd	 0";}
	|	^('float'	ID){data[data_index++]=$ID+" dd	 0";}
	;
	
givevalue	:	^('='	ID	expr){text[index++]="pop rax";text[index++]="mov ["+$ID+"],eax";}
	;
	
ifstat	options{backtrack=true;}
	:	^(IF	boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if0"+labelindex_if+"";}
		s=stat*{text[index++]="label_if0"+labelindex+":";}){labelindex_if++;}
	|	^(IF	boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_if1"+labelindex_if+"";}
		s1=stat*{text[index++]="jmp label_if2"+labelindex_if+"";text[index++]="label1_if1"+labelindex_if+":";}
		 ELSE 
		 s2=stat*{text[index++]="label_if2"+labelindex_if+":";}){labelindex_if++;}
	;
	
	
whilestat	:	^('while'{text[index++]="label_while"+labelindex_while+":";}	boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_while2"+labelindex_while+"";}
	stat*{text[index++]="jmp label_while"+labelindex_while+"\nlabel_while2"+labelindex_while+":";}){labelindex_while++;}
	;

forstat	:	^('for'	s1=givevalue{text[index++]="label_for"+labelindex_for+":";}
	s2=boolexpr{text[index++]="pop rax";text[index++]="cmp eax,1";text[index++]="jnz label_for2"+labelindex_for+"";}
	s4=stat*	
	s3=givevalue{text[index++]="jmp label_for"+labelindex_for+"";text[index++]="label_for2"+labelindex_for+":";}	){labelindex_for++;}
	;

declarefunc
	:	^(FUNC1	
		ID	{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";}
		s1=stat*	{text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";}){labelindex_func++;}
	|	^(FUNC2	ID{text[index++]="jmp label"+labelindex+"";text[index++]="label_"+$ID+":";text[index++]="push rbp";text[index++]="mov rbp,rsp";}
		stat*	
		expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex+":";}){labelindex++;}
	|	^(FUNC3	a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";}	
	(('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";}	b=ID{text[index++]="mov rax,[ rbp+"+8+"+8+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+	
	stat*{text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label_func"+labelindex_func+":";})
       	{labelindex_func++;}
	|	^(FUNC4{num=0;}	a=ID{text[index++]="jmp label_func"+labelindex_func+"";text[index++]="label_"+$a+":";}
		(('int'|'float'){text[index++]="push rbp";text[index++]="mov rbp,rsp";}	b=ID{text[index++]="mov rax,[ rbp+"+8+"+"+8*num+"]";text[index++]="mov ["+$b+"],rax";num--;data[data_index++]=$b+" dd 0";})+
        	stat*	
		expr{text[index++]="pop rax";text[index++]="mov rsp,rbp";text[index++]="pop rbp";text[index++]="ret";text[index++]="label"+labelindex_func+":";})
        {labelindex++;}
	;	

callfunc	:	^(CALL	ID){text[index++]="call label_"+$ID+"";}
	|	^(CALL	ID{num=0;}	(expr{num++;})+){text[index++]="push "+num;text[index++]="call label_"+$ID+"";}
	;

stat	:	^(DECLAREVAR	declarevar)
	|	^(GIVEVALUE	givevalue)
	|	^(IFSTAT	ifstat)
	|	^(FORSTAT	forstat)
	|	^(WHILESTAT	whilestat)
	|	^(DECLAREFUNC	declarefunc)
	|	^(CALLFUNC	callfunc)
	;

prog	:	^(PROG	stat*)
        {
                System.out.println("section .data\nt resb 100\nbuffer db 0 ,0,0");
                for(int i=0;i<data_index;i++){
                        System.out.println(data[i]);                
                }                
                System.out.println("section .text\nglobal _start\n_start:\n");
                for(int i=0;i<index;i++){
                        System.out.println(text[i]);                
                }
                System.out.println("call label_main\njmp label_a\nlabel_print:\nxor rcx,rcx\nxor rax,rax\npush rbp\nmov rbp,rsp\nmov rax,[rbp+8+8*2]\nlabel_prog:\nmov rbx,10\ndiv bl\nadd ah,30h\nmov ebx,buffer\nsub ebx,ecx\ndec ebx\nmov [ebx],ah\nmov ah,0\ninc rcx\ncmp rax,0\njnz label_prog\nmov ax,4\nmov ebx,1\nmov edx,ecx\nmov ecx,buffer\nsub ecx,edx\nint 80h\nmov rsp,rbp\npop rbp\nret\nlabel_a:\nmov ax,1\nmov ebx,0\nint 0x80\n");       
        }
	;
主程式,呼叫.g檔案生成的java檔案如下
//cm.java
import java.io.*;
import org.antlr.runtime.ANTLRFileStream;
import org.antlr.runtime.CommonTokenStream;
import org.antlr.runtime.tree.CommonTree;
import org.antlr.runtime.tree.CommonTreeNodeStream;


public class cm {

    public static void main(String args[]) throws Exception {

        cLexer lex = new cLexer(new ANTLRFileStream("./input.txt"));
        CommonTokenStream tokens = new CommonTokenStream(lex);

        cParser par = new cParser(tokens);
	
	cParser.prog_return ret = par.prog();
		
  	CommonTree t = ret.tree;
       // System.out.println(t.toStringTree());
	CommonTreeNodeStream nodes = new CommonTreeNodeStream(t);
	nodes.setTokenStream(tokens);
	cTree walker = new cTree(nodes);
	
	walker.prog();
			
	}
    
}
這樣,就得到了.s檔案,使用nasm生成.o檔案後,生成可執行檔案,測試
java org.antlr.Tool c.g
java org.antlr.Tool cTree.g
javac *.java
java cm > input.s
nasm -f elf64 input.s
ld -s -o input input.o



github          https://github.com/lizhongguo/myc