1. 程式人生 > >編譯原理之手工構造C語言詞法分析器

編譯原理之手工構造C語言詞法分析器

編寫一個(C語言)詞法分析器:

需求是:1對原來的資料進行預處理,刪掉註釋;(為了展示方便,就不刪掉換行,製表符了,本來應該是要刪掉這些的)

2將詞法正確的token分解出來,一共應該有5類,識別符號,關鍵字,常數,界符,運算子,對於原來的原始碼,將token之間都加上空格;

3對於詞法不正確的token進行提示,表示詞法不正確,具體有:浮點數的不正確,如.11,0.23.34,這樣的;

程式設計環境是:webstorm和nodejs

關於思路可以參考我的上一篇部落格,在原來的基礎上有了一些改進;

這是預處理部分的DFA:


這是處理部分的DFA:


最後這是關於詞法分析器部分的原始碼:

const fs = require("fs");
var data="";
const redline = require("readline");
data = fs.readFileSync("test2.txt");
var tem = data.toString();
//定義關鍵字集合
 const keys = ["auto","break","case","switch","char","const","continue","default","do","while",
"double","else","if","enum","extern","float","for","goto","int","long","register","return","short",
"signed","sizeof","static","struct","typedef","unino","unsigned","void","volatile"];
 //定義字母集合,這樣比較方便(沒有用正則)
const charArray = "qwertyuiopasdfghjklzxcvbnm"
    +"QWERTYUIOPASDFGHJKLZXCVBNM";
//定義界符和運算子集合
const symbols =[
    "+","-","*","/","<","<=",">",">=","=","==",
    "!=",";","(",")","^",",","\"","\'","#","&",
    "&&","|","||","%","~","<<",">>","[","]","{",
    "}","\\",".","\?",":","!"];
const digitArray = ['0','1','2','3','4','5','6','7','8','9'];
    //預處理函式,將註釋去掉
    fs.writeFileSync("test3.txt",'輸出結果如下:\r\n');
    var t2 = preprocess(tem);
    var symbolString="";
    var keysString="";
    var numberString="";
    var idString="";
    var t3 = t2.join("");
    t3=t3.split("\n");
    for (var i=0;i<t3.length;i++){
        var temStr = t3[i].toString();
        var t1 = judeg(temStr);
        fs.appendFileSync('test3.txt',t3[i].toString(), function (err) {
            if (err) throw err;
            console.log('The "data to append" was appended to file!');
        });
        fs.appendFileSync('test3.txt',t1[0].join(""), function (err) {
            if (err) throw err;
            console.log('The "data to append" was appended to file!');
        });
        symbolString=symbolString+" "+t1[1].join(" ");
        keysString=keysString+" "+t1[2].join(" ");
        numberString=numberString+" "+t1[3].join(" ");
        idString=idString+" "+t1[4].join(" ");
    }

    //整理token
fs.appendFile('test3.txt','\r\nsymbolArray:'+symbolString, function (err) {
    if (err) throw err;
    console.log('The "data to append" was appended to file!');
});
fs.appendFile('test3.txt','\r\nkeysArray:'+keysString, function (err) {
    if (err) throw err;
    console.log('The "data to append" was appended to file!');
});
fs.appendFile('test3.txt','\r\nnumberArray:'+numberString, function (err) {
    if (err) throw err;
    console.log('The "data to append" was appended to file!');
});
fs.appendFile('test3.txt','\r\nidArray:'+idString, function (err) {
    if (err) throw err;
    console.log('The "data to append" was appended to file!');
});
console.log("寫入完成");
//判斷是否是英文字元
function isLetter(x) {
    return charArray.indexOf(x)!==-1
}
//判斷是否是數字
function isDigit(x) {
    return digitArray.indexOf(x)!==-1
}
//判斷是否是界符或者運算子
function isSymbols(x) {
    return  symbols.indexOf(x)!==-1
}
function isKeys(x) {
    return keys.indexOf(x)!==-1
}
function preprocess(str) {
        var m=0;
        var n=0;
        var prestr=[];
        for (m=0;m<str.length;) {
            var  state=1;
        while((state===1||state===2||state===3||state===4||state===6)
        && (m<str.length)){
            switch (state){
                case 1:
                    if(str.charAt(m) === '/'){
                        n=m;
                        m=m+1;
                        state=2;
                        break;
                    } else{
                        n=m;
                        m=m+1;
                        state=8;
                        break;
                    }
                case 2:
                    if(str.charAt(m) === '/'){
                        m=m+1;
                        state=6;
                        break;
                    }else if (str.charAt(m) === '*'){
                        m=m+1;
                        state=3;
                        break;
                    } else{
                        m=m+1;
                        state=8;
                        break;
                    }
                case 3:
                    if(str.charAt(m) === '*'){
                        m=m+1;
                        state=4;
                        break;
                    }else{
                        m=m+1;
                        state=3;
                        break;
                    }
                case 4:
                    if(str.charAt(m) === '*'){
                        m=m+1;
                        state=4;
                        break;
                    }else if (str.charAt(m) === '/'){
                        m=m+3;
                        state=5;
                        break;
                    } else{
                        m=m+1;
                        state = 3;
                        break;
                    }

                case 6:
                    if(str.charAt(m) === '\r'){
                        m=m+2;
                        state=7;
                        break;
                    }else{
                        m=m+1;
                        state = 6;
                        break;
                    }
            }
        }
                switch (state){
                    case 5:
                        state=1; //塊註釋
                        break;
                    case 8:   //其他字元
                        state = 1;
                        prestr.push(str.slice(n,m));
                        break;
                    case 7:  //行註釋
                        state = 1;
                        break;
                }
            }
        // n儲存的是m的初值,也就是token的第一個指標指向的字元,m儲存的是token的末尾的索引值
        return prestr;
}
function judeg(str) {
    var m=0;
    var n=0;
    var tokens=[];
    var keyArray=[];
    var idArray=[];
    var symbolArray=[];
    var numberArray=[];
    var len=str.length;
    var state2 = 1;
    for(m=0;m<=len;){
        while (((state2===1)||(state2===2)||(state2===4)||(state2===6)||(state2===8)) && (m<=len)){
            switch (state2){
                case 1:
                    n=m;
                    if((str.charAt(m)==="_")||isLetter(str.charAt(m))){
                        m=m+1;
                        state2 = 2;
                    }else if(isDigit(str.charAt(m))){
                        m=m+1;
                        state2 = 4;
                    }else if(isSymbols(str.charAt(m))){
                        m=m+1;
                        state2 = 8;
                    }else{
                        m=m+1;
                        state2 = 10;
                    }
                    break;
                case 2:
                    if((str.charAt(m)==="_")||isLetter(str.charAt(m))||isDigit(str.charAt(m))){
                        m=m+1;
                        state2=2;
                    }
                    else{
                        m=m+1;
                        state2 =3;
                    }
                    break;
                case 4:
                    if(isDigit(str.charAt(m))){
                        m= m+1;
                        state2 = 4;
                    }else if(str.charAt(m) ==="."){
                        m=m+1;
                        state2 = 6;
                    }else{
                        m=m+1;
                        state2 = 5;
                    }
                    break;
                case 6:
                    if(isDigit(str.charAt(m))){
                        m=m+1;
                        state2 =6;
                    }else if(str.charAt(m) === "."){
                        m=m+1;
                        state2 = 11;
                    }else{
                        m=m+1;
                        state2 = 7;
                    }
                    break;
                case 8:
                    if(isSymbols(str.charAt(m))){
                        m=m+1;
                        state2 = 9;
                    }else{
                        m=m+1;
                        state2 = 12;
                    }
                    break;
            }
        }
        switch (state2){
            case 3:
                m=m-1;
                state2 = 1;
                var tem3=str.slice(n,m);
                if(isKeys(tem3)){
                    keyArray.push(tem3);
                    tem3=' '+tem3+' ';
                }else {
                    idArray.push(tem3);
                    tem3= ' '+tem3+' ';
                }
                tokens.push(tem3);
                break;
            case 5:
                m=m-1;
                var tem5=str.slice(n,m);
                numberArray.push(tem5);
                tem5 = ' '+tem5+' ';
                tokens.push(tem5);
                state2 =1;
                break;
            case 7:
                m=m-1;
                var tem7=str.slice(n,m);
                numberArray.push(tem7);
                tem7 = ' '+tem7+' ';
                tokens.push(tem7);
                state2 =1;
                break;
            case 12:
                m=m-1;
                var tem12=str.slice(n,m);
                //symbolArray.push(tem12);
                tem12 = ' '+tem12+' ';
                symbolArray.push(tem12);
                tokens.push(tem12);
                state2 =1;
                break;
            case 9:
                var tem9=str.slice(n,m);
                symbolArray.push(tem9);
                if(isSymbols(tem9)){
                    tem9 = ' '+tem9+' ';
                    tokens.push(tem9);
                    state2 =1;
                }else{
                    state2 =12;
                }
                break;
            case 10:
                state2 =1;
                var tem10=str.slice(n,m);
                //其他字元
                tokens.push(tem10);
                break;
            case 11:
                state2 =1;
                var tem11=str.slice(n,m);
                //出錯處理
                tem11 = '浮點數出錯請檢查';
                tokens.push(tem11);
                break;
        }
    }
   // console.log(symbolArray);
    return [tokens,symbolArray,keyArray,numberArray,idArray];
}

這是輸出結果:一行原始碼,一行token,把token加了空格分隔開;