編譯原理之手工構造C語言詞法分析器
阿新 • • 發佈:2019-02-05
編寫一個(C語言)詞法分析器:
需求是:1對原來的資料進行預處理,刪掉註釋;(為了展示方便,就不刪掉換行,製表符了,本來應該是要刪掉這些的)
2將詞法正確的token分解出來,一共應該有5類,識別符號,關鍵字,常數,界符,運算子,對於原來的原始碼,將token之間都加上空格;
3對於詞法不正確的token進行提示,表示詞法不正確,具體有:浮點數的不正確,如.11,0.23.34,這樣的;
程式設計環境是:webstorm和nodejs
關於思路可以參考我的上一篇部落格,在原來的基礎上有了一些改進;
這是預處理部分的DFA:
這是處理部分的DFA:
最後這是關於詞法分析器部分的原始碼:
const fs = require("fs"); var data=""; const redline = require("readline"); data = fs.readFileSync("test2.txt"); var tem = data.toString(); //定義關鍵字集合 const keys = ["auto","break","case","switch","char","const","continue","default","do","while", "double","else","if","enum","extern","float","for","goto","int","long","register","return","short", "signed","sizeof","static","struct","typedef","unino","unsigned","void","volatile"]; //定義字母集合,這樣比較方便(沒有用正則) const charArray = "qwertyuiopasdfghjklzxcvbnm" +"QWERTYUIOPASDFGHJKLZXCVBNM"; //定義界符和運算子集合 const symbols =[ "+","-","*","/","<","<=",">",">=","=","==", "!=",";","(",")","^",",","\"","\'","#","&", "&&","|","||","%","~","<<",">>","[","]","{", "}","\\",".","\?",":","!"]; const digitArray = ['0','1','2','3','4','5','6','7','8','9']; //預處理函式,將註釋去掉 fs.writeFileSync("test3.txt",'輸出結果如下:\r\n'); var t2 = preprocess(tem); var symbolString=""; var keysString=""; var numberString=""; var idString=""; var t3 = t2.join(""); t3=t3.split("\n"); for (var i=0;i<t3.length;i++){ var temStr = t3[i].toString(); var t1 = judeg(temStr); fs.appendFileSync('test3.txt',t3[i].toString(), function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); fs.appendFileSync('test3.txt',t1[0].join(""), function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); symbolString=symbolString+" "+t1[1].join(" "); keysString=keysString+" "+t1[2].join(" "); numberString=numberString+" "+t1[3].join(" "); idString=idString+" "+t1[4].join(" "); } //整理token fs.appendFile('test3.txt','\r\nsymbolArray:'+symbolString, function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); fs.appendFile('test3.txt','\r\nkeysArray:'+keysString, function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); fs.appendFile('test3.txt','\r\nnumberArray:'+numberString, function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); fs.appendFile('test3.txt','\r\nidArray:'+idString, function (err) { if (err) throw err; console.log('The "data to append" was appended to file!'); }); console.log("寫入完成"); //判斷是否是英文字元 function isLetter(x) { return charArray.indexOf(x)!==-1 } //判斷是否是數字 function isDigit(x) { return digitArray.indexOf(x)!==-1 } //判斷是否是界符或者運算子 function isSymbols(x) { return symbols.indexOf(x)!==-1 } function isKeys(x) { return keys.indexOf(x)!==-1 } function preprocess(str) { var m=0; var n=0; var prestr=[]; for (m=0;m<str.length;) { var state=1; while((state===1||state===2||state===3||state===4||state===6) && (m<str.length)){ switch (state){ case 1: if(str.charAt(m) === '/'){ n=m; m=m+1; state=2; break; } else{ n=m; m=m+1; state=8; break; } case 2: if(str.charAt(m) === '/'){ m=m+1; state=6; break; }else if (str.charAt(m) === '*'){ m=m+1; state=3; break; } else{ m=m+1; state=8; break; } case 3: if(str.charAt(m) === '*'){ m=m+1; state=4; break; }else{ m=m+1; state=3; break; } case 4: if(str.charAt(m) === '*'){ m=m+1; state=4; break; }else if (str.charAt(m) === '/'){ m=m+3; state=5; break; } else{ m=m+1; state = 3; break; } case 6: if(str.charAt(m) === '\r'){ m=m+2; state=7; break; }else{ m=m+1; state = 6; break; } } } switch (state){ case 5: state=1; //塊註釋 break; case 8: //其他字元 state = 1; prestr.push(str.slice(n,m)); break; case 7: //行註釋 state = 1; break; } } // n儲存的是m的初值,也就是token的第一個指標指向的字元,m儲存的是token的末尾的索引值 return prestr; } function judeg(str) { var m=0; var n=0; var tokens=[]; var keyArray=[]; var idArray=[]; var symbolArray=[]; var numberArray=[]; var len=str.length; var state2 = 1; for(m=0;m<=len;){ while (((state2===1)||(state2===2)||(state2===4)||(state2===6)||(state2===8)) && (m<=len)){ switch (state2){ case 1: n=m; if((str.charAt(m)==="_")||isLetter(str.charAt(m))){ m=m+1; state2 = 2; }else if(isDigit(str.charAt(m))){ m=m+1; state2 = 4; }else if(isSymbols(str.charAt(m))){ m=m+1; state2 = 8; }else{ m=m+1; state2 = 10; } break; case 2: if((str.charAt(m)==="_")||isLetter(str.charAt(m))||isDigit(str.charAt(m))){ m=m+1; state2=2; } else{ m=m+1; state2 =3; } break; case 4: if(isDigit(str.charAt(m))){ m= m+1; state2 = 4; }else if(str.charAt(m) ==="."){ m=m+1; state2 = 6; }else{ m=m+1; state2 = 5; } break; case 6: if(isDigit(str.charAt(m))){ m=m+1; state2 =6; }else if(str.charAt(m) === "."){ m=m+1; state2 = 11; }else{ m=m+1; state2 = 7; } break; case 8: if(isSymbols(str.charAt(m))){ m=m+1; state2 = 9; }else{ m=m+1; state2 = 12; } break; } } switch (state2){ case 3: m=m-1; state2 = 1; var tem3=str.slice(n,m); if(isKeys(tem3)){ keyArray.push(tem3); tem3=' '+tem3+' '; }else { idArray.push(tem3); tem3= ' '+tem3+' '; } tokens.push(tem3); break; case 5: m=m-1; var tem5=str.slice(n,m); numberArray.push(tem5); tem5 = ' '+tem5+' '; tokens.push(tem5); state2 =1; break; case 7: m=m-1; var tem7=str.slice(n,m); numberArray.push(tem7); tem7 = ' '+tem7+' '; tokens.push(tem7); state2 =1; break; case 12: m=m-1; var tem12=str.slice(n,m); //symbolArray.push(tem12); tem12 = ' '+tem12+' '; symbolArray.push(tem12); tokens.push(tem12); state2 =1; break; case 9: var tem9=str.slice(n,m); symbolArray.push(tem9); if(isSymbols(tem9)){ tem9 = ' '+tem9+' '; tokens.push(tem9); state2 =1; }else{ state2 =12; } break; case 10: state2 =1; var tem10=str.slice(n,m); //其他字元 tokens.push(tem10); break; case 11: state2 =1; var tem11=str.slice(n,m); //出錯處理 tem11 = '浮點數出錯請檢查'; tokens.push(tem11); break; } } // console.log(symbolArray); return [tokens,symbolArray,keyArray,numberArray,idArray]; }
這是輸出結果:一行原始碼,一行token,把token加了空格分隔開;