[編譯原理-詞法分析(二)] 使用狀態轉換圖識別詞法單元
阿新 • • 發佈:2019-09-25
前言
一個小Demo, 用於分析的原始檔比較簡單, 主要的部分都有, 擴充套件比較容易.
將正則表示式表示的模式構造為狀態轉換圖. 在本文中只列舉狀態轉換圖.
雙緩衝區(程式碼中的Buffer類): https://my.oschina.net/u/3107416/blog/3110834
數字的狀態轉換
保留字和ID的狀態轉換
運算子的狀態轉換
用於分析的原始檔
結果
前情提要
一、詞素模式 二、列印Token 三、StateTransition類 四、StateTransition的構造與解構函式 五、StateTransition普通函式的實現 六、運算子的狀態轉換 七、數字的狀態轉換 八、保留字和ID的狀態轉換 九、空格, 製表符, 換行符設定 十、呼叫
正文
將使用<~> 標記來自哪個檔案
一、詞素模式
<~Token.h> namespace Lexical_Analysis { enum Tag { RESERVE_WORD = 256, ID, METHOD, RELOP, NUM, }; enum RelopTag { LT, // < LE, // <= EQ, // = NE, // <> GT, // > GE, // >= }; enum ReserveWordTag { INT, IF, ELSE, THEN }; class Token { public: Tag tag; explicit Token(Tag t): tag(t) { } virtual ~Token() = default; }; // 運算子值 class RelopToken : public Token { public: RelopTag relop; explicit RelopToken(RelopTag tag): Token(RELOP), relop(tag) { } }; // 保留字tag, 和詞素 class ReserveWordToken : public Token { public: ReserveWordTag reserveWordTag; std::string lexeme; ReserveWordToken(ReserveWordTag t, std::string l): Token(RESERVE_WORD), reserveWordTag(t), lexeme(l) { } }; // 儲存id 值 class IdToken : public Token { public: std::string value; explicit IdToken(std::string v): Token(ID), value(v) { } }; // 儲存int 值 class IntToken : public Token { public: int value; explicit IntToken(int v): Token(NUM), value(v) { } }; // 儲存double 值 class DoubleToken : public Token { public: double value; explicit DoubleToken(double v): Token(NUM), value(v) { } }; }
二、列印Token
void Lexical_Analysis::StateTransition::print_token(Lexical_Analysis::Token *token) { RelopToken* relopToken; ReserveWordToken* reserveWordToken; IdToken* idToken; IntToken* intToken; DoubleToken* doubleToken; if ((relopToken = dynamic_cast<RelopToken*>(token))) { std::cout << "<" << getRelopTagStr(relopToken->relop) << ">" << std::endl; } else if ((reserveWordToken = dynamic_cast<ReserveWordToken*>(token))) { std::cout << "<" << getTagStr(reserveWordToken->tag) << ", " << getReserveWordTagStr(reserveWordToken->reserveWordTag) << ">" << std::endl; } else if ((idToken = dynamic_cast<IdToken*>(token))) { std::cout << "<id, " << idToken->value << ">" << std::endl; } else if ((intToken = dynamic_cast<IntToken*>(token))) { std::cout << "<int, " << intToken->value << ">" << std::endl; } else if ((doubleToken = dynamic_cast<DoubleToken*>(token))) { std::cout << "<double, " << doubleToken->value << ">" << std::endl; } } std::string Lexical_Analysis::StateTransition::getTagStr(Lexical_Analysis::Tag val) { if (val == Tag::RESERVE_WORD) return "RESERVE_WORD"; else if (val == Tag::ID) return "ID"; else if (val == Tag::RELOP) return "RELOP"; else if (val == Tag::METHOD) return "METHOD"; else if (val == Tag::NUM) return "NUM"; return std::__cxx11::string(); } std::string Lexical_Analysis::StateTransition::getRelopTagStr(Lexical_Analysis::RelopTag val) { if (val == RelopTag::LT) return "<"; else if (val == RelopTag::LE) return "<="; else if (val == RelopTag::EQ) return "="; else if (val == RelopTag::NE) return "<>"; else if (val == RelopTag::GT) return ">"; else if (val == RelopTag::GE) return ">="; return std::__cxx11::string(); } std::string Lexical_Analysis::StateTransition::getReserveWordTagStr(Lexical_Analysis::ReserveWordTag val) { if (val == ReserveWordTag::INT) return "INT"; else if (val == ReserveWordTag::IF) return "IF"; else if (val == ReserveWordTag::ELSE) return "ELSE"; else if (val == ReserveWordTag::THEN) return "THEN"; return std::__cxx11::string(); }
三、StateTransition類
<~StateTransition.h>
namespace Lexical_Analysis {
class StateTransition {
public:
explicit StateTransition(Buffer<>* _buffer);
~StateTransition();
private:
Buffer<>* buffer; // 緩衝區
std::map<std::string, ReserveWordToken*> reserveWords; // 記號表, 見建構函式初始化
private:
/**
* 將token存入記號表
* @param token
*/
void reserve(ReserveWordToken* token);
public:
/**
* 運算子的狀態轉換實現
*/
Token* getRelop();
/**
* 數字的狀態轉換實現
*/
Token* getNumber();
/**
* 字元序列的狀態轉換實現
*/
Token* getCharacterSequence();
/**
* 更新空格 製表符 換行
*/
void getBlankTabNewline();
/**
* 全域性恢復策略, 將forward值重置為lexemeBegin的值
* 使用另一個狀態圖從尚未處理的輸入部分的真實位置開始識別
*/
void fail();
/**
* 回退一個字元
*/
void retract();
public:
void print_token(Token* token);
std::string getTagStr(Tag val);
std::string getRelopTagStr(RelopTag val);
std::string getReserveWordTagStr(ReserveWordTag val);
};
}
四、StateTransition的構造與解構函式
<~StateTransition.cpp>
Lexical_Analysis::StateTransition::StateTransition(Lexical_Analysis::Buffer<> *_buffer):buffer(_buffer) {
reserve(new ReserveWordToken(ReserveWordTag::INT, "int"));
reserve(new ReserveWordToken(ReserveWordTag::IF, "if"));
reserve(new ReserveWordToken(ReserveWordTag::ELSE, "else"));
reserve(new ReserveWordToken(ReserveWordTag::THEN, "then"));
}
Lexical_Analysis::StateTransition::~StateTransition() {
}
五、StateTransition普通函式的實現
<~StateTransition.cpp>
void Lexical_Analysis::StateTransition::fail() {
buffer->forward = buffer->lexemeBegin;
}
void Lexical_Analysis::StateTransition::retract() {
buffer->pre();
}
void Lexical_Analysis::StateTransition::reserve(Lexical_Analysis::ReserveWordToken *token) {
reserveWords[token->lexeme] = token;
}
六、運算子的狀態轉換, 在下文的三個狀態轉換中, 都首先檢查了當前字元是否滿足當前狀態轉換需要的條件, 如果不滿足則直接返回. 然後則是一個while迴圈與多路分支語句.
<~StateTransition.cpp>
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getRelop() {
int state = 0;
char c = buffer->cur();
bool is_loop = (c == '<' || c == '=' || c == '>');
if (!is_loop) return nullptr;
while (is_loop) {
switch (state) {
case 0:
c = buffer->next();
if (c == '<') state = 1;
else if (c == '=') state = 5;
else if (c == '>') state = 6;
break;
case 1:
c = buffer->next();
if (c == '=') state = 2;
else if (c == '>') state = 3;
else state = 4;
break;
case 2:
// 需要將lexemeBegin指標設定為當前詞素之後的第一個字元,
// 因此使forward向前移動, 更改lexemeBegin指標後在回退
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::LE);
case 3:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::NE);
case 4:
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::LT);
case 5:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::EQ);
case 6:
c = buffer->next();
if (c == '=') state = 7;
else state = 8;
break;
case 7:
buffer->next();
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::GE);
case 8:
// 當前forward指標已經指向當前詞素之後的第一個位置, 不用更改
buffer->lexemeBegin = buffer->forward;
retract();
return new RelopToken(RelopTag::GT);
}
}
}
七、數字的狀態轉換, 包括小數位
<~StateTransition.cpp>
void calculation(int& result, int val) {
result = result * 10 + val;
}
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getNumber() {
int state = 12;
bool is_loop = is_digit(buffer->cur());
/* 直接返回 */
if (!is_loop) return nullptr;
int val = 0; // 整數部分
////////////////////
double dVal = 0.0; // 小數部分
int place = 10; // 小數點後第一位/10, 第二位/100 ...
bool isDecimal = false; // 是否為小數
////////////////////
int ePos = 0; // E 後面的數 (E06)
bool isNegative = false; // E後面的數是否為負數 (E-03)
char c;
while (is_loop) {
switch (state) {
case 12:
c = buffer->next();
if (isdigit(c)) {
calculation(val, to_digit_10(c));
state = 13;
}
break;
case 13:
c = buffer->next();
if (isdigit(c)) {
calculation(val, to_digit_10(c));
state = 13;
}
else if (c == '.') {
isDecimal = true;
state = 14;
}
else if (c == 'E') state = 16;
else state = 19;
break;
case 14:
c = buffer->next();
if (isdigit(c)) {
double d = 1.0 * to_digit_10(c) / place;
dVal += d;
place *= 10;
state = 15;
}
else state = 19;
break;
case 15:
c = buffer->next();
if (isdigit(c)) {
double d = 1.0 * to_digit_10(c) / place;
dVal += d;
place *= 10;
state = 15;
}
else if (c == 'E') state = 16;
else state = 19;
break;
case 16:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
else if (c == '+' || c == '-') {
if (c == '-') isNegative = true;
state = 17;
}
break;
case 17:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
break;
case 18:
c = buffer->next();
if (isdigit(c)) {
calculation(ePos, to_digit_10(c));
state = 18;
}
else state = 19;
break;
case 19:
is_loop = false;
break;
}
}
// 使lexemeBegin指向剛才找到的詞素之後的第一個字元, 處理完詞素後, forward回退一個位置
char* tmp = buffer->forward;
if (isDecimal) { // 如果是小數
dVal += val;
if (isNegative) for (int i = 0; i < ePos; i++) dVal /= 10;
else for (int i = 0; i < ePos; i++) dVal *= 10;
// 回退, 並設定lexemeBegin值
retract();
buffer->lexemeBegin = tmp;
DoubleToken* token = new DoubleToken(dVal);
return token;
} else {
if (isNegative) for (int i = 0; i < ePos; i++) val /= 10;
else for (int i = 0; i < ePos; i++) val *= 10;
retract();
buffer->lexemeBegin = tmp;
return new IntToken(val);
}
}
八、保留字和ID的狀態轉換
<~StateTransition.cpp>
Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getCharacterSequence() {
int state = 9;
bool is_loop = is_letter(buffer->cur());
/* 直接返回 */
if (!is_loop) return nullptr;
char c;
while (is_loop) {
switch (state) {
case 9:
c = buffer->next();
if (is_underline(c) || is_letter(c)) state = 10;
break;
case 10:
c = buffer->next();
if (is_letter(c) || is_digit(c) || is_underline(c)) state = 10;
else state = 11;
break;
case 11:
is_loop = false;
break;
}
}
// 使lexemeBegin指向剛才找到的詞素之後的第一個字元, 處理完詞素後, forward回退一個位置
char* tmp = buffer->forward;
if (!buffer->is_end) {
// 如果沒有讀到結尾, 執行回退
retract();
}
std::string result = buffer->getString();
buffer->lexemeBegin = tmp;
// 從符號表中查詢是否為關鍵詞(保留字), 如果是則返回關鍵詞token, 否則返回id token
auto it = reserveWords.find(result);
if (it != reserveWords.end()) {
return (*it).second;
}
return new IdToken(result);
}
九、空格, 製表符, 換行符設定
<~StateTransition.cpp>
void Lexical_Analysis::StateTransition::getBlankTabNewline() {
bool is_loop = is_blank_tab_newline(buffer->cur());
if (!is_loop) return ;
while (is_blank_tab_newline(buffer->cur())) {
buffer->next();
}
buffer->lexemeBegin = buffer->forward;
}
十、呼叫
<~main.cpp>
int main() {
string fileStr = "/home/yanuas/CompilingPrinciple/LexicalAnalysis/code.src";
Buffer<1024> buffer(fileStr);
StateTransition transition(&buffer);
while (1) {
Token* token = transition.getRelop();
if (token != nullptr) transition.print_token(token);
token = transition.getCharacterSequence();
if (token != nullptr) transition.print_token(token);
token = transition.getNumber();
if (token != nullptr) transition.print_token(token);
transition.getBlankTabNewline();
if (transition.buffer->is_end) break;
}