1. 程式人生 > >[編譯原理-詞法分析(二)] 使用狀態轉換圖識別詞法單元

[編譯原理-詞法分析(二)] 使用狀態轉換圖識別詞法單元

前言

一個小Demo, 用於分析的原始檔比較簡單, 主要的部分都有, 擴充套件比較容易.
將正則表示式表示的模式構造為狀態轉換圖. 在本文中只列舉狀態轉換圖.

雙緩衝區(程式碼中的Buffer類): https://my.oschina.net/u/3107416/blog/3110834

數字的狀態轉換

{P84}

保留字和ID的狀態轉換

{P83}

運算子的狀態轉換

 {P84}

用於分析的原始檔

結果

前情提要

一、詞素模式
二、列印Token
三、StateTransition類
四、StateTransition的構造與解構函式
五、StateTransition普通函式的實現
六、運算子的狀態轉換
七、數字的狀態轉換
八、保留字和ID的狀態轉換
九、空格, 製表符, 換行符設定
十、呼叫

正文

將使用<~> 標記來自哪個檔案

一、詞素模式
<~Token.h>

namespace Lexical_Analysis {
    enum Tag {
        RESERVE_WORD = 256,
        ID,
        METHOD,
        RELOP,
        NUM,
    };

    enum RelopTag {
        LT, // <
        LE, // <=
        EQ, // =
        NE, // <>
        GT, // >
        GE, // >=
    };

    enum ReserveWordTag {
        INT,
        IF,
        ELSE,
        THEN
    };

    class Token {
    public:
        Tag tag;
        explicit Token(Tag t): tag(t) { }
        virtual ~Token() = default;
    };

    // 運算子值
    class RelopToken : public Token {
    public:
        RelopTag relop;
        explicit RelopToken(RelopTag tag): Token(RELOP), relop(tag) { }
    };

    // 保留字tag, 和詞素
    class ReserveWordToken : public Token {
    public:
        ReserveWordTag reserveWordTag;
        std::string lexeme;
        ReserveWordToken(ReserveWordTag t, std::string l): Token(RESERVE_WORD), reserveWordTag(t), lexeme(l) { }
    };

    // 儲存id 值
    class IdToken : public Token {
    public:
        std::string value;
        explicit IdToken(std::string v): Token(ID), value(v) { }
    };

    // 儲存int 值
    class IntToken : public Token {
    public:
        int value;
        explicit IntToken(int v): Token(NUM), value(v) { }
    };

    // 儲存double 值
    class DoubleToken : public Token {
    public:
        double value;
        explicit DoubleToken(double v): Token(NUM), value(v) { }
    };

}
二、列印Token
void Lexical_Analysis::StateTransition::print_token(Lexical_Analysis::Token *token) {
    RelopToken* relopToken;
    ReserveWordToken* reserveWordToken;
    IdToken* idToken;
    IntToken* intToken;
    DoubleToken* doubleToken;

    if ((relopToken = dynamic_cast<RelopToken*>(token))) {
        std::cout << "<" << getRelopTagStr(relopToken->relop) << ">" << std::endl;
    } else if ((reserveWordToken = dynamic_cast<ReserveWordToken*>(token))) {
        std::cout << "<" << getTagStr(reserveWordToken->tag) << ", " << getReserveWordTagStr(reserveWordToken->reserveWordTag) << ">" << std::endl;
    } else if ((idToken = dynamic_cast<IdToken*>(token))) {
        std::cout << "<id, " << idToken->value << ">" << std::endl;
    } else if ((intToken = dynamic_cast<IntToken*>(token))) {
        std::cout << "<int, " << intToken->value << ">" << std::endl;
    } else if ((doubleToken = dynamic_cast<DoubleToken*>(token))) {
        std::cout << "<double, " << doubleToken->value << ">" << std::endl;
    }

}

std::string Lexical_Analysis::StateTransition::getTagStr(Lexical_Analysis::Tag val) {
    if (val == Tag::RESERVE_WORD) return "RESERVE_WORD";
    else if (val == Tag::ID) return "ID";
    else if (val == Tag::RELOP) return "RELOP";
    else if (val == Tag::METHOD) return "METHOD";
    else if (val == Tag::NUM) return "NUM";

    return std::__cxx11::string();
}

std::string Lexical_Analysis::StateTransition::getRelopTagStr(Lexical_Analysis::RelopTag val) {
    if (val == RelopTag::LT) return "<";
    else if (val == RelopTag::LE) return "<=";
    else if (val == RelopTag::EQ) return "=";
    else if (val == RelopTag::NE) return "<>";
    else if (val == RelopTag::GT) return ">";
    else if (val == RelopTag::GE) return ">=";

    return std::__cxx11::string();
}

std::string Lexical_Analysis::StateTransition::getReserveWordTagStr(Lexical_Analysis::ReserveWordTag val) {
    if (val == ReserveWordTag::INT) return "INT";
    else if (val == ReserveWordTag::IF) return "IF";
    else if (val == ReserveWordTag::ELSE) return "ELSE";
    else if (val == ReserveWordTag::THEN) return "THEN";

    return std::__cxx11::string();
}

三、StateTransition類
<~StateTransition.h>

namespace Lexical_Analysis {

    class StateTransition {
    public:
        explicit StateTransition(Buffer<>* _buffer);
        ~StateTransition();

    private:
        Buffer<>* buffer; // 緩衝區
        std::map<std::string, ReserveWordToken*> reserveWords; // 記號表, 見建構函式初始化
    private:
        /**
         * 將token存入記號表
         * @param token
         */
        void reserve(ReserveWordToken* token);
    public:
        /**
         * 運算子的狀態轉換實現
         */
        Token* getRelop();
        /**
         * 數字的狀態轉換實現
         */
        Token* getNumber();
        /**
         * 字元序列的狀態轉換實現
         */
        Token* getCharacterSequence();

        /**
         * 更新空格 製表符 換行
         */
        void getBlankTabNewline();

        /**
         * 全域性恢復策略, 將forward值重置為lexemeBegin的值
         * 使用另一個狀態圖從尚未處理的輸入部分的真實位置開始識別
         */
        void fail();

        /**
         * 回退一個字元
         */
        void retract();

    public:

        void print_token(Token* token);
        std::string getTagStr(Tag val);
        std::string getRelopTagStr(RelopTag val);
        std::string getReserveWordTagStr(ReserveWordTag val);
    };
}
四、StateTransition的構造與解構函式
<~StateTransition.cpp>

Lexical_Analysis::StateTransition::StateTransition(Lexical_Analysis::Buffer<> *_buffer):buffer(_buffer) {
    reserve(new ReserveWordToken(ReserveWordTag::INT, "int"));
    reserve(new ReserveWordToken(ReserveWordTag::IF, "if"));
    reserve(new ReserveWordToken(ReserveWordTag::ELSE, "else"));
    reserve(new ReserveWordToken(ReserveWordTag::THEN, "then"));
}

Lexical_Analysis::StateTransition::~StateTransition() {

}
五、StateTransition普通函式的實現
<~StateTransition.cpp>

void Lexical_Analysis::StateTransition::fail() {
    buffer->forward = buffer->lexemeBegin;
}

void Lexical_Analysis::StateTransition::retract() {
    buffer->pre();
}

void Lexical_Analysis::StateTransition::reserve(Lexical_Analysis::ReserveWordToken *token) {
    reserveWords[token->lexeme] = token;
}
六、運算子的狀態轉換, 在下文的三個狀態轉換中, 都首先檢查了當前字元是否滿足當前狀態轉換需要的條件, 如果不滿足則直接返回. 然後則是一個while迴圈與多路分支語句.
<~StateTransition.cpp>

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getRelop() {
    int state = 0;

    char c = buffer->cur();
    bool is_loop = (c == '<' || c == '=' || c == '>');

    if (!is_loop) return nullptr;

    while (is_loop) {
        switch (state) {
            case 0:
                c = buffer->next();

                if (c == '<') state = 1;
                else if (c == '=') state = 5;
                else if (c == '>') state = 6;
                break;
            case 1:
                c = buffer->next();

                if (c == '=') state = 2;
                else if (c == '>') state = 3;
                else state = 4;
                break;
            case 2:
                // 需要將lexemeBegin指標設定為當前詞素之後的第一個字元,
                // 因此使forward向前移動, 更改lexemeBegin指標後在回退
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::LE);
            case 3:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::NE);
            case 4:
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::LT);
            case 5:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::EQ);
            case 6:
                c = buffer->next();

                if (c == '=') state = 7;
                else state = 8;
                break;
            case 7:
                buffer->next();
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::GE);
            case 8:
                // 當前forward指標已經指向當前詞素之後的第一個位置, 不用更改
                buffer->lexemeBegin = buffer->forward;
                retract();
                return new RelopToken(RelopTag::GT);
        }

    }
}
七、數字的狀態轉換, 包括小數位
<~StateTransition.cpp>

void calculation(int& result, int val) {
    result = result * 10 + val;
}

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getNumber() {
    int state = 12;

    bool is_loop = is_digit(buffer->cur());

    /* 直接返回 */
    if (!is_loop) return nullptr;

    int val = 0; // 整數部分

    ////////////////////
    double dVal = 0.0; // 小數部分
    int place = 10; // 小數點後第一位/10, 第二位/100 ...
    bool isDecimal = false; // 是否為小數

    ////////////////////

    int ePos = 0; // E 後面的數 (E06)
    bool isNegative = false; // E後面的數是否為負數 (E-03)

    char c;
    while (is_loop) {
        switch (state) {
            case 12:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(val, to_digit_10(c));
                    state = 13;
                }
                break;
            case 13:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(val, to_digit_10(c));
                    state = 13;
                }
                else if (c == '.') {
                    isDecimal = true;
                    state = 14;
                }
                else if (c == 'E') state = 16;
                else state = 19;
                break;
            case 14:
                c = buffer->next();
                if (isdigit(c)) {
                    double d = 1.0 * to_digit_10(c) / place;
                    dVal += d;
                    place *= 10;

                    state = 15;
                }
                else state = 19;
                break;
            case 15:
                c = buffer->next();
                if (isdigit(c)) {
                    double d = 1.0 * to_digit_10(c) / place;
                    dVal += d;
                    place *= 10;

                    state = 15;
                }
                else if (c == 'E') state = 16;
                else state = 19;
                break;
            case 16:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                else if (c == '+' || c == '-') {
                    if (c == '-') isNegative = true;
                    state = 17;
                }
                break;
            case 17:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                break;
            case 18:
                c = buffer->next();
                if (isdigit(c)) {
                    calculation(ePos, to_digit_10(c));
                    state = 18;
                }
                else state = 19;
                break;
            case 19:
                is_loop = false;
                break;
        }
    }

    // 使lexemeBegin指向剛才找到的詞素之後的第一個字元, 處理完詞素後, forward回退一個位置
    char* tmp = buffer->forward;

    if (isDecimal) { // 如果是小數
        dVal += val;
        if (isNegative) for (int i = 0; i < ePos; i++) dVal /= 10;
        else for (int i = 0; i < ePos; i++) dVal *= 10;

        // 回退, 並設定lexemeBegin值
        retract();
        buffer->lexemeBegin = tmp;

        DoubleToken* token = new DoubleToken(dVal);
        return token;
    } else {
        if (isNegative) for (int i = 0; i < ePos; i++) val /= 10;
        else for (int i = 0; i < ePos; i++) val *= 10;

        retract();
        buffer->lexemeBegin = tmp;

        return new IntToken(val);
    }
}
八、保留字和ID的狀態轉換
<~StateTransition.cpp>

Lexical_Analysis::Token* Lexical_Analysis::StateTransition::getCharacterSequence() {
    int state = 9;

    bool is_loop = is_letter(buffer->cur());

    /* 直接返回 */
    if (!is_loop) return nullptr;

    char c;
    while (is_loop) {
        switch (state) {
            case 9:
                c = buffer->next();
                if (is_underline(c) || is_letter(c)) state = 10;
                break;
            case 10:
                c = buffer->next();
                if (is_letter(c) || is_digit(c) || is_underline(c)) state = 10;
                else state = 11;
                break;
            case 11:
                is_loop = false;
                break;
        }
    }

    // 使lexemeBegin指向剛才找到的詞素之後的第一個字元, 處理完詞素後, forward回退一個位置
    char* tmp = buffer->forward;
    if (!buffer->is_end) {
        // 如果沒有讀到結尾, 執行回退
        retract();
    }
    std::string result = buffer->getString();

    buffer->lexemeBegin = tmp;

    // 從符號表中查詢是否為關鍵詞(保留字), 如果是則返回關鍵詞token, 否則返回id token
    auto it = reserveWords.find(result);

    if (it != reserveWords.end()) {
        return (*it).second;
    }

    return new IdToken(result);
}
九、空格, 製表符, 換行符設定
<~StateTransition.cpp>

void Lexical_Analysis::StateTransition::getBlankTabNewline() {

    bool is_loop = is_blank_tab_newline(buffer->cur());
    if (!is_loop) return ;

    while (is_blank_tab_newline(buffer->cur())) {
        buffer->next();
    }
    buffer->lexemeBegin = buffer->forward;
}
十、呼叫
<~main.cpp>


int main() {

    string fileStr = "/home/yanuas/CompilingPrinciple/LexicalAnalysis/code.src";

    Buffer<1024> buffer(fileStr);
    StateTransition transition(&buffer);

    while (1) {
        Token* token = transition.getRelop();
        if (token != nullptr) transition.print_token(token);

        token = transition.getCharacterSequence();
        if (token != nullptr) transition.print_token(token);

        token = transition.getNumber();
        if (token != nullptr) transition.print_token(token);

        transition.getBlankTabNewline();

        if (transition.buffer->is_end) break;
    }