[转载水木清华]C++的词法分析器

本文介绍了一个词法分析器的设计与实现过程,详细解释了如何从输入流中识别不同类型的符号,如关键字、标识符、数字等,并通过状态机的方式过滤掉空白字符和注释,最终将识别出的符号作为令牌返回。
部署运行你感兴趣的模型镜像

作者:shellcracker (拆弹专家),

[lexical_analyzer.h]

#ifndef LEXICAL_ANALYZER_H
#define LEXICAL_ANALYZER_H

enum lexical_status
{
    status_success,
    status_eof,
    status_invalid_char,
    status_unknown = -1,
};

enum token_category
{
    token_error,
    token_keyword,
    token_identifier,
    token_number,
    token_char,
    token_string,
    token_operator,
    token_punctuator,
};

struct token
{

    token_category category;
    std::string value;
};

class lexical_analyzer
{
public:
    explicit lexical_analyzer(std::istream& ifs);
    lexical_status get_token(token& t);
    int current_line() const;
private:
    bool filter_space();
    bool filter_comment();
    std::string get_hex_string();
    std::string get_digital_string();
    std::string get_string(char delimiter);
    bool is_keyword(const std::string& str);
    int get_char();
    int peek_char();
    void putback(char ch);
    void skip_char();
private:
    std::istream* m_pstream;
    int m_line;
};

inline lexical_analyzer::lexical_analyzer(std::istream& ifs) 
    :m_pstream(&ifs), m_line(1)
{
}

inline int lexical_analyzer::current_line() const
{
    return m_line;
}

inline int lexical_analyzer::peek_char()
{
    return m_pstream->peek();
}

inline void lexical_analyzer::skip_char()
{
    get_char();
}

#endif//LEXICAL_ANALYZER_H




[lexical_analyzer.cpp]


#include <fstream>
#include "lexical_analyzer.h"

int lexical_analyzer::get_char()
{
    int ch = m_pstream->get();
    if (ch=='/n')
        ++m_line;
    return ch;
}

void lexical_analyzer::putback(char ch)
{
    if (ch=='/n')
        --m_line;
    m_pstream->putback(ch);
}

bool lexical_analyzer::filter_space()
{
    bool result = false;
    char ch = peek_char();
    if (isspace(ch))
    {
        do
        {
            skip_char();
            ch = peek_char();
        }
        while (isspace(ch));
        return true;
    }
    return false;
}

bool lexical_analyzer::filter_comment()
{
    if(peek_char()=='/')
    {
        skip_char();
        char ch = get_char();
        if(ch=='/')
        {
            while(peek_char()!='/n')
            {
                skip_char();
            }
        }
        else if(ch=='*')
        {
            for(;;)
            {
                if(get_char()=='*' && get_char()=='/')
                    break;
            }
        }
        else
        {
            putback('/');
            return false;
        }
        return true;
    }
    else
    {
        return false;
    }
}

bool lexical_analyzer::is_keyword(const std::string& str)
{
    static const char* const keywords[]=
    {
        "asm",      "auto",         "bad_cast",     "bad_typeid", 
        "bool",     "break",        "case",         "catch", 
        "char",     "class",        "const",        "const_cast", 
        "continue", "default",      "delete",       "do", 
        "double",   "dynamic_cast", "else",         "enum", 
        "except",   "explicit",     "extern",       "false", 
        "finally",  "float",        "for",          "friend", 
        "goto",     "if",           "inline",       "int", 
        "long",     "mutable",      "namespace",    "new", 
        "operator", "private",      "protected",    "public", 
        "register", "reinterpret_cast",     "return",   "short", 
        "signed",   "sizeof",       "static",       "static_cast", 
        "struct",   "switch",       "template",     "this", 
        "throw",    "true",         "try",          "typedef",
        "typeid",   "typename",     "union",        "unsigned",
        "using",    "virtual",      "void",         "volatile", 
        "while",  
    };

    for(int i=0; i<sizeof(keywords)/sizeof(keywords[0]); i++)
    {
        if(str.compare(keywords[i])==0)
            return true;
    }

    return false;
}

std::string lexical_analyzer::get_string(char delimiter)
{
    std::string result;
    for(;;)
    {
        char ch = get_char();
        if(ch==delimiter)
            break;
        else if(ch=='//')
        {
            ch = get_char();
            switch(ch)
            {
            case '/"':
                ch = '/"';
                break;
            case '/'':
                ch = '/'';
                break;
            case 'r':
                ch = '/r';
                break;

            case 'n':
                ch = '/n';
                break;
            case 'v':
                ch = '/v';
                break;
            case 't':
                ch = '/t';
                break;
            case 'a':
                ch = '/a';
                break;
            case 'b':
                ch = '/b';
                break;
            case 'f':
                ch = '/f';
                break;
            case '/r':                          // line splice
            case '/n':
                continue;
                break;
            default:
                break;
            }

            if(ch=='x' || ch=='X')
            {
                std::string s = get_hex_string();
                int x = 0;
                for(int i=0; i<s.length(); i++)
                {
                    x *= 16;
                    if(s[i]>='A' && s[i]<='F')
                        x += s[i]-'A' + 10;
                    else if(s[i]>='a' && s[i]<='f')
                        x += s[i]-'a' + 10;
                    else
                        x += s[i]-'0';
                }
                ch = (char)x;
            }

        }

        result += ch;
    }

    return result;
}

std::string lexical_analyzer::get_digital_string()
{
    std::string result;
    char ch;
    while(isdigit(ch=get_char()))
    {
        result += ch;
    }
    putback(ch);

    return result;
}

std::string lexical_analyzer::get_hex_string()
{
    std::string result;
    char ch;
    while(isxdigit(ch=get_char()))
    {
        result += ch;
    }
    putback(ch);

    return result;
}

lexical_status lexical_analyzer::get_token(token& t)
{
    if(m_pstream->eof())
        return status_eof;


    while(filter_space() || filter_comment())
    {
    }
    while(filter_comment() || filter_space())
    {
    }

    if(m_pstream->eof())
        return status_eof;

    t.value.resize(0);

    char ch = get_char();
    if(ch=='_' || isalpha(ch) || isdigit(ch) || ch=='$')
    {
        t.category = token_identifier;
        do
        {
            t.value += ch;
            ch = get_char();
        }while(ch=='_' || isalpha(ch) || isdigit(ch) || ch=='$');
        putback(ch);
    }
    else if(isdigit(ch))
    {
        t.category = token_number;
        t.value += ch;
        ch = get_char();
        if(ch=='x' || ch=='X')
        {
            t.value += ch;
            t.value += get_hex_string();
        }
        else if(isdigit(ch))
        {
            t.value += ch;
            t.value += get_digital_string();
        }
    }
    else if(ch=='/"')
    {
        t.category = token_string;
        t.value = get_string('/"');
    }
    else if(ch=='/'')
    {
        t.category = token_char;
        t.value = get_string('/'');
    }
    else 
    {
        t.category = token_operator;
        if(ch=='=' || ch=='&' || ch=='|' || ch==':')
        {
            t.value = ch;
            if(peek_char()==ch)
            {
                t.value += ch;
                skip_char();
            }
        }
        else if(ch=='+' || ch=='-')
        {
            t.value = ch;
            char cc = get_char();
            if(cc==ch)
            {
                t.value += ch;
            }
            else if(cc=='=')
            {
                t.value += '=';
            }
            else if(ch=='-' && cc=='>')
            {
                t.value += '>';                         // ->
                cc = peek_char();
                if(cc=='*')
                {
                    skip_char();

                    t.value += '*';                     // ->*
                }
            }
            else
            {
                putback(cc);
            }
        }
        else if(ch=='*' || ch=='/' || ch=='%' || ch=='^' || ch=='!')
        {

            t.value = ch;
            ch = peek_char();
            if(ch=='=')
            {
                t.value+='=';
                skip_char();
            }
        }
        else if(ch=='<' || ch=='>')
        {
            t.value = ch;
            char cc = get_char();
            if(ch==cc)                              // << >>
            {
                t.value += cc;
                cc = peek_char();
                if(cc=='=')                         // <<= >>=
                {
                    skip_char();
                    t.value += '=';
                }
            }
            else if(cc=='=')
            {
                t.value += '=';
            }
            else
            {
                putback(cc);
            }
        }
        else if(ch=='.')
        {
            t.value = '.';                          // .
            ch = get_char();
            if(ch=='*')
            {
                t.value += '*';                     // .*
            }
            else if(ch=='.')
            {
                char cc = get_char();
                if(cc=='.')                         // ...
                {
                    t.value += "..";
                }
                else
                {
                    putback(cc);
                    putback(ch);
                }
            }
            else
            {
                putback(ch);
            }
        }
        else if(ch=='~' || ch =='?' ||
            ch=='[' || ch==']' ||
            ch=='(' || ch==')'
            )
        {
            t.value = ch;
        }
        else if(ch==';' || ch=='{'|| ch=='}'|| ch==','|| ch=='#' )
        {
            t.category = token_punctuator;
            t.value = ch;
        }
        else if(ch=='//')
        {
            ch = peek_char();
            if(ch=='/r' || ch=='/n')
            {
                skip_char();
            }
            else
            {
                t.category = token_error;
                t.value = ch;
            }
        }
        else
        {
            t.category = token_error;
            t.value = ch;
            return status_invalid_char;
        }
    }

    if(t.category == token_identifier && is_keyword(t.value))
    {
        t.category = token_keyword;
    }

    return status_success;
}



[main.c], 测试程序
#include <fstream>
#include <string>
#include <iostream>

#include "lexical_analyzer.h"

int main()
{
    std::ifstream ifs("D://ThreadFuncs.cpp", std::ios::in | std::ios::binary);
    lexical_analyzer lex(std::cin);
    //lexical_analyzer lex(ifs);
    std::ofstream ofs("D://out.cpp");
    //std::ostream& os = ofs;
    std::ostream& os = std::cout;
    token t;
    lexical_status status;
    while((status=lex.get_token(t))!=status_eof)
    {
        if(status==status_success)
            os << t.value << '/n';
        else if(status==status_invalid_char)
            std::cerr << "Line:" << lex.current_line() << "invalid_char: " << t.value << '/n';
    }
    return 0;
}

您可能感兴趣的与本文相关的镜像

词法分析// TranslationDlg.cpp : 实现文件 // #include "stdafx.h" #include "Translation.h" #include "TranslationDlg.h" #ifdef _DEBUG #define new DEBUG_NEW #endif // 用于应用程序“关于”菜单项的 CAboutDlg 对话框 class CAboutDlg : public CDialog { public: CAboutDlg(); // 对话框数据 enum { IDD = IDD_ABOUTBOX }; protected: virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV 支持 // 实现 protected: DECLARE_MESSAGE_MAP() }; CAboutDlg::CAboutDlg() : CDialog(CAboutDlg::IDD) { } void CAboutDlg::DoDataExchange(CDataExchange* pDX) { CDialog::DoDataExchange(pDX); } BEGIN_MESSAGE_MAP(CAboutDlg, CDialog) END_MESSAGE_MAP() // CTranslationDlg 对话框 CTranslationDlg::CTranslationDlg(CWnd* pParent /*=NULL*/) : CDialog(CTranslationDlg::IDD, pParent) { m_hIcon = AfxGetApp()->LoadIcon(IDR_MAINFRAME); } void CTranslationDlg::DoDataExchange(CDataExchange* pDX) { CDialog::DoDataExchange(pDX); DDX_Control(pDX, IDC_EDIT2, content); DDX_Control(pDX, IDC_EDIT1, result); } BEGIN_MESSAGE_MAP(CTranslationDlg, CDialog) ON_WM_SYSCOMMAND() ON_WM_PAINT() ON_WM_QUERYDRAGICON() //}}AFX_MSG_MAP ON_BN_CLICKED(IDC_BUTTON1, &CTranslationDlg::OnBnClickedButton1) END_MESSAGE_MAP() // CTranslationDlg 消息处理程序 BOOL CTranslationDlg::OnInitDialog() { CDialog::OnInitDialog(); // 将“关于...”菜单项添加到系统菜单中。 // IDM_ABOUTBOX 必须在系统命令范围内。 ASSERT((IDM_ABOUTBOX & 0xFFF0) == IDM_ABOUTBOX); ASSERT(IDM_ABOUTBOX < 0xF000); CMenu* pSysMenu = GetSystemMenu(FALSE); if (pSysMenu != NULL) { CString strAboutMenu; strAboutMenu.LoadString(IDS_ABOUTBOX); if (!strAboutMenu.IsEmpty()) { pSysMenu->AppendMenu(MF_SEPARATOR); pSysMenu->AppendMenu(MF_STRING, IDM_ABOUTBOX, strAboutMenu); } } // 设置此对话框的图标。当应用程序主窗口不是对话框时,框架将自动 // 执行此操作 SetIcon(m_hIcon, TRUE); // 设置大图标 SetIcon(m_hIcon, FALSE); // 设置小图标 // TODO: 在此添加额外的初始化代码 CAboutDlg dlgAbout; dlgAbout.DoModal(); return TRUE; // 除非将焦点设置到控件,否则返回 TRUE } void CTranslationDlg::OnSysCommand(UINT nID, LPARAM lParam) { if ((nID & 0xFFF0) == IDM_ABOUTBOX) { CAboutDlg dlgAbout; dlgAbout.DoModal(); } else { CDialog::OnSysCommand(nID, lParam); } } // 如果向对话框添加最小化按钮,则需要下面的代码 // 来绘制该图标。对于使用文档/视图模型的 MFC 应用程序, // 这将由框架自动完成。 void CTranslationDlg::OnPaint() { if (IsIconic()) { CPaintDC dc(this); // 用于绘制的设备上下文 SendMessage(WM_ICONERASEBKGND, reinterpret_cast<WPARAM>(dc.GetSafeHdc()), 0); // 使图标在工作区矩形中居中 int cxIcon = GetSystemMetrics(SM_CXICON); int cyIcon = GetSystemMetrics(SM_CYICON); CRect rect; GetClientRect(&rect); int x = (rect.Width() - cxIcon + 1) / 2; int y = (rect.Height() - cyIcon + 1) / 2; // 绘制图标 dc.DrawIcon(x, y, m_hIcon); } else { CDialog::OnPaint(); } } //当用户拖动最小化窗口时系统调用此函数取得光标 //显示。 HCURSOR CTranslationDlg::OnQueryDragIcon() { return static_cast<HCURSOR>(m_hIcon); } void CTranslationDlg::SplideFrontSpc (CString &str) { int i = 0; for(i;str[i]==' ';) {str.Delete (0);}//MessageBox (_T("ok")); } void CTranslationDlg::OnBnClickedButton1() { // TODO: 在此添加控件通知处理程序代码 CString code ,temp ,output; //CString word[8] = ; CString flag[3][10] = {{_T("if"),_T("int"),_T("for"),_T("while"),_T("do"),_T("return"),_T("break"),_T("continue")}, {_T("+"),_T("-"),_T("*"),_T("/"),_T("="),_T(">"),_T("<"),_T("<="),_T(">="),_T("!=")}, { _T(","),_T(";"),_T("{"),_T("}"),_T("("),_T(")")}}; content.GetWindowTextW (code); code.Replace (_T("\r\n"),_T("")); code.Append (_T(" ")); while(!code.IsEmpty ()) { temp.Empty (); int i,j; int isfind = 0;//是否找到,找到了代表其类型 SplideFrontSpc(code); //temp = code.Left (code.Find (_T(","))); //code.Delete (0,2);//temp = code[0]; //截取下一个单词 temp = code.Left (code.Find (_T(" "))); code.Delete (0,code.Find (_T(" "))); //对比单词类型 for(i=0;!isfind&&i<3;i++) for(j=0;j<10;j++) if(temp == flag[i][j]) { if(i==0) isfind = 1; else if(i==1) isfind = 4; else isfind = 5; break; } if(isfind==0) { int isnum = temp[0]-'0'; if(isnum>-1||isnum<10) isfind = 3; else isfind = 2; } CString cnum; cnum.Format (_T("%d"),isfind); if(!temp.IsEmpty ()) temp = _T("(") + cnum + _T(", \"") + temp + _T("\")"); else temp = _T("词法分析完毕!\r\n"); output = output + temp + _T("\r\n"); } //output = _T("asdjfk\r\naksdjfl\nasldkfj\n"); result.SetWindowTextW (output); } BOOL CTranslationDlg::PreTranslateMessage(MSG* pMsg) { // TODO: 在此添加专用代码和/或调用基类 return CDialog::PreTranslateMessage(pMsg); } void CTranslationDlg::OnOK() { // TODO: 在此添加专用代码和/或调用基类 CDialog::OnOK(); }
评论 2
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值