| 1 | // Copyright (C) 2021 The Qt Company Ltd. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 |
| 3 | |
| 4 | #ifndef TOKENIZER_H |
| 5 | #define TOKENIZER_H |
| 6 | |
| 7 | #include "location.h" |
| 8 | |
| 9 | #include <QtCore/qfile.h> |
| 10 | #include <QtCore/qstack.h> |
| 11 | #include <QtCore/qstring.h> |
| 12 | |
| 13 | QT_BEGIN_NAMESPACE |
| 14 | |
| 15 | /* |
| 16 | Here come the C++ tokens we support. The first part contains |
| 17 | all-purpose tokens; then come keywords. |
| 18 | |
| 19 | If you add a keyword, make sure to modify the keyword array in |
| 20 | tokenizer.cpp as well, and possibly adjust Tok_FirstKeyword and |
| 21 | Tok_LastKeyword. |
| 22 | */ |
| 23 | enum { |
| 24 | Tok_Eoi, |
| 25 | Tok_Ampersand, |
| 26 | Tok_Aster, |
| 27 | Tok_Caret, |
| 28 | Tok_LeftParen, |
| 29 | Tok_RightParen, |
| 30 | Tok_LeftParenAster, |
| 31 | Tok_Equal, |
| 32 | Tok_LeftBrace, |
| 33 | Tok_RightBrace, |
| 34 | Tok_Semicolon, |
| 35 | Tok_Colon, |
| 36 | Tok_LeftAngle, |
| 37 | Tok_RightAngle, |
| 38 | Tok_Comma, |
| 39 | Tok_Ellipsis, |
| 40 | Tok_Gulbrandsen, |
| 41 | Tok_LeftBracket, |
| 42 | Tok_RightBracket, |
| 43 | Tok_Tilde, |
| 44 | Tok_SomeOperator, |
| 45 | Tok_Number, |
| 46 | Tok_String, |
| 47 | Tok_Doc, |
| 48 | , |
| 49 | Tok_Ident, |
| 50 | Tok_At, |
| 51 | Tok_char, |
| 52 | Tok_class, |
| 53 | Tok_const, |
| 54 | Tok_double, |
| 55 | Tok_int, |
| 56 | Tok_long, |
| 57 | Tok_operator, |
| 58 | Tok_short, |
| 59 | Tok_signed, |
| 60 | Tok_typename, |
| 61 | Tok_unsigned, |
| 62 | Tok_void, |
| 63 | Tok_volatile, |
| 64 | Tok_int64, |
| 65 | Tok_QPrivateSignal, |
| 66 | Tok_FirstKeyword = Tok_char, |
| 67 | Tok_LastKeyword = Tok_QPrivateSignal |
| 68 | }; |
| 69 | |
| 70 | /* |
| 71 | The Tokenizer class implements lexical analysis of C++ source |
| 72 | files. |
| 73 | |
| 74 | Not every operator or keyword of C++ is recognized; only those |
| 75 | that are interesting to us. Some Qt keywords or macros are also |
| 76 | recognized. |
| 77 | */ |
| 78 | |
| 79 | class Tokenizer |
| 80 | { |
| 81 | public: |
| 82 | Tokenizer(const Location &loc, QByteArray in); |
| 83 | Tokenizer(const Location &loc, QFile &file); |
| 84 | |
| 85 | ~Tokenizer(); |
| 86 | |
| 87 | int getToken(); |
| 88 | void setParsingFnOrMacro(bool macro) { m_parsingMacro = macro; } |
| 89 | |
| 90 | [[nodiscard]] const Location &location() const { return m_tokLoc; } |
| 91 | [[nodiscard]] QString previousLexeme() const; |
| 92 | [[nodiscard]] QString lexeme() const; |
| 93 | [[nodiscard]] QString version() const { return m_version; } |
| 94 | [[nodiscard]] int parenDepth() const { return m_parenDepth; } |
| 95 | [[nodiscard]] int bracketDepth() const { return m_bracketDepth; } |
| 96 | |
| 97 | static void initialize(); |
| 98 | static void terminate(); |
| 99 | static bool isTrue(const QString &condition); |
| 100 | |
| 101 | private: |
| 102 | void init(); |
| 103 | void start(const Location &loc); |
| 104 | /* |
| 105 | Represents the maximum amount of characters that a token can be composed |
| 106 | of. |
| 107 | |
| 108 | When a token with more characters than the maximum amount is encountered, a |
| 109 | warning is issued and parsing continues, discarding all characters from the |
| 110 | currently parsed token that don't fit into the buffer. |
| 111 | */ |
| 112 | enum { yyLexBufSize = 1048576 }; |
| 113 | |
| 114 | int getch() { return m_pos == m_in.size() ? EOF : m_in[m_pos++]; } |
| 115 | |
| 116 | inline int getChar() |
| 117 | { |
| 118 | using namespace Qt::StringLiterals; |
| 119 | |
| 120 | if (m_ch == EOF) |
| 121 | return EOF; |
| 122 | if (m_lexLen < yyLexBufSize - 1) { |
| 123 | m_lex[m_lexLen++] = (char)m_ch; |
| 124 | m_lex[m_lexLen] = '\0'; |
| 125 | } else if (!token_too_long_warning_was_issued) { |
| 126 | location().warning( |
| 127 | message: u"The content is too long.\n"_s , |
| 128 | details: u"The maximum amount of characters for this content is %1.\n"_s .arg(a: yyLexBufSize) + |
| 129 | "Consider splitting it or reducing its size." |
| 130 | ); |
| 131 | |
| 132 | token_too_long_warning_was_issued = true; |
| 133 | } |
| 134 | m_curLoc.advance(ch: QChar(m_ch)); |
| 135 | int ch = getch(); |
| 136 | if (ch == EOF) |
| 137 | return EOF; |
| 138 | // cast explicitly to make sure the value of ch |
| 139 | // is in range [0..255] to avoid assert messages |
| 140 | // when using debug CRT that checks its input. |
| 141 | return int(uint(uchar(ch))); |
| 142 | } |
| 143 | |
| 144 | int getTokenAfterPreprocessor(); |
| 145 | void pushSkipping(bool skip); |
| 146 | bool popSkipping(); |
| 147 | |
| 148 | Location m_tokLoc; |
| 149 | Location m_curLoc; |
| 150 | char *m_lexBuf1 { nullptr }; |
| 151 | char *m_lexBuf2 { nullptr }; |
| 152 | char *m_prevLex { nullptr }; |
| 153 | char *m_lex { nullptr }; |
| 154 | size_t m_lexLen {}; |
| 155 | QStack<bool> m_preprocessorSkipping; |
| 156 | int m_numPreprocessorSkipping {}; |
| 157 | int m_braceDepth {}; |
| 158 | int m_parenDepth {}; |
| 159 | int m_bracketDepth {}; |
| 160 | int m_ch {}; |
| 161 | |
| 162 | QString m_version {}; |
| 163 | bool m_parsingMacro {}; |
| 164 | |
| 165 | // Used to ensure that the warning that is issued when a token is |
| 166 | // too long to fit into our fixed sized buffer is not repeated for each |
| 167 | // character of that token after the last saved one. |
| 168 | // The flag is reset whenever a new token is requested, so as to allow |
| 169 | // reporting all such tokens that are too long during a single execution. |
| 170 | bool token_too_long_warning_was_issued{false}; |
| 171 | |
| 172 | protected: |
| 173 | QByteArray m_in {}; |
| 174 | int m_pos {}; |
| 175 | }; |
| 176 | |
| 177 | QT_END_NAMESPACE |
| 178 | |
| 179 | #endif |
| 180 | |