| 1 | // Copyright (C) 2021 The Qt Company Ltd. |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 |
| 3 | |
| 4 | #include "tokenizer.h" |
| 5 | |
| 6 | #include "config.h" |
| 7 | #include "generator.h" |
| 8 | |
| 9 | #include <QtCore/qfile.h> |
| 10 | #include <QtCore/qhash.h> |
| 11 | #include <QtCore/qregularexpression.h> |
| 12 | #include <QtCore/qstring.h> |
| 13 | #include <QtCore/qstringconverter.h> |
| 14 | |
| 15 | #include <cctype> |
| 16 | #include <cstring> |
| 17 | #include <utility> |
| 18 | |
| 19 | QT_BEGIN_NAMESPACE |
| 20 | |
| 21 | #define LANGUAGE_CPP "Cpp" |
| 22 | |
| 23 | /* qmake ignore Q_OBJECT */ |
| 24 | |
| 25 | /* |
| 26 | Keep in sync with tokenizer.h. |
| 27 | */ |
| 28 | static const char *kwords[] = { "char" , |
| 29 | "class" , |
| 30 | "const" , |
| 31 | "double" , |
| 32 | "enum" , |
| 33 | "explicit" , |
| 34 | "friend" , |
| 35 | "inline" , |
| 36 | "int" , |
| 37 | "long" , |
| 38 | "namespace" , |
| 39 | "operator" , |
| 40 | "private" , |
| 41 | "protected" , |
| 42 | "public" , |
| 43 | "short" , |
| 44 | "signals" , |
| 45 | "signed" , |
| 46 | "slots" , |
| 47 | "static" , |
| 48 | "struct" , |
| 49 | "template" , |
| 50 | "typedef" , |
| 51 | "typename" , |
| 52 | "union" , |
| 53 | "unsigned" , |
| 54 | "using" , |
| 55 | "virtual" , |
| 56 | "void" , |
| 57 | "volatile" , |
| 58 | "__int64" , |
| 59 | "default" , |
| 60 | "delete" , |
| 61 | "final" , |
| 62 | "override" , |
| 63 | "Q_OBJECT" , |
| 64 | "Q_OVERRIDE" , |
| 65 | "Q_PROPERTY" , |
| 66 | "Q_PRIVATE_PROPERTY" , |
| 67 | "Q_DECLARE_SEQUENTIAL_ITERATOR" , |
| 68 | "Q_DECLARE_MUTABLE_SEQUENTIAL_ITERATOR" , |
| 69 | "Q_DECLARE_ASSOCIATIVE_ITERATOR" , |
| 70 | "Q_DECLARE_MUTABLE_ASSOCIATIVE_ITERATOR" , |
| 71 | "Q_DECLARE_FLAGS" , |
| 72 | "Q_SIGNALS" , |
| 73 | "Q_SLOTS" , |
| 74 | "QT_COMPAT" , |
| 75 | "QT_COMPAT_CONSTRUCTOR" , |
| 76 | "QT_DEPRECATED" , |
| 77 | "QT_MOC_COMPAT" , |
| 78 | "QT_MODULE" , |
| 79 | "QT3_SUPPORT" , |
| 80 | "QT3_SUPPORT_CONSTRUCTOR" , |
| 81 | "QT3_MOC_SUPPORT" , |
| 82 | "QDOC_PROPERTY" , |
| 83 | "QPrivateSignal" }; |
| 84 | |
| 85 | static const int KwordHashTableSize = 4096; |
| 86 | static int kwordHashTable[KwordHashTableSize]; |
| 87 | |
| 88 | static QHash<QByteArray, bool> *ignoredTokensAndDirectives = nullptr; |
| 89 | |
| 90 | static QRegularExpression * = nullptr; |
| 91 | static QRegularExpression *versionX = nullptr; |
| 92 | static QRegularExpression *definedX = nullptr; |
| 93 | |
| 94 | static QRegularExpression *defines = nullptr; |
| 95 | static QRegularExpression *falsehoods = nullptr; |
| 96 | |
| 97 | static QStringDecoder sourceDecoder; |
| 98 | |
| 99 | /* |
| 100 | This function is a perfect hash function for the 37 keywords of C99 |
| 101 | (with a hash table size of 512). It should perform well on our |
| 102 | Qt-enhanced C++ subset. |
| 103 | */ |
| 104 | static int hashKword(const char *s, int len) |
| 105 | { |
| 106 | return (((uchar)s[0]) + (((uchar)s[2]) << 5) + (((uchar)s[len - 1]) << 3)) % KwordHashTableSize; |
| 107 | } |
| 108 | |
| 109 | static void insertKwordIntoHash(const char *s, int number) |
| 110 | { |
| 111 | int k = hashKword(s, len: int(strlen(s: s))); |
| 112 | while (kwordHashTable[k]) { |
| 113 | if (++k == KwordHashTableSize) |
| 114 | k = 0; |
| 115 | } |
| 116 | kwordHashTable[k] = number; |
| 117 | } |
| 118 | |
| 119 | Tokenizer::Tokenizer(const Location &loc, QFile &in) |
| 120 | { |
| 121 | init(); |
| 122 | m_in = in.readAll(); |
| 123 | m_pos = 0; |
| 124 | start(loc); |
| 125 | } |
| 126 | |
| 127 | Tokenizer::Tokenizer(const Location &loc, QByteArray in) : m_in(std::move(in)) |
| 128 | { |
| 129 | init(); |
| 130 | m_pos = 0; |
| 131 | start(loc); |
| 132 | } |
| 133 | |
| 134 | Tokenizer::~Tokenizer() |
| 135 | { |
| 136 | delete[] m_lexBuf1; |
| 137 | delete[] m_lexBuf2; |
| 138 | } |
| 139 | |
| 140 | int Tokenizer::getToken() |
| 141 | { |
| 142 | token_too_long_warning_was_issued = false; |
| 143 | |
| 144 | char *t = m_prevLex; |
| 145 | m_prevLex = m_lex; |
| 146 | m_lex = t; |
| 147 | |
| 148 | while (m_ch != EOF) { |
| 149 | m_tokLoc = m_curLoc; |
| 150 | m_lexLen = 0; |
| 151 | |
| 152 | if (isspace(m_ch)) { |
| 153 | do { |
| 154 | m_ch = getChar(); |
| 155 | } while (isspace(m_ch)); |
| 156 | } else if (isalpha(m_ch) || m_ch == '_') { |
| 157 | do { |
| 158 | m_ch = getChar(); |
| 159 | } while (isalnum(m_ch) || m_ch == '_'); |
| 160 | |
| 161 | int k = hashKword(s: m_lex, len: int(m_lexLen)); |
| 162 | for (;;) { |
| 163 | int i = kwordHashTable[k]; |
| 164 | if (i == 0) { |
| 165 | return Tok_Ident; |
| 166 | } else if (i == -1) { |
| 167 | if (!m_parsingMacro && ignoredTokensAndDirectives->contains(key: m_lex)) { |
| 168 | if (ignoredTokensAndDirectives->value(key: m_lex)) { // it's a directive |
| 169 | int parenDepth = 0; |
| 170 | while (m_ch != EOF && (m_ch != ')' || parenDepth > 1)) { |
| 171 | if (m_ch == '(') |
| 172 | ++parenDepth; |
| 173 | else if (m_ch == ')') |
| 174 | --parenDepth; |
| 175 | m_ch = getChar(); |
| 176 | } |
| 177 | if (m_ch == ')') |
| 178 | m_ch = getChar(); |
| 179 | } |
| 180 | break; |
| 181 | } |
| 182 | } else if (strcmp(s1: m_lex, s2: kwords[i - 1]) == 0) { |
| 183 | int ret = (int)Tok_FirstKeyword + i - 1; |
| 184 | if (ret != Tok_typename) |
| 185 | return ret; |
| 186 | break; |
| 187 | } |
| 188 | |
| 189 | if (++k == KwordHashTableSize) |
| 190 | k = 0; |
| 191 | } |
| 192 | } else if (isdigit(m_ch)) { |
| 193 | do { |
| 194 | m_ch = getChar(); |
| 195 | } while (isalnum(m_ch) || m_ch == '.' || m_ch == '+' || m_ch == '-'); |
| 196 | return Tok_Number; |
| 197 | } else { |
| 198 | switch (m_ch) { |
| 199 | case '!': |
| 200 | case '%': |
| 201 | m_ch = getChar(); |
| 202 | if (m_ch == '=') |
| 203 | m_ch = getChar(); |
| 204 | return Tok_SomeOperator; |
| 205 | case '"': |
| 206 | m_ch = getChar(); |
| 207 | |
| 208 | while (m_ch != EOF && m_ch != '"') { |
| 209 | if (m_ch == '\\') |
| 210 | m_ch = getChar(); |
| 211 | m_ch = getChar(); |
| 212 | } |
| 213 | m_ch = getChar(); |
| 214 | |
| 215 | if (m_ch == EOF) |
| 216 | m_tokLoc.warning( |
| 217 | QStringLiteral("Unterminated C++ string literal" ), |
| 218 | QStringLiteral("Maybe you forgot '/*!' at the beginning of the file?" )); |
| 219 | else |
| 220 | return Tok_String; |
| 221 | break; |
| 222 | case '#': |
| 223 | return getTokenAfterPreprocessor(); |
| 224 | case '&': |
| 225 | m_ch = getChar(); |
| 226 | /* |
| 227 | Removed check for '&&', only interpret '&=' as an operator. |
| 228 | '&&' is also used for an rvalue reference. QTBUG-32675 |
| 229 | */ |
| 230 | if (m_ch == '=') { |
| 231 | m_ch = getChar(); |
| 232 | return Tok_SomeOperator; |
| 233 | } else { |
| 234 | return Tok_Ampersand; |
| 235 | } |
| 236 | case '\'': |
| 237 | m_ch = getChar(); |
| 238 | /* |
| 239 | Allow empty character literal. QTBUG-25775 |
| 240 | */ |
| 241 | if (m_ch == '\'') { |
| 242 | m_ch = getChar(); |
| 243 | break; |
| 244 | } |
| 245 | if (m_ch == '\\') |
| 246 | m_ch = getChar(); |
| 247 | do { |
| 248 | m_ch = getChar(); |
| 249 | } while (m_ch != EOF && m_ch != '\''); |
| 250 | |
| 251 | if (m_ch == EOF) { |
| 252 | m_tokLoc.warning(QStringLiteral("Unterminated C++ character literal" )); |
| 253 | } else { |
| 254 | m_ch = getChar(); |
| 255 | return Tok_Number; |
| 256 | } |
| 257 | break; |
| 258 | case '(': |
| 259 | m_ch = getChar(); |
| 260 | if (m_numPreprocessorSkipping == 0) |
| 261 | m_parenDepth++; |
| 262 | if (isspace(m_ch)) { |
| 263 | do { |
| 264 | m_ch = getChar(); |
| 265 | } while (isspace(m_ch)); |
| 266 | m_lexLen = 1; |
| 267 | m_lex[1] = '\0'; |
| 268 | } |
| 269 | if (m_ch == '*') { |
| 270 | m_ch = getChar(); |
| 271 | return Tok_LeftParenAster; |
| 272 | } |
| 273 | return Tok_LeftParen; |
| 274 | case ')': |
| 275 | m_ch = getChar(); |
| 276 | if (m_numPreprocessorSkipping == 0) |
| 277 | m_parenDepth--; |
| 278 | return Tok_RightParen; |
| 279 | case '*': |
| 280 | m_ch = getChar(); |
| 281 | if (m_ch == '=') { |
| 282 | m_ch = getChar(); |
| 283 | return Tok_SomeOperator; |
| 284 | } else { |
| 285 | return Tok_Aster; |
| 286 | } |
| 287 | case '^': |
| 288 | m_ch = getChar(); |
| 289 | if (m_ch == '=') { |
| 290 | m_ch = getChar(); |
| 291 | return Tok_SomeOperator; |
| 292 | } else { |
| 293 | return Tok_Caret; |
| 294 | } |
| 295 | case '+': |
| 296 | m_ch = getChar(); |
| 297 | if (m_ch == '+' || m_ch == '=') |
| 298 | m_ch = getChar(); |
| 299 | return Tok_SomeOperator; |
| 300 | case ',': |
| 301 | m_ch = getChar(); |
| 302 | return Tok_Comma; |
| 303 | case '-': |
| 304 | m_ch = getChar(); |
| 305 | if (m_ch == '-' || m_ch == '=') { |
| 306 | m_ch = getChar(); |
| 307 | } else if (m_ch == '>') { |
| 308 | m_ch = getChar(); |
| 309 | if (m_ch == '*') |
| 310 | m_ch = getChar(); |
| 311 | } |
| 312 | return Tok_SomeOperator; |
| 313 | case '.': |
| 314 | m_ch = getChar(); |
| 315 | if (m_ch == '*') { |
| 316 | m_ch = getChar(); |
| 317 | } else if (m_ch == '.') { |
| 318 | do { |
| 319 | m_ch = getChar(); |
| 320 | } while (m_ch == '.'); |
| 321 | return Tok_Ellipsis; |
| 322 | } else if (isdigit(m_ch)) { |
| 323 | do { |
| 324 | m_ch = getChar(); |
| 325 | } while (isalnum(m_ch) || m_ch == '.' || m_ch == '+' || m_ch == '-'); |
| 326 | return Tok_Number; |
| 327 | } |
| 328 | return Tok_SomeOperator; |
| 329 | case '/': |
| 330 | m_ch = getChar(); |
| 331 | if (m_ch == '/') { |
| 332 | do { |
| 333 | m_ch = getChar(); |
| 334 | } while (m_ch != EOF && m_ch != '\n'); |
| 335 | } else if (m_ch == '*') { |
| 336 | bool metDoc = false; // empty doc is no doc |
| 337 | bool metSlashAsterBang = false; |
| 338 | bool metAster = false; |
| 339 | bool metAsterSlash = false; |
| 340 | |
| 341 | m_ch = getChar(); |
| 342 | if (m_ch == '!') |
| 343 | metSlashAsterBang = true; |
| 344 | |
| 345 | while (!metAsterSlash) { |
| 346 | if (m_ch == EOF) { |
| 347 | m_tokLoc.warning(QStringLiteral("Unterminated C++ comment" )); |
| 348 | break; |
| 349 | } else { |
| 350 | if (m_ch == '*') { |
| 351 | metAster = true; |
| 352 | } else if (metAster && m_ch == '/') { |
| 353 | metAsterSlash = true; |
| 354 | } else { |
| 355 | metAster = false; |
| 356 | if (isgraph(m_ch)) |
| 357 | metDoc = true; |
| 358 | } |
| 359 | } |
| 360 | m_ch = getChar(); |
| 361 | } |
| 362 | if (metSlashAsterBang && metDoc) |
| 363 | return Tok_Doc; |
| 364 | else if (m_parenDepth > 0) |
| 365 | return Tok_Comment; |
| 366 | } else { |
| 367 | if (m_ch == '=') |
| 368 | m_ch = getChar(); |
| 369 | return Tok_SomeOperator; |
| 370 | } |
| 371 | break; |
| 372 | case ':': |
| 373 | m_ch = getChar(); |
| 374 | if (m_ch == ':') { |
| 375 | m_ch = getChar(); |
| 376 | return Tok_Gulbrandsen; |
| 377 | } else { |
| 378 | return Tok_Colon; |
| 379 | } |
| 380 | case ';': |
| 381 | m_ch = getChar(); |
| 382 | return Tok_Semicolon; |
| 383 | case '<': |
| 384 | m_ch = getChar(); |
| 385 | if (m_ch == '<') { |
| 386 | m_ch = getChar(); |
| 387 | if (m_ch == '=') |
| 388 | m_ch = getChar(); |
| 389 | return Tok_SomeOperator; |
| 390 | } else if (m_ch == '=') { |
| 391 | m_ch = getChar(); |
| 392 | return Tok_SomeOperator; |
| 393 | } else { |
| 394 | return Tok_LeftAngle; |
| 395 | } |
| 396 | case '=': |
| 397 | m_ch = getChar(); |
| 398 | if (m_ch == '=') { |
| 399 | m_ch = getChar(); |
| 400 | return Tok_SomeOperator; |
| 401 | } else { |
| 402 | return Tok_Equal; |
| 403 | } |
| 404 | case '>': |
| 405 | m_ch = getChar(); |
| 406 | if (m_ch == '>') { |
| 407 | m_ch = getChar(); |
| 408 | if (m_ch == '=') |
| 409 | m_ch = getChar(); |
| 410 | return Tok_SomeOperator; |
| 411 | } else if (m_ch == '=') { |
| 412 | m_ch = getChar(); |
| 413 | return Tok_SomeOperator; |
| 414 | } else { |
| 415 | return Tok_RightAngle; |
| 416 | } |
| 417 | case '?': |
| 418 | m_ch = getChar(); |
| 419 | return Tok_SomeOperator; |
| 420 | case '[': |
| 421 | m_ch = getChar(); |
| 422 | if (m_numPreprocessorSkipping == 0) |
| 423 | m_bracketDepth++; |
| 424 | return Tok_LeftBracket; |
| 425 | case '\\': |
| 426 | m_ch = getChar(); |
| 427 | m_ch = getChar(); // skip one character |
| 428 | break; |
| 429 | case ']': |
| 430 | m_ch = getChar(); |
| 431 | if (m_numPreprocessorSkipping == 0) |
| 432 | m_bracketDepth--; |
| 433 | return Tok_RightBracket; |
| 434 | case '{': |
| 435 | m_ch = getChar(); |
| 436 | if (m_numPreprocessorSkipping == 0) |
| 437 | m_braceDepth++; |
| 438 | return Tok_LeftBrace; |
| 439 | case '}': |
| 440 | m_ch = getChar(); |
| 441 | if (m_numPreprocessorSkipping == 0) |
| 442 | m_braceDepth--; |
| 443 | return Tok_RightBrace; |
| 444 | case '|': |
| 445 | m_ch = getChar(); |
| 446 | if (m_ch == '|' || m_ch == '=') |
| 447 | m_ch = getChar(); |
| 448 | return Tok_SomeOperator; |
| 449 | case '~': |
| 450 | m_ch = getChar(); |
| 451 | return Tok_Tilde; |
| 452 | case '@': |
| 453 | m_ch = getChar(); |
| 454 | return Tok_At; |
| 455 | default: |
| 456 | // ### We should really prevent qdoc from looking at snippet files rather than |
| 457 | // ### suppress warnings when reading them. |
| 458 | if (m_numPreprocessorSkipping == 0 |
| 459 | && !(m_tokLoc.fileName().endsWith(s: ".qdoc" ) |
| 460 | || m_tokLoc.fileName().endsWith(s: ".js" ))) { |
| 461 | m_tokLoc.warning(QStringLiteral("Hostile character 0x%1 in C++ source" ) |
| 462 | .arg(a: (uchar)m_ch, fieldWidth: 1, base: 16)); |
| 463 | } |
| 464 | m_ch = getChar(); |
| 465 | } |
| 466 | } |
| 467 | } |
| 468 | |
| 469 | if (m_preprocessorSkipping.size() > 1) { |
| 470 | m_tokLoc.warning(QStringLiteral("Expected #endif before end of file" )); |
| 471 | // clear it out or we get an infinite loop! |
| 472 | while (!m_preprocessorSkipping.isEmpty()) { |
| 473 | popSkipping(); |
| 474 | } |
| 475 | } |
| 476 | |
| 477 | strcpy(dest: m_lex, src: "end-of-input" ); |
| 478 | m_lexLen = strlen(s: m_lex); |
| 479 | return Tok_Eoi; |
| 480 | } |
| 481 | |
| 482 | void Tokenizer::initialize() |
| 483 | { |
| 484 | Config &config = Config::instance(); |
| 485 | QString versionSym = config.get(CONFIG_VERSIONSYM).asString(); |
| 486 | const QLatin1String defaultEncoding("UTF-8" ); |
| 487 | |
| 488 | QString sourceEncoding = config.get(CONFIG_SOURCEENCODING).asString(defaultString: defaultEncoding); |
| 489 | if (!QStringConverter::encodingForName(name: sourceEncoding.toUtf8().constData())) { |
| 490 | Location().warning(QStringLiteral("Source encoding '%1' not supported, using '%2' as default." ) |
| 491 | .arg(args&: sourceEncoding, args: defaultEncoding)); |
| 492 | sourceEncoding = defaultEncoding; |
| 493 | } |
| 494 | sourceDecoder = QStringDecoder(sourceEncoding.toUtf8().constData()); |
| 495 | Q_ASSERT(sourceDecoder.isValid()); |
| 496 | |
| 497 | comment = new QRegularExpression("/(?:\\*.*\\*/|/.*\n|/[^\n]*$)" , QRegularExpression::InvertedGreedinessOption); |
| 498 | versionX = new QRegularExpression("$cannot possibly match^" ); |
| 499 | if (!versionSym.isEmpty()) |
| 500 | versionX->setPattern("^[ \t]*(?:" + QRegularExpression::escape(str: versionSym) |
| 501 | + ")[ \t]+\"([^\"]*)\"[ \t]*$" ); |
| 502 | definedX = new QRegularExpression("^defined ?\\(?([A-Z_0-9a-z]+) ?\\)?$" ); |
| 503 | |
| 504 | QStringList d{config.get(CONFIG_DEFINES).asStringList()}; |
| 505 | d += "qdoc" ; |
| 506 | defines = new QRegularExpression(QRegularExpression::anchoredPattern(expression: d.join(sep: '|'))); |
| 507 | falsehoods = new QRegularExpression(QRegularExpression::anchoredPattern( |
| 508 | expression: config.get(CONFIG_FALSEHOODS).asStringList().join(sep: '|'))); |
| 509 | |
| 510 | /* |
| 511 | The keyword hash table is always cleared before any words are inserted. |
| 512 | */ |
| 513 | memset(s: kwordHashTable, c: 0, n: sizeof(kwordHashTable)); |
| 514 | for (int i = 0; i < Tok_LastKeyword - Tok_FirstKeyword + 1; i++) |
| 515 | insertKwordIntoHash(s: kwords[i], number: i + 1); |
| 516 | |
| 517 | ignoredTokensAndDirectives = new QHash<QByteArray, bool>; |
| 518 | |
| 519 | const QStringList tokens{config.get(LANGUAGE_CPP |
| 520 | + Config::dot |
| 521 | + CONFIG_IGNORETOKENS).asStringList()}; |
| 522 | for (const auto &token : tokens) { |
| 523 | const QByteArray tb = token.toLatin1(); |
| 524 | ignoredTokensAndDirectives->insert(key: tb, value: false); |
| 525 | insertKwordIntoHash(s: tb.data(), number: -1); |
| 526 | } |
| 527 | |
| 528 | const QStringList directives{config.get(LANGUAGE_CPP |
| 529 | + Config::dot |
| 530 | + CONFIG_IGNOREDIRECTIVES).asStringList()}; |
| 531 | for (const auto &directive : directives) { |
| 532 | const QByteArray db = directive.toLatin1(); |
| 533 | ignoredTokensAndDirectives->insert(key: db, value: true); |
| 534 | insertKwordIntoHash(s: db.data(), number: -1); |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /*! |
| 539 | The heap allocated variables are freed here. The keyword |
| 540 | hash table is not cleared here, but it is cleared in the |
| 541 | initialize() function, before any keywords are inserted. |
| 542 | */ |
| 543 | void Tokenizer::terminate() |
| 544 | { |
| 545 | delete comment; |
| 546 | comment = nullptr; |
| 547 | delete versionX; |
| 548 | versionX = nullptr; |
| 549 | delete definedX; |
| 550 | definedX = nullptr; |
| 551 | delete defines; |
| 552 | defines = nullptr; |
| 553 | delete falsehoods; |
| 554 | falsehoods = nullptr; |
| 555 | delete ignoredTokensAndDirectives; |
| 556 | ignoredTokensAndDirectives = nullptr; |
| 557 | } |
| 558 | |
| 559 | void Tokenizer::init() |
| 560 | { |
| 561 | m_lexBuf1 = new char[(int)yyLexBufSize]; |
| 562 | m_lexBuf2 = new char[(int)yyLexBufSize]; |
| 563 | m_prevLex = m_lexBuf1; |
| 564 | m_prevLex[0] = '\0'; |
| 565 | m_lex = m_lexBuf2; |
| 566 | m_lex[0] = '\0'; |
| 567 | m_lexLen = 0; |
| 568 | m_preprocessorSkipping.push(t: false); |
| 569 | m_numPreprocessorSkipping = 0; |
| 570 | m_braceDepth = 0; |
| 571 | m_parenDepth = 0; |
| 572 | m_bracketDepth = 0; |
| 573 | m_ch = '\0'; |
| 574 | m_parsingMacro = false; |
| 575 | } |
| 576 | |
| 577 | void Tokenizer::start(const Location &loc) |
| 578 | { |
| 579 | m_tokLoc = loc; |
| 580 | m_curLoc = loc; |
| 581 | m_curLoc.start(); |
| 582 | strcpy(dest: m_prevLex, src: "beginning-of-input" ); |
| 583 | strcpy(dest: m_lex, src: "beginning-of-input" ); |
| 584 | m_lexLen = strlen(s: m_lex); |
| 585 | m_braceDepth = 0; |
| 586 | m_parenDepth = 0; |
| 587 | m_bracketDepth = 0; |
| 588 | m_ch = '\0'; |
| 589 | m_ch = getChar(); |
| 590 | } |
| 591 | |
| 592 | /* |
| 593 | Returns the next token, if # was met. This function interprets the |
| 594 | preprocessor directive, skips over any #ifdef'd out tokens, and returns the |
| 595 | token after all of that. |
| 596 | */ |
| 597 | int Tokenizer::getTokenAfterPreprocessor() |
| 598 | { |
| 599 | m_ch = getChar(); |
| 600 | while (isspace(m_ch) && m_ch != '\n') |
| 601 | m_ch = getChar(); |
| 602 | |
| 603 | /* |
| 604 | #directive condition |
| 605 | */ |
| 606 | QString directive; |
| 607 | QString condition; |
| 608 | |
| 609 | while (isalpha(m_ch)) { |
| 610 | directive += QChar(m_ch); |
| 611 | m_ch = getChar(); |
| 612 | } |
| 613 | if (!directive.isEmpty()) { |
| 614 | while (m_ch != EOF && m_ch != '\n') { |
| 615 | if (m_ch == '\\') { |
| 616 | m_ch = getChar(); |
| 617 | if (m_ch == '\r') |
| 618 | m_ch = getChar(); |
| 619 | } |
| 620 | condition += QChar(m_ch); |
| 621 | m_ch = getChar(); |
| 622 | } |
| 623 | condition.remove(re: *comment); |
| 624 | condition = condition.simplified(); |
| 625 | |
| 626 | /* |
| 627 | The #if, #ifdef, #ifndef, #elif, #else, and #endif |
| 628 | directives have an effect on the skipping stack. For |
| 629 | instance, if the code processed so far is |
| 630 | |
| 631 | #if 1 |
| 632 | #if 0 |
| 633 | #if 1 |
| 634 | // ... |
| 635 | #else |
| 636 | |
| 637 | the skipping stack contains, from bottom to top, false true |
| 638 | true (assuming 0 is false and 1 is true). If at least one |
| 639 | entry of the stack is true, the tokens are skipped. |
| 640 | |
| 641 | This mechanism is simple yet hard to understand. |
| 642 | */ |
| 643 | if (directive[0] == QChar('i')) { |
| 644 | if (directive == QString("if" )) |
| 645 | pushSkipping(skip: !isTrue(condition)); |
| 646 | else if (directive == QString("ifdef" )) |
| 647 | pushSkipping(skip: !defines->match(subject: condition).hasMatch()); |
| 648 | else if (directive == QString("ifndef" )) |
| 649 | pushSkipping(skip: defines->match(subject: condition).hasMatch()); |
| 650 | } else if (directive[0] == QChar('e')) { |
| 651 | if (directive == QString("elif" )) { |
| 652 | bool old = popSkipping(); |
| 653 | if (old) |
| 654 | pushSkipping(skip: !isTrue(condition)); |
| 655 | else |
| 656 | pushSkipping(skip: true); |
| 657 | } else if (directive == QString("else" )) { |
| 658 | pushSkipping(skip: !popSkipping()); |
| 659 | } else if (directive == QString("endif" )) { |
| 660 | popSkipping(); |
| 661 | } |
| 662 | } else if (directive == QString("define" )) { |
| 663 | auto match = versionX->match(subject: condition); |
| 664 | if (match.hasMatch()) |
| 665 | m_version = match.captured(nth: 1); |
| 666 | } |
| 667 | } |
| 668 | |
| 669 | int tok; |
| 670 | do { |
| 671 | /* |
| 672 | We set yyLex now, and after getToken() this will be |
| 673 | yyPrevLex. This way, we skip over the preprocessor |
| 674 | directive. |
| 675 | */ |
| 676 | qstrcpy(dst: m_lex, src: m_prevLex); |
| 677 | |
| 678 | /* |
| 679 | If getToken() meets another #, it will call |
| 680 | getTokenAfterPreprocessor() once again, which could in turn |
| 681 | call getToken() again, etc. Unless there are 10,000 or so |
| 682 | preprocessor directives in a row, this shouldn't overflow |
| 683 | the stack. |
| 684 | */ |
| 685 | tok = getToken(); |
| 686 | } while (m_numPreprocessorSkipping > 0 && tok != Tok_Eoi); |
| 687 | return tok; |
| 688 | } |
| 689 | |
| 690 | /* |
| 691 | Pushes a new skipping value onto the stack. This corresponds to entering a |
| 692 | new #if block. |
| 693 | */ |
| 694 | void Tokenizer::pushSkipping(bool skip) |
| 695 | { |
| 696 | m_preprocessorSkipping.push(t: skip); |
| 697 | if (skip) |
| 698 | m_numPreprocessorSkipping++; |
| 699 | } |
| 700 | |
| 701 | /* |
| 702 | Pops a skipping value from the stack. This corresponds to reaching a #endif. |
| 703 | */ |
| 704 | bool Tokenizer::popSkipping() |
| 705 | { |
| 706 | if (m_preprocessorSkipping.isEmpty()) { |
| 707 | m_tokLoc.warning(QStringLiteral("Unexpected #elif, #else or #endif" )); |
| 708 | return true; |
| 709 | } |
| 710 | |
| 711 | bool skip = m_preprocessorSkipping.pop(); |
| 712 | if (skip) |
| 713 | m_numPreprocessorSkipping--; |
| 714 | return skip; |
| 715 | } |
| 716 | |
| 717 | /* |
| 718 | Returns \c true if the condition evaluates as true, otherwise false. The |
| 719 | condition is represented by a string. Unsophisticated parsing techniques are |
| 720 | used. The preprocessing method could be named StriNg-Oriented PreProcessing, |
| 721 | as SNOBOL stands for StriNg-Oriented symBOlic Language. |
| 722 | */ |
| 723 | bool Tokenizer::isTrue(const QString &condition) |
| 724 | { |
| 725 | int firstOr = -1; |
| 726 | int firstAnd = -1; |
| 727 | int parenDepth = 0; |
| 728 | |
| 729 | /* |
| 730 | Find the first logical operator at top level, but be careful |
| 731 | about precedence. Examples: |
| 732 | |
| 733 | X || Y // the or |
| 734 | X || Y || Z // the leftmost or |
| 735 | X || Y && Z // the or |
| 736 | X && Y || Z // the or |
| 737 | (X || Y) && Z // the and |
| 738 | */ |
| 739 | for (int i = 0; i < condition.size() - 1; i++) { |
| 740 | QChar ch = condition[i]; |
| 741 | if (ch == QChar('(')) { |
| 742 | parenDepth++; |
| 743 | } else if (ch == QChar(')')) { |
| 744 | parenDepth--; |
| 745 | } else if (parenDepth == 0) { |
| 746 | if (condition[i + 1] == ch) { |
| 747 | if (ch == QChar('|')) { |
| 748 | firstOr = i; |
| 749 | break; |
| 750 | } else if (ch == QChar('&')) { |
| 751 | if (firstAnd == -1) |
| 752 | firstAnd = i; |
| 753 | } |
| 754 | } |
| 755 | } |
| 756 | } |
| 757 | if (firstOr != -1) |
| 758 | return isTrue(condition: condition.left(n: firstOr)) || isTrue(condition: condition.mid(position: firstOr + 2)); |
| 759 | if (firstAnd != -1) |
| 760 | return isTrue(condition: condition.left(n: firstAnd)) && isTrue(condition: condition.mid(position: firstAnd + 2)); |
| 761 | |
| 762 | QString t = condition.simplified(); |
| 763 | if (t.isEmpty()) |
| 764 | return true; |
| 765 | |
| 766 | if (t[0] == QChar('!')) |
| 767 | return !isTrue(condition: t.mid(position: 1)); |
| 768 | if (t[0] == QChar('(') && t.endsWith(c: QChar(')'))) |
| 769 | return isTrue(condition: t.mid(position: 1, n: t.size() - 2)); |
| 770 | |
| 771 | auto match = definedX->match(subject: t); |
| 772 | if (match.hasMatch()) |
| 773 | return defines->match(subject: match.captured(nth: 1)).hasMatch(); |
| 774 | else |
| 775 | return !falsehoods->match(subject: t).hasMatch(); |
| 776 | } |
| 777 | |
| 778 | QString Tokenizer::lexeme() const |
| 779 | { |
| 780 | return sourceDecoder(m_lex); |
| 781 | } |
| 782 | |
| 783 | QString Tokenizer::previousLexeme() const |
| 784 | { |
| 785 | return sourceDecoder(m_prevLex); |
| 786 | } |
| 787 | |
| 788 | QT_END_NAMESPACE |
| 789 | |