Lexer.cpp 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491
  1. //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file was developed by Chris Lattner and is distributed under
  6. // the University of Illinois Open Source License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Lexer and LexerToken interfaces.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // TODO: GCC Diagnostics emitted by the lexer:
  15. // PEDWARN: (form feed|vertical tab) in preprocessing directive
  16. //
  17. // Universal characters, unicode, char mapping:
  18. // WARNING: `%.*s' is not in NFKC
  19. // WARNING: `%.*s' is not in NFC
  20. //
  21. // Other:
  22. // TODO: Options to support:
  23. // -fexec-charset,-fwide-exec-charset
  24. //
  25. //===----------------------------------------------------------------------===//
  26. #include "clang/Lex/Lexer.h"
  27. #include "clang/Lex/Preprocessor.h"
  28. #include "clang/Basic/Diagnostic.h"
  29. #include "clang/Basic/SourceLocation.h"
  30. #include "llvm/Support/MemoryBuffer.h"
  31. #include <cctype>
  32. using namespace clang;
  33. static void InitCharacterInfo();
  34. Lexer::Lexer(const llvm::MemoryBuffer *File, unsigned fileid, Preprocessor &pp,
  35. const char *BufStart, const char *BufEnd)
  36. : BufferEnd(BufEnd ? BufEnd : File->getBufferEnd()),
  37. InputFile(File), CurFileID(fileid), PP(pp), Features(PP.getLangOptions()) {
  38. Is_PragmaLexer = false;
  39. IsMainFile = false;
  40. InitCharacterInfo();
  41. assert(BufferEnd[0] == 0 &&
  42. "We assume that the input buffer has a null character at the end"
  43. " to simplify lexing!");
  44. BufferPtr = BufStart ? BufStart : File->getBufferStart();
  45. // Start of the file is a start of line.
  46. IsAtStartOfLine = true;
  47. // We are not after parsing a #.
  48. ParsingPreprocessorDirective = false;
  49. // We are not after parsing #include.
  50. ParsingFilename = false;
  51. // We are not in raw mode. Raw mode disables diagnostics and interpretation
  52. // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
  53. // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
  54. // or otherwise skipping over tokens.
  55. LexingRawMode = false;
  56. // Default to keeping comments if requested.
  57. KeepCommentMode = PP.getCommentRetentionState();
  58. }
  59. /// Stringify - Convert the specified string into a C string, with surrounding
  60. /// ""'s, and with escaped \ and " characters.
  61. std::string Lexer::Stringify(const std::string &Str, bool Charify) {
  62. std::string Result = Str;
  63. char Quote = Charify ? '\'' : '"';
  64. for (unsigned i = 0, e = Result.size(); i != e; ++i) {
  65. if (Result[i] == '\\' || Result[i] == Quote) {
  66. Result.insert(Result.begin()+i, '\\');
  67. ++i; ++e;
  68. }
  69. }
  70. return Result;
  71. }
  72. //===----------------------------------------------------------------------===//
  73. // Character information.
  74. //===----------------------------------------------------------------------===//
  75. static unsigned char CharInfo[256];
  76. enum {
  77. CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
  78. CHAR_VERT_WS = 0x02, // '\r', '\n'
  79. CHAR_LETTER = 0x04, // a-z,A-Z
  80. CHAR_NUMBER = 0x08, // 0-9
  81. CHAR_UNDER = 0x10, // _
  82. CHAR_PERIOD = 0x20 // .
  83. };
  84. static void InitCharacterInfo() {
  85. static bool isInited = false;
  86. if (isInited) return;
  87. isInited = true;
  88. // Intiialize the CharInfo table.
  89. // TODO: statically initialize this.
  90. CharInfo[(int)' '] = CharInfo[(int)'\t'] =
  91. CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
  92. CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
  93. CharInfo[(int)'_'] = CHAR_UNDER;
  94. CharInfo[(int)'.'] = CHAR_PERIOD;
  95. for (unsigned i = 'a'; i <= 'z'; ++i)
  96. CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
  97. for (unsigned i = '0'; i <= '9'; ++i)
  98. CharInfo[i] = CHAR_NUMBER;
  99. }
  100. /// isIdentifierBody - Return true if this is the body character of an
  101. /// identifier, which is [a-zA-Z0-9_].
  102. static inline bool isIdentifierBody(unsigned char c) {
  103. return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER);
  104. }
  105. /// isHorizontalWhitespace - Return true if this character is horizontal
  106. /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'.
  107. static inline bool isHorizontalWhitespace(unsigned char c) {
  108. return CharInfo[c] & CHAR_HORZ_WS;
  109. }
  110. /// isWhitespace - Return true if this character is horizontal or vertical
  111. /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false
  112. /// for '\0'.
  113. static inline bool isWhitespace(unsigned char c) {
  114. return CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS);
  115. }
  116. /// isNumberBody - Return true if this is the body character of an
  117. /// preprocessing number, which is [a-zA-Z0-9_.].
  118. static inline bool isNumberBody(unsigned char c) {
  119. return CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD);
  120. }
  121. //===----------------------------------------------------------------------===//
  122. // Diagnostics forwarding code.
  123. //===----------------------------------------------------------------------===//
  124. /// getSourceLocation - Return a source location identifier for the specified
  125. /// offset in the current file.
  126. SourceLocation Lexer::getSourceLocation(const char *Loc) const {
  127. assert(Loc >= InputFile->getBufferStart() && Loc <= BufferEnd &&
  128. "Location out of range for this buffer!");
  129. return SourceLocation(CurFileID, Loc-InputFile->getBufferStart());
  130. }
  131. /// Diag - Forwarding function for diagnostics. This translate a source
  132. /// position in the current buffer into a SourceLocation object for rendering.
  133. void Lexer::Diag(const char *Loc, unsigned DiagID,
  134. const std::string &Msg) const {
  135. if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
  136. return;
  137. PP.Diag(getSourceLocation(Loc), DiagID, Msg);
  138. }
  139. void Lexer::Diag(SourceLocation Loc, unsigned DiagID,
  140. const std::string &Msg) const {
  141. if (LexingRawMode && Diagnostic::isNoteWarningOrExtension(DiagID))
  142. return;
  143. PP.Diag(Loc, DiagID, Msg);
  144. }
  145. //===----------------------------------------------------------------------===//
  146. // Trigraph and Escaped Newline Handling Code.
  147. //===----------------------------------------------------------------------===//
  148. /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
  149. /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
  150. static char GetTrigraphCharForLetter(char Letter) {
  151. switch (Letter) {
  152. default: return 0;
  153. case '=': return '#';
  154. case ')': return ']';
  155. case '(': return '[';
  156. case '!': return '|';
  157. case '\'': return '^';
  158. case '>': return '}';
  159. case '/': return '\\';
  160. case '<': return '{';
  161. case '-': return '~';
  162. }
  163. }
  164. /// DecodeTrigraphChar - If the specified character is a legal trigraph when
  165. /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
  166. /// return the result character. Finally, emit a warning about trigraph use
  167. /// whether trigraphs are enabled or not.
  168. static char DecodeTrigraphChar(const char *CP, Lexer *L) {
  169. char Res = GetTrigraphCharForLetter(*CP);
  170. if (Res && L) {
  171. if (!L->getFeatures().Trigraphs) {
  172. L->Diag(CP-2, diag::trigraph_ignored);
  173. return 0;
  174. } else {
  175. L->Diag(CP-2, diag::trigraph_converted, std::string()+Res);
  176. }
  177. }
  178. return Res;
  179. }
  180. /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
  181. /// get its size, and return it. This is tricky in several cases:
  182. /// 1. If currently at the start of a trigraph, we warn about the trigraph,
  183. /// then either return the trigraph (skipping 3 chars) or the '?',
  184. /// depending on whether trigraphs are enabled or not.
  185. /// 2. If this is an escaped newline (potentially with whitespace between
  186. /// the backslash and newline), implicitly skip the newline and return
  187. /// the char after it.
  188. /// 3. If this is a UCN, return it. FIXME: C++ UCN's?
  189. ///
  190. /// This handles the slow/uncommon case of the getCharAndSize method. Here we
  191. /// know that we can accumulate into Size, and that we have already incremented
  192. /// Ptr by Size bytes.
  193. ///
  194. /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
  195. /// be updated to match.
  196. ///
  197. char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  198. LexerToken *Tok) {
  199. // If we have a slash, look for an escaped newline.
  200. if (Ptr[0] == '\\') {
  201. ++Size;
  202. ++Ptr;
  203. Slash:
  204. // Common case, backslash-char where the char is not whitespace.
  205. if (!isWhitespace(Ptr[0])) return '\\';
  206. // See if we have optional whitespace characters followed by a newline.
  207. {
  208. unsigned SizeTmp = 0;
  209. do {
  210. ++SizeTmp;
  211. if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
  212. // Remember that this token needs to be cleaned.
  213. if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
  214. // Warn if there was whitespace between the backslash and newline.
  215. if (SizeTmp != 1 && Tok)
  216. Diag(Ptr, diag::backslash_newline_space);
  217. // If this is a \r\n or \n\r, skip the newlines.
  218. if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
  219. Ptr[SizeTmp-1] != Ptr[SizeTmp])
  220. ++SizeTmp;
  221. // Found backslash<whitespace><newline>. Parse the char after it.
  222. Size += SizeTmp;
  223. Ptr += SizeTmp;
  224. // Use slow version to accumulate a correct size field.
  225. return getCharAndSizeSlow(Ptr, Size, Tok);
  226. }
  227. } while (isWhitespace(Ptr[SizeTmp]));
  228. }
  229. // Otherwise, this is not an escaped newline, just return the slash.
  230. return '\\';
  231. }
  232. // If this is a trigraph, process it.
  233. if (Ptr[0] == '?' && Ptr[1] == '?') {
  234. // If this is actually a legal trigraph (not something like "??x"), emit
  235. // a trigraph warning. If so, and if trigraphs are enabled, return it.
  236. if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
  237. // Remember that this token needs to be cleaned.
  238. if (Tok) Tok->setFlag(LexerToken::NeedsCleaning);
  239. Ptr += 3;
  240. Size += 3;
  241. if (C == '\\') goto Slash;
  242. return C;
  243. }
  244. }
  245. // If this is neither, return a single character.
  246. ++Size;
  247. return *Ptr;
  248. }
  249. /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
  250. /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
  251. /// and that we have already incremented Ptr by Size bytes.
  252. ///
  253. /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
  254. /// be updated to match.
  255. char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  256. const LangOptions &Features) {
  257. // If we have a slash, look for an escaped newline.
  258. if (Ptr[0] == '\\') {
  259. ++Size;
  260. ++Ptr;
  261. Slash:
  262. // Common case, backslash-char where the char is not whitespace.
  263. if (!isWhitespace(Ptr[0])) return '\\';
  264. // See if we have optional whitespace characters followed by a newline.
  265. {
  266. unsigned SizeTmp = 0;
  267. do {
  268. ++SizeTmp;
  269. if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
  270. // If this is a \r\n or \n\r, skip the newlines.
  271. if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
  272. Ptr[SizeTmp-1] != Ptr[SizeTmp])
  273. ++SizeTmp;
  274. // Found backslash<whitespace><newline>. Parse the char after it.
  275. Size += SizeTmp;
  276. Ptr += SizeTmp;
  277. // Use slow version to accumulate a correct size field.
  278. return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
  279. }
  280. } while (isWhitespace(Ptr[SizeTmp]));
  281. }
  282. // Otherwise, this is not an escaped newline, just return the slash.
  283. return '\\';
  284. }
  285. // If this is a trigraph, process it.
  286. if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
  287. // If this is actually a legal trigraph (not something like "??x"), return
  288. // it.
  289. if (char C = GetTrigraphCharForLetter(Ptr[2])) {
  290. Ptr += 3;
  291. Size += 3;
  292. if (C == '\\') goto Slash;
  293. return C;
  294. }
  295. }
  296. // If this is neither, return a single character.
  297. ++Size;
  298. return *Ptr;
  299. }
  300. //===----------------------------------------------------------------------===//
  301. // Helper methods for lexing.
  302. //===----------------------------------------------------------------------===//
  303. void Lexer::LexIdentifier(LexerToken &Result, const char *CurPtr) {
  304. // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  305. unsigned Size;
  306. unsigned char C = *CurPtr++;
  307. while (isIdentifierBody(C)) {
  308. C = *CurPtr++;
  309. }
  310. --CurPtr; // Back up over the skipped character.
  311. // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
  312. // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
  313. // FIXME: UCNs.
  314. if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
  315. FinishIdentifier:
  316. const char *IdStart = BufferPtr;
  317. FormTokenWithChars(Result, CurPtr);
  318. Result.setKind(tok::identifier);
  319. // If we are in raw mode, return this identifier raw. There is no need to
  320. // look up identifier information or attempt to macro expand it.
  321. if (LexingRawMode) return;
  322. // Fill in Result.IdentifierInfo, looking up the identifier in the
  323. // identifier table.
  324. PP.LookUpIdentifierInfo(Result, IdStart);
  325. // Finally, now that we know we have an identifier, pass this off to the
  326. // preprocessor, which may macro expand it or something.
  327. return PP.HandleIdentifier(Result);
  328. }
  329. // Otherwise, $,\,? in identifier found. Enter slower path.
  330. C = getCharAndSize(CurPtr, Size);
  331. while (1) {
  332. if (C == '$') {
  333. // If we hit a $ and they are not supported in identifiers, we are done.
  334. if (!Features.DollarIdents) goto FinishIdentifier;
  335. // Otherwise, emit a diagnostic and continue.
  336. Diag(CurPtr, diag::ext_dollar_in_identifier);
  337. CurPtr = ConsumeChar(CurPtr, Size, Result);
  338. C = getCharAndSize(CurPtr, Size);
  339. continue;
  340. } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
  341. // Found end of identifier.
  342. goto FinishIdentifier;
  343. }
  344. // Otherwise, this character is good, consume it.
  345. CurPtr = ConsumeChar(CurPtr, Size, Result);
  346. C = getCharAndSize(CurPtr, Size);
  347. while (isIdentifierBody(C)) { // FIXME: UCNs.
  348. CurPtr = ConsumeChar(CurPtr, Size, Result);
  349. C = getCharAndSize(CurPtr, Size);
  350. }
  351. }
  352. }
  353. /// LexNumericConstant - Lex the remainer of a integer or floating point
  354. /// constant. From[-1] is the first character lexed. Return the end of the
  355. /// constant.
  356. void Lexer::LexNumericConstant(LexerToken &Result, const char *CurPtr) {
  357. unsigned Size;
  358. char C = getCharAndSize(CurPtr, Size);
  359. char PrevCh = 0;
  360. while (isNumberBody(C)) { // FIXME: UCNs?
  361. CurPtr = ConsumeChar(CurPtr, Size, Result);
  362. PrevCh = C;
  363. C = getCharAndSize(CurPtr, Size);
  364. }
  365. // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
  366. if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
  367. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  368. // If we have a hex FP constant, continue.
  369. if (Features.HexFloats &&
  370. (C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p'))
  371. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  372. Result.setKind(tok::numeric_constant);
  373. // Update the location of token as well as BufferPtr.
  374. FormTokenWithChars(Result, CurPtr);
  375. }
  376. /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
  377. /// either " or L".
  378. void Lexer::LexStringLiteral(LexerToken &Result, const char *CurPtr, bool Wide){
  379. const char *NulCharacter = 0; // Does this string contain the \0 character?
  380. char C = getAndAdvanceChar(CurPtr, Result);
  381. while (C != '"') {
  382. // Skip escaped characters.
  383. if (C == '\\') {
  384. // Skip the escaped character.
  385. C = getAndAdvanceChar(CurPtr, Result);
  386. } else if (C == '\n' || C == '\r' || // Newline.
  387. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  388. if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_string);
  389. Result.setKind(tok::unknown);
  390. FormTokenWithChars(Result, CurPtr-1);
  391. return;
  392. } else if (C == 0) {
  393. NulCharacter = CurPtr-1;
  394. }
  395. C = getAndAdvanceChar(CurPtr, Result);
  396. }
  397. // If a nul character existed in the string, warn about it.
  398. if (NulCharacter) Diag(NulCharacter, diag::null_in_string);
  399. Result.setKind(Wide ? tok::wide_string_literal : tok::string_literal);
  400. // Update the location of the token as well as the BufferPtr instance var.
  401. FormTokenWithChars(Result, CurPtr);
  402. }
  403. /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
  404. /// after having lexed the '<' character. This is used for #include filenames.
  405. void Lexer::LexAngledStringLiteral(LexerToken &Result, const char *CurPtr) {
  406. const char *NulCharacter = 0; // Does this string contain the \0 character?
  407. char C = getAndAdvanceChar(CurPtr, Result);
  408. while (C != '>') {
  409. // Skip escaped characters.
  410. if (C == '\\') {
  411. // Skip the escaped character.
  412. C = getAndAdvanceChar(CurPtr, Result);
  413. } else if (C == '\n' || C == '\r' || // Newline.
  414. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  415. if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_string);
  416. Result.setKind(tok::unknown);
  417. FormTokenWithChars(Result, CurPtr-1);
  418. return;
  419. } else if (C == 0) {
  420. NulCharacter = CurPtr-1;
  421. }
  422. C = getAndAdvanceChar(CurPtr, Result);
  423. }
  424. // If a nul character existed in the string, warn about it.
  425. if (NulCharacter) Diag(NulCharacter, diag::null_in_string);
  426. Result.setKind(tok::angle_string_literal);
  427. // Update the location of token as well as BufferPtr.
  428. FormTokenWithChars(Result, CurPtr);
  429. }
  430. /// LexCharConstant - Lex the remainder of a character constant, after having
  431. /// lexed either ' or L'.
  432. void Lexer::LexCharConstant(LexerToken &Result, const char *CurPtr) {
  433. const char *NulCharacter = 0; // Does this character contain the \0 character?
  434. // Handle the common case of 'x' and '\y' efficiently.
  435. char C = getAndAdvanceChar(CurPtr, Result);
  436. if (C == '\'') {
  437. if (!LexingRawMode) Diag(BufferPtr, diag::err_empty_character);
  438. Result.setKind(tok::unknown);
  439. FormTokenWithChars(Result, CurPtr);
  440. return;
  441. } else if (C == '\\') {
  442. // Skip the escaped character.
  443. // FIXME: UCN's.
  444. C = getAndAdvanceChar(CurPtr, Result);
  445. }
  446. if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
  447. ++CurPtr;
  448. } else {
  449. // Fall back on generic code for embedded nulls, newlines, wide chars.
  450. do {
  451. // Skip escaped characters.
  452. if (C == '\\') {
  453. // Skip the escaped character.
  454. C = getAndAdvanceChar(CurPtr, Result);
  455. } else if (C == '\n' || C == '\r' || // Newline.
  456. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  457. if (!LexingRawMode) Diag(BufferPtr, diag::err_unterminated_char);
  458. Result.setKind(tok::unknown);
  459. FormTokenWithChars(Result, CurPtr-1);
  460. return;
  461. } else if (C == 0) {
  462. NulCharacter = CurPtr-1;
  463. }
  464. C = getAndAdvanceChar(CurPtr, Result);
  465. } while (C != '\'');
  466. }
  467. if (NulCharacter) Diag(NulCharacter, diag::null_in_char);
  468. Result.setKind(tok::char_constant);
  469. // Update the location of token as well as BufferPtr.
  470. FormTokenWithChars(Result, CurPtr);
  471. }
  472. /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
  473. /// Update BufferPtr to point to the next non-whitespace character and return.
  474. void Lexer::SkipWhitespace(LexerToken &Result, const char *CurPtr) {
  475. // Whitespace - Skip it, then return the token after the whitespace.
  476. unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently.
  477. while (1) {
  478. // Skip horizontal whitespace very aggressively.
  479. while (isHorizontalWhitespace(Char))
  480. Char = *++CurPtr;
  481. // Otherwise if we something other than whitespace, we're done.
  482. if (Char != '\n' && Char != '\r')
  483. break;
  484. if (ParsingPreprocessorDirective) {
  485. // End of preprocessor directive line, let LexTokenInternal handle this.
  486. BufferPtr = CurPtr;
  487. return;
  488. }
  489. // ok, but handle newline.
  490. // The returned token is at the start of the line.
  491. Result.setFlag(LexerToken::StartOfLine);
  492. // No leading whitespace seen so far.
  493. Result.clearFlag(LexerToken::LeadingSpace);
  494. Char = *++CurPtr;
  495. }
  496. // If this isn't immediately after a newline, there is leading space.
  497. char PrevChar = CurPtr[-1];
  498. if (PrevChar != '\n' && PrevChar != '\r')
  499. Result.setFlag(LexerToken::LeadingSpace);
  500. // If the next token is obviously a // or /* */ comment, skip it efficiently
  501. // too (without going through the big switch stmt).
  502. if (Char == '/' && CurPtr[1] == '/' && !KeepCommentMode) {
  503. BufferPtr = CurPtr;
  504. SkipBCPLComment(Result, CurPtr+1);
  505. return;
  506. }
  507. if (Char == '/' && CurPtr[1] == '*' && !KeepCommentMode) {
  508. BufferPtr = CurPtr;
  509. SkipBlockComment(Result, CurPtr+2);
  510. return;
  511. }
  512. BufferPtr = CurPtr;
  513. }
  514. // SkipBCPLComment - We have just read the // characters from input. Skip until
  515. // we find the newline character thats terminate the comment. Then update
  516. /// BufferPtr and return.
  517. bool Lexer::SkipBCPLComment(LexerToken &Result, const char *CurPtr) {
  518. // If BCPL comments aren't explicitly enabled for this language, emit an
  519. // extension warning.
  520. if (!Features.BCPLComment) {
  521. Diag(BufferPtr, diag::ext_bcpl_comment);
  522. // Mark them enabled so we only emit one warning for this translation
  523. // unit.
  524. Features.BCPLComment = true;
  525. }
  526. // Scan over the body of the comment. The common case, when scanning, is that
  527. // the comment contains normal ascii characters with nothing interesting in
  528. // them. As such, optimize for this case with the inner loop.
  529. char C;
  530. do {
  531. C = *CurPtr;
  532. // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character.
  533. // If we find a \n character, scan backwards, checking to see if it's an
  534. // escaped newline, like we do for block comments.
  535. // Skip over characters in the fast loop.
  536. while (C != 0 && // Potentially EOF.
  537. C != '\\' && // Potentially escaped newline.
  538. C != '?' && // Potentially trigraph.
  539. C != '\n' && C != '\r') // Newline or DOS-style newline.
  540. C = *++CurPtr;
  541. // If this is a newline, we're done.
  542. if (C == '\n' || C == '\r')
  543. break; // Found the newline? Break out!
  544. // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
  545. // properly decode the character.
  546. const char *OldPtr = CurPtr;
  547. C = getAndAdvanceChar(CurPtr, Result);
  548. // If we read multiple characters, and one of those characters was a \r or
  549. // \n, then we had an escaped newline within the comment. Emit diagnostic
  550. // unless the next line is also a // comment.
  551. if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
  552. for (; OldPtr != CurPtr; ++OldPtr)
  553. if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
  554. // Okay, we found a // comment that ends in a newline, if the next
  555. // line is also a // comment, but has spaces, don't emit a diagnostic.
  556. if (isspace(C)) {
  557. const char *ForwardPtr = CurPtr;
  558. while (isspace(*ForwardPtr)) // Skip whitespace.
  559. ++ForwardPtr;
  560. if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
  561. break;
  562. }
  563. Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
  564. break;
  565. }
  566. }
  567. if (CurPtr == BufferEnd+1) { --CurPtr; break; }
  568. } while (C != '\n' && C != '\r');
  569. // Found but did not consume the newline.
  570. // If we are returning comments as tokens, return this comment as a token.
  571. if (KeepCommentMode)
  572. return SaveBCPLComment(Result, CurPtr);
  573. // If we are inside a preprocessor directive and we see the end of line,
  574. // return immediately, so that the lexer can return this as an EOM token.
  575. if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
  576. BufferPtr = CurPtr;
  577. return true;
  578. }
  579. // Otherwise, eat the \n character. We don't care if this is a \n\r or
  580. // \r\n sequence.
  581. ++CurPtr;
  582. // The next returned token is at the start of the line.
  583. Result.setFlag(LexerToken::StartOfLine);
  584. // No leading whitespace seen so far.
  585. Result.clearFlag(LexerToken::LeadingSpace);
  586. // It is common for the tokens immediately after a // comment to be
  587. // whitespace (indentation for the next line). Instead of going through the
  588. // big switch, handle it efficiently now.
  589. if (isWhitespace(*CurPtr)) {
  590. Result.setFlag(LexerToken::LeadingSpace);
  591. SkipWhitespace(Result, CurPtr+1);
  592. return true;
  593. }
  594. BufferPtr = CurPtr;
  595. return true;
  596. }
  597. /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
  598. /// an appropriate way and return it.
  599. bool Lexer::SaveBCPLComment(LexerToken &Result, const char *CurPtr) {
  600. Result.setKind(tok::comment);
  601. FormTokenWithChars(Result, CurPtr);
  602. // If this BCPL-style comment is in a macro definition, transmogrify it into
  603. // a C-style block comment.
  604. if (ParsingPreprocessorDirective) {
  605. std::string Spelling = PP.getSpelling(Result);
  606. assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
  607. Spelling[1] = '*'; // Change prefix to "/*".
  608. Spelling += "*/"; // add suffix.
  609. Result.setLocation(PP.CreateString(&Spelling[0], Spelling.size(),
  610. Result.getLocation()));
  611. Result.setLength(Spelling.size());
  612. }
  613. return false;
  614. }
  615. /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
  616. /// character (either \n or \r) is part of an escaped newline sequence. Issue a
  617. /// diagnostic if so. We know that the is inside of a block comment.
  618. static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
  619. Lexer *L) {
  620. assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
  621. // Back up off the newline.
  622. --CurPtr;
  623. // If this is a two-character newline sequence, skip the other character.
  624. if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
  625. // \n\n or \r\r -> not escaped newline.
  626. if (CurPtr[0] == CurPtr[1])
  627. return false;
  628. // \n\r or \r\n -> skip the newline.
  629. --CurPtr;
  630. }
  631. // If we have horizontal whitespace, skip over it. We allow whitespace
  632. // between the slash and newline.
  633. bool HasSpace = false;
  634. while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
  635. --CurPtr;
  636. HasSpace = true;
  637. }
  638. // If we have a slash, we know this is an escaped newline.
  639. if (*CurPtr == '\\') {
  640. if (CurPtr[-1] != '*') return false;
  641. } else {
  642. // It isn't a slash, is it the ?? / trigraph?
  643. if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
  644. CurPtr[-3] != '*')
  645. return false;
  646. // This is the trigraph ending the comment. Emit a stern warning!
  647. CurPtr -= 2;
  648. // If no trigraphs are enabled, warn that we ignored this trigraph and
  649. // ignore this * character.
  650. if (!L->getFeatures().Trigraphs) {
  651. L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
  652. return false;
  653. }
  654. L->Diag(CurPtr, diag::trigraph_ends_block_comment);
  655. }
  656. // Warn about having an escaped newline between the */ characters.
  657. L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
  658. // If there was space between the backslash and newline, warn about it.
  659. if (HasSpace) L->Diag(CurPtr, diag::backslash_newline_space);
  660. return true;
  661. }
  662. #ifdef __SSE2__
  663. #include <emmintrin.h>
  664. #elif __ALTIVEC__
  665. #include <altivec.h>
  666. #undef bool
  667. #endif
  668. /// SkipBlockComment - We have just read the /* characters from input. Read
  669. /// until we find the */ characters that terminate the comment. Note that we
  670. /// don't bother decoding trigraphs or escaped newlines in block comments,
  671. /// because they cannot cause the comment to end. The only thing that can
  672. /// happen is the comment could end with an escaped newline between the */ end
  673. /// of comment.
  674. bool Lexer::SkipBlockComment(LexerToken &Result, const char *CurPtr) {
  675. // Scan one character past where we should, looking for a '/' character. Once
  676. // we find it, check to see if it was preceeded by a *. This common
  677. // optimization helps people who like to put a lot of * characters in their
  678. // comments.
  679. unsigned char C = *CurPtr++;
  680. if (C == 0 && CurPtr == BufferEnd+1) {
  681. Diag(BufferPtr, diag::err_unterminated_block_comment);
  682. BufferPtr = CurPtr-1;
  683. return true;
  684. }
  685. while (1) {
  686. // Skip over all non-interesting characters until we find end of buffer or a
  687. // (probably ending) '/' character.
  688. if (CurPtr + 24 < BufferEnd) {
  689. // While not aligned to a 16-byte boundary.
  690. while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
  691. C = *CurPtr++;
  692. if (C == '/') goto FoundSlash;
  693. #ifdef __SSE2__
  694. __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
  695. '/', '/', '/', '/', '/', '/', '/', '/');
  696. while (CurPtr+16 <= BufferEnd &&
  697. _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
  698. CurPtr += 16;
  699. #elif __ALTIVEC__
  700. __vector unsigned char Slashes = {
  701. '/', '/', '/', '/', '/', '/', '/', '/',
  702. '/', '/', '/', '/', '/', '/', '/', '/'
  703. };
  704. while (CurPtr+16 <= BufferEnd &&
  705. !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
  706. CurPtr += 16;
  707. #else
  708. // Scan for '/' quickly. Many block comments are very large.
  709. while (CurPtr[0] != '/' &&
  710. CurPtr[1] != '/' &&
  711. CurPtr[2] != '/' &&
  712. CurPtr[3] != '/' &&
  713. CurPtr+4 < BufferEnd) {
  714. CurPtr += 4;
  715. }
  716. #endif
  717. // It has to be one of the bytes scanned, increment to it and read one.
  718. C = *CurPtr++;
  719. }
  720. // Loop to scan the remainder.
  721. while (C != '/' && C != '\0')
  722. C = *CurPtr++;
  723. FoundSlash:
  724. if (C == '/') {
  725. if (CurPtr[-2] == '*') // We found the final */. We're done!
  726. break;
  727. if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
  728. if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
  729. // We found the final */, though it had an escaped newline between the
  730. // * and /. We're done!
  731. break;
  732. }
  733. }
  734. if (CurPtr[0] == '*' && CurPtr[1] != '/') {
  735. // If this is a /* inside of the comment, emit a warning. Don't do this
  736. // if this is a /*/, which will end the comment. This misses cases with
  737. // embedded escaped newlines, but oh well.
  738. Diag(CurPtr-1, diag::nested_block_comment);
  739. }
  740. } else if (C == 0 && CurPtr == BufferEnd+1) {
  741. Diag(BufferPtr, diag::err_unterminated_block_comment);
  742. // Note: the user probably forgot a */. We could continue immediately
  743. // after the /*, but this would involve lexing a lot of what really is the
  744. // comment, which surely would confuse the parser.
  745. BufferPtr = CurPtr-1;
  746. return true;
  747. }
  748. C = *CurPtr++;
  749. }
  750. // If we are returning comments as tokens, return this comment as a token.
  751. if (KeepCommentMode) {
  752. Result.setKind(tok::comment);
  753. FormTokenWithChars(Result, CurPtr);
  754. return false;
  755. }
  756. // It is common for the tokens immediately after a /**/ comment to be
  757. // whitespace. Instead of going through the big switch, handle it
  758. // efficiently now.
  759. if (isHorizontalWhitespace(*CurPtr)) {
  760. Result.setFlag(LexerToken::LeadingSpace);
  761. SkipWhitespace(Result, CurPtr+1);
  762. return true;
  763. }
  764. // Otherwise, just return so that the next character will be lexed as a token.
  765. BufferPtr = CurPtr;
  766. Result.setFlag(LexerToken::LeadingSpace);
  767. return true;
  768. }
  769. //===----------------------------------------------------------------------===//
  770. // Primary Lexing Entry Points
  771. //===----------------------------------------------------------------------===//
  772. /// LexIncludeFilename - After the preprocessor has parsed a #include, lex and
  773. /// (potentially) macro expand the filename.
  774. void Lexer::LexIncludeFilename(LexerToken &FilenameTok) {
  775. assert(ParsingPreprocessorDirective &&
  776. ParsingFilename == false &&
  777. "Must be in a preprocessing directive!");
  778. // We are now parsing a filename!
  779. ParsingFilename = true;
  780. // Lex the filename.
  781. Lex(FilenameTok);
  782. // We should have obtained the filename now.
  783. ParsingFilename = false;
  784. // No filename?
  785. if (FilenameTok.getKind() == tok::eom)
  786. Diag(FilenameTok.getLocation(), diag::err_pp_expects_filename);
  787. }
  788. /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  789. /// uninterpreted string. This switches the lexer out of directive mode.
  790. std::string Lexer::ReadToEndOfLine() {
  791. assert(ParsingPreprocessorDirective && ParsingFilename == false &&
  792. "Must be in a preprocessing directive!");
  793. std::string Result;
  794. LexerToken Tmp;
  795. // CurPtr - Cache BufferPtr in an automatic variable.
  796. const char *CurPtr = BufferPtr;
  797. while (1) {
  798. char Char = getAndAdvanceChar(CurPtr, Tmp);
  799. switch (Char) {
  800. default:
  801. Result += Char;
  802. break;
  803. case 0: // Null.
  804. // Found end of file?
  805. if (CurPtr-1 != BufferEnd) {
  806. // Nope, normal character, continue.
  807. Result += Char;
  808. break;
  809. }
  810. // FALL THROUGH.
  811. case '\r':
  812. case '\n':
  813. // Okay, we found the end of the line. First, back up past the \0, \r, \n.
  814. assert(CurPtr[-1] == Char && "Trigraphs for newline?");
  815. BufferPtr = CurPtr-1;
  816. // Next, lex the character, which should handle the EOM transition.
  817. Lex(Tmp);
  818. assert(Tmp.getKind() == tok::eom && "Unexpected token!");
  819. // Finally, we're done, return the string we found.
  820. return Result;
  821. }
  822. }
  823. }
  824. /// LexEndOfFile - CurPtr points to the end of this file. Handle this
  825. /// condition, reporting diagnostics and handling other edge cases as required.
  826. /// This returns true if Result contains a token, false if PP.Lex should be
  827. /// called again.
  828. bool Lexer::LexEndOfFile(LexerToken &Result, const char *CurPtr) {
  829. // If we hit the end of the file while parsing a preprocessor directive,
  830. // end the preprocessor directive first. The next token returned will
  831. // then be the end of file.
  832. if (ParsingPreprocessorDirective) {
  833. // Done parsing the "line".
  834. ParsingPreprocessorDirective = false;
  835. Result.setKind(tok::eom);
  836. // Update the location of token as well as BufferPtr.
  837. FormTokenWithChars(Result, CurPtr);
  838. // Restore comment saving mode, in case it was disabled for directive.
  839. KeepCommentMode = PP.getCommentRetentionState();
  840. return true; // Have a token.
  841. }
  842. // If we are in raw mode, return this event as an EOF token. Let the caller
  843. // that put us in raw mode handle the event.
  844. if (LexingRawMode) {
  845. Result.startToken();
  846. BufferPtr = BufferEnd;
  847. FormTokenWithChars(Result, BufferEnd);
  848. Result.setKind(tok::eof);
  849. return true;
  850. }
  851. // Otherwise, issue diagnostics for unterminated #if and missing newline.
  852. // If we are in a #if directive, emit an error.
  853. while (!ConditionalStack.empty()) {
  854. Diag(ConditionalStack.back().IfLoc, diag::err_pp_unterminated_conditional);
  855. ConditionalStack.pop_back();
  856. }
  857. // If the file was empty or didn't end in a newline, issue a pedwarn.
  858. if (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
  859. Diag(BufferEnd, diag::ext_no_newline_eof);
  860. BufferPtr = CurPtr;
  861. // Finally, let the preprocessor handle this.
  862. return PP.HandleEndOfFile(Result);
  863. }
  864. /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
  865. /// the specified lexer will return a tok::l_paren token, 0 if it is something
  866. /// else and 2 if there are no more tokens in the buffer controlled by the
  867. /// lexer.
  868. unsigned Lexer::isNextPPTokenLParen() {
  869. assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
  870. // Switch to 'skipping' mode. This will ensure that we can lex a token
  871. // without emitting diagnostics, disables macro expansion, and will cause EOF
  872. // to return an EOF token instead of popping the include stack.
  873. LexingRawMode = true;
  874. // Save state that can be changed while lexing so that we can restore it.
  875. const char *TmpBufferPtr = BufferPtr;
  876. LexerToken Tok;
  877. Tok.startToken();
  878. LexTokenInternal(Tok);
  879. // Restore state that may have changed.
  880. BufferPtr = TmpBufferPtr;
  881. // Restore the lexer back to non-skipping mode.
  882. LexingRawMode = false;
  883. if (Tok.getKind() == tok::eof)
  884. return 2;
  885. return Tok.getKind() == tok::l_paren;
  886. }
  887. /// LexTokenInternal - This implements a simple C family lexer. It is an
  888. /// extremely performance critical piece of code. This assumes that the buffer
  889. /// has a null character at the end of the file. Return true if an error
  890. /// occurred and compilation should terminate, false if normal. This returns a
  891. /// preprocessing token, not a normal token, as such, it is an internal
  892. /// interface. It assumes that the Flags of result have been cleared before
  893. /// calling this.
  894. void Lexer::LexTokenInternal(LexerToken &Result) {
  895. LexNextToken:
  896. // New token, can't need cleaning yet.
  897. Result.clearFlag(LexerToken::NeedsCleaning);
  898. Result.setIdentifierInfo(0);
  899. // CurPtr - Cache BufferPtr in an automatic variable.
  900. const char *CurPtr = BufferPtr;
  901. // Small amounts of horizontal whitespace is very common between tokens.
  902. if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
  903. ++CurPtr;
  904. while ((*CurPtr == ' ') || (*CurPtr == '\t'))
  905. ++CurPtr;
  906. BufferPtr = CurPtr;
  907. Result.setFlag(LexerToken::LeadingSpace);
  908. }
  909. unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
  910. // Read a character, advancing over it.
  911. char Char = getAndAdvanceChar(CurPtr, Result);
  912. switch (Char) {
  913. case 0: // Null.
  914. // Found end of file?
  915. if (CurPtr-1 == BufferEnd) {
  916. // Read the PP instance variable into an automatic variable, because
  917. // LexEndOfFile will often delete 'this'.
  918. Preprocessor &PPCache = PP;
  919. if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
  920. return; // Got a token to return.
  921. return PPCache.Lex(Result);
  922. }
  923. Diag(CurPtr-1, diag::null_in_file);
  924. Result.setFlag(LexerToken::LeadingSpace);
  925. SkipWhitespace(Result, CurPtr);
  926. goto LexNextToken; // GCC isn't tail call eliminating.
  927. case '\n':
  928. case '\r':
  929. // If we are inside a preprocessor directive and we see the end of line,
  930. // we know we are done with the directive, so return an EOM token.
  931. if (ParsingPreprocessorDirective) {
  932. // Done parsing the "line".
  933. ParsingPreprocessorDirective = false;
  934. // Restore comment saving mode, in case it was disabled for directive.
  935. KeepCommentMode = PP.getCommentRetentionState();
  936. // Since we consumed a newline, we are back at the start of a line.
  937. IsAtStartOfLine = true;
  938. Result.setKind(tok::eom);
  939. break;
  940. }
  941. // The returned token is at the start of the line.
  942. Result.setFlag(LexerToken::StartOfLine);
  943. // No leading whitespace seen so far.
  944. Result.clearFlag(LexerToken::LeadingSpace);
  945. SkipWhitespace(Result, CurPtr);
  946. goto LexNextToken; // GCC isn't tail call eliminating.
  947. case ' ':
  948. case '\t':
  949. case '\f':
  950. case '\v':
  951. Result.setFlag(LexerToken::LeadingSpace);
  952. SkipWhitespace(Result, CurPtr);
  953. goto LexNextToken; // GCC isn't tail call eliminating.
  954. case 'L':
  955. // Notify MIOpt that we read a non-whitespace/non-comment token.
  956. MIOpt.ReadToken();
  957. Char = getCharAndSize(CurPtr, SizeTmp);
  958. // Wide string literal.
  959. if (Char == '"')
  960. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  961. true);
  962. // Wide character constant.
  963. if (Char == '\'')
  964. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  965. // FALL THROUGH, treating L like the start of an identifier.
  966. // C99 6.4.2: Identifiers.
  967. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  968. case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
  969. case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
  970. case 'V': case 'W': case 'X': case 'Y': case 'Z':
  971. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  972. case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  973. case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
  974. case 'v': case 'w': case 'x': case 'y': case 'z':
  975. case '_':
  976. // Notify MIOpt that we read a non-whitespace/non-comment token.
  977. MIOpt.ReadToken();
  978. return LexIdentifier(Result, CurPtr);
  979. // C99 6.4.4.1: Integer Constants.
  980. // C99 6.4.4.2: Floating Constants.
  981. case '0': case '1': case '2': case '3': case '4':
  982. case '5': case '6': case '7': case '8': case '9':
  983. // Notify MIOpt that we read a non-whitespace/non-comment token.
  984. MIOpt.ReadToken();
  985. return LexNumericConstant(Result, CurPtr);
  986. // C99 6.4.4: Character Constants.
  987. case '\'':
  988. // Notify MIOpt that we read a non-whitespace/non-comment token.
  989. MIOpt.ReadToken();
  990. return LexCharConstant(Result, CurPtr);
  991. // C99 6.4.5: String Literals.
  992. case '"':
  993. // Notify MIOpt that we read a non-whitespace/non-comment token.
  994. MIOpt.ReadToken();
  995. return LexStringLiteral(Result, CurPtr, false);
  996. // C99 6.4.6: Punctuators.
  997. case '?':
  998. Result.setKind(tok::question);
  999. break;
  1000. case '[':
  1001. Result.setKind(tok::l_square);
  1002. break;
  1003. case ']':
  1004. Result.setKind(tok::r_square);
  1005. break;
  1006. case '(':
  1007. Result.setKind(tok::l_paren);
  1008. break;
  1009. case ')':
  1010. Result.setKind(tok::r_paren);
  1011. break;
  1012. case '{':
  1013. Result.setKind(tok::l_brace);
  1014. break;
  1015. case '}':
  1016. Result.setKind(tok::r_brace);
  1017. break;
  1018. case '.':
  1019. Char = getCharAndSize(CurPtr, SizeTmp);
  1020. if (Char >= '0' && Char <= '9') {
  1021. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1022. MIOpt.ReadToken();
  1023. return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  1024. } else if (Features.CPlusPlus && Char == '*') {
  1025. Result.setKind(tok::periodstar);
  1026. CurPtr += SizeTmp;
  1027. } else if (Char == '.' &&
  1028. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
  1029. Result.setKind(tok::ellipsis);
  1030. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1031. SizeTmp2, Result);
  1032. } else {
  1033. Result.setKind(tok::period);
  1034. }
  1035. break;
  1036. case '&':
  1037. Char = getCharAndSize(CurPtr, SizeTmp);
  1038. if (Char == '&') {
  1039. Result.setKind(tok::ampamp);
  1040. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1041. } else if (Char == '=') {
  1042. Result.setKind(tok::ampequal);
  1043. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1044. } else {
  1045. Result.setKind(tok::amp);
  1046. }
  1047. break;
  1048. case '*':
  1049. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  1050. Result.setKind(tok::starequal);
  1051. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1052. } else {
  1053. Result.setKind(tok::star);
  1054. }
  1055. break;
  1056. case '+':
  1057. Char = getCharAndSize(CurPtr, SizeTmp);
  1058. if (Char == '+') {
  1059. Result.setKind(tok::plusplus);
  1060. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1061. } else if (Char == '=') {
  1062. Result.setKind(tok::plusequal);
  1063. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1064. } else {
  1065. Result.setKind(tok::plus);
  1066. }
  1067. break;
  1068. case '-':
  1069. Char = getCharAndSize(CurPtr, SizeTmp);
  1070. if (Char == '-') {
  1071. Result.setKind(tok::minusminus);
  1072. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1073. } else if (Char == '>' && Features.CPlusPlus &&
  1074. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {
  1075. Result.setKind(tok::arrowstar); // C++ ->*
  1076. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1077. SizeTmp2, Result);
  1078. } else if (Char == '>') {
  1079. Result.setKind(tok::arrow);
  1080. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1081. } else if (Char == '=') {
  1082. Result.setKind(tok::minusequal);
  1083. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1084. } else {
  1085. Result.setKind(tok::minus);
  1086. }
  1087. break;
  1088. case '~':
  1089. Result.setKind(tok::tilde);
  1090. break;
  1091. case '!':
  1092. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  1093. Result.setKind(tok::exclaimequal);
  1094. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1095. } else {
  1096. Result.setKind(tok::exclaim);
  1097. }
  1098. break;
  1099. case '/':
  1100. // 6.4.9: Comments
  1101. Char = getCharAndSize(CurPtr, SizeTmp);
  1102. if (Char == '/') { // BCPL comment.
  1103. if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
  1104. goto LexNextToken; // GCC isn't tail call eliminating.
  1105. return; // KeepCommentMode
  1106. } else if (Char == '*') { // /**/ comment.
  1107. if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
  1108. goto LexNextToken; // GCC isn't tail call eliminating.
  1109. return; // KeepCommentMode
  1110. } else if (Char == '=') {
  1111. Result.setKind(tok::slashequal);
  1112. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1113. } else {
  1114. Result.setKind(tok::slash);
  1115. }
  1116. break;
  1117. case '%':
  1118. Char = getCharAndSize(CurPtr, SizeTmp);
  1119. if (Char == '=') {
  1120. Result.setKind(tok::percentequal);
  1121. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1122. } else if (Features.Digraphs && Char == '>') {
  1123. Result.setKind(tok::r_brace); // '%>' -> '}'
  1124. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1125. } else if (Features.Digraphs && Char == ':') {
  1126. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1127. Char = getCharAndSize(CurPtr, SizeTmp);
  1128. if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
  1129. Result.setKind(tok::hashhash); // '%:%:' -> '##'
  1130. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1131. SizeTmp2, Result);
  1132. } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize
  1133. Result.setKind(tok::hashat);
  1134. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1135. Diag(BufferPtr, diag::charize_microsoft_ext);
  1136. } else {
  1137. Result.setKind(tok::hash); // '%:' -> '#'
  1138. // We parsed a # character. If this occurs at the start of the line,
  1139. // it's actually the start of a preprocessing directive. Callback to
  1140. // the preprocessor to handle it.
  1141. // FIXME: -fpreprocessed mode??
  1142. if (Result.isAtStartOfLine() && !LexingRawMode) {
  1143. BufferPtr = CurPtr;
  1144. PP.HandleDirective(Result);
  1145. // As an optimization, if the preprocessor didn't switch lexers, tail
  1146. // recurse.
  1147. if (PP.isCurrentLexer(this)) {
  1148. // Start a new token. If this is a #include or something, the PP may
  1149. // want us starting at the beginning of the line again. If so, set
  1150. // the StartOfLine flag.
  1151. if (IsAtStartOfLine) {
  1152. Result.setFlag(LexerToken::StartOfLine);
  1153. IsAtStartOfLine = false;
  1154. }
  1155. goto LexNextToken; // GCC isn't tail call eliminating.
  1156. }
  1157. return PP.Lex(Result);
  1158. }
  1159. }
  1160. } else {
  1161. Result.setKind(tok::percent);
  1162. }
  1163. break;
  1164. case '<':
  1165. Char = getCharAndSize(CurPtr, SizeTmp);
  1166. if (ParsingFilename) {
  1167. return LexAngledStringLiteral(Result, CurPtr+SizeTmp);
  1168. } else if (Char == '<' &&
  1169. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
  1170. Result.setKind(tok::lesslessequal);
  1171. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1172. SizeTmp2, Result);
  1173. } else if (Char == '<') {
  1174. Result.setKind(tok::lessless);
  1175. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1176. } else if (Char == '=') {
  1177. Result.setKind(tok::lessequal);
  1178. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1179. } else if (Features.Digraphs && Char == ':') {
  1180. Result.setKind(tok::l_square); // '<:' -> '['
  1181. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1182. } else if (Features.Digraphs && Char == '>') {
  1183. Result.setKind(tok::l_brace); // '<%' -> '{'
  1184. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1185. } else {
  1186. Result.setKind(tok::less);
  1187. }
  1188. break;
  1189. case '>':
  1190. Char = getCharAndSize(CurPtr, SizeTmp);
  1191. if (Char == '=') {
  1192. Result.setKind(tok::greaterequal);
  1193. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1194. } else if (Char == '>' &&
  1195. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
  1196. Result.setKind(tok::greatergreaterequal);
  1197. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1198. SizeTmp2, Result);
  1199. } else if (Char == '>') {
  1200. Result.setKind(tok::greatergreater);
  1201. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1202. } else {
  1203. Result.setKind(tok::greater);
  1204. }
  1205. break;
  1206. case '^':
  1207. Char = getCharAndSize(CurPtr, SizeTmp);
  1208. if (Char == '=') {
  1209. Result.setKind(tok::caretequal);
  1210. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1211. } else {
  1212. Result.setKind(tok::caret);
  1213. }
  1214. break;
  1215. case '|':
  1216. Char = getCharAndSize(CurPtr, SizeTmp);
  1217. if (Char == '=') {
  1218. Result.setKind(tok::pipeequal);
  1219. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1220. } else if (Char == '|') {
  1221. Result.setKind(tok::pipepipe);
  1222. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1223. } else {
  1224. Result.setKind(tok::pipe);
  1225. }
  1226. break;
  1227. case ':':
  1228. Char = getCharAndSize(CurPtr, SizeTmp);
  1229. if (Features.Digraphs && Char == '>') {
  1230. Result.setKind(tok::r_square); // ':>' -> ']'
  1231. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1232. } else if (Features.CPlusPlus && Char == ':') {
  1233. Result.setKind(tok::coloncolon);
  1234. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1235. } else {
  1236. Result.setKind(tok::colon);
  1237. }
  1238. break;
  1239. case ';':
  1240. Result.setKind(tok::semi);
  1241. break;
  1242. case '=':
  1243. Char = getCharAndSize(CurPtr, SizeTmp);
  1244. if (Char == '=') {
  1245. Result.setKind(tok::equalequal);
  1246. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1247. } else {
  1248. Result.setKind(tok::equal);
  1249. }
  1250. break;
  1251. case ',':
  1252. Result.setKind(tok::comma);
  1253. break;
  1254. case '#':
  1255. Char = getCharAndSize(CurPtr, SizeTmp);
  1256. if (Char == '#') {
  1257. Result.setKind(tok::hashhash);
  1258. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1259. } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize
  1260. Result.setKind(tok::hashat);
  1261. Diag(BufferPtr, diag::charize_microsoft_ext);
  1262. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1263. } else {
  1264. Result.setKind(tok::hash);
  1265. // We parsed a # character. If this occurs at the start of the line,
  1266. // it's actually the start of a preprocessing directive. Callback to
  1267. // the preprocessor to handle it.
  1268. // FIXME: -fpreprocessed mode??
  1269. if (Result.isAtStartOfLine() && !LexingRawMode) {
  1270. BufferPtr = CurPtr;
  1271. PP.HandleDirective(Result);
  1272. // As an optimization, if the preprocessor didn't switch lexers, tail
  1273. // recurse.
  1274. if (PP.isCurrentLexer(this)) {
  1275. // Start a new token. If this is a #include or something, the PP may
  1276. // want us starting at the beginning of the line again. If so, set
  1277. // the StartOfLine flag.
  1278. if (IsAtStartOfLine) {
  1279. Result.setFlag(LexerToken::StartOfLine);
  1280. IsAtStartOfLine = false;
  1281. }
  1282. goto LexNextToken; // GCC isn't tail call eliminating.
  1283. }
  1284. return PP.Lex(Result);
  1285. }
  1286. }
  1287. break;
  1288. case '\\':
  1289. // FIXME: UCN's.
  1290. // FALL THROUGH.
  1291. default:
  1292. // Objective C support.
  1293. if (CurPtr[-1] == '@' && Features.ObjC1) {
  1294. Result.setKind(tok::at);
  1295. break;
  1296. } else if (CurPtr[-1] == '$' && Features.DollarIdents) {// $ in identifiers.
  1297. Diag(CurPtr-1, diag::ext_dollar_in_identifier);
  1298. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1299. MIOpt.ReadToken();
  1300. return LexIdentifier(Result, CurPtr);
  1301. }
  1302. Result.setKind(tok::unknown);
  1303. break;
  1304. }
  1305. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1306. MIOpt.ReadToken();
  1307. // Update the location of token as well as BufferPtr.
  1308. FormTokenWithChars(Result, CurPtr);
  1309. }