Lexer.cpp 62 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752
  1. //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Lexer and Token interfaces.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. //
  14. // TODO: GCC Diagnostics emitted by the lexer:
  15. // PEDWARN: (form feed|vertical tab) in preprocessing directive
  16. //
  17. // Universal characters, unicode, char mapping:
  18. // WARNING: `%.*s' is not in NFKC
  19. // WARNING: `%.*s' is not in NFC
  20. //
  21. // Other:
  22. // TODO: Options to support:
  23. // -fexec-charset,-fwide-exec-charset
  24. //
  25. //===----------------------------------------------------------------------===//
  26. #include "clang/Lex/Lexer.h"
  27. #include "clang/Lex/Preprocessor.h"
  28. #include "clang/Basic/Diagnostic.h"
  29. #include "clang/Basic/SourceManager.h"
  30. #include "llvm/Support/Compiler.h"
  31. #include "llvm/Support/MemoryBuffer.h"
  32. #include <cctype>
  33. using namespace clang;
  34. static void InitCharacterInfo();
  35. //===----------------------------------------------------------------------===//
  36. // Token Class Implementation
  37. //===----------------------------------------------------------------------===//
  38. /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
  39. bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
  40. if (IdentifierInfo *II = getIdentifierInfo())
  41. return II->getObjCKeywordID() == objcKey;
  42. return false;
  43. }
  44. /// getObjCKeywordID - Return the ObjC keyword kind.
  45. tok::ObjCKeywordKind Token::getObjCKeywordID() const {
  46. IdentifierInfo *specId = getIdentifierInfo();
  47. return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
  48. }
  49. //===----------------------------------------------------------------------===//
  50. // Lexer Class Implementation
  51. //===----------------------------------------------------------------------===//
  52. void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
  53. const char *BufEnd) {
  54. InitCharacterInfo();
  55. BufferStart = BufStart;
  56. BufferPtr = BufPtr;
  57. BufferEnd = BufEnd;
  58. assert(BufEnd[0] == 0 &&
  59. "We assume that the input buffer has a null character at the end"
  60. " to simplify lexing!");
  61. Is_PragmaLexer = false;
  62. // Start of the file is a start of line.
  63. IsAtStartOfLine = true;
  64. // We are not after parsing a #.
  65. ParsingPreprocessorDirective = false;
  66. // We are not after parsing #include.
  67. ParsingFilename = false;
  68. // We are not in raw mode. Raw mode disables diagnostics and interpretation
  69. // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
  70. // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
  71. // or otherwise skipping over tokens.
  72. LexingRawMode = false;
  73. // Default to not keeping comments.
  74. ExtendedTokenMode = 0;
  75. }
  76. /// Lexer constructor - Create a new lexer object for the specified buffer
  77. /// with the specified preprocessor managing the lexing process. This lexer
  78. /// assumes that the associated file buffer and Preprocessor objects will
  79. /// outlive it, so it doesn't take ownership of either of them.
  80. Lexer::Lexer(FileID FID, Preprocessor &PP)
  81. : PreprocessorLexer(&PP, FID),
  82. FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
  83. Features(PP.getLangOptions()) {
  84. const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID);
  85. InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
  86. InputFile->getBufferEnd());
  87. // Default to keeping comments if the preprocessor wants them.
  88. SetCommentRetentionState(PP.getCommentRetentionState());
  89. }
  90. /// Lexer constructor - Create a new raw lexer object. This object is only
  91. /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
  92. /// range will outlive it, so it doesn't take ownership of it.
  93. Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
  94. const char *BufStart, const char *BufPtr, const char *BufEnd)
  95. : FileLoc(fileloc), Features(features) {
  96. InitLexer(BufStart, BufPtr, BufEnd);
  97. // We *are* in raw mode.
  98. LexingRawMode = true;
  99. }
  100. /// Lexer constructor - Create a new raw lexer object. This object is only
  101. /// suitable for calls to 'LexRawToken'. This lexer assumes that the text
  102. /// range will outlive it, so it doesn't take ownership of it.
  103. Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features)
  104. : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
  105. const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID);
  106. InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
  107. FromFile->getBufferEnd());
  108. // We *are* in raw mode.
  109. LexingRawMode = true;
  110. }
  111. /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
  112. /// _Pragma expansion. This has a variety of magic semantics that this method
  113. /// sets up. It returns a new'd Lexer that must be delete'd when done.
  114. ///
  115. /// On entrance to this routine, TokStartLoc is a macro location which has a
  116. /// spelling loc that indicates the bytes to be lexed for the token and an
  117. /// instantiation location that indicates where all lexed tokens should be
  118. /// "expanded from".
  119. ///
  120. /// FIXME: It would really be nice to make _Pragma just be a wrapper around a
  121. /// normal lexer that remaps tokens as they fly by. This would require making
  122. /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
  123. /// interface that could handle this stuff. This would pull GetMappedTokenLoc
  124. /// out of the critical path of the lexer!
  125. ///
  126. Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
  127. SourceLocation InstantiationLoc,
  128. unsigned TokLen, Preprocessor &PP) {
  129. SourceManager &SM = PP.getSourceManager();
  130. // Create the lexer as if we were going to lex the file normally.
  131. FileID SpellingFID = SM.getFileID(SpellingLoc);
  132. Lexer *L = new Lexer(SpellingFID, PP);
  133. // Now that the lexer is created, change the start/end locations so that we
  134. // just lex the subsection of the file that we want. This is lexing from a
  135. // scratch buffer.
  136. const char *StrData = SM.getCharacterData(SpellingLoc);
  137. L->BufferPtr = StrData;
  138. L->BufferEnd = StrData+TokLen;
  139. // Set the SourceLocation with the remapping information. This ensures that
  140. // GetMappedTokenLoc will remap the tokens as they are lexed.
  141. L->FileLoc = SM.getInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
  142. InstantiationLoc);
  143. // Ensure that the lexer thinks it is inside a directive, so that end \n will
  144. // return an EOM token.
  145. L->ParsingPreprocessorDirective = true;
  146. // This lexer really is for _Pragma.
  147. L->Is_PragmaLexer = true;
  148. return L;
  149. }
  150. /// Stringify - Convert the specified string into a C string, with surrounding
  151. /// ""'s, and with escaped \ and " characters.
  152. std::string Lexer::Stringify(const std::string &Str, bool Charify) {
  153. std::string Result = Str;
  154. char Quote = Charify ? '\'' : '"';
  155. for (unsigned i = 0, e = Result.size(); i != e; ++i) {
  156. if (Result[i] == '\\' || Result[i] == Quote) {
  157. Result.insert(Result.begin()+i, '\\');
  158. ++i; ++e;
  159. }
  160. }
  161. return Result;
  162. }
  163. /// Stringify - Convert the specified string into a C string by escaping '\'
  164. /// and " characters. This does not add surrounding ""'s to the string.
  165. void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
  166. for (unsigned i = 0, e = Str.size(); i != e; ++i) {
  167. if (Str[i] == '\\' || Str[i] == '"') {
  168. Str.insert(Str.begin()+i, '\\');
  169. ++i; ++e;
  170. }
  171. }
  172. }
  173. /// MeasureTokenLength - Relex the token at the specified location and return
  174. /// its length in bytes in the input file. If the token needs cleaning (e.g.
  175. /// includes a trigraph or an escaped newline) then this count includes bytes
  176. /// that are part of that.
  177. unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
  178. const SourceManager &SM) {
  179. // If this comes from a macro expansion, we really do want the macro name, not
  180. // the token this macro expanded to.
  181. Loc = SM.getInstantiationLoc(Loc);
  182. // TODO: this could be special cased for common tokens like identifiers, ')',
  183. // etc to make this faster, if it mattered. Just look at StrData[0] to handle
  184. // all obviously single-char tokens. This could use
  185. // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
  186. // something.
  187. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedFileLoc(Loc);
  188. std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first);
  189. const char *StrData = Buffer.first+LocInfo.second;
  190. // Create a langops struct and enable trigraphs. This is sufficient for
  191. // measuring tokens.
  192. LangOptions LangOpts;
  193. LangOpts.Trigraphs = true;
  194. // Create a lexer starting at the beginning of this token.
  195. Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second);
  196. Token TheTok;
  197. TheLexer.LexFromRawLexer(TheTok);
  198. return TheTok.getLength();
  199. }
  200. //===----------------------------------------------------------------------===//
  201. // Character information.
  202. //===----------------------------------------------------------------------===//
  203. static unsigned char CharInfo[256];
  204. enum {
  205. CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0'
  206. CHAR_VERT_WS = 0x02, // '\r', '\n'
  207. CHAR_LETTER = 0x04, // a-z,A-Z
  208. CHAR_NUMBER = 0x08, // 0-9
  209. CHAR_UNDER = 0x10, // _
  210. CHAR_PERIOD = 0x20 // .
  211. };
  212. static void InitCharacterInfo() {
  213. static bool isInited = false;
  214. if (isInited) return;
  215. isInited = true;
  216. // Intiialize the CharInfo table.
  217. // TODO: statically initialize this.
  218. CharInfo[(int)' '] = CharInfo[(int)'\t'] =
  219. CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
  220. CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
  221. CharInfo[(int)'_'] = CHAR_UNDER;
  222. CharInfo[(int)'.'] = CHAR_PERIOD;
  223. for (unsigned i = 'a'; i <= 'z'; ++i)
  224. CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
  225. for (unsigned i = '0'; i <= '9'; ++i)
  226. CharInfo[i] = CHAR_NUMBER;
  227. }
  228. /// isIdentifierBody - Return true if this is the body character of an
  229. /// identifier, which is [a-zA-Z0-9_].
  230. static inline bool isIdentifierBody(unsigned char c) {
  231. return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
  232. }
  233. /// isHorizontalWhitespace - Return true if this character is horizontal
  234. /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'.
  235. static inline bool isHorizontalWhitespace(unsigned char c) {
  236. return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
  237. }
  238. /// isWhitespace - Return true if this character is horizontal or vertical
  239. /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false
  240. /// for '\0'.
  241. static inline bool isWhitespace(unsigned char c) {
  242. return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
  243. }
  244. /// isNumberBody - Return true if this is the body character of an
  245. /// preprocessing number, which is [a-zA-Z0-9_.].
  246. static inline bool isNumberBody(unsigned char c) {
  247. return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
  248. true : false;
  249. }
  250. //===----------------------------------------------------------------------===//
  251. // Diagnostics forwarding code.
  252. //===----------------------------------------------------------------------===//
  253. /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
  254. /// lexer buffer was all instantiated at a single point, perform the mapping.
  255. /// This is currently only used for _Pragma implementation, so it is the slow
  256. /// path of the hot getSourceLocation method. Do not allow it to be inlined.
  257. static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
  258. SourceLocation FileLoc,
  259. unsigned CharNo) DISABLE_INLINE;
  260. static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
  261. SourceLocation FileLoc,
  262. unsigned CharNo) {
  263. // Otherwise, we're lexing "mapped tokens". This is used for things like
  264. // _Pragma handling. Combine the instantiation location of FileLoc with the
  265. // spelling location.
  266. SourceManager &SourceMgr = PP.getSourceManager();
  267. // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
  268. // characters come from spelling(FileLoc)+Offset.
  269. SourceLocation InstLoc = SourceMgr.getInstantiationLoc(FileLoc);
  270. SourceLocation SpellingLoc = SourceMgr.getSpellingLoc(FileLoc);
  271. SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
  272. return SourceMgr.getInstantiationLoc(SpellingLoc, InstLoc);
  273. }
  274. /// getSourceLocation - Return a source location identifier for the specified
  275. /// offset in the current file.
  276. SourceLocation Lexer::getSourceLocation(const char *Loc) const {
  277. assert(Loc >= BufferStart && Loc <= BufferEnd &&
  278. "Location out of range for this buffer!");
  279. // In the normal case, we're just lexing from a simple file buffer, return
  280. // the file id from FileLoc with the offset specified.
  281. unsigned CharNo = Loc-BufferStart;
  282. if (FileLoc.isFileID())
  283. return FileLoc.getFileLocWithOffset(CharNo);
  284. // Otherwise, this is the _Pragma lexer case, which pretends that all of the
  285. // tokens are lexed from where the _Pragma was defined.
  286. assert(PP && "This doesn't work on raw lexers");
  287. return GetMappedTokenLoc(*PP, FileLoc, CharNo);
  288. }
  289. /// Diag - Forwarding function for diagnostics. This translate a source
  290. /// position in the current buffer into a SourceLocation object for rendering.
  291. DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
  292. return PP->Diag(getSourceLocation(Loc), DiagID);
  293. }
  294. //===----------------------------------------------------------------------===//
  295. // Trigraph and Escaped Newline Handling Code.
  296. //===----------------------------------------------------------------------===//
  297. /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
  298. /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
  299. static char GetTrigraphCharForLetter(char Letter) {
  300. switch (Letter) {
  301. default: return 0;
  302. case '=': return '#';
  303. case ')': return ']';
  304. case '(': return '[';
  305. case '!': return '|';
  306. case '\'': return '^';
  307. case '>': return '}';
  308. case '/': return '\\';
  309. case '<': return '{';
  310. case '-': return '~';
  311. }
  312. }
  313. /// DecodeTrigraphChar - If the specified character is a legal trigraph when
  314. /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
  315. /// return the result character. Finally, emit a warning about trigraph use
  316. /// whether trigraphs are enabled or not.
  317. static char DecodeTrigraphChar(const char *CP, Lexer *L) {
  318. char Res = GetTrigraphCharForLetter(*CP);
  319. if (!Res || !L) return Res;
  320. if (!L->getFeatures().Trigraphs) {
  321. if (!L->isLexingRawMode())
  322. L->Diag(CP-2, diag::trigraph_ignored);
  323. return 0;
  324. }
  325. if (!L->isLexingRawMode())
  326. L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
  327. return Res;
  328. }
  329. /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
  330. /// get its size, and return it. This is tricky in several cases:
  331. /// 1. If currently at the start of a trigraph, we warn about the trigraph,
  332. /// then either return the trigraph (skipping 3 chars) or the '?',
  333. /// depending on whether trigraphs are enabled or not.
  334. /// 2. If this is an escaped newline (potentially with whitespace between
  335. /// the backslash and newline), implicitly skip the newline and return
  336. /// the char after it.
  337. /// 3. If this is a UCN, return it. FIXME: C++ UCN's?
  338. ///
  339. /// This handles the slow/uncommon case of the getCharAndSize method. Here we
  340. /// know that we can accumulate into Size, and that we have already incremented
  341. /// Ptr by Size bytes.
  342. ///
  343. /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
  344. /// be updated to match.
  345. ///
  346. char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  347. Token *Tok) {
  348. // If we have a slash, look for an escaped newline.
  349. if (Ptr[0] == '\\') {
  350. ++Size;
  351. ++Ptr;
  352. Slash:
  353. // Common case, backslash-char where the char is not whitespace.
  354. if (!isWhitespace(Ptr[0])) return '\\';
  355. // See if we have optional whitespace characters followed by a newline.
  356. {
  357. unsigned SizeTmp = 0;
  358. do {
  359. ++SizeTmp;
  360. if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
  361. // Remember that this token needs to be cleaned.
  362. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  363. // Warn if there was whitespace between the backslash and newline.
  364. if (SizeTmp != 1 && Tok && !isLexingRawMode())
  365. Diag(Ptr, diag::backslash_newline_space);
  366. // If this is a \r\n or \n\r, skip the newlines.
  367. if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
  368. Ptr[SizeTmp-1] != Ptr[SizeTmp])
  369. ++SizeTmp;
  370. // Found backslash<whitespace><newline>. Parse the char after it.
  371. Size += SizeTmp;
  372. Ptr += SizeTmp;
  373. // Use slow version to accumulate a correct size field.
  374. return getCharAndSizeSlow(Ptr, Size, Tok);
  375. }
  376. } while (isWhitespace(Ptr[SizeTmp]));
  377. }
  378. // Otherwise, this is not an escaped newline, just return the slash.
  379. return '\\';
  380. }
  381. // If this is a trigraph, process it.
  382. if (Ptr[0] == '?' && Ptr[1] == '?') {
  383. // If this is actually a legal trigraph (not something like "??x"), emit
  384. // a trigraph warning. If so, and if trigraphs are enabled, return it.
  385. if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
  386. // Remember that this token needs to be cleaned.
  387. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  388. Ptr += 3;
  389. Size += 3;
  390. if (C == '\\') goto Slash;
  391. return C;
  392. }
  393. }
  394. // If this is neither, return a single character.
  395. ++Size;
  396. return *Ptr;
  397. }
  398. /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
  399. /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
  400. /// and that we have already incremented Ptr by Size bytes.
  401. ///
  402. /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
  403. /// be updated to match.
  404. char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  405. const LangOptions &Features) {
  406. // If we have a slash, look for an escaped newline.
  407. if (Ptr[0] == '\\') {
  408. ++Size;
  409. ++Ptr;
  410. Slash:
  411. // Common case, backslash-char where the char is not whitespace.
  412. if (!isWhitespace(Ptr[0])) return '\\';
  413. // See if we have optional whitespace characters followed by a newline.
  414. {
  415. unsigned SizeTmp = 0;
  416. do {
  417. ++SizeTmp;
  418. if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
  419. // If this is a \r\n or \n\r, skip the newlines.
  420. if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
  421. Ptr[SizeTmp-1] != Ptr[SizeTmp])
  422. ++SizeTmp;
  423. // Found backslash<whitespace><newline>. Parse the char after it.
  424. Size += SizeTmp;
  425. Ptr += SizeTmp;
  426. // Use slow version to accumulate a correct size field.
  427. return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
  428. }
  429. } while (isWhitespace(Ptr[SizeTmp]));
  430. }
  431. // Otherwise, this is not an escaped newline, just return the slash.
  432. return '\\';
  433. }
  434. // If this is a trigraph, process it.
  435. if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
  436. // If this is actually a legal trigraph (not something like "??x"), return
  437. // it.
  438. if (char C = GetTrigraphCharForLetter(Ptr[2])) {
  439. Ptr += 3;
  440. Size += 3;
  441. if (C == '\\') goto Slash;
  442. return C;
  443. }
  444. }
  445. // If this is neither, return a single character.
  446. ++Size;
  447. return *Ptr;
  448. }
  449. //===----------------------------------------------------------------------===//
  450. // Helper methods for lexing.
  451. //===----------------------------------------------------------------------===//
  452. void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  453. // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  454. unsigned Size;
  455. unsigned char C = *CurPtr++;
  456. while (isIdentifierBody(C)) {
  457. C = *CurPtr++;
  458. }
  459. --CurPtr; // Back up over the skipped character.
  460. // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
  461. // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
  462. // FIXME: UCNs.
  463. if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
  464. FinishIdentifier:
  465. const char *IdStart = BufferPtr;
  466. FormTokenWithChars(Result, CurPtr, tok::identifier);
  467. // If we are in raw mode, return this identifier raw. There is no need to
  468. // look up identifier information or attempt to macro expand it.
  469. if (LexingRawMode) return;
  470. // Fill in Result.IdentifierInfo, looking up the identifier in the
  471. // identifier table.
  472. PP->LookUpIdentifierInfo(Result, IdStart);
  473. // Finally, now that we know we have an identifier, pass this off to the
  474. // preprocessor, which may macro expand it or something.
  475. return PP->HandleIdentifier(Result);
  476. }
  477. // Otherwise, $,\,? in identifier found. Enter slower path.
  478. C = getCharAndSize(CurPtr, Size);
  479. while (1) {
  480. if (C == '$') {
  481. // If we hit a $ and they are not supported in identifiers, we are done.
  482. if (!Features.DollarIdents) goto FinishIdentifier;
  483. // Otherwise, emit a diagnostic and continue.
  484. if (!isLexingRawMode())
  485. Diag(CurPtr, diag::ext_dollar_in_identifier);
  486. CurPtr = ConsumeChar(CurPtr, Size, Result);
  487. C = getCharAndSize(CurPtr, Size);
  488. continue;
  489. } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
  490. // Found end of identifier.
  491. goto FinishIdentifier;
  492. }
  493. // Otherwise, this character is good, consume it.
  494. CurPtr = ConsumeChar(CurPtr, Size, Result);
  495. C = getCharAndSize(CurPtr, Size);
  496. while (isIdentifierBody(C)) { // FIXME: UCNs.
  497. CurPtr = ConsumeChar(CurPtr, Size, Result);
  498. C = getCharAndSize(CurPtr, Size);
  499. }
  500. }
  501. }
  502. /// LexNumericConstant - Lex the remainder of a integer or floating point
  503. /// constant. From[-1] is the first character lexed. Return the end of the
  504. /// constant.
  505. void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  506. unsigned Size;
  507. char C = getCharAndSize(CurPtr, Size);
  508. char PrevCh = 0;
  509. while (isNumberBody(C)) { // FIXME: UCNs?
  510. CurPtr = ConsumeChar(CurPtr, Size, Result);
  511. PrevCh = C;
  512. C = getCharAndSize(CurPtr, Size);
  513. }
  514. // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
  515. if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
  516. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  517. // If we have a hex FP constant, continue.
  518. if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
  519. (Features.HexFloats || !Features.NoExtensions))
  520. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  521. // Update the location of token as well as BufferPtr.
  522. FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
  523. }
  524. /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
  525. /// either " or L".
  526. void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
  527. const char *NulCharacter = 0; // Does this string contain the \0 character?
  528. char C = getAndAdvanceChar(CurPtr, Result);
  529. while (C != '"') {
  530. // Skip escaped characters.
  531. if (C == '\\') {
  532. // Skip the escaped character.
  533. C = getAndAdvanceChar(CurPtr, Result);
  534. } else if (C == '\n' || C == '\r' || // Newline.
  535. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  536. if (!isLexingRawMode())
  537. Diag(BufferPtr, diag::err_unterminated_string);
  538. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  539. return;
  540. } else if (C == 0) {
  541. NulCharacter = CurPtr-1;
  542. }
  543. C = getAndAdvanceChar(CurPtr, Result);
  544. }
  545. // If a nul character existed in the string, warn about it.
  546. if (NulCharacter && !isLexingRawMode())
  547. Diag(NulCharacter, diag::null_in_string);
  548. // Update the location of the token as well as the BufferPtr instance var.
  549. FormTokenWithChars(Result, CurPtr,
  550. Wide ? tok::wide_string_literal : tok::string_literal);
  551. }
  552. /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
  553. /// after having lexed the '<' character. This is used for #include filenames.
  554. void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
  555. const char *NulCharacter = 0; // Does this string contain the \0 character?
  556. char C = getAndAdvanceChar(CurPtr, Result);
  557. while (C != '>') {
  558. // Skip escaped characters.
  559. if (C == '\\') {
  560. // Skip the escaped character.
  561. C = getAndAdvanceChar(CurPtr, Result);
  562. } else if (C == '\n' || C == '\r' || // Newline.
  563. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  564. if (!isLexingRawMode())
  565. Diag(BufferPtr, diag::err_unterminated_string);
  566. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  567. return;
  568. } else if (C == 0) {
  569. NulCharacter = CurPtr-1;
  570. }
  571. C = getAndAdvanceChar(CurPtr, Result);
  572. }
  573. // If a nul character existed in the string, warn about it.
  574. if (NulCharacter && !isLexingRawMode())
  575. Diag(NulCharacter, diag::null_in_string);
  576. // Update the location of token as well as BufferPtr.
  577. FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
  578. }
  579. /// LexCharConstant - Lex the remainder of a character constant, after having
  580. /// lexed either ' or L'.
  581. void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
  582. const char *NulCharacter = 0; // Does this character contain the \0 character?
  583. // Handle the common case of 'x' and '\y' efficiently.
  584. char C = getAndAdvanceChar(CurPtr, Result);
  585. if (C == '\'') {
  586. if (!isLexingRawMode())
  587. Diag(BufferPtr, diag::err_empty_character);
  588. FormTokenWithChars(Result, CurPtr, tok::unknown);
  589. return;
  590. } else if (C == '\\') {
  591. // Skip the escaped character.
  592. // FIXME: UCN's.
  593. C = getAndAdvanceChar(CurPtr, Result);
  594. }
  595. if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
  596. ++CurPtr;
  597. } else {
  598. // Fall back on generic code for embedded nulls, newlines, wide chars.
  599. do {
  600. // Skip escaped characters.
  601. if (C == '\\') {
  602. // Skip the escaped character.
  603. C = getAndAdvanceChar(CurPtr, Result);
  604. } else if (C == '\n' || C == '\r' || // Newline.
  605. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  606. if (!isLexingRawMode())
  607. Diag(BufferPtr, diag::err_unterminated_char);
  608. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  609. return;
  610. } else if (C == 0) {
  611. NulCharacter = CurPtr-1;
  612. }
  613. C = getAndAdvanceChar(CurPtr, Result);
  614. } while (C != '\'');
  615. }
  616. if (NulCharacter && !isLexingRawMode())
  617. Diag(NulCharacter, diag::null_in_char);
  618. // Update the location of token as well as BufferPtr.
  619. FormTokenWithChars(Result, CurPtr, tok::char_constant);
  620. }
  621. /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
  622. /// Update BufferPtr to point to the next non-whitespace character and return.
  623. ///
  624. /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
  625. ///
  626. bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
  627. // Whitespace - Skip it, then return the token after the whitespace.
  628. unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently.
  629. while (1) {
  630. // Skip horizontal whitespace very aggressively.
  631. while (isHorizontalWhitespace(Char))
  632. Char = *++CurPtr;
  633. // Otherwise if we have something other than whitespace, we're done.
  634. if (Char != '\n' && Char != '\r')
  635. break;
  636. if (ParsingPreprocessorDirective) {
  637. // End of preprocessor directive line, let LexTokenInternal handle this.
  638. BufferPtr = CurPtr;
  639. return false;
  640. }
  641. // ok, but handle newline.
  642. // The returned token is at the start of the line.
  643. Result.setFlag(Token::StartOfLine);
  644. // No leading whitespace seen so far.
  645. Result.clearFlag(Token::LeadingSpace);
  646. Char = *++CurPtr;
  647. }
  648. // If this isn't immediately after a newline, there is leading space.
  649. char PrevChar = CurPtr[-1];
  650. if (PrevChar != '\n' && PrevChar != '\r')
  651. Result.setFlag(Token::LeadingSpace);
  652. // If the client wants us to return whitespace, return it now.
  653. if (isKeepWhitespaceMode()) {
  654. FormTokenWithChars(Result, CurPtr, tok::unknown);
  655. return true;
  656. }
  657. BufferPtr = CurPtr;
  658. return false;
  659. }
  660. // SkipBCPLComment - We have just read the // characters from input. Skip until
  661. // we find the newline character thats terminate the comment. Then update
  662. /// BufferPtr and return. If we're in KeepCommentMode, this will form the token
  663. /// and return true.
  664. bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
  665. // If BCPL comments aren't explicitly enabled for this language, emit an
  666. // extension warning.
  667. if (!Features.BCPLComment && !isLexingRawMode()) {
  668. Diag(BufferPtr, diag::ext_bcpl_comment);
  669. // Mark them enabled so we only emit one warning for this translation
  670. // unit.
  671. Features.BCPLComment = true;
  672. }
  673. // Scan over the body of the comment. The common case, when scanning, is that
  674. // the comment contains normal ascii characters with nothing interesting in
  675. // them. As such, optimize for this case with the inner loop.
  676. char C;
  677. do {
  678. C = *CurPtr;
  679. // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character.
  680. // If we find a \n character, scan backwards, checking to see if it's an
  681. // escaped newline, like we do for block comments.
  682. // Skip over characters in the fast loop.
  683. while (C != 0 && // Potentially EOF.
  684. C != '\\' && // Potentially escaped newline.
  685. C != '?' && // Potentially trigraph.
  686. C != '\n' && C != '\r') // Newline or DOS-style newline.
  687. C = *++CurPtr;
  688. // If this is a newline, we're done.
  689. if (C == '\n' || C == '\r')
  690. break; // Found the newline? Break out!
  691. // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
  692. // properly decode the character. Read it in raw mode to avoid emitting
  693. // diagnostics about things like trigraphs. If we see an escaped newline,
  694. // we'll handle it below.
  695. const char *OldPtr = CurPtr;
  696. bool OldRawMode = isLexingRawMode();
  697. LexingRawMode = true;
  698. C = getAndAdvanceChar(CurPtr, Result);
  699. LexingRawMode = OldRawMode;
  700. // If we read multiple characters, and one of those characters was a \r or
  701. // \n, then we had an escaped newline within the comment. Emit diagnostic
  702. // unless the next line is also a // comment.
  703. if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
  704. for (; OldPtr != CurPtr; ++OldPtr)
  705. if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
  706. // Okay, we found a // comment that ends in a newline, if the next
  707. // line is also a // comment, but has spaces, don't emit a diagnostic.
  708. if (isspace(C)) {
  709. const char *ForwardPtr = CurPtr;
  710. while (isspace(*ForwardPtr)) // Skip whitespace.
  711. ++ForwardPtr;
  712. if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
  713. break;
  714. }
  715. if (!isLexingRawMode())
  716. Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
  717. break;
  718. }
  719. }
  720. if (CurPtr == BufferEnd+1) { --CurPtr; break; }
  721. } while (C != '\n' && C != '\r');
  722. // Found but did not consume the newline.
  723. // If we are returning comments as tokens, return this comment as a token.
  724. if (inKeepCommentMode())
  725. return SaveBCPLComment(Result, CurPtr);
  726. // If we are inside a preprocessor directive and we see the end of line,
  727. // return immediately, so that the lexer can return this as an EOM token.
  728. if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
  729. BufferPtr = CurPtr;
  730. return false;
  731. }
  732. // Otherwise, eat the \n character. We don't care if this is a \n\r or
  733. // \r\n sequence. This is an efficiency hack (because we know the \n can't
  734. // contribute to another token), it isn't needed for correctness. Note that
  735. // this is ok even in KeepWhitespaceMode, because we would have returned the
  736. /// comment above in that mode.
  737. ++CurPtr;
  738. // The next returned token is at the start of the line.
  739. Result.setFlag(Token::StartOfLine);
  740. // No leading whitespace seen so far.
  741. Result.clearFlag(Token::LeadingSpace);
  742. BufferPtr = CurPtr;
  743. return false;
  744. }
  745. /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
  746. /// an appropriate way and return it.
  747. bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
  748. // If we're not in a preprocessor directive, just return the // comment
  749. // directly.
  750. FormTokenWithChars(Result, CurPtr, tok::comment);
  751. if (!ParsingPreprocessorDirective)
  752. return true;
  753. // If this BCPL-style comment is in a macro definition, transmogrify it into
  754. // a C-style block comment.
  755. std::string Spelling = PP->getSpelling(Result);
  756. assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
  757. Spelling[1] = '*'; // Change prefix to "/*".
  758. Spelling += "*/"; // add suffix.
  759. Result.setKind(tok::comment);
  760. Result.setLocation(PP->CreateString(&Spelling[0], Spelling.size(),
  761. Result.getLocation()));
  762. Result.setLength(Spelling.size());
  763. return true;
  764. }
  765. /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
  766. /// character (either \n or \r) is part of an escaped newline sequence. Issue a
  767. /// diagnostic if so. We know that the newline is inside of a block comment.
  768. static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
  769. Lexer *L) {
  770. assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
  771. // Back up off the newline.
  772. --CurPtr;
  773. // If this is a two-character newline sequence, skip the other character.
  774. if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
  775. // \n\n or \r\r -> not escaped newline.
  776. if (CurPtr[0] == CurPtr[1])
  777. return false;
  778. // \n\r or \r\n -> skip the newline.
  779. --CurPtr;
  780. }
  781. // If we have horizontal whitespace, skip over it. We allow whitespace
  782. // between the slash and newline.
  783. bool HasSpace = false;
  784. while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
  785. --CurPtr;
  786. HasSpace = true;
  787. }
  788. // If we have a slash, we know this is an escaped newline.
  789. if (*CurPtr == '\\') {
  790. if (CurPtr[-1] != '*') return false;
  791. } else {
  792. // It isn't a slash, is it the ?? / trigraph?
  793. if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
  794. CurPtr[-3] != '*')
  795. return false;
  796. // This is the trigraph ending the comment. Emit a stern warning!
  797. CurPtr -= 2;
  798. // If no trigraphs are enabled, warn that we ignored this trigraph and
  799. // ignore this * character.
  800. if (!L->getFeatures().Trigraphs) {
  801. if (!L->isLexingRawMode())
  802. L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
  803. return false;
  804. }
  805. if (!L->isLexingRawMode())
  806. L->Diag(CurPtr, diag::trigraph_ends_block_comment);
  807. }
  808. // Warn about having an escaped newline between the */ characters.
  809. if (!L->isLexingRawMode())
  810. L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
  811. // If there was space between the backslash and newline, warn about it.
  812. if (HasSpace && !L->isLexingRawMode())
  813. L->Diag(CurPtr, diag::backslash_newline_space);
  814. return true;
  815. }
  816. #ifdef __SSE2__
  817. #include <emmintrin.h>
  818. #elif __ALTIVEC__
  819. #include <altivec.h>
  820. #undef bool
  821. #endif
  822. /// SkipBlockComment - We have just read the /* characters from input. Read
  823. /// until we find the */ characters that terminate the comment. Note that we
  824. /// don't bother decoding trigraphs or escaped newlines in block comments,
  825. /// because they cannot cause the comment to end. The only thing that can
  826. /// happen is the comment could end with an escaped newline between the */ end
  827. /// of comment.
  828. ///
  829. /// If KeepCommentMode is enabled, this forms a token from the comment and
  830. /// returns true.
  831. bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
  832. // Scan one character past where we should, looking for a '/' character. Once
  833. // we find it, check to see if it was preceeded by a *. This common
  834. // optimization helps people who like to put a lot of * characters in their
  835. // comments.
  836. // The first character we get with newlines and trigraphs skipped to handle
  837. // the degenerate /*/ case below correctly if the * has an escaped newline
  838. // after it.
  839. unsigned CharSize;
  840. unsigned char C = getCharAndSize(CurPtr, CharSize);
  841. CurPtr += CharSize;
  842. if (C == 0 && CurPtr == BufferEnd+1) {
  843. if (!isLexingRawMode())
  844. Diag(BufferPtr, diag::err_unterminated_block_comment);
  845. --CurPtr;
  846. // KeepWhitespaceMode should return this broken comment as a token. Since
  847. // it isn't a well formed comment, just return it as an 'unknown' token.
  848. if (isKeepWhitespaceMode()) {
  849. FormTokenWithChars(Result, CurPtr, tok::unknown);
  850. return true;
  851. }
  852. BufferPtr = CurPtr;
  853. return false;
  854. }
  855. // Check to see if the first character after the '/*' is another /. If so,
  856. // then this slash does not end the block comment, it is part of it.
  857. if (C == '/')
  858. C = *CurPtr++;
  859. while (1) {
  860. // Skip over all non-interesting characters until we find end of buffer or a
  861. // (probably ending) '/' character.
  862. if (CurPtr + 24 < BufferEnd) {
  863. // While not aligned to a 16-byte boundary.
  864. while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
  865. C = *CurPtr++;
  866. if (C == '/') goto FoundSlash;
  867. #ifdef __SSE2__
  868. __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
  869. '/', '/', '/', '/', '/', '/', '/', '/');
  870. while (CurPtr+16 <= BufferEnd &&
  871. _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
  872. CurPtr += 16;
  873. #elif __ALTIVEC__
  874. __vector unsigned char Slashes = {
  875. '/', '/', '/', '/', '/', '/', '/', '/',
  876. '/', '/', '/', '/', '/', '/', '/', '/'
  877. };
  878. while (CurPtr+16 <= BufferEnd &&
  879. !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
  880. CurPtr += 16;
  881. #else
  882. // Scan for '/' quickly. Many block comments are very large.
  883. while (CurPtr[0] != '/' &&
  884. CurPtr[1] != '/' &&
  885. CurPtr[2] != '/' &&
  886. CurPtr[3] != '/' &&
  887. CurPtr+4 < BufferEnd) {
  888. CurPtr += 4;
  889. }
  890. #endif
  891. // It has to be one of the bytes scanned, increment to it and read one.
  892. C = *CurPtr++;
  893. }
  894. // Loop to scan the remainder.
  895. while (C != '/' && C != '\0')
  896. C = *CurPtr++;
  897. FoundSlash:
  898. if (C == '/') {
  899. if (CurPtr[-2] == '*') // We found the final */. We're done!
  900. break;
  901. if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
  902. if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
  903. // We found the final */, though it had an escaped newline between the
  904. // * and /. We're done!
  905. break;
  906. }
  907. }
  908. if (CurPtr[0] == '*' && CurPtr[1] != '/') {
  909. // If this is a /* inside of the comment, emit a warning. Don't do this
  910. // if this is a /*/, which will end the comment. This misses cases with
  911. // embedded escaped newlines, but oh well.
  912. if (!isLexingRawMode())
  913. Diag(CurPtr-1, diag::warn_nested_block_comment);
  914. }
  915. } else if (C == 0 && CurPtr == BufferEnd+1) {
  916. if (!isLexingRawMode())
  917. Diag(BufferPtr, diag::err_unterminated_block_comment);
  918. // Note: the user probably forgot a */. We could continue immediately
  919. // after the /*, but this would involve lexing a lot of what really is the
  920. // comment, which surely would confuse the parser.
  921. --CurPtr;
  922. // KeepWhitespaceMode should return this broken comment as a token. Since
  923. // it isn't a well formed comment, just return it as an 'unknown' token.
  924. if (isKeepWhitespaceMode()) {
  925. FormTokenWithChars(Result, CurPtr, tok::unknown);
  926. return true;
  927. }
  928. BufferPtr = CurPtr;
  929. return false;
  930. }
  931. C = *CurPtr++;
  932. }
  933. // If we are returning comments as tokens, return this comment as a token.
  934. if (inKeepCommentMode()) {
  935. FormTokenWithChars(Result, CurPtr, tok::comment);
  936. return true;
  937. }
  938. // It is common for the tokens immediately after a /**/ comment to be
  939. // whitespace. Instead of going through the big switch, handle it
  940. // efficiently now. This is safe even in KeepWhitespaceMode because we would
  941. // have already returned above with the comment as a token.
  942. if (isHorizontalWhitespace(*CurPtr)) {
  943. Result.setFlag(Token::LeadingSpace);
  944. SkipWhitespace(Result, CurPtr+1);
  945. return false;
  946. }
  947. // Otherwise, just return so that the next character will be lexed as a token.
  948. BufferPtr = CurPtr;
  949. Result.setFlag(Token::LeadingSpace);
  950. return false;
  951. }
  952. //===----------------------------------------------------------------------===//
  953. // Primary Lexing Entry Points
  954. //===----------------------------------------------------------------------===//
  955. /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  956. /// uninterpreted string. This switches the lexer out of directive mode.
  957. std::string Lexer::ReadToEndOfLine() {
  958. assert(ParsingPreprocessorDirective && ParsingFilename == false &&
  959. "Must be in a preprocessing directive!");
  960. std::string Result;
  961. Token Tmp;
  962. // CurPtr - Cache BufferPtr in an automatic variable.
  963. const char *CurPtr = BufferPtr;
  964. while (1) {
  965. char Char = getAndAdvanceChar(CurPtr, Tmp);
  966. switch (Char) {
  967. default:
  968. Result += Char;
  969. break;
  970. case 0: // Null.
  971. // Found end of file?
  972. if (CurPtr-1 != BufferEnd) {
  973. // Nope, normal character, continue.
  974. Result += Char;
  975. break;
  976. }
  977. // FALL THROUGH.
  978. case '\r':
  979. case '\n':
  980. // Okay, we found the end of the line. First, back up past the \0, \r, \n.
  981. assert(CurPtr[-1] == Char && "Trigraphs for newline?");
  982. BufferPtr = CurPtr-1;
  983. // Next, lex the character, which should handle the EOM transition.
  984. Lex(Tmp);
  985. assert(Tmp.is(tok::eom) && "Unexpected token!");
  986. // Finally, we're done, return the string we found.
  987. return Result;
  988. }
  989. }
  990. }
  991. /// LexEndOfFile - CurPtr points to the end of this file. Handle this
  992. /// condition, reporting diagnostics and handling other edge cases as required.
  993. /// This returns true if Result contains a token, false if PP.Lex should be
  994. /// called again.
  995. bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
  996. // If we hit the end of the file while parsing a preprocessor directive,
  997. // end the preprocessor directive first. The next token returned will
  998. // then be the end of file.
  999. if (ParsingPreprocessorDirective) {
  1000. // Done parsing the "line".
  1001. ParsingPreprocessorDirective = false;
  1002. // Update the location of token as well as BufferPtr.
  1003. FormTokenWithChars(Result, CurPtr, tok::eom);
  1004. // Restore comment saving mode, in case it was disabled for directive.
  1005. SetCommentRetentionState(PP->getCommentRetentionState());
  1006. return true; // Have a token.
  1007. }
  1008. // If we are in raw mode, return this event as an EOF token. Let the caller
  1009. // that put us in raw mode handle the event.
  1010. if (isLexingRawMode()) {
  1011. Result.startToken();
  1012. BufferPtr = BufferEnd;
  1013. FormTokenWithChars(Result, BufferEnd, tok::eof);
  1014. return true;
  1015. }
  1016. // Otherwise, issue diagnostics for unterminated #if and missing newline.
  1017. // If we are in a #if directive, emit an error.
  1018. while (!ConditionalStack.empty()) {
  1019. PP->Diag(ConditionalStack.back().IfLoc,
  1020. diag::err_pp_unterminated_conditional);
  1021. ConditionalStack.pop_back();
  1022. }
  1023. // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
  1024. // a pedwarn.
  1025. if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
  1026. Diag(BufferEnd, diag::ext_no_newline_eof);
  1027. BufferPtr = CurPtr;
  1028. // Finally, let the preprocessor handle this.
  1029. return PP->HandleEndOfFile(Result);
  1030. }
  1031. /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
  1032. /// the specified lexer will return a tok::l_paren token, 0 if it is something
  1033. /// else and 2 if there are no more tokens in the buffer controlled by the
  1034. /// lexer.
  1035. unsigned Lexer::isNextPPTokenLParen() {
  1036. assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
  1037. // Switch to 'skipping' mode. This will ensure that we can lex a token
  1038. // without emitting diagnostics, disables macro expansion, and will cause EOF
  1039. // to return an EOF token instead of popping the include stack.
  1040. LexingRawMode = true;
  1041. // Save state that can be changed while lexing so that we can restore it.
  1042. const char *TmpBufferPtr = BufferPtr;
  1043. Token Tok;
  1044. Tok.startToken();
  1045. LexTokenInternal(Tok);
  1046. // Restore state that may have changed.
  1047. BufferPtr = TmpBufferPtr;
  1048. // Restore the lexer back to non-skipping mode.
  1049. LexingRawMode = false;
  1050. if (Tok.is(tok::eof))
  1051. return 2;
  1052. return Tok.is(tok::l_paren);
  1053. }
  1054. /// LexTokenInternal - This implements a simple C family lexer. It is an
  1055. /// extremely performance critical piece of code. This assumes that the buffer
  1056. /// has a null character at the end of the file. Return true if an error
  1057. /// occurred and compilation should terminate, false if normal. This returns a
  1058. /// preprocessing token, not a normal token, as such, it is an internal
  1059. /// interface. It assumes that the Flags of result have been cleared before
  1060. /// calling this.
  1061. void Lexer::LexTokenInternal(Token &Result) {
  1062. LexNextToken:
  1063. // New token, can't need cleaning yet.
  1064. Result.clearFlag(Token::NeedsCleaning);
  1065. Result.setIdentifierInfo(0);
  1066. // CurPtr - Cache BufferPtr in an automatic variable.
  1067. const char *CurPtr = BufferPtr;
  1068. // Small amounts of horizontal whitespace is very common between tokens.
  1069. if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
  1070. ++CurPtr;
  1071. while ((*CurPtr == ' ') || (*CurPtr == '\t'))
  1072. ++CurPtr;
  1073. // If we are keeping whitespace and other tokens, just return what we just
  1074. // skipped. The next lexer invocation will return the token after the
  1075. // whitespace.
  1076. if (isKeepWhitespaceMode()) {
  1077. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1078. return;
  1079. }
  1080. BufferPtr = CurPtr;
  1081. Result.setFlag(Token::LeadingSpace);
  1082. }
  1083. unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
  1084. // Read a character, advancing over it.
  1085. char Char = getAndAdvanceChar(CurPtr, Result);
  1086. tok::TokenKind Kind;
  1087. switch (Char) {
  1088. case 0: // Null.
  1089. // Found end of file?
  1090. if (CurPtr-1 == BufferEnd) {
  1091. // Read the PP instance variable into an automatic variable, because
  1092. // LexEndOfFile will often delete 'this'.
  1093. Preprocessor *PPCache = PP;
  1094. if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file.
  1095. return; // Got a token to return.
  1096. assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
  1097. return PPCache->Lex(Result);
  1098. }
  1099. if (!isLexingRawMode())
  1100. Diag(CurPtr-1, diag::null_in_file);
  1101. Result.setFlag(Token::LeadingSpace);
  1102. if (SkipWhitespace(Result, CurPtr))
  1103. return; // KeepWhitespaceMode
  1104. goto LexNextToken; // GCC isn't tail call eliminating.
  1105. case '\n':
  1106. case '\r':
  1107. // If we are inside a preprocessor directive and we see the end of line,
  1108. // we know we are done with the directive, so return an EOM token.
  1109. if (ParsingPreprocessorDirective) {
  1110. // Done parsing the "line".
  1111. ParsingPreprocessorDirective = false;
  1112. // Restore comment saving mode, in case it was disabled for directive.
  1113. SetCommentRetentionState(PP->getCommentRetentionState());
  1114. // Since we consumed a newline, we are back at the start of a line.
  1115. IsAtStartOfLine = true;
  1116. Kind = tok::eom;
  1117. break;
  1118. }
  1119. // The returned token is at the start of the line.
  1120. Result.setFlag(Token::StartOfLine);
  1121. // No leading whitespace seen so far.
  1122. Result.clearFlag(Token::LeadingSpace);
  1123. if (SkipWhitespace(Result, CurPtr))
  1124. return; // KeepWhitespaceMode
  1125. goto LexNextToken; // GCC isn't tail call eliminating.
  1126. case ' ':
  1127. case '\t':
  1128. case '\f':
  1129. case '\v':
  1130. SkipHorizontalWhitespace:
  1131. Result.setFlag(Token::LeadingSpace);
  1132. if (SkipWhitespace(Result, CurPtr))
  1133. return; // KeepWhitespaceMode
  1134. SkipIgnoredUnits:
  1135. CurPtr = BufferPtr;
  1136. // If the next token is obviously a // or /* */ comment, skip it efficiently
  1137. // too (without going through the big switch stmt).
  1138. if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
  1139. Features.BCPLComment) {
  1140. SkipBCPLComment(Result, CurPtr+2);
  1141. goto SkipIgnoredUnits;
  1142. } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
  1143. SkipBlockComment(Result, CurPtr+2);
  1144. goto SkipIgnoredUnits;
  1145. } else if (isHorizontalWhitespace(*CurPtr)) {
  1146. goto SkipHorizontalWhitespace;
  1147. }
  1148. goto LexNextToken; // GCC isn't tail call eliminating.
  1149. // C99 6.4.4.1: Integer Constants.
  1150. // C99 6.4.4.2: Floating Constants.
  1151. case '0': case '1': case '2': case '3': case '4':
  1152. case '5': case '6': case '7': case '8': case '9':
  1153. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1154. MIOpt.ReadToken();
  1155. return LexNumericConstant(Result, CurPtr);
  1156. case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
  1157. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1158. MIOpt.ReadToken();
  1159. Char = getCharAndSize(CurPtr, SizeTmp);
  1160. // Wide string literal.
  1161. if (Char == '"')
  1162. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  1163. true);
  1164. // Wide character constant.
  1165. if (Char == '\'')
  1166. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  1167. // FALL THROUGH, treating L like the start of an identifier.
  1168. // C99 6.4.2: Identifiers.
  1169. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  1170. case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
  1171. case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
  1172. case 'V': case 'W': case 'X': case 'Y': case 'Z':
  1173. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  1174. case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  1175. case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
  1176. case 'v': case 'w': case 'x': case 'y': case 'z':
  1177. case '_':
  1178. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1179. MIOpt.ReadToken();
  1180. return LexIdentifier(Result, CurPtr);
  1181. case '$': // $ in identifiers.
  1182. if (Features.DollarIdents) {
  1183. if (!isLexingRawMode())
  1184. Diag(CurPtr-1, diag::ext_dollar_in_identifier);
  1185. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1186. MIOpt.ReadToken();
  1187. return LexIdentifier(Result, CurPtr);
  1188. }
  1189. Kind = tok::unknown;
  1190. break;
  1191. // C99 6.4.4: Character Constants.
  1192. case '\'':
  1193. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1194. MIOpt.ReadToken();
  1195. return LexCharConstant(Result, CurPtr);
  1196. // C99 6.4.5: String Literals.
  1197. case '"':
  1198. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1199. MIOpt.ReadToken();
  1200. return LexStringLiteral(Result, CurPtr, false);
  1201. // C99 6.4.6: Punctuators.
  1202. case '?':
  1203. Kind = tok::question;
  1204. break;
  1205. case '[':
  1206. Kind = tok::l_square;
  1207. break;
  1208. case ']':
  1209. Kind = tok::r_square;
  1210. break;
  1211. case '(':
  1212. Kind = tok::l_paren;
  1213. break;
  1214. case ')':
  1215. Kind = tok::r_paren;
  1216. break;
  1217. case '{':
  1218. Kind = tok::l_brace;
  1219. break;
  1220. case '}':
  1221. Kind = tok::r_brace;
  1222. break;
  1223. case '.':
  1224. Char = getCharAndSize(CurPtr, SizeTmp);
  1225. if (Char >= '0' && Char <= '9') {
  1226. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1227. MIOpt.ReadToken();
  1228. return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  1229. } else if (Features.CPlusPlus && Char == '*') {
  1230. Kind = tok::periodstar;
  1231. CurPtr += SizeTmp;
  1232. } else if (Char == '.' &&
  1233. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
  1234. Kind = tok::ellipsis;
  1235. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1236. SizeTmp2, Result);
  1237. } else {
  1238. Kind = tok::period;
  1239. }
  1240. break;
  1241. case '&':
  1242. Char = getCharAndSize(CurPtr, SizeTmp);
  1243. if (Char == '&') {
  1244. Kind = tok::ampamp;
  1245. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1246. } else if (Char == '=') {
  1247. Kind = tok::ampequal;
  1248. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1249. } else {
  1250. Kind = tok::amp;
  1251. }
  1252. break;
  1253. case '*':
  1254. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  1255. Kind = tok::starequal;
  1256. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1257. } else {
  1258. Kind = tok::star;
  1259. }
  1260. break;
  1261. case '+':
  1262. Char = getCharAndSize(CurPtr, SizeTmp);
  1263. if (Char == '+') {
  1264. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1265. Kind = tok::plusplus;
  1266. } else if (Char == '=') {
  1267. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1268. Kind = tok::plusequal;
  1269. } else {
  1270. Kind = tok::plus;
  1271. }
  1272. break;
  1273. case '-':
  1274. Char = getCharAndSize(CurPtr, SizeTmp);
  1275. if (Char == '-') { // --
  1276. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1277. Kind = tok::minusminus;
  1278. } else if (Char == '>' && Features.CPlusPlus &&
  1279. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
  1280. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1281. SizeTmp2, Result);
  1282. Kind = tok::arrowstar;
  1283. } else if (Char == '>') { // ->
  1284. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1285. Kind = tok::arrow;
  1286. } else if (Char == '=') { // -=
  1287. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1288. Kind = tok::minusequal;
  1289. } else {
  1290. Kind = tok::minus;
  1291. }
  1292. break;
  1293. case '~':
  1294. Kind = tok::tilde;
  1295. break;
  1296. case '!':
  1297. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  1298. Kind = tok::exclaimequal;
  1299. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1300. } else {
  1301. Kind = tok::exclaim;
  1302. }
  1303. break;
  1304. case '/':
  1305. // 6.4.9: Comments
  1306. Char = getCharAndSize(CurPtr, SizeTmp);
  1307. if (Char == '/') { // BCPL comment.
  1308. // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
  1309. // want to lex this as a comment. There is one problem with this though,
  1310. // that in one particular corner case, this can change the behavior of the
  1311. // resultant program. For example, In "foo //**/ bar", C89 would lex
  1312. // this as "foo / bar" and langauges with BCPL comments would lex it as
  1313. // "foo". Check to see if the character after the second slash is a '*'.
  1314. // If so, we will lex that as a "/" instead of the start of a comment.
  1315. if (Features.BCPLComment ||
  1316. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
  1317. if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
  1318. return; // KeepCommentMode
  1319. // It is common for the tokens immediately after a // comment to be
  1320. // whitespace (indentation for the next line). Instead of going through
  1321. // the big switch, handle it efficiently now.
  1322. goto SkipIgnoredUnits;
  1323. }
  1324. }
  1325. if (Char == '*') { // /**/ comment.
  1326. if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
  1327. return; // KeepCommentMode
  1328. goto LexNextToken; // GCC isn't tail call eliminating.
  1329. }
  1330. if (Char == '=') {
  1331. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1332. Kind = tok::slashequal;
  1333. } else {
  1334. Kind = tok::slash;
  1335. }
  1336. break;
  1337. case '%':
  1338. Char = getCharAndSize(CurPtr, SizeTmp);
  1339. if (Char == '=') {
  1340. Kind = tok::percentequal;
  1341. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1342. } else if (Features.Digraphs && Char == '>') {
  1343. Kind = tok::r_brace; // '%>' -> '}'
  1344. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1345. } else if (Features.Digraphs && Char == ':') {
  1346. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1347. Char = getCharAndSize(CurPtr, SizeTmp);
  1348. if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
  1349. Kind = tok::hashhash; // '%:%:' -> '##'
  1350. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1351. SizeTmp2, Result);
  1352. } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize
  1353. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1354. if (!isLexingRawMode())
  1355. Diag(BufferPtr, diag::charize_microsoft_ext);
  1356. Kind = tok::hashat;
  1357. } else {
  1358. Kind = tok::hash; // '%:' -> '#'
  1359. // We parsed a # character. If this occurs at the start of the line,
  1360. // it's actually the start of a preprocessing directive. Callback to
  1361. // the preprocessor to handle it.
  1362. // FIXME: -fpreprocessed mode??
  1363. if (Result.isAtStartOfLine() && !LexingRawMode) {
  1364. BufferPtr = CurPtr;
  1365. PP->HandleDirective(Result);
  1366. // As an optimization, if the preprocessor didn't switch lexers, tail
  1367. // recurse.
  1368. if (PP->isCurrentLexer(this)) {
  1369. // Start a new token. If this is a #include or something, the PP may
  1370. // want us starting at the beginning of the line again. If so, set
  1371. // the StartOfLine flag.
  1372. if (IsAtStartOfLine) {
  1373. Result.setFlag(Token::StartOfLine);
  1374. IsAtStartOfLine = false;
  1375. }
  1376. goto LexNextToken; // GCC isn't tail call eliminating.
  1377. }
  1378. return PP->Lex(Result);
  1379. }
  1380. }
  1381. } else {
  1382. Kind = tok::percent;
  1383. }
  1384. break;
  1385. case '<':
  1386. Char = getCharAndSize(CurPtr, SizeTmp);
  1387. if (ParsingFilename) {
  1388. return LexAngledStringLiteral(Result, CurPtr+SizeTmp);
  1389. } else if (Char == '<' &&
  1390. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
  1391. Kind = tok::lesslessequal;
  1392. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1393. SizeTmp2, Result);
  1394. } else if (Char == '<') {
  1395. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1396. Kind = tok::lessless;
  1397. } else if (Char == '=') {
  1398. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1399. Kind = tok::lessequal;
  1400. } else if (Features.Digraphs && Char == ':') { // '<:' -> '['
  1401. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1402. Kind = tok::l_square;
  1403. } else if (Features.Digraphs && Char == '%') { // '<%' -> '{'
  1404. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1405. Kind = tok::l_brace;
  1406. } else {
  1407. Kind = tok::less;
  1408. }
  1409. break;
  1410. case '>':
  1411. Char = getCharAndSize(CurPtr, SizeTmp);
  1412. if (Char == '=') {
  1413. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1414. Kind = tok::greaterequal;
  1415. } else if (Char == '>' &&
  1416. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
  1417. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  1418. SizeTmp2, Result);
  1419. Kind = tok::greatergreaterequal;
  1420. } else if (Char == '>') {
  1421. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1422. Kind = tok::greatergreater;
  1423. } else {
  1424. Kind = tok::greater;
  1425. }
  1426. break;
  1427. case '^':
  1428. Char = getCharAndSize(CurPtr, SizeTmp);
  1429. if (Char == '=') {
  1430. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1431. Kind = tok::caretequal;
  1432. } else {
  1433. Kind = tok::caret;
  1434. }
  1435. break;
  1436. case '|':
  1437. Char = getCharAndSize(CurPtr, SizeTmp);
  1438. if (Char == '=') {
  1439. Kind = tok::pipeequal;
  1440. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1441. } else if (Char == '|') {
  1442. Kind = tok::pipepipe;
  1443. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1444. } else {
  1445. Kind = tok::pipe;
  1446. }
  1447. break;
  1448. case ':':
  1449. Char = getCharAndSize(CurPtr, SizeTmp);
  1450. if (Features.Digraphs && Char == '>') {
  1451. Kind = tok::r_square; // ':>' -> ']'
  1452. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1453. } else if (Features.CPlusPlus && Char == ':') {
  1454. Kind = tok::coloncolon;
  1455. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1456. } else {
  1457. Kind = tok::colon;
  1458. }
  1459. break;
  1460. case ';':
  1461. Kind = tok::semi;
  1462. break;
  1463. case '=':
  1464. Char = getCharAndSize(CurPtr, SizeTmp);
  1465. if (Char == '=') {
  1466. Kind = tok::equalequal;
  1467. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1468. } else {
  1469. Kind = tok::equal;
  1470. }
  1471. break;
  1472. case ',':
  1473. Kind = tok::comma;
  1474. break;
  1475. case '#':
  1476. Char = getCharAndSize(CurPtr, SizeTmp);
  1477. if (Char == '#') {
  1478. Kind = tok::hashhash;
  1479. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1480. } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize
  1481. Kind = tok::hashat;
  1482. if (!isLexingRawMode())
  1483. Diag(BufferPtr, diag::charize_microsoft_ext);
  1484. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  1485. } else {
  1486. Kind = tok::hash;
  1487. // We parsed a # character. If this occurs at the start of the line,
  1488. // it's actually the start of a preprocessing directive. Callback to
  1489. // the preprocessor to handle it.
  1490. // FIXME: -fpreprocessed mode??
  1491. if (Result.isAtStartOfLine() && !LexingRawMode) {
  1492. BufferPtr = CurPtr;
  1493. PP->HandleDirective(Result);
  1494. // As an optimization, if the preprocessor didn't switch lexers, tail
  1495. // recurse.
  1496. if (PP->isCurrentLexer(this)) {
  1497. // Start a new token. If this is a #include or something, the PP may
  1498. // want us starting at the beginning of the line again. If so, set
  1499. // the StartOfLine flag.
  1500. if (IsAtStartOfLine) {
  1501. Result.setFlag(Token::StartOfLine);
  1502. IsAtStartOfLine = false;
  1503. }
  1504. goto LexNextToken; // GCC isn't tail call eliminating.
  1505. }
  1506. return PP->Lex(Result);
  1507. }
  1508. }
  1509. break;
  1510. case '@':
  1511. // Objective C support.
  1512. if (CurPtr[-1] == '@' && Features.ObjC1)
  1513. Kind = tok::at;
  1514. else
  1515. Kind = tok::unknown;
  1516. break;
  1517. case '\\':
  1518. // FIXME: UCN's.
  1519. // FALL THROUGH.
  1520. default:
  1521. Kind = tok::unknown;
  1522. break;
  1523. }
  1524. // Notify MIOpt that we read a non-whitespace/non-comment token.
  1525. MIOpt.ReadToken();
  1526. // Update the location of token as well as BufferPtr.
  1527. FormTokenWithChars(Result, CurPtr, Kind);
  1528. }