Lexer.cpp 141 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951
  1. //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the Lexer and Token interfaces.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #include "clang/Lex/Lexer.h"
  13. #include "UnicodeCharSets.h"
  14. #include "clang/Basic/CharInfo.h"
  15. #include "clang/Basic/IdentifierTable.h"
  16. #include "clang/Basic/LangOptions.h"
  17. #include "clang/Basic/SourceLocation.h"
  18. #include "clang/Basic/SourceManager.h"
  19. #include "clang/Basic/TokenKinds.h"
  20. #include "clang/Lex/LexDiagnostic.h"
  21. #include "clang/Lex/LiteralSupport.h"
  22. #include "clang/Lex/MultipleIncludeOpt.h"
  23. #include "clang/Lex/Preprocessor.h"
  24. #include "clang/Lex/PreprocessorOptions.h"
  25. #include "clang/Lex/Token.h"
  26. #include "clang/Basic/Diagnostic.h"
  27. #include "clang/Basic/LLVM.h"
  28. #include "clang/Basic/TokenKinds.h"
  29. #include "llvm/ADT/None.h"
  30. #include "llvm/ADT/Optional.h"
  31. #include "llvm/ADT/StringExtras.h"
  32. #include "llvm/ADT/StringSwitch.h"
  33. #include "llvm/ADT/StringRef.h"
  34. #include "llvm/Support/Compiler.h"
  35. #include "llvm/Support/ConvertUTF.h"
  36. #include "llvm/Support/MathExtras.h"
  37. #include "llvm/Support/MemoryBuffer.h"
  38. #include "llvm/Support/NativeFormatting.h"
  39. #include "llvm/Support/UnicodeCharRanges.h"
  40. #include <algorithm>
  41. #include <cassert>
  42. #include <cstddef>
  43. #include <cstdint>
  44. #include <cstring>
  45. #include <string>
  46. #include <tuple>
  47. #include <utility>
  48. using namespace clang;
  49. //===----------------------------------------------------------------------===//
  50. // Token Class Implementation
  51. //===----------------------------------------------------------------------===//
  52. /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
  53. bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
  54. if (isAnnotation())
  55. return false;
  56. if (IdentifierInfo *II = getIdentifierInfo())
  57. return II->getObjCKeywordID() == objcKey;
  58. return false;
  59. }
  60. /// getObjCKeywordID - Return the ObjC keyword kind.
  61. tok::ObjCKeywordKind Token::getObjCKeywordID() const {
  62. if (isAnnotation())
  63. return tok::objc_not_keyword;
  64. IdentifierInfo *specId = getIdentifierInfo();
  65. return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
  66. }
  67. //===----------------------------------------------------------------------===//
  68. // Lexer Class Implementation
  69. //===----------------------------------------------------------------------===//
  70. void Lexer::anchor() {}
  71. void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
  72. const char *BufEnd) {
  73. BufferStart = BufStart;
  74. BufferPtr = BufPtr;
  75. BufferEnd = BufEnd;
  76. assert(BufEnd[0] == 0 &&
  77. "We assume that the input buffer has a null character at the end"
  78. " to simplify lexing!");
  79. // Check whether we have a BOM in the beginning of the buffer. If yes - act
  80. // accordingly. Right now we support only UTF-8 with and without BOM, so, just
  81. // skip the UTF-8 BOM if it's present.
  82. if (BufferStart == BufferPtr) {
  83. // Determine the size of the BOM.
  84. StringRef Buf(BufferStart, BufferEnd - BufferStart);
  85. size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
  86. .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
  87. .Default(0);
  88. // Skip the BOM.
  89. BufferPtr += BOMLength;
  90. }
  91. Is_PragmaLexer = false;
  92. CurrentConflictMarkerState = CMK_None;
  93. // Start of the file is a start of line.
  94. IsAtStartOfLine = true;
  95. IsAtPhysicalStartOfLine = true;
  96. HasLeadingSpace = false;
  97. HasLeadingEmptyMacro = false;
  98. // We are not after parsing a #.
  99. ParsingPreprocessorDirective = false;
  100. // We are not after parsing #include.
  101. ParsingFilename = false;
  102. // We are not in raw mode. Raw mode disables diagnostics and interpretation
  103. // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
  104. // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
  105. // or otherwise skipping over tokens.
  106. LexingRawMode = false;
  107. // Default to not keeping comments.
  108. ExtendedTokenMode = 0;
  109. }
  110. /// Lexer constructor - Create a new lexer object for the specified buffer
  111. /// with the specified preprocessor managing the lexing process. This lexer
  112. /// assumes that the associated file buffer and Preprocessor objects will
  113. /// outlive it, so it doesn't take ownership of either of them.
  114. Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
  115. : PreprocessorLexer(&PP, FID),
  116. FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
  117. LangOpts(PP.getLangOpts()) {
  118. InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
  119. InputFile->getBufferEnd());
  120. resetExtendedTokenMode();
  121. }
  122. /// Lexer constructor - Create a new raw lexer object. This object is only
  123. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
  124. /// range will outlive it, so it doesn't take ownership of it.
  125. Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
  126. const char *BufStart, const char *BufPtr, const char *BufEnd)
  127. : FileLoc(fileloc), LangOpts(langOpts) {
  128. InitLexer(BufStart, BufPtr, BufEnd);
  129. // We *are* in raw mode.
  130. LexingRawMode = true;
  131. }
  132. /// Lexer constructor - Create a new raw lexer object. This object is only
  133. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
  134. /// range will outlive it, so it doesn't take ownership of it.
  135. Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
  136. const SourceManager &SM, const LangOptions &langOpts)
  137. : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
  138. FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
  139. void Lexer::resetExtendedTokenMode() {
  140. assert(PP && "Cannot reset token mode without a preprocessor");
  141. if (LangOpts.TraditionalCPP)
  142. SetKeepWhitespaceMode(true);
  143. else
  144. SetCommentRetentionState(PP->getCommentRetentionState());
  145. }
  146. /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
  147. /// _Pragma expansion. This has a variety of magic semantics that this method
  148. /// sets up. It returns a new'd Lexer that must be delete'd when done.
  149. ///
  150. /// On entrance to this routine, TokStartLoc is a macro location which has a
  151. /// spelling loc that indicates the bytes to be lexed for the token and an
  152. /// expansion location that indicates where all lexed tokens should be
  153. /// "expanded from".
  154. ///
  155. /// TODO: It would really be nice to make _Pragma just be a wrapper around a
  156. /// normal lexer that remaps tokens as they fly by. This would require making
  157. /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
  158. /// interface that could handle this stuff. This would pull GetMappedTokenLoc
  159. /// out of the critical path of the lexer!
  160. ///
  161. Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
  162. SourceLocation ExpansionLocStart,
  163. SourceLocation ExpansionLocEnd,
  164. unsigned TokLen, Preprocessor &PP) {
  165. SourceManager &SM = PP.getSourceManager();
  166. // Create the lexer as if we were going to lex the file normally.
  167. FileID SpellingFID = SM.getFileID(SpellingLoc);
  168. const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
  169. Lexer *L = new Lexer(SpellingFID, InputFile, PP);
  170. // Now that the lexer is created, change the start/end locations so that we
  171. // just lex the subsection of the file that we want. This is lexing from a
  172. // scratch buffer.
  173. const char *StrData = SM.getCharacterData(SpellingLoc);
  174. L->BufferPtr = StrData;
  175. L->BufferEnd = StrData+TokLen;
  176. assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
  177. // Set the SourceLocation with the remapping information. This ensures that
  178. // GetMappedTokenLoc will remap the tokens as they are lexed.
  179. L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
  180. ExpansionLocStart,
  181. ExpansionLocEnd, TokLen);
  182. // Ensure that the lexer thinks it is inside a directive, so that end \n will
  183. // return an EOD token.
  184. L->ParsingPreprocessorDirective = true;
  185. // This lexer really is for _Pragma.
  186. L->Is_PragmaLexer = true;
  187. return L;
  188. }
  189. bool Lexer::skipOver(unsigned NumBytes) {
  190. IsAtPhysicalStartOfLine = true;
  191. IsAtStartOfLine = true;
  192. if ((BufferPtr + NumBytes) > BufferEnd)
  193. return true;
  194. BufferPtr += NumBytes;
  195. return false;
  196. }
  197. template <typename T> static void StringifyImpl(T &Str, char Quote) {
  198. typename T::size_type i = 0, e = Str.size();
  199. while (i < e) {
  200. if (Str[i] == '\\' || Str[i] == Quote) {
  201. Str.insert(Str.begin() + i, '\\');
  202. i += 2;
  203. ++e;
  204. } else if (Str[i] == '\n' || Str[i] == '\r') {
  205. // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
  206. if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
  207. Str[i] != Str[i + 1]) {
  208. Str[i] = '\\';
  209. Str[i + 1] = 'n';
  210. } else {
  211. // Replace '\n' and '\r' to '\\' followed by 'n'.
  212. Str[i] = '\\';
  213. Str.insert(Str.begin() + i + 1, 'n');
  214. ++e;
  215. }
  216. i += 2;
  217. } else
  218. ++i;
  219. }
  220. }
  221. std::string Lexer::Stringify(StringRef Str, bool Charify) {
  222. std::string Result = Str;
  223. char Quote = Charify ? '\'' : '"';
  224. StringifyImpl(Result, Quote);
  225. return Result;
  226. }
  227. void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
  228. //===----------------------------------------------------------------------===//
  229. // Token Spelling
  230. //===----------------------------------------------------------------------===//
  231. /// Slow case of getSpelling. Extract the characters comprising the
  232. /// spelling of this token from the provided input buffer.
  233. static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
  234. const LangOptions &LangOpts, char *Spelling) {
  235. assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
  236. size_t Length = 0;
  237. const char *BufEnd = BufPtr + Tok.getLength();
  238. if (tok::isStringLiteral(Tok.getKind())) {
  239. // Munch the encoding-prefix and opening double-quote.
  240. while (BufPtr < BufEnd) {
  241. unsigned Size;
  242. Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
  243. BufPtr += Size;
  244. if (Spelling[Length - 1] == '"')
  245. break;
  246. }
  247. // Raw string literals need special handling; trigraph expansion and line
  248. // splicing do not occur within their d-char-sequence nor within their
  249. // r-char-sequence.
  250. if (Length >= 2 &&
  251. Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
  252. // Search backwards from the end of the token to find the matching closing
  253. // quote.
  254. const char *RawEnd = BufEnd;
  255. do --RawEnd; while (*RawEnd != '"');
  256. size_t RawLength = RawEnd - BufPtr + 1;
  257. // Everything between the quotes is included verbatim in the spelling.
  258. memcpy(Spelling + Length, BufPtr, RawLength);
  259. Length += RawLength;
  260. BufPtr += RawLength;
  261. // The rest of the token is lexed normally.
  262. }
  263. }
  264. while (BufPtr < BufEnd) {
  265. unsigned Size;
  266. Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
  267. BufPtr += Size;
  268. }
  269. assert(Length < Tok.getLength() &&
  270. "NeedsCleaning flag set on token that didn't need cleaning!");
  271. return Length;
  272. }
  273. /// getSpelling() - Return the 'spelling' of this token. The spelling of a
  274. /// token are the characters used to represent the token in the source file
  275. /// after trigraph expansion and escaped-newline folding. In particular, this
  276. /// wants to get the true, uncanonicalized, spelling of things like digraphs
  277. /// UCNs, etc.
  278. StringRef Lexer::getSpelling(SourceLocation loc,
  279. SmallVectorImpl<char> &buffer,
  280. const SourceManager &SM,
  281. const LangOptions &options,
  282. bool *invalid) {
  283. // Break down the source location.
  284. std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
  285. // Try to the load the file buffer.
  286. bool invalidTemp = false;
  287. StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
  288. if (invalidTemp) {
  289. if (invalid) *invalid = true;
  290. return {};
  291. }
  292. const char *tokenBegin = file.data() + locInfo.second;
  293. // Lex from the start of the given location.
  294. Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
  295. file.begin(), tokenBegin, file.end());
  296. Token token;
  297. lexer.LexFromRawLexer(token);
  298. unsigned length = token.getLength();
  299. // Common case: no need for cleaning.
  300. if (!token.needsCleaning())
  301. return StringRef(tokenBegin, length);
  302. // Hard case, we need to relex the characters into the string.
  303. buffer.resize(length);
  304. buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
  305. return StringRef(buffer.data(), buffer.size());
  306. }
  307. /// getSpelling() - Return the 'spelling' of this token. The spelling of a
  308. /// token are the characters used to represent the token in the source file
  309. /// after trigraph expansion and escaped-newline folding. In particular, this
  310. /// wants to get the true, uncanonicalized, spelling of things like digraphs
  311. /// UCNs, etc.
  312. std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
  313. const LangOptions &LangOpts, bool *Invalid) {
  314. assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
  315. bool CharDataInvalid = false;
  316. const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
  317. &CharDataInvalid);
  318. if (Invalid)
  319. *Invalid = CharDataInvalid;
  320. if (CharDataInvalid)
  321. return {};
  322. // If this token contains nothing interesting, return it directly.
  323. if (!Tok.needsCleaning())
  324. return std::string(TokStart, TokStart + Tok.getLength());
  325. std::string Result;
  326. Result.resize(Tok.getLength());
  327. Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
  328. return Result;
  329. }
  330. /// getSpelling - This method is used to get the spelling of a token into a
  331. /// preallocated buffer, instead of as an std::string. The caller is required
  332. /// to allocate enough space for the token, which is guaranteed to be at least
  333. /// Tok.getLength() bytes long. The actual length of the token is returned.
  334. ///
  335. /// Note that this method may do two possible things: it may either fill in
  336. /// the buffer specified with characters, or it may *change the input pointer*
  337. /// to point to a constant buffer with the data already in it (avoiding a
  338. /// copy). The caller is not allowed to modify the returned buffer pointer
  339. /// if an internal buffer is returned.
  340. unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
  341. const SourceManager &SourceMgr,
  342. const LangOptions &LangOpts, bool *Invalid) {
  343. assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
  344. const char *TokStart = nullptr;
  345. // NOTE: this has to be checked *before* testing for an IdentifierInfo.
  346. if (Tok.is(tok::raw_identifier))
  347. TokStart = Tok.getRawIdentifier().data();
  348. else if (!Tok.hasUCN()) {
  349. if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
  350. // Just return the string from the identifier table, which is very quick.
  351. Buffer = II->getNameStart();
  352. return II->getLength();
  353. }
  354. }
  355. // NOTE: this can be checked even after testing for an IdentifierInfo.
  356. if (Tok.isLiteral())
  357. TokStart = Tok.getLiteralData();
  358. if (!TokStart) {
  359. // Compute the start of the token in the input lexer buffer.
  360. bool CharDataInvalid = false;
  361. TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
  362. if (Invalid)
  363. *Invalid = CharDataInvalid;
  364. if (CharDataInvalid) {
  365. Buffer = "";
  366. return 0;
  367. }
  368. }
  369. // If this token contains nothing interesting, return it directly.
  370. if (!Tok.needsCleaning()) {
  371. Buffer = TokStart;
  372. return Tok.getLength();
  373. }
  374. // Otherwise, hard case, relex the characters into the string.
  375. return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
  376. }
  377. /// MeasureTokenLength - Relex the token at the specified location and return
  378. /// its length in bytes in the input file. If the token needs cleaning (e.g.
  379. /// includes a trigraph or an escaped newline) then this count includes bytes
  380. /// that are part of that.
  381. unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
  382. const SourceManager &SM,
  383. const LangOptions &LangOpts) {
  384. Token TheTok;
  385. if (getRawToken(Loc, TheTok, SM, LangOpts))
  386. return 0;
  387. return TheTok.getLength();
  388. }
  389. /// Relex the token at the specified location.
  390. /// \returns true if there was a failure, false on success.
  391. bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
  392. const SourceManager &SM,
  393. const LangOptions &LangOpts,
  394. bool IgnoreWhiteSpace) {
  395. // TODO: this could be special cased for common tokens like identifiers, ')',
  396. // etc to make this faster, if it mattered. Just look at StrData[0] to handle
  397. // all obviously single-char tokens. This could use
  398. // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
  399. // something.
  400. // If this comes from a macro expansion, we really do want the macro name, not
  401. // the token this macro expanded to.
  402. Loc = SM.getExpansionLoc(Loc);
  403. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  404. bool Invalid = false;
  405. StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
  406. if (Invalid)
  407. return true;
  408. const char *StrData = Buffer.data()+LocInfo.second;
  409. if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
  410. return true;
  411. // Create a lexer starting at the beginning of this token.
  412. Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
  413. Buffer.begin(), StrData, Buffer.end());
  414. TheLexer.SetCommentRetentionState(true);
  415. TheLexer.LexFromRawLexer(Result);
  416. return false;
  417. }
  418. /// Returns the pointer that points to the beginning of line that contains
  419. /// the given offset, or null if the offset if invalid.
  420. static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
  421. const char *BufStart = Buffer.data();
  422. if (Offset >= Buffer.size())
  423. return nullptr;
  424. const char *LexStart = BufStart + Offset;
  425. for (; LexStart != BufStart; --LexStart) {
  426. if (isVerticalWhitespace(LexStart[0]) &&
  427. !Lexer::isNewLineEscaped(BufStart, LexStart)) {
  428. // LexStart should point at first character of logical line.
  429. ++LexStart;
  430. break;
  431. }
  432. }
  433. return LexStart;
  434. }
  435. static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
  436. const SourceManager &SM,
  437. const LangOptions &LangOpts) {
  438. assert(Loc.isFileID());
  439. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  440. if (LocInfo.first.isInvalid())
  441. return Loc;
  442. bool Invalid = false;
  443. StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
  444. if (Invalid)
  445. return Loc;
  446. // Back up from the current location until we hit the beginning of a line
  447. // (or the buffer). We'll relex from that point.
  448. const char *StrData = Buffer.data() + LocInfo.second;
  449. const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
  450. if (!LexStart || LexStart == StrData)
  451. return Loc;
  452. // Create a lexer starting at the beginning of this token.
  453. SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
  454. Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
  455. Buffer.end());
  456. TheLexer.SetCommentRetentionState(true);
  457. // Lex tokens until we find the token that contains the source location.
  458. Token TheTok;
  459. do {
  460. TheLexer.LexFromRawLexer(TheTok);
  461. if (TheLexer.getBufferLocation() > StrData) {
  462. // Lexing this token has taken the lexer past the source location we're
  463. // looking for. If the current token encompasses our source location,
  464. // return the beginning of that token.
  465. if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
  466. return TheTok.getLocation();
  467. // We ended up skipping over the source location entirely, which means
  468. // that it points into whitespace. We're done here.
  469. break;
  470. }
  471. } while (TheTok.getKind() != tok::eof);
  472. // We've passed our source location; just return the original source location.
  473. return Loc;
  474. }
  475. SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
  476. const SourceManager &SM,
  477. const LangOptions &LangOpts) {
  478. if (Loc.isFileID())
  479. return getBeginningOfFileToken(Loc, SM, LangOpts);
  480. if (!SM.isMacroArgExpansion(Loc))
  481. return Loc;
  482. SourceLocation FileLoc = SM.getSpellingLoc(Loc);
  483. SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
  484. std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
  485. std::pair<FileID, unsigned> BeginFileLocInfo =
  486. SM.getDecomposedLoc(BeginFileLoc);
  487. assert(FileLocInfo.first == BeginFileLocInfo.first &&
  488. FileLocInfo.second >= BeginFileLocInfo.second);
  489. return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
  490. }
  491. namespace {
  492. enum PreambleDirectiveKind {
  493. PDK_Skipped,
  494. PDK_Unknown
  495. };
  496. } // namespace
  497. PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
  498. const LangOptions &LangOpts,
  499. unsigned MaxLines) {
  500. // Create a lexer starting at the beginning of the file. Note that we use a
  501. // "fake" file source location at offset 1 so that the lexer will track our
  502. // position within the file.
  503. const unsigned StartOffset = 1;
  504. SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
  505. Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
  506. Buffer.end());
  507. TheLexer.SetCommentRetentionState(true);
  508. bool InPreprocessorDirective = false;
  509. Token TheTok;
  510. SourceLocation ActiveCommentLoc;
  511. unsigned MaxLineOffset = 0;
  512. if (MaxLines) {
  513. const char *CurPtr = Buffer.begin();
  514. unsigned CurLine = 0;
  515. while (CurPtr != Buffer.end()) {
  516. char ch = *CurPtr++;
  517. if (ch == '\n') {
  518. ++CurLine;
  519. if (CurLine == MaxLines)
  520. break;
  521. }
  522. }
  523. if (CurPtr != Buffer.end())
  524. MaxLineOffset = CurPtr - Buffer.begin();
  525. }
  526. do {
  527. TheLexer.LexFromRawLexer(TheTok);
  528. if (InPreprocessorDirective) {
  529. // If we've hit the end of the file, we're done.
  530. if (TheTok.getKind() == tok::eof) {
  531. break;
  532. }
  533. // If we haven't hit the end of the preprocessor directive, skip this
  534. // token.
  535. if (!TheTok.isAtStartOfLine())
  536. continue;
  537. // We've passed the end of the preprocessor directive, and will look
  538. // at this token again below.
  539. InPreprocessorDirective = false;
  540. }
  541. // Keep track of the # of lines in the preamble.
  542. if (TheTok.isAtStartOfLine()) {
  543. unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
  544. // If we were asked to limit the number of lines in the preamble,
  545. // and we're about to exceed that limit, we're done.
  546. if (MaxLineOffset && TokOffset >= MaxLineOffset)
  547. break;
  548. }
  549. // Comments are okay; skip over them.
  550. if (TheTok.getKind() == tok::comment) {
  551. if (ActiveCommentLoc.isInvalid())
  552. ActiveCommentLoc = TheTok.getLocation();
  553. continue;
  554. }
  555. if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
  556. // This is the start of a preprocessor directive.
  557. Token HashTok = TheTok;
  558. InPreprocessorDirective = true;
  559. ActiveCommentLoc = SourceLocation();
  560. // Figure out which directive this is. Since we're lexing raw tokens,
  561. // we don't have an identifier table available. Instead, just look at
  562. // the raw identifier to recognize and categorize preprocessor directives.
  563. TheLexer.LexFromRawLexer(TheTok);
  564. if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
  565. StringRef Keyword = TheTok.getRawIdentifier();
  566. PreambleDirectiveKind PDK
  567. = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
  568. .Case("include", PDK_Skipped)
  569. .Case("__include_macros", PDK_Skipped)
  570. .Case("define", PDK_Skipped)
  571. .Case("undef", PDK_Skipped)
  572. .Case("line", PDK_Skipped)
  573. .Case("error", PDK_Skipped)
  574. .Case("pragma", PDK_Skipped)
  575. .Case("import", PDK_Skipped)
  576. .Case("include_next", PDK_Skipped)
  577. .Case("warning", PDK_Skipped)
  578. .Case("ident", PDK_Skipped)
  579. .Case("sccs", PDK_Skipped)
  580. .Case("assert", PDK_Skipped)
  581. .Case("unassert", PDK_Skipped)
  582. .Case("if", PDK_Skipped)
  583. .Case("ifdef", PDK_Skipped)
  584. .Case("ifndef", PDK_Skipped)
  585. .Case("elif", PDK_Skipped)
  586. .Case("else", PDK_Skipped)
  587. .Case("endif", PDK_Skipped)
  588. .Default(PDK_Unknown);
  589. switch (PDK) {
  590. case PDK_Skipped:
  591. continue;
  592. case PDK_Unknown:
  593. // We don't know what this directive is; stop at the '#'.
  594. break;
  595. }
  596. }
  597. // We only end up here if we didn't recognize the preprocessor
  598. // directive or it was one that can't occur in the preamble at this
  599. // point. Roll back the current token to the location of the '#'.
  600. TheTok = HashTok;
  601. }
  602. // We hit a token that we don't recognize as being in the
  603. // "preprocessing only" part of the file, so we're no longer in
  604. // the preamble.
  605. break;
  606. } while (true);
  607. SourceLocation End;
  608. if (ActiveCommentLoc.isValid())
  609. End = ActiveCommentLoc; // don't truncate a decl comment.
  610. else
  611. End = TheTok.getLocation();
  612. return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
  613. TheTok.isAtStartOfLine());
  614. }
  615. unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
  616. const SourceManager &SM,
  617. const LangOptions &LangOpts) {
  618. // Figure out how many physical characters away the specified expansion
  619. // character is. This needs to take into consideration newlines and
  620. // trigraphs.
  621. bool Invalid = false;
  622. const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
  623. // If they request the first char of the token, we're trivially done.
  624. if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
  625. return 0;
  626. unsigned PhysOffset = 0;
  627. // The usual case is that tokens don't contain anything interesting. Skip
  628. // over the uninteresting characters. If a token only consists of simple
  629. // chars, this method is extremely fast.
  630. while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
  631. if (CharNo == 0)
  632. return PhysOffset;
  633. ++TokPtr;
  634. --CharNo;
  635. ++PhysOffset;
  636. }
  637. // If we have a character that may be a trigraph or escaped newline, use a
  638. // lexer to parse it correctly.
  639. for (; CharNo; --CharNo) {
  640. unsigned Size;
  641. Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
  642. TokPtr += Size;
  643. PhysOffset += Size;
  644. }
  645. // Final detail: if we end up on an escaped newline, we want to return the
  646. // location of the actual byte of the token. For example foo\<newline>bar
  647. // advanced by 3 should return the location of b, not of \\. One compounding
  648. // detail of this is that the escape may be made by a trigraph.
  649. if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
  650. PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
  651. return PhysOffset;
  652. }
  653. /// Computes the source location just past the end of the
  654. /// token at this source location.
  655. ///
  656. /// This routine can be used to produce a source location that
  657. /// points just past the end of the token referenced by \p Loc, and
  658. /// is generally used when a diagnostic needs to point just after a
  659. /// token where it expected something different that it received. If
  660. /// the returned source location would not be meaningful (e.g., if
  661. /// it points into a macro), this routine returns an invalid
  662. /// source location.
  663. ///
  664. /// \param Offset an offset from the end of the token, where the source
  665. /// location should refer to. The default offset (0) produces a source
  666. /// location pointing just past the end of the token; an offset of 1 produces
  667. /// a source location pointing to the last character in the token, etc.
  668. SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
  669. const SourceManager &SM,
  670. const LangOptions &LangOpts) {
  671. if (Loc.isInvalid())
  672. return {};
  673. if (Loc.isMacroID()) {
  674. if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
  675. return {}; // Points inside the macro expansion.
  676. }
  677. unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
  678. if (Len > Offset)
  679. Len = Len - Offset;
  680. else
  681. return Loc;
  682. return Loc.getLocWithOffset(Len);
  683. }
  684. /// Returns true if the given MacroID location points at the first
  685. /// token of the macro expansion.
  686. bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
  687. const SourceManager &SM,
  688. const LangOptions &LangOpts,
  689. SourceLocation *MacroBegin) {
  690. assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
  691. SourceLocation expansionLoc;
  692. if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
  693. return false;
  694. if (expansionLoc.isFileID()) {
  695. // No other macro expansions, this is the first.
  696. if (MacroBegin)
  697. *MacroBegin = expansionLoc;
  698. return true;
  699. }
  700. return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
  701. }
  702. /// Returns true if the given MacroID location points at the last
  703. /// token of the macro expansion.
  704. bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
  705. const SourceManager &SM,
  706. const LangOptions &LangOpts,
  707. SourceLocation *MacroEnd) {
  708. assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
  709. SourceLocation spellLoc = SM.getSpellingLoc(loc);
  710. unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
  711. if (tokLen == 0)
  712. return false;
  713. SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
  714. SourceLocation expansionLoc;
  715. if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
  716. return false;
  717. if (expansionLoc.isFileID()) {
  718. // No other macro expansions.
  719. if (MacroEnd)
  720. *MacroEnd = expansionLoc;
  721. return true;
  722. }
  723. return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
  724. }
  725. static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
  726. const SourceManager &SM,
  727. const LangOptions &LangOpts) {
  728. SourceLocation Begin = Range.getBegin();
  729. SourceLocation End = Range.getEnd();
  730. assert(Begin.isFileID() && End.isFileID());
  731. if (Range.isTokenRange()) {
  732. End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
  733. if (End.isInvalid())
  734. return {};
  735. }
  736. // Break down the source locations.
  737. FileID FID;
  738. unsigned BeginOffs;
  739. std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
  740. if (FID.isInvalid())
  741. return {};
  742. unsigned EndOffs;
  743. if (!SM.isInFileID(End, FID, &EndOffs) ||
  744. BeginOffs > EndOffs)
  745. return {};
  746. return CharSourceRange::getCharRange(Begin, End);
  747. }
  748. CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
  749. const SourceManager &SM,
  750. const LangOptions &LangOpts) {
  751. SourceLocation Begin = Range.getBegin();
  752. SourceLocation End = Range.getEnd();
  753. if (Begin.isInvalid() || End.isInvalid())
  754. return {};
  755. if (Begin.isFileID() && End.isFileID())
  756. return makeRangeFromFileLocs(Range, SM, LangOpts);
  757. if (Begin.isMacroID() && End.isFileID()) {
  758. if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
  759. return {};
  760. Range.setBegin(Begin);
  761. return makeRangeFromFileLocs(Range, SM, LangOpts);
  762. }
  763. if (Begin.isFileID() && End.isMacroID()) {
  764. if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
  765. &End)) ||
  766. (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
  767. &End)))
  768. return {};
  769. Range.setEnd(End);
  770. return makeRangeFromFileLocs(Range, SM, LangOpts);
  771. }
  772. assert(Begin.isMacroID() && End.isMacroID());
  773. SourceLocation MacroBegin, MacroEnd;
  774. if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
  775. ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
  776. &MacroEnd)) ||
  777. (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
  778. &MacroEnd)))) {
  779. Range.setBegin(MacroBegin);
  780. Range.setEnd(MacroEnd);
  781. return makeRangeFromFileLocs(Range, SM, LangOpts);
  782. }
  783. bool Invalid = false;
  784. const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
  785. &Invalid);
  786. if (Invalid)
  787. return {};
  788. if (BeginEntry.getExpansion().isMacroArgExpansion()) {
  789. const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
  790. &Invalid);
  791. if (Invalid)
  792. return {};
  793. if (EndEntry.getExpansion().isMacroArgExpansion() &&
  794. BeginEntry.getExpansion().getExpansionLocStart() ==
  795. EndEntry.getExpansion().getExpansionLocStart()) {
  796. Range.setBegin(SM.getImmediateSpellingLoc(Begin));
  797. Range.setEnd(SM.getImmediateSpellingLoc(End));
  798. return makeFileCharRange(Range, SM, LangOpts);
  799. }
  800. }
  801. return {};
  802. }
  803. StringRef Lexer::getSourceText(CharSourceRange Range,
  804. const SourceManager &SM,
  805. const LangOptions &LangOpts,
  806. bool *Invalid) {
  807. Range = makeFileCharRange(Range, SM, LangOpts);
  808. if (Range.isInvalid()) {
  809. if (Invalid) *Invalid = true;
  810. return {};
  811. }
  812. // Break down the source location.
  813. std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
  814. if (beginInfo.first.isInvalid()) {
  815. if (Invalid) *Invalid = true;
  816. return {};
  817. }
  818. unsigned EndOffs;
  819. if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
  820. beginInfo.second > EndOffs) {
  821. if (Invalid) *Invalid = true;
  822. return {};
  823. }
  824. // Try to the load the file buffer.
  825. bool invalidTemp = false;
  826. StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
  827. if (invalidTemp) {
  828. if (Invalid) *Invalid = true;
  829. return {};
  830. }
  831. if (Invalid) *Invalid = false;
  832. return file.substr(beginInfo.second, EndOffs - beginInfo.second);
  833. }
  834. StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
  835. const SourceManager &SM,
  836. const LangOptions &LangOpts) {
  837. assert(Loc.isMacroID() && "Only reasonable to call this on macros");
  838. // Find the location of the immediate macro expansion.
  839. while (true) {
  840. FileID FID = SM.getFileID(Loc);
  841. const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
  842. const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
  843. Loc = Expansion.getExpansionLocStart();
  844. if (!Expansion.isMacroArgExpansion())
  845. break;
  846. // For macro arguments we need to check that the argument did not come
  847. // from an inner macro, e.g: "MAC1( MAC2(foo) )"
  848. // Loc points to the argument id of the macro definition, move to the
  849. // macro expansion.
  850. Loc = SM.getImmediateExpansionRange(Loc).getBegin();
  851. SourceLocation SpellLoc = Expansion.getSpellingLoc();
  852. if (SpellLoc.isFileID())
  853. break; // No inner macro.
  854. // If spelling location resides in the same FileID as macro expansion
  855. // location, it means there is no inner macro.
  856. FileID MacroFID = SM.getFileID(Loc);
  857. if (SM.isInFileID(SpellLoc, MacroFID))
  858. break;
  859. // Argument came from inner macro.
  860. Loc = SpellLoc;
  861. }
  862. // Find the spelling location of the start of the non-argument expansion
  863. // range. This is where the macro name was spelled in order to begin
  864. // expanding this macro.
  865. Loc = SM.getSpellingLoc(Loc);
  866. // Dig out the buffer where the macro name was spelled and the extents of the
  867. // name so that we can render it into the expansion note.
  868. std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
  869. unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
  870. StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
  871. return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
  872. }
  873. StringRef Lexer::getImmediateMacroNameForDiagnostics(
  874. SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
  875. assert(Loc.isMacroID() && "Only reasonable to call this on macros");
  876. // Walk past macro argument expansions.
  877. while (SM.isMacroArgExpansion(Loc))
  878. Loc = SM.getImmediateExpansionRange(Loc).getBegin();
  879. // If the macro's spelling has no FileID, then it's actually a token paste
  880. // or stringization (or similar) and not a macro at all.
  881. if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
  882. return {};
  883. // Find the spelling location of the start of the non-argument expansion
  884. // range. This is where the macro name was spelled in order to begin
  885. // expanding this macro.
  886. Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
  887. // Dig out the buffer where the macro name was spelled and the extents of the
  888. // name so that we can render it into the expansion note.
  889. std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
  890. unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
  891. StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
  892. return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
  893. }
  894. bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
  895. return isIdentifierBody(c, LangOpts.DollarIdents);
  896. }
  897. bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
  898. assert(isVerticalWhitespace(Str[0]));
  899. if (Str - 1 < BufferStart)
  900. return false;
  901. if ((Str[0] == '\n' && Str[-1] == '\r') ||
  902. (Str[0] == '\r' && Str[-1] == '\n')) {
  903. if (Str - 2 < BufferStart)
  904. return false;
  905. --Str;
  906. }
  907. --Str;
  908. // Rewind to first non-space character:
  909. while (Str > BufferStart && isHorizontalWhitespace(*Str))
  910. --Str;
  911. return *Str == '\\';
  912. }
  913. StringRef Lexer::getIndentationForLine(SourceLocation Loc,
  914. const SourceManager &SM) {
  915. if (Loc.isInvalid() || Loc.isMacroID())
  916. return {};
  917. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  918. if (LocInfo.first.isInvalid())
  919. return {};
  920. bool Invalid = false;
  921. StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
  922. if (Invalid)
  923. return {};
  924. const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
  925. if (!Line)
  926. return {};
  927. StringRef Rest = Buffer.substr(Line - Buffer.data());
  928. size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
  929. return NumWhitespaceChars == StringRef::npos
  930. ? ""
  931. : Rest.take_front(NumWhitespaceChars);
  932. }
  933. //===----------------------------------------------------------------------===//
  934. // Diagnostics forwarding code.
  935. //===----------------------------------------------------------------------===//
  936. /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
  937. /// lexer buffer was all expanded at a single point, perform the mapping.
  938. /// This is currently only used for _Pragma implementation, so it is the slow
  939. /// path of the hot getSourceLocation method. Do not allow it to be inlined.
  940. static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
  941. Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
  942. static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
  943. SourceLocation FileLoc,
  944. unsigned CharNo, unsigned TokLen) {
  945. assert(FileLoc.isMacroID() && "Must be a macro expansion");
  946. // Otherwise, we're lexing "mapped tokens". This is used for things like
  947. // _Pragma handling. Combine the expansion location of FileLoc with the
  948. // spelling location.
  949. SourceManager &SM = PP.getSourceManager();
  950. // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
  951. // characters come from spelling(FileLoc)+Offset.
  952. SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
  953. SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
  954. // Figure out the expansion loc range, which is the range covered by the
  955. // original _Pragma(...) sequence.
  956. CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
  957. return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
  958. }
  959. /// getSourceLocation - Return a source location identifier for the specified
  960. /// offset in the current file.
  961. SourceLocation Lexer::getSourceLocation(const char *Loc,
  962. unsigned TokLen) const {
  963. assert(Loc >= BufferStart && Loc <= BufferEnd &&
  964. "Location out of range for this buffer!");
  965. // In the normal case, we're just lexing from a simple file buffer, return
  966. // the file id from FileLoc with the offset specified.
  967. unsigned CharNo = Loc-BufferStart;
  968. if (FileLoc.isFileID())
  969. return FileLoc.getLocWithOffset(CharNo);
  970. // Otherwise, this is the _Pragma lexer case, which pretends that all of the
  971. // tokens are lexed from where the _Pragma was defined.
  972. assert(PP && "This doesn't work on raw lexers");
  973. return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
  974. }
  975. /// Diag - Forwarding function for diagnostics. This translate a source
  976. /// position in the current buffer into a SourceLocation object for rendering.
  977. DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
  978. return PP->Diag(getSourceLocation(Loc), DiagID);
  979. }
  980. //===----------------------------------------------------------------------===//
  981. // Trigraph and Escaped Newline Handling Code.
  982. //===----------------------------------------------------------------------===//
  983. /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
  984. /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
  985. static char GetTrigraphCharForLetter(char Letter) {
  986. switch (Letter) {
  987. default: return 0;
  988. case '=': return '#';
  989. case ')': return ']';
  990. case '(': return '[';
  991. case '!': return '|';
  992. case '\'': return '^';
  993. case '>': return '}';
  994. case '/': return '\\';
  995. case '<': return '{';
  996. case '-': return '~';
  997. }
  998. }
  999. /// DecodeTrigraphChar - If the specified character is a legal trigraph when
  1000. /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
  1001. /// return the result character. Finally, emit a warning about trigraph use
  1002. /// whether trigraphs are enabled or not.
  1003. static char DecodeTrigraphChar(const char *CP, Lexer *L) {
  1004. char Res = GetTrigraphCharForLetter(*CP);
  1005. if (!Res || !L) return Res;
  1006. if (!L->getLangOpts().Trigraphs) {
  1007. if (!L->isLexingRawMode())
  1008. L->Diag(CP-2, diag::trigraph_ignored);
  1009. return 0;
  1010. }
  1011. if (!L->isLexingRawMode())
  1012. L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
  1013. return Res;
  1014. }
  1015. /// getEscapedNewLineSize - Return the size of the specified escaped newline,
  1016. /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
  1017. /// trigraph equivalent on entry to this function.
  1018. unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
  1019. unsigned Size = 0;
  1020. while (isWhitespace(Ptr[Size])) {
  1021. ++Size;
  1022. if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
  1023. continue;
  1024. // If this is a \r\n or \n\r, skip the other half.
  1025. if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
  1026. Ptr[Size-1] != Ptr[Size])
  1027. ++Size;
  1028. return Size;
  1029. }
  1030. // Not an escaped newline, must be a \t or something else.
  1031. return 0;
  1032. }
  1033. /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
  1034. /// them), skip over them and return the first non-escaped-newline found,
  1035. /// otherwise return P.
  1036. const char *Lexer::SkipEscapedNewLines(const char *P) {
  1037. while (true) {
  1038. const char *AfterEscape;
  1039. if (*P == '\\') {
  1040. AfterEscape = P+1;
  1041. } else if (*P == '?') {
  1042. // If not a trigraph for escape, bail out.
  1043. if (P[1] != '?' || P[2] != '/')
  1044. return P;
  1045. // FIXME: Take LangOpts into account; the language might not
  1046. // support trigraphs.
  1047. AfterEscape = P+3;
  1048. } else {
  1049. return P;
  1050. }
  1051. unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
  1052. if (NewLineSize == 0) return P;
  1053. P = AfterEscape+NewLineSize;
  1054. }
  1055. }
  1056. Optional<Token> Lexer::findNextToken(SourceLocation Loc,
  1057. const SourceManager &SM,
  1058. const LangOptions &LangOpts) {
  1059. if (Loc.isMacroID()) {
  1060. if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
  1061. return None;
  1062. }
  1063. Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
  1064. // Break down the source location.
  1065. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  1066. // Try to load the file buffer.
  1067. bool InvalidTemp = false;
  1068. StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
  1069. if (InvalidTemp)
  1070. return None;
  1071. const char *TokenBegin = File.data() + LocInfo.second;
  1072. // Lex from the start of the given location.
  1073. Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
  1074. TokenBegin, File.end());
  1075. // Find the token.
  1076. Token Tok;
  1077. lexer.LexFromRawLexer(Tok);
  1078. return Tok;
  1079. }
  1080. /// Checks that the given token is the first token that occurs after the
  1081. /// given location (this excludes comments and whitespace). Returns the location
  1082. /// immediately after the specified token. If the token is not found or the
  1083. /// location is inside a macro, the returned source location will be invalid.
  1084. SourceLocation Lexer::findLocationAfterToken(
  1085. SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
  1086. const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
  1087. Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
  1088. if (!Tok || Tok->isNot(TKind))
  1089. return {};
  1090. SourceLocation TokenLoc = Tok->getLocation();
  1091. // Calculate how much whitespace needs to be skipped if any.
  1092. unsigned NumWhitespaceChars = 0;
  1093. if (SkipTrailingWhitespaceAndNewLine) {
  1094. const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
  1095. unsigned char C = *TokenEnd;
  1096. while (isHorizontalWhitespace(C)) {
  1097. C = *(++TokenEnd);
  1098. NumWhitespaceChars++;
  1099. }
  1100. // Skip \r, \n, \r\n, or \n\r
  1101. if (C == '\n' || C == '\r') {
  1102. char PrevC = C;
  1103. C = *(++TokenEnd);
  1104. NumWhitespaceChars++;
  1105. if ((C == '\n' || C == '\r') && C != PrevC)
  1106. NumWhitespaceChars++;
  1107. }
  1108. }
  1109. return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
  1110. }
  1111. /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
  1112. /// get its size, and return it. This is tricky in several cases:
  1113. /// 1. If currently at the start of a trigraph, we warn about the trigraph,
  1114. /// then either return the trigraph (skipping 3 chars) or the '?',
  1115. /// depending on whether trigraphs are enabled or not.
  1116. /// 2. If this is an escaped newline (potentially with whitespace between
  1117. /// the backslash and newline), implicitly skip the newline and return
  1118. /// the char after it.
  1119. ///
  1120. /// This handles the slow/uncommon case of the getCharAndSize method. Here we
  1121. /// know that we can accumulate into Size, and that we have already incremented
  1122. /// Ptr by Size bytes.
  1123. ///
  1124. /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
  1125. /// be updated to match.
  1126. char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  1127. Token *Tok) {
  1128. // If we have a slash, look for an escaped newline.
  1129. if (Ptr[0] == '\\') {
  1130. ++Size;
  1131. ++Ptr;
  1132. Slash:
  1133. // Common case, backslash-char where the char is not whitespace.
  1134. if (!isWhitespace(Ptr[0])) return '\\';
  1135. // See if we have optional whitespace characters between the slash and
  1136. // newline.
  1137. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
  1138. // Remember that this token needs to be cleaned.
  1139. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  1140. // Warn if there was whitespace between the backslash and newline.
  1141. if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
  1142. Diag(Ptr, diag::backslash_newline_space);
  1143. // Found backslash<whitespace><newline>. Parse the char after it.
  1144. Size += EscapedNewLineSize;
  1145. Ptr += EscapedNewLineSize;
  1146. // Use slow version to accumulate a correct size field.
  1147. return getCharAndSizeSlow(Ptr, Size, Tok);
  1148. }
  1149. // Otherwise, this is not an escaped newline, just return the slash.
  1150. return '\\';
  1151. }
  1152. // If this is a trigraph, process it.
  1153. if (Ptr[0] == '?' && Ptr[1] == '?') {
  1154. // If this is actually a legal trigraph (not something like "??x"), emit
  1155. // a trigraph warning. If so, and if trigraphs are enabled, return it.
  1156. if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
  1157. // Remember that this token needs to be cleaned.
  1158. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  1159. Ptr += 3;
  1160. Size += 3;
  1161. if (C == '\\') goto Slash;
  1162. return C;
  1163. }
  1164. }
  1165. // If this is neither, return a single character.
  1166. ++Size;
  1167. return *Ptr;
  1168. }
  1169. /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
  1170. /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
  1171. /// and that we have already incremented Ptr by Size bytes.
  1172. ///
  1173. /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
  1174. /// be updated to match.
  1175. char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  1176. const LangOptions &LangOpts) {
  1177. // If we have a slash, look for an escaped newline.
  1178. if (Ptr[0] == '\\') {
  1179. ++Size;
  1180. ++Ptr;
  1181. Slash:
  1182. // Common case, backslash-char where the char is not whitespace.
  1183. if (!isWhitespace(Ptr[0])) return '\\';
  1184. // See if we have optional whitespace characters followed by a newline.
  1185. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
  1186. // Found backslash<whitespace><newline>. Parse the char after it.
  1187. Size += EscapedNewLineSize;
  1188. Ptr += EscapedNewLineSize;
  1189. // Use slow version to accumulate a correct size field.
  1190. return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
  1191. }
  1192. // Otherwise, this is not an escaped newline, just return the slash.
  1193. return '\\';
  1194. }
  1195. // If this is a trigraph, process it.
  1196. if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
  1197. // If this is actually a legal trigraph (not something like "??x"), return
  1198. // it.
  1199. if (char C = GetTrigraphCharForLetter(Ptr[2])) {
  1200. Ptr += 3;
  1201. Size += 3;
  1202. if (C == '\\') goto Slash;
  1203. return C;
  1204. }
  1205. }
  1206. // If this is neither, return a single character.
  1207. ++Size;
  1208. return *Ptr;
  1209. }
  1210. //===----------------------------------------------------------------------===//
  1211. // Helper methods for lexing.
  1212. //===----------------------------------------------------------------------===//
  1213. /// Routine that indiscriminately sets the offset into the source file.
  1214. void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
  1215. BufferPtr = BufferStart + Offset;
  1216. if (BufferPtr > BufferEnd)
  1217. BufferPtr = BufferEnd;
  1218. // FIXME: What exactly does the StartOfLine bit mean? There are two
  1219. // possible meanings for the "start" of the line: the first token on the
  1220. // unexpanded line, or the first token on the expanded line.
  1221. IsAtStartOfLine = StartOfLine;
  1222. IsAtPhysicalStartOfLine = StartOfLine;
  1223. }
  1224. static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
  1225. if (LangOpts.AsmPreprocessor) {
  1226. return false;
  1227. } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  1228. static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
  1229. C11AllowedIDCharRanges);
  1230. return C11AllowedIDChars.contains(C);
  1231. } else if (LangOpts.CPlusPlus) {
  1232. static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
  1233. CXX03AllowedIDCharRanges);
  1234. return CXX03AllowedIDChars.contains(C);
  1235. } else {
  1236. static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
  1237. C99AllowedIDCharRanges);
  1238. return C99AllowedIDChars.contains(C);
  1239. }
  1240. }
  1241. static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
  1242. assert(isAllowedIDChar(C, LangOpts));
  1243. if (LangOpts.AsmPreprocessor) {
  1244. return false;
  1245. } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  1246. static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
  1247. C11DisallowedInitialIDCharRanges);
  1248. return !C11DisallowedInitialIDChars.contains(C);
  1249. } else if (LangOpts.CPlusPlus) {
  1250. return true;
  1251. } else {
  1252. static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
  1253. C99DisallowedInitialIDCharRanges);
  1254. return !C99DisallowedInitialIDChars.contains(C);
  1255. }
  1256. }
  1257. static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
  1258. const char *End) {
  1259. return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
  1260. L.getSourceLocation(End));
  1261. }
  1262. static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
  1263. CharSourceRange Range, bool IsFirst) {
  1264. // Check C99 compatibility.
  1265. if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
  1266. enum {
  1267. CannotAppearInIdentifier = 0,
  1268. CannotStartIdentifier
  1269. };
  1270. static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
  1271. C99AllowedIDCharRanges);
  1272. static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
  1273. C99DisallowedInitialIDCharRanges);
  1274. if (!C99AllowedIDChars.contains(C)) {
  1275. Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
  1276. << Range
  1277. << CannotAppearInIdentifier;
  1278. } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
  1279. Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
  1280. << Range
  1281. << CannotStartIdentifier;
  1282. }
  1283. }
  1284. // Check C++98 compatibility.
  1285. if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
  1286. static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
  1287. CXX03AllowedIDCharRanges);
  1288. if (!CXX03AllowedIDChars.contains(C)) {
  1289. Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
  1290. << Range;
  1291. }
  1292. }
  1293. }
  1294. /// After encountering UTF-8 character C and interpreting it as an identifier
  1295. /// character, check whether it's a homoglyph for a common non-identifier
  1296. /// source character that is unlikely to be an intentional identifier
  1297. /// character and warn if so.
  1298. static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
  1299. CharSourceRange Range) {
  1300. // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
  1301. struct HomoglyphPair {
  1302. uint32_t Character;
  1303. char LooksLike;
  1304. bool operator<(HomoglyphPair R) const { return Character < R.Character; }
  1305. };
  1306. static constexpr HomoglyphPair SortedHomoglyphs[] = {
  1307. {U'\u00ad', 0}, // SOFT HYPHEN
  1308. {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
  1309. {U'\u037e', ';'}, // GREEK QUESTION MARK
  1310. {U'\u200b', 0}, // ZERO WIDTH SPACE
  1311. {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
  1312. {U'\u200d', 0}, // ZERO WIDTH JOINER
  1313. {U'\u2060', 0}, // WORD JOINER
  1314. {U'\u2061', 0}, // FUNCTION APPLICATION
  1315. {U'\u2062', 0}, // INVISIBLE TIMES
  1316. {U'\u2063', 0}, // INVISIBLE SEPARATOR
  1317. {U'\u2064', 0}, // INVISIBLE PLUS
  1318. {U'\u2212', '-'}, // MINUS SIGN
  1319. {U'\u2215', '/'}, // DIVISION SLASH
  1320. {U'\u2216', '\\'}, // SET MINUS
  1321. {U'\u2217', '*'}, // ASTERISK OPERATOR
  1322. {U'\u2223', '|'}, // DIVIDES
  1323. {U'\u2227', '^'}, // LOGICAL AND
  1324. {U'\u2236', ':'}, // RATIO
  1325. {U'\u223c', '~'}, // TILDE OPERATOR
  1326. {U'\ua789', ':'}, // MODIFIER LETTER COLON
  1327. {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
  1328. {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
  1329. {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
  1330. {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
  1331. {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
  1332. {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
  1333. {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
  1334. {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
  1335. {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
  1336. {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
  1337. {U'\uff0c', ','}, // FULLWIDTH COMMA
  1338. {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
  1339. {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
  1340. {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
  1341. {U'\uff1a', ':'}, // FULLWIDTH COLON
  1342. {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
  1343. {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
  1344. {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
  1345. {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
  1346. {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
  1347. {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
  1348. {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
  1349. {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
  1350. {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
  1351. {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
  1352. {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
  1353. {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
  1354. {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
  1355. {U'\uff5e', '~'}, // FULLWIDTH TILDE
  1356. {0, 0}
  1357. };
  1358. auto Homoglyph =
  1359. std::lower_bound(std::begin(SortedHomoglyphs),
  1360. std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
  1361. if (Homoglyph->Character == C) {
  1362. llvm::SmallString<5> CharBuf;
  1363. {
  1364. llvm::raw_svector_ostream CharOS(CharBuf);
  1365. llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
  1366. }
  1367. if (Homoglyph->LooksLike) {
  1368. const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
  1369. Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
  1370. << Range << CharBuf << LooksLikeStr;
  1371. } else {
  1372. Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
  1373. << Range << CharBuf;
  1374. }
  1375. }
  1376. }
  1377. bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
  1378. Token &Result) {
  1379. const char *UCNPtr = CurPtr + Size;
  1380. uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
  1381. if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
  1382. return false;
  1383. if (!isLexingRawMode())
  1384. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
  1385. makeCharRange(*this, CurPtr, UCNPtr),
  1386. /*IsFirst=*/false);
  1387. Result.setFlag(Token::HasUCN);
  1388. if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
  1389. (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
  1390. CurPtr = UCNPtr;
  1391. else
  1392. while (CurPtr != UCNPtr)
  1393. (void)getAndAdvanceChar(CurPtr, Result);
  1394. return true;
  1395. }
  1396. bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
  1397. const char *UnicodePtr = CurPtr;
  1398. llvm::UTF32 CodePoint;
  1399. llvm::ConversionResult Result =
  1400. llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
  1401. (const llvm::UTF8 *)BufferEnd,
  1402. &CodePoint,
  1403. llvm::strictConversion);
  1404. if (Result != llvm::conversionOK ||
  1405. !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
  1406. return false;
  1407. if (!isLexingRawMode()) {
  1408. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
  1409. makeCharRange(*this, CurPtr, UnicodePtr),
  1410. /*IsFirst=*/false);
  1411. maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
  1412. makeCharRange(*this, CurPtr, UnicodePtr));
  1413. }
  1414. CurPtr = UnicodePtr;
  1415. return true;
  1416. }
  1417. bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  1418. // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  1419. unsigned Size;
  1420. unsigned char C = *CurPtr++;
  1421. while (isIdentifierBody(C))
  1422. C = *CurPtr++;
  1423. --CurPtr; // Back up over the skipped character.
  1424. // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
  1425. // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
  1426. //
  1427. // TODO: Could merge these checks into an InfoTable flag to make the
  1428. // comparison cheaper
  1429. if (isASCII(C) && C != '\\' && C != '?' &&
  1430. (C != '$' || !LangOpts.DollarIdents)) {
  1431. FinishIdentifier:
  1432. const char *IdStart = BufferPtr;
  1433. FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
  1434. Result.setRawIdentifierData(IdStart);
  1435. // If we are in raw mode, return this identifier raw. There is no need to
  1436. // look up identifier information or attempt to macro expand it.
  1437. if (LexingRawMode)
  1438. return true;
  1439. // Fill in Result.IdentifierInfo and update the token kind,
  1440. // looking up the identifier in the identifier table.
  1441. IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
  1442. // Note that we have to call PP->LookUpIdentifierInfo() even for code
  1443. // completion, it writes IdentifierInfo into Result, and callers rely on it.
  1444. // If the completion point is at the end of an identifier, we want to treat
  1445. // the identifier as incomplete even if it resolves to a macro or a keyword.
  1446. // This allows e.g. 'class^' to complete to 'classifier'.
  1447. if (isCodeCompletionPoint(CurPtr)) {
  1448. // Return the code-completion token.
  1449. Result.setKind(tok::code_completion);
  1450. // Skip the code-completion char and all immediate identifier characters.
  1451. // This ensures we get consistent behavior when completing at any point in
  1452. // an identifier (i.e. at the start, in the middle, at the end). Note that
  1453. // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
  1454. // simpler.
  1455. assert(*CurPtr == 0 && "Completion character must be 0");
  1456. ++CurPtr;
  1457. // Note that code completion token is not added as a separate character
  1458. // when the completion point is at the end of the buffer. Therefore, we need
  1459. // to check if the buffer has ended.
  1460. if (CurPtr < BufferEnd) {
  1461. while (isIdentifierBody(*CurPtr))
  1462. ++CurPtr;
  1463. }
  1464. BufferPtr = CurPtr;
  1465. return true;
  1466. }
  1467. // Finally, now that we know we have an identifier, pass this off to the
  1468. // preprocessor, which may macro expand it or something.
  1469. if (II->isHandleIdentifierCase())
  1470. return PP->HandleIdentifier(Result);
  1471. return true;
  1472. }
  1473. // Otherwise, $,\,? in identifier found. Enter slower path.
  1474. C = getCharAndSize(CurPtr, Size);
  1475. while (true) {
  1476. if (C == '$') {
  1477. // If we hit a $ and they are not supported in identifiers, we are done.
  1478. if (!LangOpts.DollarIdents) goto FinishIdentifier;
  1479. // Otherwise, emit a diagnostic and continue.
  1480. if (!isLexingRawMode())
  1481. Diag(CurPtr, diag::ext_dollar_in_identifier);
  1482. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1483. C = getCharAndSize(CurPtr, Size);
  1484. continue;
  1485. } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
  1486. C = getCharAndSize(CurPtr, Size);
  1487. continue;
  1488. } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
  1489. C = getCharAndSize(CurPtr, Size);
  1490. continue;
  1491. } else if (!isIdentifierBody(C)) {
  1492. goto FinishIdentifier;
  1493. }
  1494. // Otherwise, this character is good, consume it.
  1495. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1496. C = getCharAndSize(CurPtr, Size);
  1497. while (isIdentifierBody(C)) {
  1498. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1499. C = getCharAndSize(CurPtr, Size);
  1500. }
  1501. }
  1502. }
  1503. /// isHexaLiteral - Return true if Start points to a hex constant.
  1504. /// in microsoft mode (where this is supposed to be several different tokens).
  1505. bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
  1506. unsigned Size;
  1507. char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
  1508. if (C1 != '0')
  1509. return false;
  1510. char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
  1511. return (C2 == 'x' || C2 == 'X');
  1512. }
  1513. /// LexNumericConstant - Lex the remainder of a integer or floating point
  1514. /// constant. From[-1] is the first character lexed. Return the end of the
  1515. /// constant.
  1516. bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  1517. unsigned Size;
  1518. char C = getCharAndSize(CurPtr, Size);
  1519. char PrevCh = 0;
  1520. while (isPreprocessingNumberBody(C)) {
  1521. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1522. PrevCh = C;
  1523. C = getCharAndSize(CurPtr, Size);
  1524. }
  1525. // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
  1526. if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
  1527. // If we are in Microsoft mode, don't continue if the constant is hex.
  1528. // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
  1529. if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
  1530. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  1531. }
  1532. // If we have a hex FP constant, continue.
  1533. if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
  1534. // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
  1535. // not-quite-conforming extension. Only do so if this looks like it's
  1536. // actually meant to be a hexfloat, and not if it has a ud-suffix.
  1537. bool IsHexFloat = true;
  1538. if (!LangOpts.C99) {
  1539. if (!isHexaLiteral(BufferPtr, LangOpts))
  1540. IsHexFloat = false;
  1541. else if (!getLangOpts().CPlusPlus17 &&
  1542. std::find(BufferPtr, CurPtr, '_') != CurPtr)
  1543. IsHexFloat = false;
  1544. }
  1545. if (IsHexFloat)
  1546. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
  1547. }
  1548. // If we have a digit separator, continue.
  1549. if (C == '\'' && getLangOpts().CPlusPlus14) {
  1550. unsigned NextSize;
  1551. char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
  1552. if (isIdentifierBody(Next)) {
  1553. if (!isLexingRawMode())
  1554. Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
  1555. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1556. CurPtr = ConsumeChar(CurPtr, NextSize, Result);
  1557. return LexNumericConstant(Result, CurPtr);
  1558. }
  1559. }
  1560. // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
  1561. if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
  1562. return LexNumericConstant(Result, CurPtr);
  1563. if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
  1564. return LexNumericConstant(Result, CurPtr);
  1565. // Update the location of token as well as BufferPtr.
  1566. const char *TokStart = BufferPtr;
  1567. FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
  1568. Result.setLiteralData(TokStart);
  1569. return true;
  1570. }
  1571. /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
  1572. /// in C++11, or warn on a ud-suffix in C++98.
  1573. const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
  1574. bool IsStringLiteral) {
  1575. assert(getLangOpts().CPlusPlus);
  1576. // Maximally munch an identifier.
  1577. unsigned Size;
  1578. char C = getCharAndSize(CurPtr, Size);
  1579. bool Consumed = false;
  1580. if (!isIdentifierHead(C)) {
  1581. if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
  1582. Consumed = true;
  1583. else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
  1584. Consumed = true;
  1585. else
  1586. return CurPtr;
  1587. }
  1588. if (!getLangOpts().CPlusPlus11) {
  1589. if (!isLexingRawMode())
  1590. Diag(CurPtr,
  1591. C == '_' ? diag::warn_cxx11_compat_user_defined_literal
  1592. : diag::warn_cxx11_compat_reserved_user_defined_literal)
  1593. << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
  1594. return CurPtr;
  1595. }
  1596. // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
  1597. // that does not start with an underscore is ill-formed. As a conforming
  1598. // extension, we treat all such suffixes as if they had whitespace before
  1599. // them. We assume a suffix beginning with a UCN or UTF-8 character is more
  1600. // likely to be a ud-suffix than a macro, however, and accept that.
  1601. if (!Consumed) {
  1602. bool IsUDSuffix = false;
  1603. if (C == '_')
  1604. IsUDSuffix = true;
  1605. else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
  1606. // In C++1y, we need to look ahead a few characters to see if this is a
  1607. // valid suffix for a string literal or a numeric literal (this could be
  1608. // the 'operator""if' defining a numeric literal operator).
  1609. const unsigned MaxStandardSuffixLength = 3;
  1610. char Buffer[MaxStandardSuffixLength] = { C };
  1611. unsigned Consumed = Size;
  1612. unsigned Chars = 1;
  1613. while (true) {
  1614. unsigned NextSize;
  1615. char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
  1616. getLangOpts());
  1617. if (!isIdentifierBody(Next)) {
  1618. // End of suffix. Check whether this is on the whitelist.
  1619. const StringRef CompleteSuffix(Buffer, Chars);
  1620. IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
  1621. CompleteSuffix);
  1622. break;
  1623. }
  1624. if (Chars == MaxStandardSuffixLength)
  1625. // Too long: can't be a standard suffix.
  1626. break;
  1627. Buffer[Chars++] = Next;
  1628. Consumed += NextSize;
  1629. }
  1630. }
  1631. if (!IsUDSuffix) {
  1632. if (!isLexingRawMode())
  1633. Diag(CurPtr, getLangOpts().MSVCCompat
  1634. ? diag::ext_ms_reserved_user_defined_literal
  1635. : diag::ext_reserved_user_defined_literal)
  1636. << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
  1637. return CurPtr;
  1638. }
  1639. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1640. }
  1641. Result.setFlag(Token::HasUDSuffix);
  1642. while (true) {
  1643. C = getCharAndSize(CurPtr, Size);
  1644. if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
  1645. else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
  1646. else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
  1647. else break;
  1648. }
  1649. return CurPtr;
  1650. }
  1651. /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
  1652. /// either " or L" or u8" or u" or U".
  1653. bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
  1654. tok::TokenKind Kind) {
  1655. const char *AfterQuote = CurPtr;
  1656. // Does this string contain the \0 character?
  1657. const char *NulCharacter = nullptr;
  1658. if (!isLexingRawMode() &&
  1659. (Kind == tok::utf8_string_literal ||
  1660. Kind == tok::utf16_string_literal ||
  1661. Kind == tok::utf32_string_literal))
  1662. Diag(BufferPtr, getLangOpts().CPlusPlus
  1663. ? diag::warn_cxx98_compat_unicode_literal
  1664. : diag::warn_c99_compat_unicode_literal);
  1665. char C = getAndAdvanceChar(CurPtr, Result);
  1666. while (C != '"') {
  1667. // Skip escaped characters. Escaped newlines will already be processed by
  1668. // getAndAdvanceChar.
  1669. if (C == '\\')
  1670. C = getAndAdvanceChar(CurPtr, Result);
  1671. if (C == '\n' || C == '\r' || // Newline.
  1672. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  1673. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1674. Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
  1675. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1676. return true;
  1677. }
  1678. if (C == 0) {
  1679. if (isCodeCompletionPoint(CurPtr-1)) {
  1680. if (ParsingFilename)
  1681. codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
  1682. else
  1683. PP->CodeCompleteNaturalLanguage();
  1684. FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
  1685. cutOffLexing();
  1686. return true;
  1687. }
  1688. NulCharacter = CurPtr-1;
  1689. }
  1690. C = getAndAdvanceChar(CurPtr, Result);
  1691. }
  1692. // If we are in C++11, lex the optional ud-suffix.
  1693. if (getLangOpts().CPlusPlus)
  1694. CurPtr = LexUDSuffix(Result, CurPtr, true);
  1695. // If a nul character existed in the string, warn about it.
  1696. if (NulCharacter && !isLexingRawMode())
  1697. Diag(NulCharacter, diag::null_in_char_or_string) << 1;
  1698. // Update the location of the token as well as the BufferPtr instance var.
  1699. const char *TokStart = BufferPtr;
  1700. FormTokenWithChars(Result, CurPtr, Kind);
  1701. Result.setLiteralData(TokStart);
  1702. return true;
  1703. }
  1704. /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
  1705. /// having lexed R", LR", u8R", uR", or UR".
  1706. bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
  1707. tok::TokenKind Kind) {
  1708. // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
  1709. // Between the initial and final double quote characters of the raw string,
  1710. // any transformations performed in phases 1 and 2 (trigraphs,
  1711. // universal-character-names, and line splicing) are reverted.
  1712. if (!isLexingRawMode())
  1713. Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
  1714. unsigned PrefixLen = 0;
  1715. while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
  1716. ++PrefixLen;
  1717. // If the last character was not a '(', then we didn't lex a valid delimiter.
  1718. if (CurPtr[PrefixLen] != '(') {
  1719. if (!isLexingRawMode()) {
  1720. const char *PrefixEnd = &CurPtr[PrefixLen];
  1721. if (PrefixLen == 16) {
  1722. Diag(PrefixEnd, diag::err_raw_delim_too_long);
  1723. } else {
  1724. Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
  1725. << StringRef(PrefixEnd, 1);
  1726. }
  1727. }
  1728. // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
  1729. // it's possible the '"' was intended to be part of the raw string, but
  1730. // there's not much we can do about that.
  1731. while (true) {
  1732. char C = *CurPtr++;
  1733. if (C == '"')
  1734. break;
  1735. if (C == 0 && CurPtr-1 == BufferEnd) {
  1736. --CurPtr;
  1737. break;
  1738. }
  1739. }
  1740. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1741. return true;
  1742. }
  1743. // Save prefix and move CurPtr past it
  1744. const char *Prefix = CurPtr;
  1745. CurPtr += PrefixLen + 1; // skip over prefix and '('
  1746. while (true) {
  1747. char C = *CurPtr++;
  1748. if (C == ')') {
  1749. // Check for prefix match and closing quote.
  1750. if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
  1751. CurPtr += PrefixLen + 1; // skip over prefix and '"'
  1752. break;
  1753. }
  1754. } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
  1755. if (!isLexingRawMode())
  1756. Diag(BufferPtr, diag::err_unterminated_raw_string)
  1757. << StringRef(Prefix, PrefixLen);
  1758. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1759. return true;
  1760. }
  1761. }
  1762. // If we are in C++11, lex the optional ud-suffix.
  1763. if (getLangOpts().CPlusPlus)
  1764. CurPtr = LexUDSuffix(Result, CurPtr, true);
  1765. // Update the location of token as well as BufferPtr.
  1766. const char *TokStart = BufferPtr;
  1767. FormTokenWithChars(Result, CurPtr, Kind);
  1768. Result.setLiteralData(TokStart);
  1769. return true;
  1770. }
  1771. /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
  1772. /// after having lexed the '<' character. This is used for #include filenames.
  1773. bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
  1774. // Does this string contain the \0 character?
  1775. const char *NulCharacter = nullptr;
  1776. const char *AfterLessPos = CurPtr;
  1777. char C = getAndAdvanceChar(CurPtr, Result);
  1778. while (C != '>') {
  1779. // Skip escaped characters. Escaped newlines will already be processed by
  1780. // getAndAdvanceChar.
  1781. if (C == '\\')
  1782. C = getAndAdvanceChar(CurPtr, Result);
  1783. if (C == '\n' || C == '\r' || // Newline.
  1784. (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
  1785. // If the filename is unterminated, then it must just be a lone <
  1786. // character. Return this as such.
  1787. FormTokenWithChars(Result, AfterLessPos, tok::less);
  1788. return true;
  1789. }
  1790. if (C == 0) {
  1791. if (isCodeCompletionPoint(CurPtr - 1)) {
  1792. codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
  1793. cutOffLexing();
  1794. FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
  1795. return true;
  1796. }
  1797. NulCharacter = CurPtr-1;
  1798. }
  1799. C = getAndAdvanceChar(CurPtr, Result);
  1800. }
  1801. // If a nul character existed in the string, warn about it.
  1802. if (NulCharacter && !isLexingRawMode())
  1803. Diag(NulCharacter, diag::null_in_char_or_string) << 1;
  1804. // Update the location of token as well as BufferPtr.
  1805. const char *TokStart = BufferPtr;
  1806. FormTokenWithChars(Result, CurPtr, tok::header_name);
  1807. Result.setLiteralData(TokStart);
  1808. return true;
  1809. }
  1810. void Lexer::codeCompleteIncludedFile(const char *PathStart,
  1811. const char *CompletionPoint,
  1812. bool IsAngled) {
  1813. // Completion only applies to the filename, after the last slash.
  1814. StringRef PartialPath(PathStart, CompletionPoint - PathStart);
  1815. auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/");
  1816. StringRef Dir =
  1817. (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
  1818. const char *StartOfFilename =
  1819. (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
  1820. // Code completion filter range is the filename only, up to completion point.
  1821. PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
  1822. StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
  1823. // We should replace the characters up to the closing quote, if any.
  1824. while (CompletionPoint < BufferEnd) {
  1825. char Next = *(CompletionPoint + 1);
  1826. if (Next == 0 || Next == '\r' || Next == '\n')
  1827. break;
  1828. ++CompletionPoint;
  1829. if (Next == (IsAngled ? '>' : '"'))
  1830. break;
  1831. }
  1832. PP->setCodeCompletionTokenRange(
  1833. FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
  1834. FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
  1835. PP->CodeCompleteIncludedFile(Dir, IsAngled);
  1836. }
  1837. /// LexCharConstant - Lex the remainder of a character constant, after having
  1838. /// lexed either ' or L' or u8' or u' or U'.
  1839. bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
  1840. tok::TokenKind Kind) {
  1841. // Does this character contain the \0 character?
  1842. const char *NulCharacter = nullptr;
  1843. if (!isLexingRawMode()) {
  1844. if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
  1845. Diag(BufferPtr, getLangOpts().CPlusPlus
  1846. ? diag::warn_cxx98_compat_unicode_literal
  1847. : diag::warn_c99_compat_unicode_literal);
  1848. else if (Kind == tok::utf8_char_constant)
  1849. Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
  1850. }
  1851. char C = getAndAdvanceChar(CurPtr, Result);
  1852. if (C == '\'') {
  1853. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1854. Diag(BufferPtr, diag::ext_empty_character);
  1855. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1856. return true;
  1857. }
  1858. while (C != '\'') {
  1859. // Skip escaped characters.
  1860. if (C == '\\')
  1861. C = getAndAdvanceChar(CurPtr, Result);
  1862. if (C == '\n' || C == '\r' || // Newline.
  1863. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  1864. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1865. Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
  1866. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1867. return true;
  1868. }
  1869. if (C == 0) {
  1870. if (isCodeCompletionPoint(CurPtr-1)) {
  1871. PP->CodeCompleteNaturalLanguage();
  1872. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1873. cutOffLexing();
  1874. return true;
  1875. }
  1876. NulCharacter = CurPtr-1;
  1877. }
  1878. C = getAndAdvanceChar(CurPtr, Result);
  1879. }
  1880. // If we are in C++11, lex the optional ud-suffix.
  1881. if (getLangOpts().CPlusPlus)
  1882. CurPtr = LexUDSuffix(Result, CurPtr, false);
  1883. // If a nul character existed in the character, warn about it.
  1884. if (NulCharacter && !isLexingRawMode())
  1885. Diag(NulCharacter, diag::null_in_char_or_string) << 0;
  1886. // Update the location of token as well as BufferPtr.
  1887. const char *TokStart = BufferPtr;
  1888. FormTokenWithChars(Result, CurPtr, Kind);
  1889. Result.setLiteralData(TokStart);
  1890. return true;
  1891. }
  1892. /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
  1893. /// Update BufferPtr to point to the next non-whitespace character and return.
  1894. ///
  1895. /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
  1896. bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
  1897. bool &TokAtPhysicalStartOfLine) {
  1898. // Whitespace - Skip it, then return the token after the whitespace.
  1899. bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
  1900. unsigned char Char = *CurPtr;
  1901. // Skip consecutive spaces efficiently.
  1902. while (true) {
  1903. // Skip horizontal whitespace very aggressively.
  1904. while (isHorizontalWhitespace(Char))
  1905. Char = *++CurPtr;
  1906. // Otherwise if we have something other than whitespace, we're done.
  1907. if (!isVerticalWhitespace(Char))
  1908. break;
  1909. if (ParsingPreprocessorDirective) {
  1910. // End of preprocessor directive line, let LexTokenInternal handle this.
  1911. BufferPtr = CurPtr;
  1912. return false;
  1913. }
  1914. // OK, but handle newline.
  1915. SawNewline = true;
  1916. Char = *++CurPtr;
  1917. }
  1918. // If the client wants us to return whitespace, return it now.
  1919. if (isKeepWhitespaceMode()) {
  1920. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1921. if (SawNewline) {
  1922. IsAtStartOfLine = true;
  1923. IsAtPhysicalStartOfLine = true;
  1924. }
  1925. // FIXME: The next token will not have LeadingSpace set.
  1926. return true;
  1927. }
  1928. // If this isn't immediately after a newline, there is leading space.
  1929. char PrevChar = CurPtr[-1];
  1930. bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
  1931. Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
  1932. if (SawNewline) {
  1933. Result.setFlag(Token::StartOfLine);
  1934. TokAtPhysicalStartOfLine = true;
  1935. }
  1936. BufferPtr = CurPtr;
  1937. return false;
  1938. }
  1939. /// We have just read the // characters from input. Skip until we find the
  1940. /// newline character that terminates the comment. Then update BufferPtr and
  1941. /// return.
  1942. ///
  1943. /// If we're in KeepCommentMode or any CommentHandler has inserted
  1944. /// some tokens, this will store the first token and return true.
  1945. bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
  1946. bool &TokAtPhysicalStartOfLine) {
  1947. // If Line comments aren't explicitly enabled for this language, emit an
  1948. // extension warning.
  1949. if (!LangOpts.LineComment && !isLexingRawMode()) {
  1950. Diag(BufferPtr, diag::ext_line_comment);
  1951. // Mark them enabled so we only emit one warning for this translation
  1952. // unit.
  1953. LangOpts.LineComment = true;
  1954. }
  1955. // Scan over the body of the comment. The common case, when scanning, is that
  1956. // the comment contains normal ascii characters with nothing interesting in
  1957. // them. As such, optimize for this case with the inner loop.
  1958. //
  1959. // This loop terminates with CurPtr pointing at the newline (or end of buffer)
  1960. // character that ends the line comment.
  1961. char C;
  1962. while (true) {
  1963. C = *CurPtr;
  1964. // Skip over characters in the fast loop.
  1965. while (C != 0 && // Potentially EOF.
  1966. C != '\n' && C != '\r') // Newline or DOS-style newline.
  1967. C = *++CurPtr;
  1968. const char *NextLine = CurPtr;
  1969. if (C != 0) {
  1970. // We found a newline, see if it's escaped.
  1971. const char *EscapePtr = CurPtr-1;
  1972. bool HasSpace = false;
  1973. while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
  1974. --EscapePtr;
  1975. HasSpace = true;
  1976. }
  1977. if (*EscapePtr == '\\')
  1978. // Escaped newline.
  1979. CurPtr = EscapePtr;
  1980. else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
  1981. EscapePtr[-2] == '?' && LangOpts.Trigraphs)
  1982. // Trigraph-escaped newline.
  1983. CurPtr = EscapePtr-2;
  1984. else
  1985. break; // This is a newline, we're done.
  1986. // If there was space between the backslash and newline, warn about it.
  1987. if (HasSpace && !isLexingRawMode())
  1988. Diag(EscapePtr, diag::backslash_newline_space);
  1989. }
  1990. // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
  1991. // properly decode the character. Read it in raw mode to avoid emitting
  1992. // diagnostics about things like trigraphs. If we see an escaped newline,
  1993. // we'll handle it below.
  1994. const char *OldPtr = CurPtr;
  1995. bool OldRawMode = isLexingRawMode();
  1996. LexingRawMode = true;
  1997. C = getAndAdvanceChar(CurPtr, Result);
  1998. LexingRawMode = OldRawMode;
  1999. // If we only read only one character, then no special handling is needed.
  2000. // We're done and can skip forward to the newline.
  2001. if (C != 0 && CurPtr == OldPtr+1) {
  2002. CurPtr = NextLine;
  2003. break;
  2004. }
  2005. // If we read multiple characters, and one of those characters was a \r or
  2006. // \n, then we had an escaped newline within the comment. Emit diagnostic
  2007. // unless the next line is also a // comment.
  2008. if (CurPtr != OldPtr + 1 && C != '/' &&
  2009. (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
  2010. for (; OldPtr != CurPtr; ++OldPtr)
  2011. if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
  2012. // Okay, we found a // comment that ends in a newline, if the next
  2013. // line is also a // comment, but has spaces, don't emit a diagnostic.
  2014. if (isWhitespace(C)) {
  2015. const char *ForwardPtr = CurPtr;
  2016. while (isWhitespace(*ForwardPtr)) // Skip whitespace.
  2017. ++ForwardPtr;
  2018. if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
  2019. break;
  2020. }
  2021. if (!isLexingRawMode())
  2022. Diag(OldPtr-1, diag::ext_multi_line_line_comment);
  2023. break;
  2024. }
  2025. }
  2026. if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
  2027. --CurPtr;
  2028. break;
  2029. }
  2030. if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
  2031. PP->CodeCompleteNaturalLanguage();
  2032. cutOffLexing();
  2033. return false;
  2034. }
  2035. }
  2036. // Found but did not consume the newline. Notify comment handlers about the
  2037. // comment unless we're in a #if 0 block.
  2038. if (PP && !isLexingRawMode() &&
  2039. PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
  2040. getSourceLocation(CurPtr)))) {
  2041. BufferPtr = CurPtr;
  2042. return true; // A token has to be returned.
  2043. }
  2044. // If we are returning comments as tokens, return this comment as a token.
  2045. if (inKeepCommentMode())
  2046. return SaveLineComment(Result, CurPtr);
  2047. // If we are inside a preprocessor directive and we see the end of line,
  2048. // return immediately, so that the lexer can return this as an EOD token.
  2049. if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
  2050. BufferPtr = CurPtr;
  2051. return false;
  2052. }
  2053. // Otherwise, eat the \n character. We don't care if this is a \n\r or
  2054. // \r\n sequence. This is an efficiency hack (because we know the \n can't
  2055. // contribute to another token), it isn't needed for correctness. Note that
  2056. // this is ok even in KeepWhitespaceMode, because we would have returned the
  2057. /// comment above in that mode.
  2058. ++CurPtr;
  2059. // The next returned token is at the start of the line.
  2060. Result.setFlag(Token::StartOfLine);
  2061. TokAtPhysicalStartOfLine = true;
  2062. // No leading whitespace seen so far.
  2063. Result.clearFlag(Token::LeadingSpace);
  2064. BufferPtr = CurPtr;
  2065. return false;
  2066. }
  2067. /// If in save-comment mode, package up this Line comment in an appropriate
  2068. /// way and return it.
  2069. bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
  2070. // If we're not in a preprocessor directive, just return the // comment
  2071. // directly.
  2072. FormTokenWithChars(Result, CurPtr, tok::comment);
  2073. if (!ParsingPreprocessorDirective || LexingRawMode)
  2074. return true;
  2075. // If this Line-style comment is in a macro definition, transmogrify it into
  2076. // a C-style block comment.
  2077. bool Invalid = false;
  2078. std::string Spelling = PP->getSpelling(Result, &Invalid);
  2079. if (Invalid)
  2080. return true;
  2081. assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
  2082. Spelling[1] = '*'; // Change prefix to "/*".
  2083. Spelling += "*/"; // add suffix.
  2084. Result.setKind(tok::comment);
  2085. PP->CreateString(Spelling, Result,
  2086. Result.getLocation(), Result.getLocation());
  2087. return true;
  2088. }
  2089. /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
  2090. /// character (either \\n or \\r) is part of an escaped newline sequence. Issue
  2091. /// a diagnostic if so. We know that the newline is inside of a block comment.
  2092. static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
  2093. Lexer *L) {
  2094. assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
  2095. // Back up off the newline.
  2096. --CurPtr;
  2097. // If this is a two-character newline sequence, skip the other character.
  2098. if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
  2099. // \n\n or \r\r -> not escaped newline.
  2100. if (CurPtr[0] == CurPtr[1])
  2101. return false;
  2102. // \n\r or \r\n -> skip the newline.
  2103. --CurPtr;
  2104. }
  2105. // If we have horizontal whitespace, skip over it. We allow whitespace
  2106. // between the slash and newline.
  2107. bool HasSpace = false;
  2108. while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
  2109. --CurPtr;
  2110. HasSpace = true;
  2111. }
  2112. // If we have a slash, we know this is an escaped newline.
  2113. if (*CurPtr == '\\') {
  2114. if (CurPtr[-1] != '*') return false;
  2115. } else {
  2116. // It isn't a slash, is it the ?? / trigraph?
  2117. if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
  2118. CurPtr[-3] != '*')
  2119. return false;
  2120. // This is the trigraph ending the comment. Emit a stern warning!
  2121. CurPtr -= 2;
  2122. // If no trigraphs are enabled, warn that we ignored this trigraph and
  2123. // ignore this * character.
  2124. if (!L->getLangOpts().Trigraphs) {
  2125. if (!L->isLexingRawMode())
  2126. L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
  2127. return false;
  2128. }
  2129. if (!L->isLexingRawMode())
  2130. L->Diag(CurPtr, diag::trigraph_ends_block_comment);
  2131. }
  2132. // Warn about having an escaped newline between the */ characters.
  2133. if (!L->isLexingRawMode())
  2134. L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
  2135. // If there was space between the backslash and newline, warn about it.
  2136. if (HasSpace && !L->isLexingRawMode())
  2137. L->Diag(CurPtr, diag::backslash_newline_space);
  2138. return true;
  2139. }
  2140. #ifdef __SSE2__
  2141. #include <emmintrin.h>
  2142. #elif __ALTIVEC__
  2143. #include <altivec.h>
  2144. #undef bool
  2145. #endif
  2146. /// We have just read from input the / and * characters that started a comment.
  2147. /// Read until we find the * and / characters that terminate the comment.
  2148. /// Note that we don't bother decoding trigraphs or escaped newlines in block
  2149. /// comments, because they cannot cause the comment to end. The only thing
  2150. /// that can happen is the comment could end with an escaped newline between
  2151. /// the terminating * and /.
  2152. ///
  2153. /// If we're in KeepCommentMode or any CommentHandler has inserted
  2154. /// some tokens, this will store the first token and return true.
  2155. bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
  2156. bool &TokAtPhysicalStartOfLine) {
  2157. // Scan one character past where we should, looking for a '/' character. Once
  2158. // we find it, check to see if it was preceded by a *. This common
  2159. // optimization helps people who like to put a lot of * characters in their
  2160. // comments.
  2161. // The first character we get with newlines and trigraphs skipped to handle
  2162. // the degenerate /*/ case below correctly if the * has an escaped newline
  2163. // after it.
  2164. unsigned CharSize;
  2165. unsigned char C = getCharAndSize(CurPtr, CharSize);
  2166. CurPtr += CharSize;
  2167. if (C == 0 && CurPtr == BufferEnd+1) {
  2168. if (!isLexingRawMode())
  2169. Diag(BufferPtr, diag::err_unterminated_block_comment);
  2170. --CurPtr;
  2171. // KeepWhitespaceMode should return this broken comment as a token. Since
  2172. // it isn't a well formed comment, just return it as an 'unknown' token.
  2173. if (isKeepWhitespaceMode()) {
  2174. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2175. return true;
  2176. }
  2177. BufferPtr = CurPtr;
  2178. return false;
  2179. }
  2180. // Check to see if the first character after the '/*' is another /. If so,
  2181. // then this slash does not end the block comment, it is part of it.
  2182. if (C == '/')
  2183. C = *CurPtr++;
  2184. while (true) {
  2185. // Skip over all non-interesting characters until we find end of buffer or a
  2186. // (probably ending) '/' character.
  2187. if (CurPtr + 24 < BufferEnd &&
  2188. // If there is a code-completion point avoid the fast scan because it
  2189. // doesn't check for '\0'.
  2190. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
  2191. // While not aligned to a 16-byte boundary.
  2192. while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
  2193. C = *CurPtr++;
  2194. if (C == '/') goto FoundSlash;
  2195. #ifdef __SSE2__
  2196. __m128i Slashes = _mm_set1_epi8('/');
  2197. while (CurPtr+16 <= BufferEnd) {
  2198. int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
  2199. Slashes));
  2200. if (cmp != 0) {
  2201. // Adjust the pointer to point directly after the first slash. It's
  2202. // not necessary to set C here, it will be overwritten at the end of
  2203. // the outer loop.
  2204. CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
  2205. goto FoundSlash;
  2206. }
  2207. CurPtr += 16;
  2208. }
  2209. #elif __ALTIVEC__
  2210. __vector unsigned char Slashes = {
  2211. '/', '/', '/', '/', '/', '/', '/', '/',
  2212. '/', '/', '/', '/', '/', '/', '/', '/'
  2213. };
  2214. while (CurPtr+16 <= BufferEnd &&
  2215. !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
  2216. CurPtr += 16;
  2217. #else
  2218. // Scan for '/' quickly. Many block comments are very large.
  2219. while (CurPtr[0] != '/' &&
  2220. CurPtr[1] != '/' &&
  2221. CurPtr[2] != '/' &&
  2222. CurPtr[3] != '/' &&
  2223. CurPtr+4 < BufferEnd) {
  2224. CurPtr += 4;
  2225. }
  2226. #endif
  2227. // It has to be one of the bytes scanned, increment to it and read one.
  2228. C = *CurPtr++;
  2229. }
  2230. // Loop to scan the remainder.
  2231. while (C != '/' && C != '\0')
  2232. C = *CurPtr++;
  2233. if (C == '/') {
  2234. FoundSlash:
  2235. if (CurPtr[-2] == '*') // We found the final */. We're done!
  2236. break;
  2237. if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
  2238. if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
  2239. // We found the final */, though it had an escaped newline between the
  2240. // * and /. We're done!
  2241. break;
  2242. }
  2243. }
  2244. if (CurPtr[0] == '*' && CurPtr[1] != '/') {
  2245. // If this is a /* inside of the comment, emit a warning. Don't do this
  2246. // if this is a /*/, which will end the comment. This misses cases with
  2247. // embedded escaped newlines, but oh well.
  2248. if (!isLexingRawMode())
  2249. Diag(CurPtr-1, diag::warn_nested_block_comment);
  2250. }
  2251. } else if (C == 0 && CurPtr == BufferEnd+1) {
  2252. if (!isLexingRawMode())
  2253. Diag(BufferPtr, diag::err_unterminated_block_comment);
  2254. // Note: the user probably forgot a */. We could continue immediately
  2255. // after the /*, but this would involve lexing a lot of what really is the
  2256. // comment, which surely would confuse the parser.
  2257. --CurPtr;
  2258. // KeepWhitespaceMode should return this broken comment as a token. Since
  2259. // it isn't a well formed comment, just return it as an 'unknown' token.
  2260. if (isKeepWhitespaceMode()) {
  2261. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2262. return true;
  2263. }
  2264. BufferPtr = CurPtr;
  2265. return false;
  2266. } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
  2267. PP->CodeCompleteNaturalLanguage();
  2268. cutOffLexing();
  2269. return false;
  2270. }
  2271. C = *CurPtr++;
  2272. }
  2273. // Notify comment handlers about the comment unless we're in a #if 0 block.
  2274. if (PP && !isLexingRawMode() &&
  2275. PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
  2276. getSourceLocation(CurPtr)))) {
  2277. BufferPtr = CurPtr;
  2278. return true; // A token has to be returned.
  2279. }
  2280. // If we are returning comments as tokens, return this comment as a token.
  2281. if (inKeepCommentMode()) {
  2282. FormTokenWithChars(Result, CurPtr, tok::comment);
  2283. return true;
  2284. }
  2285. // It is common for the tokens immediately after a /**/ comment to be
  2286. // whitespace. Instead of going through the big switch, handle it
  2287. // efficiently now. This is safe even in KeepWhitespaceMode because we would
  2288. // have already returned above with the comment as a token.
  2289. if (isHorizontalWhitespace(*CurPtr)) {
  2290. SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
  2291. return false;
  2292. }
  2293. // Otherwise, just return so that the next character will be lexed as a token.
  2294. BufferPtr = CurPtr;
  2295. Result.setFlag(Token::LeadingSpace);
  2296. return false;
  2297. }
  2298. //===----------------------------------------------------------------------===//
  2299. // Primary Lexing Entry Points
  2300. //===----------------------------------------------------------------------===//
  2301. /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  2302. /// uninterpreted string. This switches the lexer out of directive mode.
  2303. void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
  2304. assert(ParsingPreprocessorDirective && ParsingFilename == false &&
  2305. "Must be in a preprocessing directive!");
  2306. Token Tmp;
  2307. // CurPtr - Cache BufferPtr in an automatic variable.
  2308. const char *CurPtr = BufferPtr;
  2309. while (true) {
  2310. char Char = getAndAdvanceChar(CurPtr, Tmp);
  2311. switch (Char) {
  2312. default:
  2313. if (Result)
  2314. Result->push_back(Char);
  2315. break;
  2316. case 0: // Null.
  2317. // Found end of file?
  2318. if (CurPtr-1 != BufferEnd) {
  2319. if (isCodeCompletionPoint(CurPtr-1)) {
  2320. PP->CodeCompleteNaturalLanguage();
  2321. cutOffLexing();
  2322. return;
  2323. }
  2324. // Nope, normal character, continue.
  2325. if (Result)
  2326. Result->push_back(Char);
  2327. break;
  2328. }
  2329. // FALL THROUGH.
  2330. LLVM_FALLTHROUGH;
  2331. case '\r':
  2332. case '\n':
  2333. // Okay, we found the end of the line. First, back up past the \0, \r, \n.
  2334. assert(CurPtr[-1] == Char && "Trigraphs for newline?");
  2335. BufferPtr = CurPtr-1;
  2336. // Next, lex the character, which should handle the EOD transition.
  2337. Lex(Tmp);
  2338. if (Tmp.is(tok::code_completion)) {
  2339. if (PP)
  2340. PP->CodeCompleteNaturalLanguage();
  2341. Lex(Tmp);
  2342. }
  2343. assert(Tmp.is(tok::eod) && "Unexpected token!");
  2344. // Finally, we're done;
  2345. return;
  2346. }
  2347. }
  2348. }
  2349. /// LexEndOfFile - CurPtr points to the end of this file. Handle this
  2350. /// condition, reporting diagnostics and handling other edge cases as required.
  2351. /// This returns true if Result contains a token, false if PP.Lex should be
  2352. /// called again.
  2353. bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
  2354. // If we hit the end of the file while parsing a preprocessor directive,
  2355. // end the preprocessor directive first. The next token returned will
  2356. // then be the end of file.
  2357. if (ParsingPreprocessorDirective) {
  2358. // Done parsing the "line".
  2359. ParsingPreprocessorDirective = false;
  2360. // Update the location of token as well as BufferPtr.
  2361. FormTokenWithChars(Result, CurPtr, tok::eod);
  2362. // Restore comment saving mode, in case it was disabled for directive.
  2363. if (PP)
  2364. resetExtendedTokenMode();
  2365. return true; // Have a token.
  2366. }
  2367. // If we are in raw mode, return this event as an EOF token. Let the caller
  2368. // that put us in raw mode handle the event.
  2369. if (isLexingRawMode()) {
  2370. Result.startToken();
  2371. BufferPtr = BufferEnd;
  2372. FormTokenWithChars(Result, BufferEnd, tok::eof);
  2373. return true;
  2374. }
  2375. if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
  2376. PP->setRecordedPreambleConditionalStack(ConditionalStack);
  2377. ConditionalStack.clear();
  2378. }
  2379. // Issue diagnostics for unterminated #if and missing newline.
  2380. // If we are in a #if directive, emit an error.
  2381. while (!ConditionalStack.empty()) {
  2382. if (PP->getCodeCompletionFileLoc() != FileLoc)
  2383. PP->Diag(ConditionalStack.back().IfLoc,
  2384. diag::err_pp_unterminated_conditional);
  2385. ConditionalStack.pop_back();
  2386. }
  2387. // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
  2388. // a pedwarn.
  2389. if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
  2390. DiagnosticsEngine &Diags = PP->getDiagnostics();
  2391. SourceLocation EndLoc = getSourceLocation(BufferEnd);
  2392. unsigned DiagID;
  2393. if (LangOpts.CPlusPlus11) {
  2394. // C++11 [lex.phases] 2.2 p2
  2395. // Prefer the C++98 pedantic compatibility warning over the generic,
  2396. // non-extension, user-requested "missing newline at EOF" warning.
  2397. if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
  2398. DiagID = diag::warn_cxx98_compat_no_newline_eof;
  2399. } else {
  2400. DiagID = diag::warn_no_newline_eof;
  2401. }
  2402. } else {
  2403. DiagID = diag::ext_no_newline_eof;
  2404. }
  2405. Diag(BufferEnd, DiagID)
  2406. << FixItHint::CreateInsertion(EndLoc, "\n");
  2407. }
  2408. BufferPtr = CurPtr;
  2409. // Finally, let the preprocessor handle this.
  2410. return PP->HandleEndOfFile(Result, isPragmaLexer());
  2411. }
  2412. /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
  2413. /// the specified lexer will return a tok::l_paren token, 0 if it is something
  2414. /// else and 2 if there are no more tokens in the buffer controlled by the
  2415. /// lexer.
  2416. unsigned Lexer::isNextPPTokenLParen() {
  2417. assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
  2418. // Switch to 'skipping' mode. This will ensure that we can lex a token
  2419. // without emitting diagnostics, disables macro expansion, and will cause EOF
  2420. // to return an EOF token instead of popping the include stack.
  2421. LexingRawMode = true;
  2422. // Save state that can be changed while lexing so that we can restore it.
  2423. const char *TmpBufferPtr = BufferPtr;
  2424. bool inPPDirectiveMode = ParsingPreprocessorDirective;
  2425. bool atStartOfLine = IsAtStartOfLine;
  2426. bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
  2427. bool leadingSpace = HasLeadingSpace;
  2428. Token Tok;
  2429. Lex(Tok);
  2430. // Restore state that may have changed.
  2431. BufferPtr = TmpBufferPtr;
  2432. ParsingPreprocessorDirective = inPPDirectiveMode;
  2433. HasLeadingSpace = leadingSpace;
  2434. IsAtStartOfLine = atStartOfLine;
  2435. IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
  2436. // Restore the lexer back to non-skipping mode.
  2437. LexingRawMode = false;
  2438. if (Tok.is(tok::eof))
  2439. return 2;
  2440. return Tok.is(tok::l_paren);
  2441. }
  2442. /// Find the end of a version control conflict marker.
  2443. static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
  2444. ConflictMarkerKind CMK) {
  2445. const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
  2446. size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
  2447. auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
  2448. size_t Pos = RestOfBuffer.find(Terminator);
  2449. while (Pos != StringRef::npos) {
  2450. // Must occur at start of line.
  2451. if (Pos == 0 ||
  2452. (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
  2453. RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
  2454. Pos = RestOfBuffer.find(Terminator);
  2455. continue;
  2456. }
  2457. return RestOfBuffer.data()+Pos;
  2458. }
  2459. return nullptr;
  2460. }
  2461. /// IsStartOfConflictMarker - If the specified pointer is the start of a version
  2462. /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
  2463. /// and recover nicely. This returns true if it is a conflict marker and false
  2464. /// if not.
  2465. bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
  2466. // Only a conflict marker if it starts at the beginning of a line.
  2467. if (CurPtr != BufferStart &&
  2468. CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
  2469. return false;
  2470. // Check to see if we have <<<<<<< or >>>>.
  2471. if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
  2472. !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
  2473. return false;
  2474. // If we have a situation where we don't care about conflict markers, ignore
  2475. // it.
  2476. if (CurrentConflictMarkerState || isLexingRawMode())
  2477. return false;
  2478. ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
  2479. // Check to see if there is an ending marker somewhere in the buffer at the
  2480. // start of a line to terminate this conflict marker.
  2481. if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
  2482. // We found a match. We are really in a conflict marker.
  2483. // Diagnose this, and ignore to the end of line.
  2484. Diag(CurPtr, diag::err_conflict_marker);
  2485. CurrentConflictMarkerState = Kind;
  2486. // Skip ahead to the end of line. We know this exists because the
  2487. // end-of-conflict marker starts with \r or \n.
  2488. while (*CurPtr != '\r' && *CurPtr != '\n') {
  2489. assert(CurPtr != BufferEnd && "Didn't find end of line");
  2490. ++CurPtr;
  2491. }
  2492. BufferPtr = CurPtr;
  2493. return true;
  2494. }
  2495. // No end of conflict marker found.
  2496. return false;
  2497. }
  2498. /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
  2499. /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
  2500. /// is the end of a conflict marker. Handle it by ignoring up until the end of
  2501. /// the line. This returns true if it is a conflict marker and false if not.
  2502. bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
  2503. // Only a conflict marker if it starts at the beginning of a line.
  2504. if (CurPtr != BufferStart &&
  2505. CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
  2506. return false;
  2507. // If we have a situation where we don't care about conflict markers, ignore
  2508. // it.
  2509. if (!CurrentConflictMarkerState || isLexingRawMode())
  2510. return false;
  2511. // Check to see if we have the marker (4 characters in a row).
  2512. for (unsigned i = 1; i != 4; ++i)
  2513. if (CurPtr[i] != CurPtr[0])
  2514. return false;
  2515. // If we do have it, search for the end of the conflict marker. This could
  2516. // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
  2517. // be the end of conflict marker.
  2518. if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
  2519. CurrentConflictMarkerState)) {
  2520. CurPtr = End;
  2521. // Skip ahead to the end of line.
  2522. while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
  2523. ++CurPtr;
  2524. BufferPtr = CurPtr;
  2525. // No longer in the conflict marker.
  2526. CurrentConflictMarkerState = CMK_None;
  2527. return true;
  2528. }
  2529. return false;
  2530. }
  2531. static const char *findPlaceholderEnd(const char *CurPtr,
  2532. const char *BufferEnd) {
  2533. if (CurPtr == BufferEnd)
  2534. return nullptr;
  2535. BufferEnd -= 1; // Scan until the second last character.
  2536. for (; CurPtr != BufferEnd; ++CurPtr) {
  2537. if (CurPtr[0] == '#' && CurPtr[1] == '>')
  2538. return CurPtr + 2;
  2539. }
  2540. return nullptr;
  2541. }
  2542. bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
  2543. assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
  2544. if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
  2545. return false;
  2546. const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
  2547. if (!End)
  2548. return false;
  2549. const char *Start = CurPtr - 1;
  2550. if (!LangOpts.AllowEditorPlaceholders)
  2551. Diag(Start, diag::err_placeholder_in_source);
  2552. Result.startToken();
  2553. FormTokenWithChars(Result, End, tok::raw_identifier);
  2554. Result.setRawIdentifierData(Start);
  2555. PP->LookUpIdentifierInfo(Result);
  2556. Result.setFlag(Token::IsEditorPlaceholder);
  2557. BufferPtr = End;
  2558. return true;
  2559. }
  2560. bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
  2561. if (PP && PP->isCodeCompletionEnabled()) {
  2562. SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
  2563. return Loc == PP->getCodeCompletionLoc();
  2564. }
  2565. return false;
  2566. }
  2567. uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
  2568. Token *Result) {
  2569. unsigned CharSize;
  2570. char Kind = getCharAndSize(StartPtr, CharSize);
  2571. unsigned NumHexDigits;
  2572. if (Kind == 'u')
  2573. NumHexDigits = 4;
  2574. else if (Kind == 'U')
  2575. NumHexDigits = 8;
  2576. else
  2577. return 0;
  2578. if (!LangOpts.CPlusPlus && !LangOpts.C99) {
  2579. if (Result && !isLexingRawMode())
  2580. Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
  2581. return 0;
  2582. }
  2583. const char *CurPtr = StartPtr + CharSize;
  2584. const char *KindLoc = &CurPtr[-1];
  2585. uint32_t CodePoint = 0;
  2586. for (unsigned i = 0; i < NumHexDigits; ++i) {
  2587. char C = getCharAndSize(CurPtr, CharSize);
  2588. unsigned Value = llvm::hexDigitValue(C);
  2589. if (Value == -1U) {
  2590. if (Result && !isLexingRawMode()) {
  2591. if (i == 0) {
  2592. Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
  2593. << StringRef(KindLoc, 1);
  2594. } else {
  2595. Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
  2596. // If the user wrote \U1234, suggest a fixit to \u.
  2597. if (i == 4 && NumHexDigits == 8) {
  2598. CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
  2599. Diag(KindLoc, diag::note_ucn_four_not_eight)
  2600. << FixItHint::CreateReplacement(URange, "u");
  2601. }
  2602. }
  2603. }
  2604. return 0;
  2605. }
  2606. CodePoint <<= 4;
  2607. CodePoint += Value;
  2608. CurPtr += CharSize;
  2609. }
  2610. if (Result) {
  2611. Result->setFlag(Token::HasUCN);
  2612. if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
  2613. StartPtr = CurPtr;
  2614. else
  2615. while (StartPtr != CurPtr)
  2616. (void)getAndAdvanceChar(StartPtr, *Result);
  2617. } else {
  2618. StartPtr = CurPtr;
  2619. }
  2620. // Don't apply C family restrictions to UCNs in assembly mode
  2621. if (LangOpts.AsmPreprocessor)
  2622. return CodePoint;
  2623. // C99 6.4.3p2: A universal character name shall not specify a character whose
  2624. // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
  2625. // 0060 (`), nor one in the range D800 through DFFF inclusive.)
  2626. // C++11 [lex.charset]p2: If the hexadecimal value for a
  2627. // universal-character-name corresponds to a surrogate code point (in the
  2628. // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
  2629. // if the hexadecimal value for a universal-character-name outside the
  2630. // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
  2631. // string literal corresponds to a control character (in either of the
  2632. // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
  2633. // basic source character set, the program is ill-formed.
  2634. if (CodePoint < 0xA0) {
  2635. if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
  2636. return CodePoint;
  2637. // We don't use isLexingRawMode() here because we need to warn about bad
  2638. // UCNs even when skipping preprocessing tokens in a #if block.
  2639. if (Result && PP) {
  2640. if (CodePoint < 0x20 || CodePoint >= 0x7F)
  2641. Diag(BufferPtr, diag::err_ucn_control_character);
  2642. else {
  2643. char C = static_cast<char>(CodePoint);
  2644. Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
  2645. }
  2646. }
  2647. return 0;
  2648. } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
  2649. // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
  2650. // We don't use isLexingRawMode() here because we need to diagnose bad
  2651. // UCNs even when skipping preprocessing tokens in a #if block.
  2652. if (Result && PP) {
  2653. if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
  2654. Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
  2655. else
  2656. Diag(BufferPtr, diag::err_ucn_escape_invalid);
  2657. }
  2658. return 0;
  2659. }
  2660. return CodePoint;
  2661. }
  2662. bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
  2663. const char *CurPtr) {
  2664. static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
  2665. UnicodeWhitespaceCharRanges);
  2666. if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
  2667. UnicodeWhitespaceChars.contains(C)) {
  2668. Diag(BufferPtr, diag::ext_unicode_whitespace)
  2669. << makeCharRange(*this, BufferPtr, CurPtr);
  2670. Result.setFlag(Token::LeadingSpace);
  2671. return true;
  2672. }
  2673. return false;
  2674. }
  2675. bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
  2676. if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
  2677. if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
  2678. !PP->isPreprocessedOutput()) {
  2679. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
  2680. makeCharRange(*this, BufferPtr, CurPtr),
  2681. /*IsFirst=*/true);
  2682. maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
  2683. makeCharRange(*this, BufferPtr, CurPtr));
  2684. }
  2685. MIOpt.ReadToken();
  2686. return LexIdentifier(Result, CurPtr);
  2687. }
  2688. if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
  2689. !PP->isPreprocessedOutput() &&
  2690. !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
  2691. // Non-ASCII characters tend to creep into source code unintentionally.
  2692. // Instead of letting the parser complain about the unknown token,
  2693. // just drop the character.
  2694. // Note that we can /only/ do this when the non-ASCII character is actually
  2695. // spelled as Unicode, not written as a UCN. The standard requires that
  2696. // we not throw away any possible preprocessor tokens, but there's a
  2697. // loophole in the mapping of Unicode characters to basic character set
  2698. // characters that allows us to map these particular characters to, say,
  2699. // whitespace.
  2700. Diag(BufferPtr, diag::err_non_ascii)
  2701. << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
  2702. BufferPtr = CurPtr;
  2703. return false;
  2704. }
  2705. // Otherwise, we have an explicit UCN or a character that's unlikely to show
  2706. // up by accident.
  2707. MIOpt.ReadToken();
  2708. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2709. return true;
  2710. }
  2711. void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
  2712. IsAtStartOfLine = Result.isAtStartOfLine();
  2713. HasLeadingSpace = Result.hasLeadingSpace();
  2714. HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
  2715. // Note that this doesn't affect IsAtPhysicalStartOfLine.
  2716. }
  2717. bool Lexer::Lex(Token &Result) {
  2718. // Start a new token.
  2719. Result.startToken();
  2720. // Set up misc whitespace flags for LexTokenInternal.
  2721. if (IsAtStartOfLine) {
  2722. Result.setFlag(Token::StartOfLine);
  2723. IsAtStartOfLine = false;
  2724. }
  2725. if (HasLeadingSpace) {
  2726. Result.setFlag(Token::LeadingSpace);
  2727. HasLeadingSpace = false;
  2728. }
  2729. if (HasLeadingEmptyMacro) {
  2730. Result.setFlag(Token::LeadingEmptyMacro);
  2731. HasLeadingEmptyMacro = false;
  2732. }
  2733. bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
  2734. IsAtPhysicalStartOfLine = false;
  2735. bool isRawLex = isLexingRawMode();
  2736. (void) isRawLex;
  2737. bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
  2738. // (After the LexTokenInternal call, the lexer might be destroyed.)
  2739. assert((returnedToken || !isRawLex) && "Raw lex must succeed");
  2740. return returnedToken;
  2741. }
  2742. /// LexTokenInternal - This implements a simple C family lexer. It is an
  2743. /// extremely performance critical piece of code. This assumes that the buffer
  2744. /// has a null character at the end of the file. This returns a preprocessing
  2745. /// token, not a normal token, as such, it is an internal interface. It assumes
  2746. /// that the Flags of result have been cleared before calling this.
  2747. bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
  2748. LexNextToken:
  2749. // New token, can't need cleaning yet.
  2750. Result.clearFlag(Token::NeedsCleaning);
  2751. Result.setIdentifierInfo(nullptr);
  2752. // CurPtr - Cache BufferPtr in an automatic variable.
  2753. const char *CurPtr = BufferPtr;
  2754. // Small amounts of horizontal whitespace is very common between tokens.
  2755. if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
  2756. ++CurPtr;
  2757. while ((*CurPtr == ' ') || (*CurPtr == '\t'))
  2758. ++CurPtr;
  2759. // If we are keeping whitespace and other tokens, just return what we just
  2760. // skipped. The next lexer invocation will return the token after the
  2761. // whitespace.
  2762. if (isKeepWhitespaceMode()) {
  2763. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2764. // FIXME: The next token will not have LeadingSpace set.
  2765. return true;
  2766. }
  2767. BufferPtr = CurPtr;
  2768. Result.setFlag(Token::LeadingSpace);
  2769. }
  2770. unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
  2771. // Read a character, advancing over it.
  2772. char Char = getAndAdvanceChar(CurPtr, Result);
  2773. tok::TokenKind Kind;
  2774. switch (Char) {
  2775. case 0: // Null.
  2776. // Found end of file?
  2777. if (CurPtr-1 == BufferEnd)
  2778. return LexEndOfFile(Result, CurPtr-1);
  2779. // Check if we are performing code completion.
  2780. if (isCodeCompletionPoint(CurPtr-1)) {
  2781. // Return the code-completion token.
  2782. Result.startToken();
  2783. FormTokenWithChars(Result, CurPtr, tok::code_completion);
  2784. return true;
  2785. }
  2786. if (!isLexingRawMode())
  2787. Diag(CurPtr-1, diag::null_in_file);
  2788. Result.setFlag(Token::LeadingSpace);
  2789. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2790. return true; // KeepWhitespaceMode
  2791. // We know the lexer hasn't changed, so just try again with this lexer.
  2792. // (We manually eliminate the tail call to avoid recursion.)
  2793. goto LexNextToken;
  2794. case 26: // DOS & CP/M EOF: "^Z".
  2795. // If we're in Microsoft extensions mode, treat this as end of file.
  2796. if (LangOpts.MicrosoftExt) {
  2797. if (!isLexingRawMode())
  2798. Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
  2799. return LexEndOfFile(Result, CurPtr-1);
  2800. }
  2801. // If Microsoft extensions are disabled, this is just random garbage.
  2802. Kind = tok::unknown;
  2803. break;
  2804. case '\r':
  2805. if (CurPtr[0] == '\n')
  2806. (void)getAndAdvanceChar(CurPtr, Result);
  2807. LLVM_FALLTHROUGH;
  2808. case '\n':
  2809. // If we are inside a preprocessor directive and we see the end of line,
  2810. // we know we are done with the directive, so return an EOD token.
  2811. if (ParsingPreprocessorDirective) {
  2812. // Done parsing the "line".
  2813. ParsingPreprocessorDirective = false;
  2814. // Restore comment saving mode, in case it was disabled for directive.
  2815. if (PP)
  2816. resetExtendedTokenMode();
  2817. // Since we consumed a newline, we are back at the start of a line.
  2818. IsAtStartOfLine = true;
  2819. IsAtPhysicalStartOfLine = true;
  2820. Kind = tok::eod;
  2821. break;
  2822. }
  2823. // No leading whitespace seen so far.
  2824. Result.clearFlag(Token::LeadingSpace);
  2825. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2826. return true; // KeepWhitespaceMode
  2827. // We only saw whitespace, so just try again with this lexer.
  2828. // (We manually eliminate the tail call to avoid recursion.)
  2829. goto LexNextToken;
  2830. case ' ':
  2831. case '\t':
  2832. case '\f':
  2833. case '\v':
  2834. SkipHorizontalWhitespace:
  2835. Result.setFlag(Token::LeadingSpace);
  2836. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2837. return true; // KeepWhitespaceMode
  2838. SkipIgnoredUnits:
  2839. CurPtr = BufferPtr;
  2840. // If the next token is obviously a // or /* */ comment, skip it efficiently
  2841. // too (without going through the big switch stmt).
  2842. if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
  2843. LangOpts.LineComment &&
  2844. (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
  2845. if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
  2846. return true; // There is a token to return.
  2847. goto SkipIgnoredUnits;
  2848. } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
  2849. if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
  2850. return true; // There is a token to return.
  2851. goto SkipIgnoredUnits;
  2852. } else if (isHorizontalWhitespace(*CurPtr)) {
  2853. goto SkipHorizontalWhitespace;
  2854. }
  2855. // We only saw whitespace, so just try again with this lexer.
  2856. // (We manually eliminate the tail call to avoid recursion.)
  2857. goto LexNextToken;
  2858. // C99 6.4.4.1: Integer Constants.
  2859. // C99 6.4.4.2: Floating Constants.
  2860. case '0': case '1': case '2': case '3': case '4':
  2861. case '5': case '6': case '7': case '8': case '9':
  2862. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2863. MIOpt.ReadToken();
  2864. return LexNumericConstant(Result, CurPtr);
  2865. case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
  2866. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2867. MIOpt.ReadToken();
  2868. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  2869. Char = getCharAndSize(CurPtr, SizeTmp);
  2870. // UTF-16 string literal
  2871. if (Char == '"')
  2872. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2873. tok::utf16_string_literal);
  2874. // UTF-16 character constant
  2875. if (Char == '\'')
  2876. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2877. tok::utf16_char_constant);
  2878. // UTF-16 raw string literal
  2879. if (Char == 'R' && LangOpts.CPlusPlus11 &&
  2880. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2881. return LexRawStringLiteral(Result,
  2882. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2883. SizeTmp2, Result),
  2884. tok::utf16_string_literal);
  2885. if (Char == '8') {
  2886. char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
  2887. // UTF-8 string literal
  2888. if (Char2 == '"')
  2889. return LexStringLiteral(Result,
  2890. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2891. SizeTmp2, Result),
  2892. tok::utf8_string_literal);
  2893. if (Char2 == '\'' && LangOpts.CPlusPlus17)
  2894. return LexCharConstant(
  2895. Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2896. SizeTmp2, Result),
  2897. tok::utf8_char_constant);
  2898. if (Char2 == 'R' && LangOpts.CPlusPlus11) {
  2899. unsigned SizeTmp3;
  2900. char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
  2901. // UTF-8 raw string literal
  2902. if (Char3 == '"') {
  2903. return LexRawStringLiteral(Result,
  2904. ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2905. SizeTmp2, Result),
  2906. SizeTmp3, Result),
  2907. tok::utf8_string_literal);
  2908. }
  2909. }
  2910. }
  2911. }
  2912. // treat u like the start of an identifier.
  2913. return LexIdentifier(Result, CurPtr);
  2914. case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
  2915. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2916. MIOpt.ReadToken();
  2917. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  2918. Char = getCharAndSize(CurPtr, SizeTmp);
  2919. // UTF-32 string literal
  2920. if (Char == '"')
  2921. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2922. tok::utf32_string_literal);
  2923. // UTF-32 character constant
  2924. if (Char == '\'')
  2925. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2926. tok::utf32_char_constant);
  2927. // UTF-32 raw string literal
  2928. if (Char == 'R' && LangOpts.CPlusPlus11 &&
  2929. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2930. return LexRawStringLiteral(Result,
  2931. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2932. SizeTmp2, Result),
  2933. tok::utf32_string_literal);
  2934. }
  2935. // treat U like the start of an identifier.
  2936. return LexIdentifier(Result, CurPtr);
  2937. case 'R': // Identifier or C++0x raw string literal
  2938. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2939. MIOpt.ReadToken();
  2940. if (LangOpts.CPlusPlus11) {
  2941. Char = getCharAndSize(CurPtr, SizeTmp);
  2942. if (Char == '"')
  2943. return LexRawStringLiteral(Result,
  2944. ConsumeChar(CurPtr, SizeTmp, Result),
  2945. tok::string_literal);
  2946. }
  2947. // treat R like the start of an identifier.
  2948. return LexIdentifier(Result, CurPtr);
  2949. case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
  2950. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2951. MIOpt.ReadToken();
  2952. Char = getCharAndSize(CurPtr, SizeTmp);
  2953. // Wide string literal.
  2954. if (Char == '"')
  2955. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2956. tok::wide_string_literal);
  2957. // Wide raw string literal.
  2958. if (LangOpts.CPlusPlus11 && Char == 'R' &&
  2959. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2960. return LexRawStringLiteral(Result,
  2961. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2962. SizeTmp2, Result),
  2963. tok::wide_string_literal);
  2964. // Wide character constant.
  2965. if (Char == '\'')
  2966. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2967. tok::wide_char_constant);
  2968. // FALL THROUGH, treating L like the start of an identifier.
  2969. LLVM_FALLTHROUGH;
  2970. // C99 6.4.2: Identifiers.
  2971. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  2972. case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
  2973. case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
  2974. case 'V': case 'W': case 'X': case 'Y': case 'Z':
  2975. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  2976. case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  2977. case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
  2978. case 'v': case 'w': case 'x': case 'y': case 'z':
  2979. case '_':
  2980. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2981. MIOpt.ReadToken();
  2982. return LexIdentifier(Result, CurPtr);
  2983. case '$': // $ in identifiers.
  2984. if (LangOpts.DollarIdents) {
  2985. if (!isLexingRawMode())
  2986. Diag(CurPtr-1, diag::ext_dollar_in_identifier);
  2987. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2988. MIOpt.ReadToken();
  2989. return LexIdentifier(Result, CurPtr);
  2990. }
  2991. Kind = tok::unknown;
  2992. break;
  2993. // C99 6.4.4: Character Constants.
  2994. case '\'':
  2995. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2996. MIOpt.ReadToken();
  2997. return LexCharConstant(Result, CurPtr, tok::char_constant);
  2998. // C99 6.4.5: String Literals.
  2999. case '"':
  3000. // Notify MIOpt that we read a non-whitespace/non-comment token.
  3001. MIOpt.ReadToken();
  3002. return LexStringLiteral(Result, CurPtr,
  3003. ParsingFilename ? tok::header_name
  3004. : tok::string_literal);
  3005. // C99 6.4.6: Punctuators.
  3006. case '?':
  3007. Kind = tok::question;
  3008. break;
  3009. case '[':
  3010. Kind = tok::l_square;
  3011. break;
  3012. case ']':
  3013. Kind = tok::r_square;
  3014. break;
  3015. case '(':
  3016. Kind = tok::l_paren;
  3017. break;
  3018. case ')':
  3019. Kind = tok::r_paren;
  3020. break;
  3021. case '{':
  3022. Kind = tok::l_brace;
  3023. break;
  3024. case '}':
  3025. Kind = tok::r_brace;
  3026. break;
  3027. case '.':
  3028. Char = getCharAndSize(CurPtr, SizeTmp);
  3029. if (Char >= '0' && Char <= '9') {
  3030. // Notify MIOpt that we read a non-whitespace/non-comment token.
  3031. MIOpt.ReadToken();
  3032. return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  3033. } else if (LangOpts.CPlusPlus && Char == '*') {
  3034. Kind = tok::periodstar;
  3035. CurPtr += SizeTmp;
  3036. } else if (Char == '.' &&
  3037. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
  3038. Kind = tok::ellipsis;
  3039. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3040. SizeTmp2, Result);
  3041. } else {
  3042. Kind = tok::period;
  3043. }
  3044. break;
  3045. case '&':
  3046. Char = getCharAndSize(CurPtr, SizeTmp);
  3047. if (Char == '&') {
  3048. Kind = tok::ampamp;
  3049. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3050. } else if (Char == '=') {
  3051. Kind = tok::ampequal;
  3052. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3053. } else {
  3054. Kind = tok::amp;
  3055. }
  3056. break;
  3057. case '*':
  3058. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  3059. Kind = tok::starequal;
  3060. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3061. } else {
  3062. Kind = tok::star;
  3063. }
  3064. break;
  3065. case '+':
  3066. Char = getCharAndSize(CurPtr, SizeTmp);
  3067. if (Char == '+') {
  3068. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3069. Kind = tok::plusplus;
  3070. } else if (Char == '=') {
  3071. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3072. Kind = tok::plusequal;
  3073. } else {
  3074. Kind = tok::plus;
  3075. }
  3076. break;
  3077. case '-':
  3078. Char = getCharAndSize(CurPtr, SizeTmp);
  3079. if (Char == '-') { // --
  3080. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3081. Kind = tok::minusminus;
  3082. } else if (Char == '>' && LangOpts.CPlusPlus &&
  3083. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
  3084. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3085. SizeTmp2, Result);
  3086. Kind = tok::arrowstar;
  3087. } else if (Char == '>') { // ->
  3088. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3089. Kind = tok::arrow;
  3090. } else if (Char == '=') { // -=
  3091. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3092. Kind = tok::minusequal;
  3093. } else {
  3094. Kind = tok::minus;
  3095. }
  3096. break;
  3097. case '~':
  3098. Kind = tok::tilde;
  3099. break;
  3100. case '!':
  3101. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  3102. Kind = tok::exclaimequal;
  3103. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3104. } else {
  3105. Kind = tok::exclaim;
  3106. }
  3107. break;
  3108. case '/':
  3109. // 6.4.9: Comments
  3110. Char = getCharAndSize(CurPtr, SizeTmp);
  3111. if (Char == '/') { // Line comment.
  3112. // Even if Line comments are disabled (e.g. in C89 mode), we generally
  3113. // want to lex this as a comment. There is one problem with this though,
  3114. // that in one particular corner case, this can change the behavior of the
  3115. // resultant program. For example, In "foo //**/ bar", C89 would lex
  3116. // this as "foo / bar" and languages with Line comments would lex it as
  3117. // "foo". Check to see if the character after the second slash is a '*'.
  3118. // If so, we will lex that as a "/" instead of the start of a comment.
  3119. // However, we never do this if we are just preprocessing.
  3120. bool TreatAsComment = LangOpts.LineComment &&
  3121. (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
  3122. if (!TreatAsComment)
  3123. if (!(PP && PP->isPreprocessedOutput()))
  3124. TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
  3125. if (TreatAsComment) {
  3126. if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  3127. TokAtPhysicalStartOfLine))
  3128. return true; // There is a token to return.
  3129. // It is common for the tokens immediately after a // comment to be
  3130. // whitespace (indentation for the next line). Instead of going through
  3131. // the big switch, handle it efficiently now.
  3132. goto SkipIgnoredUnits;
  3133. }
  3134. }
  3135. if (Char == '*') { // /**/ comment.
  3136. if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  3137. TokAtPhysicalStartOfLine))
  3138. return true; // There is a token to return.
  3139. // We only saw whitespace, so just try again with this lexer.
  3140. // (We manually eliminate the tail call to avoid recursion.)
  3141. goto LexNextToken;
  3142. }
  3143. if (Char == '=') {
  3144. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3145. Kind = tok::slashequal;
  3146. } else {
  3147. Kind = tok::slash;
  3148. }
  3149. break;
  3150. case '%':
  3151. Char = getCharAndSize(CurPtr, SizeTmp);
  3152. if (Char == '=') {
  3153. Kind = tok::percentequal;
  3154. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3155. } else if (LangOpts.Digraphs && Char == '>') {
  3156. Kind = tok::r_brace; // '%>' -> '}'
  3157. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3158. } else if (LangOpts.Digraphs && Char == ':') {
  3159. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3160. Char = getCharAndSize(CurPtr, SizeTmp);
  3161. if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
  3162. Kind = tok::hashhash; // '%:%:' -> '##'
  3163. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3164. SizeTmp2, Result);
  3165. } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
  3166. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3167. if (!isLexingRawMode())
  3168. Diag(BufferPtr, diag::ext_charize_microsoft);
  3169. Kind = tok::hashat;
  3170. } else { // '%:' -> '#'
  3171. // We parsed a # character. If this occurs at the start of the line,
  3172. // it's actually the start of a preprocessing directive. Callback to
  3173. // the preprocessor to handle it.
  3174. // TODO: -fpreprocessed mode??
  3175. if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
  3176. goto HandleDirective;
  3177. Kind = tok::hash;
  3178. }
  3179. } else {
  3180. Kind = tok::percent;
  3181. }
  3182. break;
  3183. case '<':
  3184. Char = getCharAndSize(CurPtr, SizeTmp);
  3185. if (ParsingFilename) {
  3186. return LexAngledStringLiteral(Result, CurPtr);
  3187. } else if (Char == '<') {
  3188. char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
  3189. if (After == '=') {
  3190. Kind = tok::lesslessequal;
  3191. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3192. SizeTmp2, Result);
  3193. } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
  3194. // If this is actually a '<<<<<<<' version control conflict marker,
  3195. // recognize it as such and recover nicely.
  3196. goto LexNextToken;
  3197. } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
  3198. // If this is '<<<<' and we're in a Perforce-style conflict marker,
  3199. // ignore it.
  3200. goto LexNextToken;
  3201. } else if (LangOpts.CUDA && After == '<') {
  3202. Kind = tok::lesslessless;
  3203. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3204. SizeTmp2, Result);
  3205. } else {
  3206. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3207. Kind = tok::lessless;
  3208. }
  3209. } else if (Char == '=') {
  3210. char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
  3211. if (After == '>') {
  3212. if (getLangOpts().CPlusPlus2a) {
  3213. if (!isLexingRawMode())
  3214. Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
  3215. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3216. SizeTmp2, Result);
  3217. Kind = tok::spaceship;
  3218. break;
  3219. }
  3220. // Suggest adding a space between the '<=' and the '>' to avoid a
  3221. // change in semantics if this turns up in C++ <=17 mode.
  3222. if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
  3223. Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
  3224. << FixItHint::CreateInsertion(
  3225. getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
  3226. }
  3227. }
  3228. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3229. Kind = tok::lessequal;
  3230. } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
  3231. if (LangOpts.CPlusPlus11 &&
  3232. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
  3233. // C++0x [lex.pptoken]p3:
  3234. // Otherwise, if the next three characters are <:: and the subsequent
  3235. // character is neither : nor >, the < is treated as a preprocessor
  3236. // token by itself and not as the first character of the alternative
  3237. // token <:.
  3238. unsigned SizeTmp3;
  3239. char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
  3240. if (After != ':' && After != '>') {
  3241. Kind = tok::less;
  3242. if (!isLexingRawMode())
  3243. Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
  3244. break;
  3245. }
  3246. }
  3247. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3248. Kind = tok::l_square;
  3249. } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
  3250. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3251. Kind = tok::l_brace;
  3252. } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
  3253. lexEditorPlaceholder(Result, CurPtr)) {
  3254. return true;
  3255. } else {
  3256. Kind = tok::less;
  3257. }
  3258. break;
  3259. case '>':
  3260. Char = getCharAndSize(CurPtr, SizeTmp);
  3261. if (Char == '=') {
  3262. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3263. Kind = tok::greaterequal;
  3264. } else if (Char == '>') {
  3265. char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
  3266. if (After == '=') {
  3267. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3268. SizeTmp2, Result);
  3269. Kind = tok::greatergreaterequal;
  3270. } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
  3271. // If this is actually a '>>>>' conflict marker, recognize it as such
  3272. // and recover nicely.
  3273. goto LexNextToken;
  3274. } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
  3275. // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
  3276. goto LexNextToken;
  3277. } else if (LangOpts.CUDA && After == '>') {
  3278. Kind = tok::greatergreatergreater;
  3279. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  3280. SizeTmp2, Result);
  3281. } else {
  3282. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3283. Kind = tok::greatergreater;
  3284. }
  3285. } else {
  3286. Kind = tok::greater;
  3287. }
  3288. break;
  3289. case '^':
  3290. Char = getCharAndSize(CurPtr, SizeTmp);
  3291. if (Char == '=') {
  3292. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3293. Kind = tok::caretequal;
  3294. } else if (LangOpts.OpenCL && Char == '^') {
  3295. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3296. Kind = tok::caretcaret;
  3297. } else {
  3298. Kind = tok::caret;
  3299. }
  3300. break;
  3301. case '|':
  3302. Char = getCharAndSize(CurPtr, SizeTmp);
  3303. if (Char == '=') {
  3304. Kind = tok::pipeequal;
  3305. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3306. } else if (Char == '|') {
  3307. // If this is '|||||||' and we're in a conflict marker, ignore it.
  3308. if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
  3309. goto LexNextToken;
  3310. Kind = tok::pipepipe;
  3311. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3312. } else {
  3313. Kind = tok::pipe;
  3314. }
  3315. break;
  3316. case ':':
  3317. Char = getCharAndSize(CurPtr, SizeTmp);
  3318. if (LangOpts.Digraphs && Char == '>') {
  3319. Kind = tok::r_square; // ':>' -> ']'
  3320. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3321. } else if ((LangOpts.CPlusPlus ||
  3322. LangOpts.DoubleSquareBracketAttributes) &&
  3323. Char == ':') {
  3324. Kind = tok::coloncolon;
  3325. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3326. } else {
  3327. Kind = tok::colon;
  3328. }
  3329. break;
  3330. case ';':
  3331. Kind = tok::semi;
  3332. break;
  3333. case '=':
  3334. Char = getCharAndSize(CurPtr, SizeTmp);
  3335. if (Char == '=') {
  3336. // If this is '====' and we're in a conflict marker, ignore it.
  3337. if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
  3338. goto LexNextToken;
  3339. Kind = tok::equalequal;
  3340. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3341. } else {
  3342. Kind = tok::equal;
  3343. }
  3344. break;
  3345. case ',':
  3346. Kind = tok::comma;
  3347. break;
  3348. case '#':
  3349. Char = getCharAndSize(CurPtr, SizeTmp);
  3350. if (Char == '#') {
  3351. Kind = tok::hashhash;
  3352. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3353. } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
  3354. Kind = tok::hashat;
  3355. if (!isLexingRawMode())
  3356. Diag(BufferPtr, diag::ext_charize_microsoft);
  3357. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3358. } else {
  3359. // We parsed a # character. If this occurs at the start of the line,
  3360. // it's actually the start of a preprocessing directive. Callback to
  3361. // the preprocessor to handle it.
  3362. // TODO: -fpreprocessed mode??
  3363. if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
  3364. goto HandleDirective;
  3365. Kind = tok::hash;
  3366. }
  3367. break;
  3368. case '@':
  3369. // Objective C support.
  3370. if (CurPtr[-1] == '@' && LangOpts.ObjC)
  3371. Kind = tok::at;
  3372. else
  3373. Kind = tok::unknown;
  3374. break;
  3375. // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
  3376. case '\\':
  3377. if (!LangOpts.AsmPreprocessor) {
  3378. if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
  3379. if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
  3380. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  3381. return true; // KeepWhitespaceMode
  3382. // We only saw whitespace, so just try again with this lexer.
  3383. // (We manually eliminate the tail call to avoid recursion.)
  3384. goto LexNextToken;
  3385. }
  3386. return LexUnicode(Result, CodePoint, CurPtr);
  3387. }
  3388. }
  3389. Kind = tok::unknown;
  3390. break;
  3391. default: {
  3392. if (isASCII(Char)) {
  3393. Kind = tok::unknown;
  3394. break;
  3395. }
  3396. llvm::UTF32 CodePoint;
  3397. // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
  3398. // an escaped newline.
  3399. --CurPtr;
  3400. llvm::ConversionResult Status =
  3401. llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
  3402. (const llvm::UTF8 *)BufferEnd,
  3403. &CodePoint,
  3404. llvm::strictConversion);
  3405. if (Status == llvm::conversionOK) {
  3406. if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
  3407. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  3408. return true; // KeepWhitespaceMode
  3409. // We only saw whitespace, so just try again with this lexer.
  3410. // (We manually eliminate the tail call to avoid recursion.)
  3411. goto LexNextToken;
  3412. }
  3413. return LexUnicode(Result, CodePoint, CurPtr);
  3414. }
  3415. if (isLexingRawMode() || ParsingPreprocessorDirective ||
  3416. PP->isPreprocessedOutput()) {
  3417. ++CurPtr;
  3418. Kind = tok::unknown;
  3419. break;
  3420. }
  3421. // Non-ASCII characters tend to creep into source code unintentionally.
  3422. // Instead of letting the parser complain about the unknown token,
  3423. // just diagnose the invalid UTF-8, then drop the character.
  3424. Diag(CurPtr, diag::err_invalid_utf8);
  3425. BufferPtr = CurPtr+1;
  3426. // We're pretending the character didn't exist, so just try again with
  3427. // this lexer.
  3428. // (We manually eliminate the tail call to avoid recursion.)
  3429. goto LexNextToken;
  3430. }
  3431. }
  3432. // Notify MIOpt that we read a non-whitespace/non-comment token.
  3433. MIOpt.ReadToken();
  3434. // Update the location of token as well as BufferPtr.
  3435. FormTokenWithChars(Result, CurPtr, Kind);
  3436. return true;
  3437. HandleDirective:
  3438. // We parsed a # character and it's the start of a preprocessing directive.
  3439. FormTokenWithChars(Result, CurPtr, tok::hash);
  3440. PP->HandleDirective(Result);
  3441. if (PP->hadModuleLoaderFatalFailure()) {
  3442. // With a fatal failure in the module loader, we abort parsing.
  3443. assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
  3444. return true;
  3445. }
  3446. // We parsed the directive; lex a token with the new state.
  3447. return false;
  3448. }