CommentLexer.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867
  1. //===--- CommentLexer.cpp -------------------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "clang/AST/CommentLexer.h"
  9. #include "clang/AST/CommentCommandTraits.h"
  10. #include "clang/AST/CommentDiagnostic.h"
  11. #include "clang/Basic/CharInfo.h"
  12. #include "llvm/ADT/StringExtras.h"
  13. #include "llvm/ADT/StringSwitch.h"
  14. #include "llvm/Support/ConvertUTF.h"
  15. #include "llvm/Support/ErrorHandling.h"
  16. namespace clang {
  17. namespace comments {
  18. void Token::dump(const Lexer &L, const SourceManager &SM) const {
  19. llvm::errs() << "comments::Token Kind=" << Kind << " ";
  20. Loc.print(llvm::errs(), SM);
  21. llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  22. }
  23. static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  24. return isLetter(C);
  25. }
  26. static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  27. return isDigit(C);
  28. }
  29. static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  30. return isHexDigit(C);
  31. }
  32. static inline StringRef convertCodePointToUTF8(
  33. llvm::BumpPtrAllocator &Allocator,
  34. unsigned CodePoint) {
  35. char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  36. char *ResolvedPtr = Resolved;
  37. if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  38. return StringRef(Resolved, ResolvedPtr - Resolved);
  39. else
  40. return StringRef();
  41. }
  42. namespace {
  43. #include "clang/AST/CommentHTMLTags.inc"
  44. #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  45. } // end anonymous namespace
  46. StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  47. // Fast path, first check a few most widely used named character references.
  48. return llvm::StringSwitch<StringRef>(Name)
  49. .Case("amp", "&")
  50. .Case("lt", "<")
  51. .Case("gt", ">")
  52. .Case("quot", "\"")
  53. .Case("apos", "\'")
  54. // Slow path.
  55. .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  56. }
  57. StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  58. unsigned CodePoint = 0;
  59. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  60. assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  61. CodePoint *= 10;
  62. CodePoint += Name[i] - '0';
  63. }
  64. return convertCodePointToUTF8(Allocator, CodePoint);
  65. }
  66. StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  67. unsigned CodePoint = 0;
  68. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  69. CodePoint *= 16;
  70. const char C = Name[i];
  71. assert(isHTMLHexCharacterReferenceCharacter(C));
  72. CodePoint += llvm::hexDigitValue(C);
  73. }
  74. return convertCodePointToUTF8(Allocator, CodePoint);
  75. }
  76. void Lexer::skipLineStartingDecorations() {
  77. // This function should be called only for C comments
  78. assert(CommentState == LCS_InsideCComment);
  79. if (BufferPtr == CommentEnd)
  80. return;
  81. switch (*BufferPtr) {
  82. case ' ':
  83. case '\t':
  84. case '\f':
  85. case '\v': {
  86. const char *NewBufferPtr = BufferPtr;
  87. NewBufferPtr++;
  88. if (NewBufferPtr == CommentEnd)
  89. return;
  90. char C = *NewBufferPtr;
  91. while (isHorizontalWhitespace(C)) {
  92. NewBufferPtr++;
  93. if (NewBufferPtr == CommentEnd)
  94. return;
  95. C = *NewBufferPtr;
  96. }
  97. if (C == '*')
  98. BufferPtr = NewBufferPtr + 1;
  99. break;
  100. }
  101. case '*':
  102. BufferPtr++;
  103. break;
  104. }
  105. }
  106. namespace {
  107. /// Returns pointer to the first newline character in the string.
  108. const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  109. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  110. if (isVerticalWhitespace(*BufferPtr))
  111. return BufferPtr;
  112. }
  113. return BufferEnd;
  114. }
  115. const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  116. if (BufferPtr == BufferEnd)
  117. return BufferPtr;
  118. if (*BufferPtr == '\n')
  119. BufferPtr++;
  120. else {
  121. assert(*BufferPtr == '\r');
  122. BufferPtr++;
  123. if (BufferPtr != BufferEnd && *BufferPtr == '\n')
  124. BufferPtr++;
  125. }
  126. return BufferPtr;
  127. }
  128. const char *skipNamedCharacterReference(const char *BufferPtr,
  129. const char *BufferEnd) {
  130. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  131. if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
  132. return BufferPtr;
  133. }
  134. return BufferEnd;
  135. }
  136. const char *skipDecimalCharacterReference(const char *BufferPtr,
  137. const char *BufferEnd) {
  138. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  139. if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
  140. return BufferPtr;
  141. }
  142. return BufferEnd;
  143. }
  144. const char *skipHexCharacterReference(const char *BufferPtr,
  145. const char *BufferEnd) {
  146. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  147. if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
  148. return BufferPtr;
  149. }
  150. return BufferEnd;
  151. }
  152. bool isHTMLIdentifierStartingCharacter(char C) {
  153. return isLetter(C);
  154. }
  155. bool isHTMLIdentifierCharacter(char C) {
  156. return isAlphanumeric(C);
  157. }
  158. const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  159. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  160. if (!isHTMLIdentifierCharacter(*BufferPtr))
  161. return BufferPtr;
  162. }
  163. return BufferEnd;
  164. }
  165. /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
  166. /// string allowed.
  167. ///
  168. /// Returns pointer to closing quote.
  169. const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
  170. {
  171. const char Quote = *BufferPtr;
  172. assert(Quote == '\"' || Quote == '\'');
  173. BufferPtr++;
  174. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  175. const char C = *BufferPtr;
  176. if (C == Quote && BufferPtr[-1] != '\\')
  177. return BufferPtr;
  178. }
  179. return BufferEnd;
  180. }
  181. const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  182. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  183. if (!isWhitespace(*BufferPtr))
  184. return BufferPtr;
  185. }
  186. return BufferEnd;
  187. }
  188. bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  189. return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
  190. }
  191. bool isCommandNameStartCharacter(char C) {
  192. return isLetter(C);
  193. }
  194. bool isCommandNameCharacter(char C) {
  195. return isAlphanumeric(C);
  196. }
  197. const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  198. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  199. if (!isCommandNameCharacter(*BufferPtr))
  200. return BufferPtr;
  201. }
  202. return BufferEnd;
  203. }
  204. /// Return the one past end pointer for BCPL comments.
  205. /// Handles newlines escaped with backslash or trigraph for backslahs.
  206. const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  207. const char *CurPtr = BufferPtr;
  208. while (CurPtr != BufferEnd) {
  209. while (!isVerticalWhitespace(*CurPtr)) {
  210. CurPtr++;
  211. if (CurPtr == BufferEnd)
  212. return BufferEnd;
  213. }
  214. // We found a newline, check if it is escaped.
  215. const char *EscapePtr = CurPtr - 1;
  216. while(isHorizontalWhitespace(*EscapePtr))
  217. EscapePtr--;
  218. if (*EscapePtr == '\\' ||
  219. (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
  220. EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
  221. // We found an escaped newline.
  222. CurPtr = skipNewline(CurPtr, BufferEnd);
  223. } else
  224. return CurPtr; // Not an escaped newline.
  225. }
  226. return BufferEnd;
  227. }
  228. /// Return the one past end pointer for C comments.
  229. /// Very dumb, does not handle escaped newlines or trigraphs.
  230. const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  231. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  232. if (*BufferPtr == '*') {
  233. assert(BufferPtr + 1 != BufferEnd);
  234. if (*(BufferPtr + 1) == '/')
  235. return BufferPtr;
  236. }
  237. }
  238. llvm_unreachable("buffer end hit before '*/' was seen");
  239. }
  240. } // end anonymous namespace
  241. void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
  242. tok::TokenKind Kind) {
  243. const unsigned TokLen = TokEnd - BufferPtr;
  244. Result.setLocation(getSourceLocation(BufferPtr));
  245. Result.setKind(Kind);
  246. Result.setLength(TokLen);
  247. #ifndef NDEBUG
  248. Result.TextPtr = "<UNSET>";
  249. Result.IntVal = 7;
  250. #endif
  251. BufferPtr = TokEnd;
  252. }
  253. void Lexer::lexCommentText(Token &T) {
  254. assert(CommentState == LCS_InsideBCPLComment ||
  255. CommentState == LCS_InsideCComment);
  256. // Handles lexing non-command text, i.e. text and newline.
  257. auto HandleNonCommandToken = [&]() -> void {
  258. assert(State == LS_Normal);
  259. const char *TokenPtr = BufferPtr;
  260. assert(TokenPtr < CommentEnd);
  261. switch (*TokenPtr) {
  262. case '\n':
  263. case '\r':
  264. TokenPtr = skipNewline(TokenPtr, CommentEnd);
  265. formTokenWithChars(T, TokenPtr, tok::newline);
  266. if (CommentState == LCS_InsideCComment)
  267. skipLineStartingDecorations();
  268. return;
  269. default: {
  270. StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r";
  271. size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr)
  272. .find_first_of(TokStartSymbols);
  273. if (End != StringRef::npos)
  274. TokenPtr += End;
  275. else
  276. TokenPtr = CommentEnd;
  277. formTextToken(T, TokenPtr);
  278. return;
  279. }
  280. }
  281. };
  282. if (!ParseCommands)
  283. return HandleNonCommandToken();
  284. switch (State) {
  285. case LS_Normal:
  286. break;
  287. case LS_VerbatimBlockFirstLine:
  288. lexVerbatimBlockFirstLine(T);
  289. return;
  290. case LS_VerbatimBlockBody:
  291. lexVerbatimBlockBody(T);
  292. return;
  293. case LS_VerbatimLineText:
  294. lexVerbatimLineText(T);
  295. return;
  296. case LS_HTMLStartTag:
  297. lexHTMLStartTag(T);
  298. return;
  299. case LS_HTMLEndTag:
  300. lexHTMLEndTag(T);
  301. return;
  302. }
  303. assert(State == LS_Normal);
  304. const char *TokenPtr = BufferPtr;
  305. assert(TokenPtr < CommentEnd);
  306. switch(*TokenPtr) {
  307. case '\\':
  308. case '@': {
  309. // Commands that start with a backslash and commands that start with
  310. // 'at' have equivalent semantics. But we keep information about the
  311. // exact syntax in AST for comments.
  312. tok::TokenKind CommandKind =
  313. (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
  314. TokenPtr++;
  315. if (TokenPtr == CommentEnd) {
  316. formTextToken(T, TokenPtr);
  317. return;
  318. }
  319. char C = *TokenPtr;
  320. switch (C) {
  321. default:
  322. break;
  323. case '\\': case '@': case '&': case '$':
  324. case '#': case '<': case '>': case '%':
  325. case '\"': case '.': case ':':
  326. // This is one of \\ \@ \& \$ etc escape sequences.
  327. TokenPtr++;
  328. if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
  329. // This is the \:: escape sequence.
  330. TokenPtr++;
  331. }
  332. StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
  333. formTokenWithChars(T, TokenPtr, tok::text);
  334. T.setText(UnescapedText);
  335. return;
  336. }
  337. // Don't make zero-length commands.
  338. if (!isCommandNameStartCharacter(*TokenPtr)) {
  339. formTextToken(T, TokenPtr);
  340. return;
  341. }
  342. TokenPtr = skipCommandName(TokenPtr, CommentEnd);
  343. unsigned Length = TokenPtr - (BufferPtr + 1);
  344. // Hardcoded support for lexing LaTeX formula commands
  345. // \f$ \f[ \f] \f{ \f} as a single command.
  346. if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
  347. C = *TokenPtr;
  348. if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
  349. TokenPtr++;
  350. Length++;
  351. }
  352. }
  353. StringRef CommandName(BufferPtr + 1, Length);
  354. const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
  355. if (!Info) {
  356. if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
  357. StringRef CorrectedName = Info->Name;
  358. SourceLocation Loc = getSourceLocation(BufferPtr);
  359. SourceLocation EndLoc = getSourceLocation(TokenPtr);
  360. SourceRange FullRange = SourceRange(Loc, EndLoc);
  361. SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
  362. Diag(Loc, diag::warn_correct_comment_command_name)
  363. << FullRange << CommandName << CorrectedName
  364. << FixItHint::CreateReplacement(CommandRange, CorrectedName);
  365. } else {
  366. formTokenWithChars(T, TokenPtr, tok::unknown_command);
  367. T.setUnknownCommandName(CommandName);
  368. Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
  369. << SourceRange(T.getLocation(), T.getEndLocation());
  370. return;
  371. }
  372. }
  373. if (Info->IsVerbatimBlockCommand) {
  374. setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
  375. return;
  376. }
  377. if (Info->IsVerbatimLineCommand) {
  378. setupAndLexVerbatimLine(T, TokenPtr, Info);
  379. return;
  380. }
  381. formTokenWithChars(T, TokenPtr, CommandKind);
  382. T.setCommandID(Info->getID());
  383. return;
  384. }
  385. case '&':
  386. lexHTMLCharacterReference(T);
  387. return;
  388. case '<': {
  389. TokenPtr++;
  390. if (TokenPtr == CommentEnd) {
  391. formTextToken(T, TokenPtr);
  392. return;
  393. }
  394. const char C = *TokenPtr;
  395. if (isHTMLIdentifierStartingCharacter(C))
  396. setupAndLexHTMLStartTag(T);
  397. else if (C == '/')
  398. setupAndLexHTMLEndTag(T);
  399. else
  400. formTextToken(T, TokenPtr);
  401. return;
  402. }
  403. default:
  404. return HandleNonCommandToken();
  405. }
  406. }
  407. void Lexer::setupAndLexVerbatimBlock(Token &T,
  408. const char *TextBegin,
  409. char Marker, const CommandInfo *Info) {
  410. assert(Info->IsVerbatimBlockCommand);
  411. VerbatimBlockEndCommandName.clear();
  412. VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  413. VerbatimBlockEndCommandName.append(Info->EndCommandName);
  414. formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  415. T.setVerbatimBlockID(Info->getID());
  416. // If there is a newline following the verbatim opening command, skip the
  417. // newline so that we don't create an tok::verbatim_block_line with empty
  418. // text content.
  419. if (BufferPtr != CommentEnd &&
  420. isVerticalWhitespace(*BufferPtr)) {
  421. BufferPtr = skipNewline(BufferPtr, CommentEnd);
  422. State = LS_VerbatimBlockBody;
  423. return;
  424. }
  425. State = LS_VerbatimBlockFirstLine;
  426. }
  427. void Lexer::lexVerbatimBlockFirstLine(Token &T) {
  428. again:
  429. assert(BufferPtr < CommentEnd);
  430. // FIXME: It would be better to scan the text once, finding either the block
  431. // end command or newline.
  432. //
  433. // Extract current line.
  434. const char *Newline = findNewline(BufferPtr, CommentEnd);
  435. StringRef Line(BufferPtr, Newline - BufferPtr);
  436. // Look for end command in current line.
  437. size_t Pos = Line.find(VerbatimBlockEndCommandName);
  438. const char *TextEnd;
  439. const char *NextLine;
  440. if (Pos == StringRef::npos) {
  441. // Current line is completely verbatim.
  442. TextEnd = Newline;
  443. NextLine = skipNewline(Newline, CommentEnd);
  444. } else if (Pos == 0) {
  445. // Current line contains just an end command.
  446. const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
  447. StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
  448. formTokenWithChars(T, End, tok::verbatim_block_end);
  449. T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
  450. State = LS_Normal;
  451. return;
  452. } else {
  453. // There is some text, followed by end command. Extract text first.
  454. TextEnd = BufferPtr + Pos;
  455. NextLine = TextEnd;
  456. // If there is only whitespace before end command, skip whitespace.
  457. if (isWhitespace(BufferPtr, TextEnd)) {
  458. BufferPtr = TextEnd;
  459. goto again;
  460. }
  461. }
  462. StringRef Text(BufferPtr, TextEnd - BufferPtr);
  463. formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  464. T.setVerbatimBlockText(Text);
  465. State = LS_VerbatimBlockBody;
  466. }
  467. void Lexer::lexVerbatimBlockBody(Token &T) {
  468. assert(State == LS_VerbatimBlockBody);
  469. if (CommentState == LCS_InsideCComment)
  470. skipLineStartingDecorations();
  471. if (BufferPtr == CommentEnd) {
  472. formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
  473. T.setVerbatimBlockText("");
  474. return;
  475. }
  476. lexVerbatimBlockFirstLine(T);
  477. }
  478. void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  479. const CommandInfo *Info) {
  480. assert(Info->IsVerbatimLineCommand);
  481. formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  482. T.setVerbatimLineID(Info->getID());
  483. State = LS_VerbatimLineText;
  484. }
  485. void Lexer::lexVerbatimLineText(Token &T) {
  486. assert(State == LS_VerbatimLineText);
  487. // Extract current line.
  488. const char *Newline = findNewline(BufferPtr, CommentEnd);
  489. StringRef Text(BufferPtr, Newline - BufferPtr);
  490. formTokenWithChars(T, Newline, tok::verbatim_line_text);
  491. T.setVerbatimLineText(Text);
  492. State = LS_Normal;
  493. }
  494. void Lexer::lexHTMLCharacterReference(Token &T) {
  495. const char *TokenPtr = BufferPtr;
  496. assert(*TokenPtr == '&');
  497. TokenPtr++;
  498. if (TokenPtr == CommentEnd) {
  499. formTextToken(T, TokenPtr);
  500. return;
  501. }
  502. const char *NamePtr;
  503. bool isNamed = false;
  504. bool isDecimal = false;
  505. char C = *TokenPtr;
  506. if (isHTMLNamedCharacterReferenceCharacter(C)) {
  507. NamePtr = TokenPtr;
  508. TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
  509. isNamed = true;
  510. } else if (C == '#') {
  511. TokenPtr++;
  512. if (TokenPtr == CommentEnd) {
  513. formTextToken(T, TokenPtr);
  514. return;
  515. }
  516. C = *TokenPtr;
  517. if (isHTMLDecimalCharacterReferenceCharacter(C)) {
  518. NamePtr = TokenPtr;
  519. TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
  520. isDecimal = true;
  521. } else if (C == 'x' || C == 'X') {
  522. TokenPtr++;
  523. NamePtr = TokenPtr;
  524. TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
  525. } else {
  526. formTextToken(T, TokenPtr);
  527. return;
  528. }
  529. } else {
  530. formTextToken(T, TokenPtr);
  531. return;
  532. }
  533. if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
  534. *TokenPtr != ';') {
  535. formTextToken(T, TokenPtr);
  536. return;
  537. }
  538. StringRef Name(NamePtr, TokenPtr - NamePtr);
  539. TokenPtr++; // Skip semicolon.
  540. StringRef Resolved;
  541. if (isNamed)
  542. Resolved = resolveHTMLNamedCharacterReference(Name);
  543. else if (isDecimal)
  544. Resolved = resolveHTMLDecimalCharacterReference(Name);
  545. else
  546. Resolved = resolveHTMLHexCharacterReference(Name);
  547. if (Resolved.empty()) {
  548. formTextToken(T, TokenPtr);
  549. return;
  550. }
  551. formTokenWithChars(T, TokenPtr, tok::text);
  552. T.setText(Resolved);
  553. }
  554. void Lexer::setupAndLexHTMLStartTag(Token &T) {
  555. assert(BufferPtr[0] == '<' &&
  556. isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  557. const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  558. StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  559. if (!isHTMLTagName(Name)) {
  560. formTextToken(T, TagNameEnd);
  561. return;
  562. }
  563. formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  564. T.setHTMLTagStartName(Name);
  565. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  566. const char C = *BufferPtr;
  567. if (BufferPtr != CommentEnd &&
  568. (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
  569. State = LS_HTMLStartTag;
  570. }
  571. void Lexer::lexHTMLStartTag(Token &T) {
  572. assert(State == LS_HTMLStartTag);
  573. const char *TokenPtr = BufferPtr;
  574. char C = *TokenPtr;
  575. if (isHTMLIdentifierCharacter(C)) {
  576. TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
  577. StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
  578. formTokenWithChars(T, TokenPtr, tok::html_ident);
  579. T.setHTMLIdent(Ident);
  580. } else {
  581. switch (C) {
  582. case '=':
  583. TokenPtr++;
  584. formTokenWithChars(T, TokenPtr, tok::html_equals);
  585. break;
  586. case '\"':
  587. case '\'': {
  588. const char *OpenQuote = TokenPtr;
  589. TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
  590. const char *ClosingQuote = TokenPtr;
  591. if (TokenPtr != CommentEnd) // Skip closing quote.
  592. TokenPtr++;
  593. formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
  594. T.setHTMLQuotedString(StringRef(OpenQuote + 1,
  595. ClosingQuote - (OpenQuote + 1)));
  596. break;
  597. }
  598. case '>':
  599. TokenPtr++;
  600. formTokenWithChars(T, TokenPtr, tok::html_greater);
  601. State = LS_Normal;
  602. return;
  603. case '/':
  604. TokenPtr++;
  605. if (TokenPtr != CommentEnd && *TokenPtr == '>') {
  606. TokenPtr++;
  607. formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
  608. } else
  609. formTextToken(T, TokenPtr);
  610. State = LS_Normal;
  611. return;
  612. }
  613. }
  614. // Now look ahead and return to normal state if we don't see any HTML tokens
  615. // ahead.
  616. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  617. if (BufferPtr == CommentEnd) {
  618. State = LS_Normal;
  619. return;
  620. }
  621. C = *BufferPtr;
  622. if (!isHTMLIdentifierStartingCharacter(C) &&
  623. C != '=' && C != '\"' && C != '\'' && C != '>') {
  624. State = LS_Normal;
  625. return;
  626. }
  627. }
  628. void Lexer::setupAndLexHTMLEndTag(Token &T) {
  629. assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
  630. const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  631. const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  632. StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  633. if (!isHTMLTagName(Name)) {
  634. formTextToken(T, TagNameEnd);
  635. return;
  636. }
  637. const char *End = skipWhitespace(TagNameEnd, CommentEnd);
  638. formTokenWithChars(T, End, tok::html_end_tag);
  639. T.setHTMLTagEndName(Name);
  640. if (BufferPtr != CommentEnd && *BufferPtr == '>')
  641. State = LS_HTMLEndTag;
  642. }
  643. void Lexer::lexHTMLEndTag(Token &T) {
  644. assert(BufferPtr != CommentEnd && *BufferPtr == '>');
  645. formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  646. State = LS_Normal;
  647. }
  648. Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
  649. const CommandTraits &Traits, SourceLocation FileLoc,
  650. const char *BufferStart, const char *BufferEnd,
  651. bool ParseCommands)
  652. : Allocator(Allocator), Diags(Diags), Traits(Traits),
  653. BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc),
  654. BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal),
  655. ParseCommands(ParseCommands) {}
  656. void Lexer::lex(Token &T) {
  657. again:
  658. switch (CommentState) {
  659. case LCS_BeforeComment:
  660. if (BufferPtr == BufferEnd) {
  661. formTokenWithChars(T, BufferPtr, tok::eof);
  662. return;
  663. }
  664. assert(*BufferPtr == '/');
  665. BufferPtr++; // Skip first slash.
  666. switch(*BufferPtr) {
  667. case '/': { // BCPL comment.
  668. BufferPtr++; // Skip second slash.
  669. if (BufferPtr != BufferEnd) {
  670. // Skip Doxygen magic marker, if it is present.
  671. // It might be missing because of a typo //< or /*<, or because we
  672. // merged this non-Doxygen comment into a bunch of Doxygen comments
  673. // around it: /** ... */ /* ... */ /** ... */
  674. const char C = *BufferPtr;
  675. if (C == '/' || C == '!')
  676. BufferPtr++;
  677. }
  678. // Skip less-than symbol that marks trailing comments.
  679. // Skip it even if the comment is not a Doxygen one, because //< and /*<
  680. // are frequent typos.
  681. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  682. BufferPtr++;
  683. CommentState = LCS_InsideBCPLComment;
  684. if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
  685. State = LS_Normal;
  686. CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
  687. goto again;
  688. }
  689. case '*': { // C comment.
  690. BufferPtr++; // Skip star.
  691. // Skip Doxygen magic marker.
  692. const char C = *BufferPtr;
  693. if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
  694. BufferPtr++;
  695. // Skip less-than symbol that marks trailing comments.
  696. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  697. BufferPtr++;
  698. CommentState = LCS_InsideCComment;
  699. State = LS_Normal;
  700. CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
  701. goto again;
  702. }
  703. default:
  704. llvm_unreachable("second character of comment should be '/' or '*'");
  705. }
  706. case LCS_BetweenComments: {
  707. // Consecutive comments are extracted only if there is only whitespace
  708. // between them. So we can search for the start of the next comment.
  709. const char *EndWhitespace = BufferPtr;
  710. while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
  711. EndWhitespace++;
  712. // Turn any whitespace between comments (and there is only whitespace
  713. // between them -- guaranteed by comment extraction) into a newline. We
  714. // have two newlines between C comments in total (first one was synthesized
  715. // after a comment).
  716. formTokenWithChars(T, EndWhitespace, tok::newline);
  717. CommentState = LCS_BeforeComment;
  718. break;
  719. }
  720. case LCS_InsideBCPLComment:
  721. case LCS_InsideCComment:
  722. if (BufferPtr != CommentEnd) {
  723. lexCommentText(T);
  724. break;
  725. } else {
  726. // Skip C comment closing sequence.
  727. if (CommentState == LCS_InsideCComment) {
  728. assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
  729. BufferPtr += 2;
  730. assert(BufferPtr <= BufferEnd);
  731. // Synthenize newline just after the C comment, regardless if there is
  732. // actually a newline.
  733. formTokenWithChars(T, BufferPtr, tok::newline);
  734. CommentState = LCS_BetweenComments;
  735. break;
  736. } else {
  737. // Don't synthesized a newline after BCPL comment.
  738. CommentState = LCS_BetweenComments;
  739. goto again;
  740. }
  741. }
  742. }
  743. }
  744. StringRef Lexer::getSpelling(const Token &Tok,
  745. const SourceManager &SourceMgr) const {
  746. SourceLocation Loc = Tok.getLocation();
  747. std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
  748. bool InvalidTemp = false;
  749. StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  750. if (InvalidTemp)
  751. return StringRef();
  752. const char *Begin = File.data() + LocInfo.second;
  753. return StringRef(Begin, Tok.getLength());
  754. }
  755. } // end namespace comments
  756. } // end namespace clang