CommentLexer.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. #include "clang/AST/CommentLexer.h"
  2. #include "clang/AST/CommentCommandTraits.h"
  3. #include "clang/AST/CommentDiagnostic.h"
  4. #include "clang/Basic/CharInfo.h"
  5. #include "llvm/ADT/StringExtras.h"
  6. #include "llvm/ADT/StringSwitch.h"
  7. #include "llvm/Support/ConvertUTF.h"
  8. #include "llvm/Support/ErrorHandling.h"
  9. namespace clang {
  10. namespace comments {
  11. void Token::dump(const Lexer &L, const SourceManager &SM) const {
  12. llvm::errs() << "comments::Token Kind=" << Kind << " ";
  13. Loc.dump(SM);
  14. llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  15. }
  16. static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  17. return isLetter(C);
  18. }
  19. static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  20. return isDigit(C);
  21. }
  22. static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  23. return isHexDigit(C);
  24. }
  25. static inline StringRef convertCodePointToUTF8(
  26. llvm::BumpPtrAllocator &Allocator,
  27. unsigned CodePoint) {
  28. char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  29. char *ResolvedPtr = Resolved;
  30. if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  31. return StringRef(Resolved, ResolvedPtr - Resolved);
  32. else
  33. return StringRef();
  34. }
  35. namespace {
  36. #include "clang/AST/CommentHTMLTags.inc"
  37. #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  38. } // unnamed namespace
  39. StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  40. // Fast path, first check a few most widely used named character references.
  41. return llvm::StringSwitch<StringRef>(Name)
  42. .Case("amp", "&")
  43. .Case("lt", "<")
  44. .Case("gt", ">")
  45. .Case("quot", "\"")
  46. .Case("apos", "\'")
  47. // Slow path.
  48. .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  49. }
  50. StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  51. unsigned CodePoint = 0;
  52. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  53. assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  54. CodePoint *= 10;
  55. CodePoint += Name[i] - '0';
  56. }
  57. return convertCodePointToUTF8(Allocator, CodePoint);
  58. }
  59. StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  60. unsigned CodePoint = 0;
  61. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  62. CodePoint *= 16;
  63. const char C = Name[i];
  64. assert(isHTMLHexCharacterReferenceCharacter(C));
  65. CodePoint += llvm::hexDigitValue(C);
  66. }
  67. return convertCodePointToUTF8(Allocator, CodePoint);
  68. }
  69. void Lexer::skipLineStartingDecorations() {
  70. // This function should be called only for C comments
  71. assert(CommentState == LCS_InsideCComment);
  72. if (BufferPtr == CommentEnd)
  73. return;
  74. switch (*BufferPtr) {
  75. case ' ':
  76. case '\t':
  77. case '\f':
  78. case '\v': {
  79. const char *NewBufferPtr = BufferPtr;
  80. NewBufferPtr++;
  81. if (NewBufferPtr == CommentEnd)
  82. return;
  83. char C = *NewBufferPtr;
  84. while (isHorizontalWhitespace(C)) {
  85. NewBufferPtr++;
  86. if (NewBufferPtr == CommentEnd)
  87. return;
  88. C = *NewBufferPtr;
  89. }
  90. if (C == '*')
  91. BufferPtr = NewBufferPtr + 1;
  92. break;
  93. }
  94. case '*':
  95. BufferPtr++;
  96. break;
  97. }
  98. }
  99. namespace {
  100. /// Returns pointer to the first newline character in the string.
  101. const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  102. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  103. if (isVerticalWhitespace(*BufferPtr))
  104. return BufferPtr;
  105. }
  106. return BufferEnd;
  107. }
  108. const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  109. if (BufferPtr == BufferEnd)
  110. return BufferPtr;
  111. if (*BufferPtr == '\n')
  112. BufferPtr++;
  113. else {
  114. assert(*BufferPtr == '\r');
  115. BufferPtr++;
  116. if (BufferPtr != BufferEnd && *BufferPtr == '\n')
  117. BufferPtr++;
  118. }
  119. return BufferPtr;
  120. }
  121. const char *skipNamedCharacterReference(const char *BufferPtr,
  122. const char *BufferEnd) {
  123. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  124. if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
  125. return BufferPtr;
  126. }
  127. return BufferEnd;
  128. }
  129. const char *skipDecimalCharacterReference(const char *BufferPtr,
  130. const char *BufferEnd) {
  131. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  132. if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
  133. return BufferPtr;
  134. }
  135. return BufferEnd;
  136. }
  137. const char *skipHexCharacterReference(const char *BufferPtr,
  138. const char *BufferEnd) {
  139. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  140. if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
  141. return BufferPtr;
  142. }
  143. return BufferEnd;
  144. }
  145. bool isHTMLIdentifierStartingCharacter(char C) {
  146. return isLetter(C);
  147. }
  148. bool isHTMLIdentifierCharacter(char C) {
  149. return isAlphanumeric(C);
  150. }
  151. const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  152. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  153. if (!isHTMLIdentifierCharacter(*BufferPtr))
  154. return BufferPtr;
  155. }
  156. return BufferEnd;
  157. }
  158. /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
  159. /// string allowed.
  160. ///
  161. /// Returns pointer to closing quote.
  162. const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
  163. {
  164. const char Quote = *BufferPtr;
  165. assert(Quote == '\"' || Quote == '\'');
  166. BufferPtr++;
  167. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  168. const char C = *BufferPtr;
  169. if (C == Quote && BufferPtr[-1] != '\\')
  170. return BufferPtr;
  171. }
  172. return BufferEnd;
  173. }
  174. const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  175. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  176. if (!isWhitespace(*BufferPtr))
  177. return BufferPtr;
  178. }
  179. return BufferEnd;
  180. }
  181. bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  182. return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
  183. }
  184. bool isCommandNameStartCharacter(char C) {
  185. return isLetter(C);
  186. }
  187. bool isCommandNameCharacter(char C) {
  188. return isAlphanumeric(C);
  189. }
  190. const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  191. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  192. if (!isCommandNameCharacter(*BufferPtr))
  193. return BufferPtr;
  194. }
  195. return BufferEnd;
  196. }
  197. /// Return the one past end pointer for BCPL comments.
  198. /// Handles newlines escaped with backslash or trigraph for backslahs.
  199. const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  200. const char *CurPtr = BufferPtr;
  201. while (CurPtr != BufferEnd) {
  202. while (!isVerticalWhitespace(*CurPtr)) {
  203. CurPtr++;
  204. if (CurPtr == BufferEnd)
  205. return BufferEnd;
  206. }
  207. // We found a newline, check if it is escaped.
  208. const char *EscapePtr = CurPtr - 1;
  209. while(isHorizontalWhitespace(*EscapePtr))
  210. EscapePtr--;
  211. if (*EscapePtr == '\\' ||
  212. (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
  213. EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
  214. // We found an escaped newline.
  215. CurPtr = skipNewline(CurPtr, BufferEnd);
  216. } else
  217. return CurPtr; // Not an escaped newline.
  218. }
  219. return BufferEnd;
  220. }
  221. /// Return the one past end pointer for C comments.
  222. /// Very dumb, does not handle escaped newlines or trigraphs.
  223. const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  224. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  225. if (*BufferPtr == '*') {
  226. assert(BufferPtr + 1 != BufferEnd);
  227. if (*(BufferPtr + 1) == '/')
  228. return BufferPtr;
  229. }
  230. }
  231. llvm_unreachable("buffer end hit before '*/' was seen");
  232. }
  233. } // unnamed namespace
  234. void Lexer::lexCommentText(Token &T) {
  235. assert(CommentState == LCS_InsideBCPLComment ||
  236. CommentState == LCS_InsideCComment);
  237. switch (State) {
  238. case LS_Normal:
  239. break;
  240. case LS_VerbatimBlockFirstLine:
  241. lexVerbatimBlockFirstLine(T);
  242. return;
  243. case LS_VerbatimBlockBody:
  244. lexVerbatimBlockBody(T);
  245. return;
  246. case LS_VerbatimLineText:
  247. lexVerbatimLineText(T);
  248. return;
  249. case LS_HTMLStartTag:
  250. lexHTMLStartTag(T);
  251. return;
  252. case LS_HTMLEndTag:
  253. lexHTMLEndTag(T);
  254. return;
  255. }
  256. assert(State == LS_Normal);
  257. const char *TokenPtr = BufferPtr;
  258. assert(TokenPtr < CommentEnd);
  259. while (TokenPtr != CommentEnd) {
  260. switch(*TokenPtr) {
  261. case '\\':
  262. case '@': {
  263. // Commands that start with a backslash and commands that start with
  264. // 'at' have equivalent semantics. But we keep information about the
  265. // exact syntax in AST for comments.
  266. tok::TokenKind CommandKind =
  267. (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
  268. TokenPtr++;
  269. if (TokenPtr == CommentEnd) {
  270. formTextToken(T, TokenPtr);
  271. return;
  272. }
  273. char C = *TokenPtr;
  274. switch (C) {
  275. default:
  276. break;
  277. case '\\': case '@': case '&': case '$':
  278. case '#': case '<': case '>': case '%':
  279. case '\"': case '.': case ':':
  280. // This is one of \\ \@ \& \$ etc escape sequences.
  281. TokenPtr++;
  282. if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
  283. // This is the \:: escape sequence.
  284. TokenPtr++;
  285. }
  286. StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
  287. formTokenWithChars(T, TokenPtr, tok::text);
  288. T.setText(UnescapedText);
  289. return;
  290. }
  291. // Don't make zero-length commands.
  292. if (!isCommandNameStartCharacter(*TokenPtr)) {
  293. formTextToken(T, TokenPtr);
  294. return;
  295. }
  296. TokenPtr = skipCommandName(TokenPtr, CommentEnd);
  297. unsigned Length = TokenPtr - (BufferPtr + 1);
  298. // Hardcoded support for lexing LaTeX formula commands
  299. // \f$ \f[ \f] \f{ \f} as a single command.
  300. if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
  301. C = *TokenPtr;
  302. if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
  303. TokenPtr++;
  304. Length++;
  305. }
  306. }
  307. const StringRef CommandName(BufferPtr + 1, Length);
  308. const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
  309. if (!Info) {
  310. formTokenWithChars(T, TokenPtr, tok::unknown_command);
  311. T.setUnknownCommandName(CommandName);
  312. if (Info = Traits.getTypoCorrectCommandInfo(CommandName)) {
  313. StringRef CorrectedName = Info->Name;
  314. SourceRange CommandRange(T.getLocation().getLocWithOffset(1),
  315. T.getEndLocation());
  316. Diag(T.getLocation(), diag::warn_correct_comment_command_name)
  317. << CommandName << CorrectedName
  318. << FixItHint::CreateReplacement(CommandRange, CorrectedName);
  319. } else {
  320. Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
  321. return;
  322. }
  323. }
  324. if (Info->IsVerbatimBlockCommand) {
  325. setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
  326. return;
  327. }
  328. if (Info->IsVerbatimLineCommand) {
  329. setupAndLexVerbatimLine(T, TokenPtr, Info);
  330. return;
  331. }
  332. formTokenWithChars(T, TokenPtr, CommandKind);
  333. T.setCommandID(Info->getID());
  334. return;
  335. }
  336. case '&':
  337. lexHTMLCharacterReference(T);
  338. return;
  339. case '<': {
  340. TokenPtr++;
  341. if (TokenPtr == CommentEnd) {
  342. formTextToken(T, TokenPtr);
  343. return;
  344. }
  345. const char C = *TokenPtr;
  346. if (isHTMLIdentifierStartingCharacter(C))
  347. setupAndLexHTMLStartTag(T);
  348. else if (C == '/')
  349. setupAndLexHTMLEndTag(T);
  350. else
  351. formTextToken(T, TokenPtr);
  352. return;
  353. }
  354. case '\n':
  355. case '\r':
  356. TokenPtr = skipNewline(TokenPtr, CommentEnd);
  357. formTokenWithChars(T, TokenPtr, tok::newline);
  358. if (CommentState == LCS_InsideCComment)
  359. skipLineStartingDecorations();
  360. return;
  361. default: {
  362. size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
  363. find_first_of("\n\r\\@&<");
  364. if (End != StringRef::npos)
  365. TokenPtr += End;
  366. else
  367. TokenPtr = CommentEnd;
  368. formTextToken(T, TokenPtr);
  369. return;
  370. }
  371. }
  372. }
  373. }
  374. void Lexer::setupAndLexVerbatimBlock(Token &T,
  375. const char *TextBegin,
  376. char Marker, const CommandInfo *Info) {
  377. assert(Info->IsVerbatimBlockCommand);
  378. VerbatimBlockEndCommandName.clear();
  379. VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  380. VerbatimBlockEndCommandName.append(Info->EndCommandName);
  381. formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  382. T.setVerbatimBlockID(Info->getID());
  383. // If there is a newline following the verbatim opening command, skip the
  384. // newline so that we don't create an tok::verbatim_block_line with empty
  385. // text content.
  386. if (BufferPtr != CommentEnd &&
  387. isVerticalWhitespace(*BufferPtr)) {
  388. BufferPtr = skipNewline(BufferPtr, CommentEnd);
  389. State = LS_VerbatimBlockBody;
  390. return;
  391. }
  392. State = LS_VerbatimBlockFirstLine;
  393. }
  394. void Lexer::lexVerbatimBlockFirstLine(Token &T) {
  395. again:
  396. assert(BufferPtr < CommentEnd);
  397. // FIXME: It would be better to scan the text once, finding either the block
  398. // end command or newline.
  399. //
  400. // Extract current line.
  401. const char *Newline = findNewline(BufferPtr, CommentEnd);
  402. StringRef Line(BufferPtr, Newline - BufferPtr);
  403. // Look for end command in current line.
  404. size_t Pos = Line.find(VerbatimBlockEndCommandName);
  405. const char *TextEnd;
  406. const char *NextLine;
  407. if (Pos == StringRef::npos) {
  408. // Current line is completely verbatim.
  409. TextEnd = Newline;
  410. NextLine = skipNewline(Newline, CommentEnd);
  411. } else if (Pos == 0) {
  412. // Current line contains just an end command.
  413. const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
  414. StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
  415. formTokenWithChars(T, End, tok::verbatim_block_end);
  416. T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
  417. State = LS_Normal;
  418. return;
  419. } else {
  420. // There is some text, followed by end command. Extract text first.
  421. TextEnd = BufferPtr + Pos;
  422. NextLine = TextEnd;
  423. // If there is only whitespace before end command, skip whitespace.
  424. if (isWhitespace(BufferPtr, TextEnd)) {
  425. BufferPtr = TextEnd;
  426. goto again;
  427. }
  428. }
  429. StringRef Text(BufferPtr, TextEnd - BufferPtr);
  430. formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  431. T.setVerbatimBlockText(Text);
  432. State = LS_VerbatimBlockBody;
  433. }
  434. void Lexer::lexVerbatimBlockBody(Token &T) {
  435. assert(State == LS_VerbatimBlockBody);
  436. if (CommentState == LCS_InsideCComment)
  437. skipLineStartingDecorations();
  438. lexVerbatimBlockFirstLine(T);
  439. }
  440. void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  441. const CommandInfo *Info) {
  442. assert(Info->IsVerbatimLineCommand);
  443. formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  444. T.setVerbatimLineID(Info->getID());
  445. State = LS_VerbatimLineText;
  446. }
  447. void Lexer::lexVerbatimLineText(Token &T) {
  448. assert(State == LS_VerbatimLineText);
  449. // Extract current line.
  450. const char *Newline = findNewline(BufferPtr, CommentEnd);
  451. const StringRef Text(BufferPtr, Newline - BufferPtr);
  452. formTokenWithChars(T, Newline, tok::verbatim_line_text);
  453. T.setVerbatimLineText(Text);
  454. State = LS_Normal;
  455. }
  456. void Lexer::lexHTMLCharacterReference(Token &T) {
  457. const char *TokenPtr = BufferPtr;
  458. assert(*TokenPtr == '&');
  459. TokenPtr++;
  460. if (TokenPtr == CommentEnd) {
  461. formTextToken(T, TokenPtr);
  462. return;
  463. }
  464. const char *NamePtr;
  465. bool isNamed = false;
  466. bool isDecimal = false;
  467. char C = *TokenPtr;
  468. if (isHTMLNamedCharacterReferenceCharacter(C)) {
  469. NamePtr = TokenPtr;
  470. TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
  471. isNamed = true;
  472. } else if (C == '#') {
  473. TokenPtr++;
  474. if (TokenPtr == CommentEnd) {
  475. formTextToken(T, TokenPtr);
  476. return;
  477. }
  478. C = *TokenPtr;
  479. if (isHTMLDecimalCharacterReferenceCharacter(C)) {
  480. NamePtr = TokenPtr;
  481. TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
  482. isDecimal = true;
  483. } else if (C == 'x' || C == 'X') {
  484. TokenPtr++;
  485. NamePtr = TokenPtr;
  486. TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
  487. } else {
  488. formTextToken(T, TokenPtr);
  489. return;
  490. }
  491. } else {
  492. formTextToken(T, TokenPtr);
  493. return;
  494. }
  495. if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
  496. *TokenPtr != ';') {
  497. formTextToken(T, TokenPtr);
  498. return;
  499. }
  500. StringRef Name(NamePtr, TokenPtr - NamePtr);
  501. TokenPtr++; // Skip semicolon.
  502. StringRef Resolved;
  503. if (isNamed)
  504. Resolved = resolveHTMLNamedCharacterReference(Name);
  505. else if (isDecimal)
  506. Resolved = resolveHTMLDecimalCharacterReference(Name);
  507. else
  508. Resolved = resolveHTMLHexCharacterReference(Name);
  509. if (Resolved.empty()) {
  510. formTextToken(T, TokenPtr);
  511. return;
  512. }
  513. formTokenWithChars(T, TokenPtr, tok::text);
  514. T.setText(Resolved);
  515. return;
  516. }
  517. void Lexer::setupAndLexHTMLStartTag(Token &T) {
  518. assert(BufferPtr[0] == '<' &&
  519. isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  520. const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  521. StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  522. if (!isHTMLTagName(Name)) {
  523. formTextToken(T, TagNameEnd);
  524. return;
  525. }
  526. formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  527. T.setHTMLTagStartName(Name);
  528. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  529. const char C = *BufferPtr;
  530. if (BufferPtr != CommentEnd &&
  531. (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
  532. State = LS_HTMLStartTag;
  533. }
  534. void Lexer::lexHTMLStartTag(Token &T) {
  535. assert(State == LS_HTMLStartTag);
  536. const char *TokenPtr = BufferPtr;
  537. char C = *TokenPtr;
  538. if (isHTMLIdentifierCharacter(C)) {
  539. TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
  540. StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
  541. formTokenWithChars(T, TokenPtr, tok::html_ident);
  542. T.setHTMLIdent(Ident);
  543. } else {
  544. switch (C) {
  545. case '=':
  546. TokenPtr++;
  547. formTokenWithChars(T, TokenPtr, tok::html_equals);
  548. break;
  549. case '\"':
  550. case '\'': {
  551. const char *OpenQuote = TokenPtr;
  552. TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
  553. const char *ClosingQuote = TokenPtr;
  554. if (TokenPtr != CommentEnd) // Skip closing quote.
  555. TokenPtr++;
  556. formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
  557. T.setHTMLQuotedString(StringRef(OpenQuote + 1,
  558. ClosingQuote - (OpenQuote + 1)));
  559. break;
  560. }
  561. case '>':
  562. TokenPtr++;
  563. formTokenWithChars(T, TokenPtr, tok::html_greater);
  564. State = LS_Normal;
  565. return;
  566. case '/':
  567. TokenPtr++;
  568. if (TokenPtr != CommentEnd && *TokenPtr == '>') {
  569. TokenPtr++;
  570. formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
  571. } else
  572. formTextToken(T, TokenPtr);
  573. State = LS_Normal;
  574. return;
  575. }
  576. }
  577. // Now look ahead and return to normal state if we don't see any HTML tokens
  578. // ahead.
  579. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  580. if (BufferPtr == CommentEnd) {
  581. State = LS_Normal;
  582. return;
  583. }
  584. C = *BufferPtr;
  585. if (!isHTMLIdentifierStartingCharacter(C) &&
  586. C != '=' && C != '\"' && C != '\'' && C != '>') {
  587. State = LS_Normal;
  588. return;
  589. }
  590. }
  591. void Lexer::setupAndLexHTMLEndTag(Token &T) {
  592. assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
  593. const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  594. const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  595. StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  596. if (!isHTMLTagName(Name)) {
  597. formTextToken(T, TagNameEnd);
  598. return;
  599. }
  600. const char *End = skipWhitespace(TagNameEnd, CommentEnd);
  601. formTokenWithChars(T, End, tok::html_end_tag);
  602. T.setHTMLTagEndName(Name);
  603. if (BufferPtr != CommentEnd && *BufferPtr == '>')
  604. State = LS_HTMLEndTag;
  605. }
  606. void Lexer::lexHTMLEndTag(Token &T) {
  607. assert(BufferPtr != CommentEnd && *BufferPtr == '>');
  608. formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  609. State = LS_Normal;
  610. }
  611. Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
  612. const CommandTraits &Traits,
  613. SourceLocation FileLoc,
  614. const char *BufferStart, const char *BufferEnd):
  615. Allocator(Allocator), Diags(Diags), Traits(Traits),
  616. BufferStart(BufferStart), BufferEnd(BufferEnd),
  617. FileLoc(FileLoc), BufferPtr(BufferStart),
  618. CommentState(LCS_BeforeComment), State(LS_Normal) {
  619. }
  620. void Lexer::lex(Token &T) {
  621. again:
  622. switch (CommentState) {
  623. case LCS_BeforeComment:
  624. if (BufferPtr == BufferEnd) {
  625. formTokenWithChars(T, BufferPtr, tok::eof);
  626. return;
  627. }
  628. assert(*BufferPtr == '/');
  629. BufferPtr++; // Skip first slash.
  630. switch(*BufferPtr) {
  631. case '/': { // BCPL comment.
  632. BufferPtr++; // Skip second slash.
  633. if (BufferPtr != BufferEnd) {
  634. // Skip Doxygen magic marker, if it is present.
  635. // It might be missing because of a typo //< or /*<, or because we
  636. // merged this non-Doxygen comment into a bunch of Doxygen comments
  637. // around it: /** ... */ /* ... */ /** ... */
  638. const char C = *BufferPtr;
  639. if (C == '/' || C == '!')
  640. BufferPtr++;
  641. }
  642. // Skip less-than symbol that marks trailing comments.
  643. // Skip it even if the comment is not a Doxygen one, because //< and /*<
  644. // are frequent typos.
  645. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  646. BufferPtr++;
  647. CommentState = LCS_InsideBCPLComment;
  648. if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
  649. State = LS_Normal;
  650. CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
  651. goto again;
  652. }
  653. case '*': { // C comment.
  654. BufferPtr++; // Skip star.
  655. // Skip Doxygen magic marker.
  656. const char C = *BufferPtr;
  657. if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
  658. BufferPtr++;
  659. // Skip less-than symbol that marks trailing comments.
  660. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  661. BufferPtr++;
  662. CommentState = LCS_InsideCComment;
  663. State = LS_Normal;
  664. CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
  665. goto again;
  666. }
  667. default:
  668. llvm_unreachable("second character of comment should be '/' or '*'");
  669. }
  670. case LCS_BetweenComments: {
  671. // Consecutive comments are extracted only if there is only whitespace
  672. // between them. So we can search for the start of the next comment.
  673. const char *EndWhitespace = BufferPtr;
  674. while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
  675. EndWhitespace++;
  676. // Turn any whitespace between comments (and there is only whitespace
  677. // between them -- guaranteed by comment extraction) into a newline. We
  678. // have two newlines between C comments in total (first one was synthesized
  679. // after a comment).
  680. formTokenWithChars(T, EndWhitespace, tok::newline);
  681. CommentState = LCS_BeforeComment;
  682. break;
  683. }
  684. case LCS_InsideBCPLComment:
  685. case LCS_InsideCComment:
  686. if (BufferPtr != CommentEnd) {
  687. lexCommentText(T);
  688. break;
  689. } else {
  690. // Skip C comment closing sequence.
  691. if (CommentState == LCS_InsideCComment) {
  692. assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
  693. BufferPtr += 2;
  694. assert(BufferPtr <= BufferEnd);
  695. // Synthenize newline just after the C comment, regardless if there is
  696. // actually a newline.
  697. formTokenWithChars(T, BufferPtr, tok::newline);
  698. CommentState = LCS_BetweenComments;
  699. break;
  700. } else {
  701. // Don't synthesized a newline after BCPL comment.
  702. CommentState = LCS_BetweenComments;
  703. goto again;
  704. }
  705. }
  706. }
  707. }
  708. StringRef Lexer::getSpelling(const Token &Tok,
  709. const SourceManager &SourceMgr,
  710. bool *Invalid) const {
  711. SourceLocation Loc = Tok.getLocation();
  712. std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
  713. bool InvalidTemp = false;
  714. StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  715. if (InvalidTemp) {
  716. *Invalid = true;
  717. return StringRef();
  718. }
  719. const char *Begin = File.data() + LocInfo.second;
  720. return StringRef(Begin, Tok.getLength());
  721. }
  722. } // end namespace comments
  723. } // end namespace clang