CommentLexer.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814
  1. #include "clang/AST/CommentLexer.h"
  2. #include "clang/AST/CommentCommandTraits.h"
  3. #include "clang/Basic/CharInfo.h"
  4. #include "llvm/ADT/StringExtras.h"
  5. #include "llvm/ADT/StringSwitch.h"
  6. #include "llvm/Support/ConvertUTF.h"
  7. #include "llvm/Support/ErrorHandling.h"
  8. namespace clang {
  9. namespace comments {
  10. void Token::dump(const Lexer &L, const SourceManager &SM) const {
  11. llvm::errs() << "comments::Token Kind=" << Kind << " ";
  12. Loc.dump(SM);
  13. llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  14. }
  15. static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  16. return isLetter(C);
  17. }
  18. static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  19. return isDigit(C);
  20. }
  21. static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  22. return isHexDigit(C);
  23. }
  24. static inline StringRef convertCodePointToUTF8(
  25. llvm::BumpPtrAllocator &Allocator,
  26. unsigned CodePoint) {
  27. char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  28. char *ResolvedPtr = Resolved;
  29. if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  30. return StringRef(Resolved, ResolvedPtr - Resolved);
  31. else
  32. return StringRef();
  33. }
  34. namespace {
  35. #include "clang/AST/CommentHTMLTags.inc"
  36. #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  37. } // unnamed namespace
  38. StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  39. // Fast path, first check a few most widely used named character references.
  40. return llvm::StringSwitch<StringRef>(Name)
  41. .Case("amp", "&")
  42. .Case("lt", "<")
  43. .Case("gt", ">")
  44. .Case("quot", "\"")
  45. .Case("apos", "\'")
  46. // Slow path.
  47. .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  48. }
  49. StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  50. unsigned CodePoint = 0;
  51. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  52. assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  53. CodePoint *= 10;
  54. CodePoint += Name[i] - '0';
  55. }
  56. return convertCodePointToUTF8(Allocator, CodePoint);
  57. }
  58. StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  59. unsigned CodePoint = 0;
  60. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  61. CodePoint *= 16;
  62. const char C = Name[i];
  63. assert(isHTMLHexCharacterReferenceCharacter(C));
  64. CodePoint += llvm::hexDigitValue(C);
  65. }
  66. return convertCodePointToUTF8(Allocator, CodePoint);
  67. }
  68. void Lexer::skipLineStartingDecorations() {
  69. // This function should be called only for C comments
  70. assert(CommentState == LCS_InsideCComment);
  71. if (BufferPtr == CommentEnd)
  72. return;
  73. switch (*BufferPtr) {
  74. case ' ':
  75. case '\t':
  76. case '\f':
  77. case '\v': {
  78. const char *NewBufferPtr = BufferPtr;
  79. NewBufferPtr++;
  80. if (NewBufferPtr == CommentEnd)
  81. return;
  82. char C = *NewBufferPtr;
  83. while (isHorizontalWhitespace(C)) {
  84. NewBufferPtr++;
  85. if (NewBufferPtr == CommentEnd)
  86. return;
  87. C = *NewBufferPtr;
  88. }
  89. if (C == '*')
  90. BufferPtr = NewBufferPtr + 1;
  91. break;
  92. }
  93. case '*':
  94. BufferPtr++;
  95. break;
  96. }
  97. }
  98. namespace {
  99. /// Returns pointer to the first newline character in the string.
  100. const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  101. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  102. if (isVerticalWhitespace(*BufferPtr))
  103. return BufferPtr;
  104. }
  105. return BufferEnd;
  106. }
  107. const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  108. if (BufferPtr == BufferEnd)
  109. return BufferPtr;
  110. if (*BufferPtr == '\n')
  111. BufferPtr++;
  112. else {
  113. assert(*BufferPtr == '\r');
  114. BufferPtr++;
  115. if (BufferPtr != BufferEnd && *BufferPtr == '\n')
  116. BufferPtr++;
  117. }
  118. return BufferPtr;
  119. }
  120. const char *skipNamedCharacterReference(const char *BufferPtr,
  121. const char *BufferEnd) {
  122. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  123. if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
  124. return BufferPtr;
  125. }
  126. return BufferEnd;
  127. }
  128. const char *skipDecimalCharacterReference(const char *BufferPtr,
  129. const char *BufferEnd) {
  130. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  131. if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
  132. return BufferPtr;
  133. }
  134. return BufferEnd;
  135. }
  136. const char *skipHexCharacterReference(const char *BufferPtr,
  137. const char *BufferEnd) {
  138. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  139. if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
  140. return BufferPtr;
  141. }
  142. return BufferEnd;
  143. }
  144. bool isHTMLIdentifierStartingCharacter(char C) {
  145. return isLetter(C);
  146. }
  147. bool isHTMLIdentifierCharacter(char C) {
  148. return isAlphanumeric(C);
  149. }
  150. const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  151. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  152. if (!isHTMLIdentifierCharacter(*BufferPtr))
  153. return BufferPtr;
  154. }
  155. return BufferEnd;
  156. }
  157. /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
  158. /// string allowed.
  159. ///
  160. /// Returns pointer to closing quote.
  161. const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
  162. {
  163. const char Quote = *BufferPtr;
  164. assert(Quote == '\"' || Quote == '\'');
  165. BufferPtr++;
  166. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  167. const char C = *BufferPtr;
  168. if (C == Quote && BufferPtr[-1] != '\\')
  169. return BufferPtr;
  170. }
  171. return BufferEnd;
  172. }
  173. const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  174. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  175. if (!isWhitespace(*BufferPtr))
  176. return BufferPtr;
  177. }
  178. return BufferEnd;
  179. }
  180. bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  181. return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
  182. }
  183. bool isCommandNameStartCharacter(char C) {
  184. return isLetter(C);
  185. }
  186. bool isCommandNameCharacter(char C) {
  187. return isAlphanumeric(C);
  188. }
  189. const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  190. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  191. if (!isCommandNameCharacter(*BufferPtr))
  192. return BufferPtr;
  193. }
  194. return BufferEnd;
  195. }
  196. /// Return the one past end pointer for BCPL comments.
  197. /// Handles newlines escaped with backslash or trigraph for backslahs.
  198. const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  199. const char *CurPtr = BufferPtr;
  200. while (CurPtr != BufferEnd) {
  201. while (!isVerticalWhitespace(*CurPtr)) {
  202. CurPtr++;
  203. if (CurPtr == BufferEnd)
  204. return BufferEnd;
  205. }
  206. // We found a newline, check if it is escaped.
  207. const char *EscapePtr = CurPtr - 1;
  208. while(isHorizontalWhitespace(*EscapePtr))
  209. EscapePtr--;
  210. if (*EscapePtr == '\\' ||
  211. (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
  212. EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
  213. // We found an escaped newline.
  214. CurPtr = skipNewline(CurPtr, BufferEnd);
  215. } else
  216. return CurPtr; // Not an escaped newline.
  217. }
  218. return BufferEnd;
  219. }
  220. /// Return the one past end pointer for C comments.
  221. /// Very dumb, does not handle escaped newlines or trigraphs.
  222. const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  223. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  224. if (*BufferPtr == '*') {
  225. assert(BufferPtr + 1 != BufferEnd);
  226. if (*(BufferPtr + 1) == '/')
  227. return BufferPtr;
  228. }
  229. }
  230. llvm_unreachable("buffer end hit before '*/' was seen");
  231. }
  232. } // unnamed namespace
  233. void Lexer::lexCommentText(Token &T) {
  234. assert(CommentState == LCS_InsideBCPLComment ||
  235. CommentState == LCS_InsideCComment);
  236. switch (State) {
  237. case LS_Normal:
  238. break;
  239. case LS_VerbatimBlockFirstLine:
  240. lexVerbatimBlockFirstLine(T);
  241. return;
  242. case LS_VerbatimBlockBody:
  243. lexVerbatimBlockBody(T);
  244. return;
  245. case LS_VerbatimLineText:
  246. lexVerbatimLineText(T);
  247. return;
  248. case LS_HTMLStartTag:
  249. lexHTMLStartTag(T);
  250. return;
  251. case LS_HTMLEndTag:
  252. lexHTMLEndTag(T);
  253. return;
  254. }
  255. assert(State == LS_Normal);
  256. const char *TokenPtr = BufferPtr;
  257. assert(TokenPtr < CommentEnd);
  258. while (TokenPtr != CommentEnd) {
  259. switch(*TokenPtr) {
  260. case '\\':
  261. case '@': {
  262. T.HDCommand = (*TokenPtr == '@');
  263. TokenPtr++;
  264. if (TokenPtr == CommentEnd) {
  265. formTextToken(T, TokenPtr);
  266. return;
  267. }
  268. char C = *TokenPtr;
  269. switch (C) {
  270. default:
  271. break;
  272. case '\\': case '@': case '&': case '$':
  273. case '#': case '<': case '>': case '%':
  274. case '\"': case '.': case ':':
  275. // This is one of \\ \@ \& \$ etc escape sequences.
  276. TokenPtr++;
  277. if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
  278. // This is the \:: escape sequence.
  279. TokenPtr++;
  280. }
  281. StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
  282. formTokenWithChars(T, TokenPtr, tok::text);
  283. T.setText(UnescapedText);
  284. return;
  285. }
  286. // Don't make zero-length commands.
  287. if (!isCommandNameStartCharacter(*TokenPtr)) {
  288. formTextToken(T, TokenPtr);
  289. return;
  290. }
  291. TokenPtr = skipCommandName(TokenPtr, CommentEnd);
  292. unsigned Length = TokenPtr - (BufferPtr + 1);
  293. // Hardcoded support for lexing LaTeX formula commands
  294. // \f$ \f[ \f] \f{ \f} as a single command.
  295. if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
  296. C = *TokenPtr;
  297. if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
  298. TokenPtr++;
  299. Length++;
  300. }
  301. }
  302. const StringRef CommandName(BufferPtr + 1, Length);
  303. const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
  304. if (!Info) {
  305. formTokenWithChars(T, TokenPtr, tok::unknown_command);
  306. T.setUnknownCommandName(CommandName);
  307. return;
  308. }
  309. if (Info->IsVerbatimBlockCommand) {
  310. setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
  311. return;
  312. }
  313. if (Info->IsVerbatimLineCommand) {
  314. setupAndLexVerbatimLine(T, TokenPtr, Info);
  315. return;
  316. }
  317. formTokenWithChars(T, TokenPtr, tok::command);
  318. T.setCommandID(Info->getID());
  319. return;
  320. }
  321. case '&':
  322. lexHTMLCharacterReference(T);
  323. return;
  324. case '<': {
  325. TokenPtr++;
  326. if (TokenPtr == CommentEnd) {
  327. formTextToken(T, TokenPtr);
  328. return;
  329. }
  330. const char C = *TokenPtr;
  331. if (isHTMLIdentifierStartingCharacter(C))
  332. setupAndLexHTMLStartTag(T);
  333. else if (C == '/')
  334. setupAndLexHTMLEndTag(T);
  335. else
  336. formTextToken(T, TokenPtr);
  337. return;
  338. }
  339. case '\n':
  340. case '\r':
  341. TokenPtr = skipNewline(TokenPtr, CommentEnd);
  342. formTokenWithChars(T, TokenPtr, tok::newline);
  343. if (CommentState == LCS_InsideCComment)
  344. skipLineStartingDecorations();
  345. return;
  346. default: {
  347. size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
  348. find_first_of("\n\r\\@&<");
  349. if (End != StringRef::npos)
  350. TokenPtr += End;
  351. else
  352. TokenPtr = CommentEnd;
  353. formTextToken(T, TokenPtr);
  354. return;
  355. }
  356. }
  357. }
  358. }
  359. void Lexer::setupAndLexVerbatimBlock(Token &T,
  360. const char *TextBegin,
  361. char Marker, const CommandInfo *Info) {
  362. assert(Info->IsVerbatimBlockCommand);
  363. VerbatimBlockEndCommandName.clear();
  364. VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  365. VerbatimBlockEndCommandName.append(Info->EndCommandName);
  366. formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  367. T.setVerbatimBlockID(Info->getID());
  368. // If there is a newline following the verbatim opening command, skip the
  369. // newline so that we don't create an tok::verbatim_block_line with empty
  370. // text content.
  371. if (BufferPtr != CommentEnd &&
  372. isVerticalWhitespace(*BufferPtr)) {
  373. BufferPtr = skipNewline(BufferPtr, CommentEnd);
  374. State = LS_VerbatimBlockBody;
  375. return;
  376. }
  377. State = LS_VerbatimBlockFirstLine;
  378. }
  379. void Lexer::lexVerbatimBlockFirstLine(Token &T) {
  380. again:
  381. assert(BufferPtr < CommentEnd);
  382. // FIXME: It would be better to scan the text once, finding either the block
  383. // end command or newline.
  384. //
  385. // Extract current line.
  386. const char *Newline = findNewline(BufferPtr, CommentEnd);
  387. StringRef Line(BufferPtr, Newline - BufferPtr);
  388. // Look for end command in current line.
  389. size_t Pos = Line.find(VerbatimBlockEndCommandName);
  390. const char *TextEnd;
  391. const char *NextLine;
  392. if (Pos == StringRef::npos) {
  393. // Current line is completely verbatim.
  394. TextEnd = Newline;
  395. NextLine = skipNewline(Newline, CommentEnd);
  396. } else if (Pos == 0) {
  397. // Current line contains just an end command.
  398. const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
  399. StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
  400. formTokenWithChars(T, End, tok::verbatim_block_end);
  401. T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
  402. State = LS_Normal;
  403. return;
  404. } else {
  405. // There is some text, followed by end command. Extract text first.
  406. TextEnd = BufferPtr + Pos;
  407. NextLine = TextEnd;
  408. // If there is only whitespace before end command, skip whitespace.
  409. if (isWhitespace(BufferPtr, TextEnd)) {
  410. BufferPtr = TextEnd;
  411. goto again;
  412. }
  413. }
  414. StringRef Text(BufferPtr, TextEnd - BufferPtr);
  415. formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  416. T.setVerbatimBlockText(Text);
  417. State = LS_VerbatimBlockBody;
  418. }
  419. void Lexer::lexVerbatimBlockBody(Token &T) {
  420. assert(State == LS_VerbatimBlockBody);
  421. if (CommentState == LCS_InsideCComment)
  422. skipLineStartingDecorations();
  423. lexVerbatimBlockFirstLine(T);
  424. }
  425. void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  426. const CommandInfo *Info) {
  427. assert(Info->IsVerbatimLineCommand);
  428. formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  429. T.setVerbatimLineID(Info->getID());
  430. State = LS_VerbatimLineText;
  431. }
  432. void Lexer::lexVerbatimLineText(Token &T) {
  433. assert(State == LS_VerbatimLineText);
  434. // Extract current line.
  435. const char *Newline = findNewline(BufferPtr, CommentEnd);
  436. const StringRef Text(BufferPtr, Newline - BufferPtr);
  437. formTokenWithChars(T, Newline, tok::verbatim_line_text);
  438. T.setVerbatimLineText(Text);
  439. State = LS_Normal;
  440. }
  441. void Lexer::lexHTMLCharacterReference(Token &T) {
  442. const char *TokenPtr = BufferPtr;
  443. assert(*TokenPtr == '&');
  444. TokenPtr++;
  445. if (TokenPtr == CommentEnd) {
  446. formTextToken(T, TokenPtr);
  447. return;
  448. }
  449. const char *NamePtr;
  450. bool isNamed = false;
  451. bool isDecimal = false;
  452. char C = *TokenPtr;
  453. if (isHTMLNamedCharacterReferenceCharacter(C)) {
  454. NamePtr = TokenPtr;
  455. TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
  456. isNamed = true;
  457. } else if (C == '#') {
  458. TokenPtr++;
  459. if (TokenPtr == CommentEnd) {
  460. formTextToken(T, TokenPtr);
  461. return;
  462. }
  463. C = *TokenPtr;
  464. if (isHTMLDecimalCharacterReferenceCharacter(C)) {
  465. NamePtr = TokenPtr;
  466. TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
  467. isDecimal = true;
  468. } else if (C == 'x' || C == 'X') {
  469. TokenPtr++;
  470. NamePtr = TokenPtr;
  471. TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
  472. } else {
  473. formTextToken(T, TokenPtr);
  474. return;
  475. }
  476. } else {
  477. formTextToken(T, TokenPtr);
  478. return;
  479. }
  480. if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
  481. *TokenPtr != ';') {
  482. formTextToken(T, TokenPtr);
  483. return;
  484. }
  485. StringRef Name(NamePtr, TokenPtr - NamePtr);
  486. TokenPtr++; // Skip semicolon.
  487. StringRef Resolved;
  488. if (isNamed)
  489. Resolved = resolveHTMLNamedCharacterReference(Name);
  490. else if (isDecimal)
  491. Resolved = resolveHTMLDecimalCharacterReference(Name);
  492. else
  493. Resolved = resolveHTMLHexCharacterReference(Name);
  494. if (Resolved.empty()) {
  495. formTextToken(T, TokenPtr);
  496. return;
  497. }
  498. formTokenWithChars(T, TokenPtr, tok::text);
  499. T.setText(Resolved);
  500. return;
  501. }
  502. void Lexer::setupAndLexHTMLStartTag(Token &T) {
  503. assert(BufferPtr[0] == '<' &&
  504. isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  505. const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  506. StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  507. if (!isHTMLTagName(Name)) {
  508. formTextToken(T, TagNameEnd);
  509. return;
  510. }
  511. formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  512. T.setHTMLTagStartName(Name);
  513. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  514. const char C = *BufferPtr;
  515. if (BufferPtr != CommentEnd &&
  516. (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
  517. State = LS_HTMLStartTag;
  518. }
  519. void Lexer::lexHTMLStartTag(Token &T) {
  520. assert(State == LS_HTMLStartTag);
  521. const char *TokenPtr = BufferPtr;
  522. char C = *TokenPtr;
  523. if (isHTMLIdentifierCharacter(C)) {
  524. TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
  525. StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
  526. formTokenWithChars(T, TokenPtr, tok::html_ident);
  527. T.setHTMLIdent(Ident);
  528. } else {
  529. switch (C) {
  530. case '=':
  531. TokenPtr++;
  532. formTokenWithChars(T, TokenPtr, tok::html_equals);
  533. break;
  534. case '\"':
  535. case '\'': {
  536. const char *OpenQuote = TokenPtr;
  537. TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
  538. const char *ClosingQuote = TokenPtr;
  539. if (TokenPtr != CommentEnd) // Skip closing quote.
  540. TokenPtr++;
  541. formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
  542. T.setHTMLQuotedString(StringRef(OpenQuote + 1,
  543. ClosingQuote - (OpenQuote + 1)));
  544. break;
  545. }
  546. case '>':
  547. TokenPtr++;
  548. formTokenWithChars(T, TokenPtr, tok::html_greater);
  549. State = LS_Normal;
  550. return;
  551. case '/':
  552. TokenPtr++;
  553. if (TokenPtr != CommentEnd && *TokenPtr == '>') {
  554. TokenPtr++;
  555. formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
  556. } else
  557. formTextToken(T, TokenPtr);
  558. State = LS_Normal;
  559. return;
  560. }
  561. }
  562. // Now look ahead and return to normal state if we don't see any HTML tokens
  563. // ahead.
  564. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  565. if (BufferPtr == CommentEnd) {
  566. State = LS_Normal;
  567. return;
  568. }
  569. C = *BufferPtr;
  570. if (!isHTMLIdentifierStartingCharacter(C) &&
  571. C != '=' && C != '\"' && C != '\'' && C != '>') {
  572. State = LS_Normal;
  573. return;
  574. }
  575. }
  576. void Lexer::setupAndLexHTMLEndTag(Token &T) {
  577. assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
  578. const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  579. const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  580. StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  581. if (!isHTMLTagName(Name)) {
  582. formTextToken(T, TagNameEnd);
  583. return;
  584. }
  585. const char *End = skipWhitespace(TagNameEnd, CommentEnd);
  586. formTokenWithChars(T, End, tok::html_end_tag);
  587. T.setHTMLTagEndName(Name);
  588. if (BufferPtr != CommentEnd && *BufferPtr == '>')
  589. State = LS_HTMLEndTag;
  590. }
  591. void Lexer::lexHTMLEndTag(Token &T) {
  592. assert(BufferPtr != CommentEnd && *BufferPtr == '>');
  593. formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  594. State = LS_Normal;
  595. }
  596. Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
  597. SourceLocation FileLoc,
  598. const char *BufferStart, const char *BufferEnd):
  599. Allocator(Allocator), Traits(Traits),
  600. BufferStart(BufferStart), BufferEnd(BufferEnd),
  601. FileLoc(FileLoc), BufferPtr(BufferStart),
  602. CommentState(LCS_BeforeComment), State(LS_Normal) {
  603. }
  604. void Lexer::lex(Token &T) {
  605. again:
  606. switch (CommentState) {
  607. case LCS_BeforeComment:
  608. if (BufferPtr == BufferEnd) {
  609. formTokenWithChars(T, BufferPtr, tok::eof);
  610. return;
  611. }
  612. assert(*BufferPtr == '/');
  613. BufferPtr++; // Skip first slash.
  614. switch(*BufferPtr) {
  615. case '/': { // BCPL comment.
  616. BufferPtr++; // Skip second slash.
  617. if (BufferPtr != BufferEnd) {
  618. // Skip Doxygen magic marker, if it is present.
  619. // It might be missing because of a typo //< or /*<, or because we
  620. // merged this non-Doxygen comment into a bunch of Doxygen comments
  621. // around it: /** ... */ /* ... */ /** ... */
  622. const char C = *BufferPtr;
  623. if (C == '/' || C == '!')
  624. BufferPtr++;
  625. }
  626. // Skip less-than symbol that marks trailing comments.
  627. // Skip it even if the comment is not a Doxygen one, because //< and /*<
  628. // are frequent typos.
  629. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  630. BufferPtr++;
  631. CommentState = LCS_InsideBCPLComment;
  632. if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
  633. State = LS_Normal;
  634. CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
  635. goto again;
  636. }
  637. case '*': { // C comment.
  638. BufferPtr++; // Skip star.
  639. // Skip Doxygen magic marker.
  640. const char C = *BufferPtr;
  641. if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
  642. BufferPtr++;
  643. // Skip less-than symbol that marks trailing comments.
  644. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  645. BufferPtr++;
  646. CommentState = LCS_InsideCComment;
  647. State = LS_Normal;
  648. CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
  649. goto again;
  650. }
  651. default:
  652. llvm_unreachable("second character of comment should be '/' or '*'");
  653. }
  654. case LCS_BetweenComments: {
  655. // Consecutive comments are extracted only if there is only whitespace
  656. // between them. So we can search for the start of the next comment.
  657. const char *EndWhitespace = BufferPtr;
  658. while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
  659. EndWhitespace++;
  660. // Turn any whitespace between comments (and there is only whitespace
  661. // between them -- guaranteed by comment extraction) into a newline. We
  662. // have two newlines between C comments in total (first one was synthesized
  663. // after a comment).
  664. formTokenWithChars(T, EndWhitespace, tok::newline);
  665. CommentState = LCS_BeforeComment;
  666. break;
  667. }
  668. case LCS_InsideBCPLComment:
  669. case LCS_InsideCComment:
  670. if (BufferPtr != CommentEnd) {
  671. lexCommentText(T);
  672. break;
  673. } else {
  674. // Skip C comment closing sequence.
  675. if (CommentState == LCS_InsideCComment) {
  676. assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
  677. BufferPtr += 2;
  678. assert(BufferPtr <= BufferEnd);
  679. // Synthenize newline just after the C comment, regardless if there is
  680. // actually a newline.
  681. formTokenWithChars(T, BufferPtr, tok::newline);
  682. CommentState = LCS_BetweenComments;
  683. break;
  684. } else {
  685. // Don't synthesized a newline after BCPL comment.
  686. CommentState = LCS_BetweenComments;
  687. goto again;
  688. }
  689. }
  690. }
  691. }
  692. StringRef Lexer::getSpelling(const Token &Tok,
  693. const SourceManager &SourceMgr,
  694. bool *Invalid) const {
  695. SourceLocation Loc = Tok.getLocation();
  696. std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
  697. bool InvalidTemp = false;
  698. StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  699. if (InvalidTemp) {
  700. *Invalid = true;
  701. return StringRef();
  702. }
  703. const char *Begin = File.data() + LocInfo.second;
  704. return StringRef(Begin, Tok.getLength());
  705. }
  706. } // end namespace comments
  707. } // end namespace clang