RawCommentList.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. #include "clang/AST/RawCommentList.h"
  9. #include "clang/AST/ASTContext.h"
  10. #include "clang/AST/Comment.h"
  11. #include "clang/AST/CommentBriefParser.h"
  12. #include "clang/AST/CommentCommandTraits.h"
  13. #include "clang/AST/CommentLexer.h"
  14. #include "clang/AST/CommentParser.h"
  15. #include "clang/AST/CommentSema.h"
  16. #include "clang/Basic/CharInfo.h"
  17. #include "llvm/ADT/STLExtras.h"
  18. using namespace clang;
  19. namespace {
  20. /// Get comment kind and bool describing if it is a trailing comment.
  21. std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
  22. bool ParseAllComments) {
  23. const size_t MinCommentLength = ParseAllComments ? 2 : 3;
  24. if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
  25. return std::make_pair(RawComment::RCK_Invalid, false);
  26. RawComment::CommentKind K;
  27. if (Comment[1] == '/') {
  28. if (Comment.size() < 3)
  29. return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  30. if (Comment[2] == '/')
  31. K = RawComment::RCK_BCPLSlash;
  32. else if (Comment[2] == '!')
  33. K = RawComment::RCK_BCPLExcl;
  34. else
  35. return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
  36. } else {
  37. assert(Comment.size() >= 4);
  38. // Comment lexer does not understand escapes in comment markers, so pretend
  39. // that this is not a comment.
  40. if (Comment[1] != '*' ||
  41. Comment[Comment.size() - 2] != '*' ||
  42. Comment[Comment.size() - 1] != '/')
  43. return std::make_pair(RawComment::RCK_Invalid, false);
  44. if (Comment[2] == '*')
  45. K = RawComment::RCK_JavaDoc;
  46. else if (Comment[2] == '!')
  47. K = RawComment::RCK_Qt;
  48. else
  49. return std::make_pair(RawComment::RCK_OrdinaryC, false);
  50. }
  51. const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
  52. return std::make_pair(K, TrailingComment);
  53. }
  54. bool mergedCommentIsTrailingComment(StringRef Comment) {
  55. return (Comment.size() > 3) && (Comment[3] == '<');
  56. }
  57. /// Returns true if R1 and R2 both have valid locations that start on the same
  58. /// column.
  59. bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
  60. const RawComment &R2) {
  61. SourceLocation L1 = R1.getBeginLoc();
  62. SourceLocation L2 = R2.getBeginLoc();
  63. bool Invalid = false;
  64. unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
  65. if (!Invalid) {
  66. unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
  67. return !Invalid && (C1 == C2);
  68. }
  69. return false;
  70. }
  71. } // unnamed namespace
  72. /// Determines whether there is only whitespace in `Buffer` between `P`
  73. /// and the previous line.
  74. /// \param Buffer The buffer to search in.
  75. /// \param P The offset from the beginning of `Buffer` to start from.
  76. /// \return true if all of the characters in `Buffer` ranging from the closest
  77. /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
  78. /// are whitespace.
  79. static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
  80. // Search backwards until we see linefeed or carriage return.
  81. for (unsigned I = P; I != 0; --I) {
  82. char C = Buffer[I - 1];
  83. if (isVerticalWhitespace(C))
  84. return true;
  85. if (!isHorizontalWhitespace(C))
  86. return false;
  87. }
  88. // We hit the beginning of the buffer.
  89. return true;
  90. }
  91. /// Returns whether `K` is an ordinary comment kind.
  92. static bool isOrdinaryKind(RawComment::CommentKind K) {
  93. return (K == RawComment::RCK_OrdinaryBCPL) ||
  94. (K == RawComment::RCK_OrdinaryC);
  95. }
  96. RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
  97. const CommentOptions &CommentOpts, bool Merged) :
  98. Range(SR), RawTextValid(false), BriefTextValid(false),
  99. IsAttached(false), IsTrailingComment(false),
  100. IsAlmostTrailingComment(false) {
  101. // Extract raw comment text, if possible.
  102. if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
  103. Kind = RCK_Invalid;
  104. return;
  105. }
  106. // Guess comment kind.
  107. std::pair<CommentKind, bool> K =
  108. getCommentKind(RawText, CommentOpts.ParseAllComments);
  109. // Guess whether an ordinary comment is trailing.
  110. if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
  111. FileID BeginFileID;
  112. unsigned BeginOffset;
  113. std::tie(BeginFileID, BeginOffset) =
  114. SourceMgr.getDecomposedLoc(Range.getBegin());
  115. if (BeginOffset != 0) {
  116. bool Invalid = false;
  117. const char *Buffer =
  118. SourceMgr.getBufferData(BeginFileID, &Invalid).data();
  119. IsTrailingComment |=
  120. (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
  121. }
  122. }
  123. if (!Merged) {
  124. Kind = K.first;
  125. IsTrailingComment |= K.second;
  126. IsAlmostTrailingComment = RawText.startswith("//<") ||
  127. RawText.startswith("/*<");
  128. } else {
  129. Kind = RCK_Merged;
  130. IsTrailingComment =
  131. IsTrailingComment || mergedCommentIsTrailingComment(RawText);
  132. }
  133. }
  134. StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
  135. FileID BeginFileID;
  136. FileID EndFileID;
  137. unsigned BeginOffset;
  138. unsigned EndOffset;
  139. std::tie(BeginFileID, BeginOffset) =
  140. SourceMgr.getDecomposedLoc(Range.getBegin());
  141. std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
  142. const unsigned Length = EndOffset - BeginOffset;
  143. if (Length < 2)
  144. return StringRef();
  145. // The comment can't begin in one file and end in another.
  146. assert(BeginFileID == EndFileID);
  147. bool Invalid = false;
  148. const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
  149. &Invalid).data();
  150. if (Invalid)
  151. return StringRef();
  152. return StringRef(BufferStart + BeginOffset, Length);
  153. }
  154. const char *RawComment::extractBriefText(const ASTContext &Context) const {
  155. // Lazily initialize RawText using the accessor before using it.
  156. (void)getRawText(Context.getSourceManager());
  157. // Since we will be copying the resulting text, all allocations made during
  158. // parsing are garbage after resulting string is formed. Thus we can use
  159. // a separate allocator for all temporary stuff.
  160. llvm::BumpPtrAllocator Allocator;
  161. comments::Lexer L(Allocator, Context.getDiagnostics(),
  162. Context.getCommentCommandTraits(),
  163. Range.getBegin(),
  164. RawText.begin(), RawText.end());
  165. comments::BriefParser P(L, Context.getCommentCommandTraits());
  166. const std::string Result = P.Parse();
  167. const unsigned BriefTextLength = Result.size();
  168. char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
  169. memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
  170. BriefText = BriefTextPtr;
  171. BriefTextValid = true;
  172. return BriefTextPtr;
  173. }
  174. comments::FullComment *RawComment::parse(const ASTContext &Context,
  175. const Preprocessor *PP,
  176. const Decl *D) const {
  177. // Lazily initialize RawText using the accessor before using it.
  178. (void)getRawText(Context.getSourceManager());
  179. comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
  180. Context.getCommentCommandTraits(),
  181. getSourceRange().getBegin(),
  182. RawText.begin(), RawText.end());
  183. comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
  184. Context.getDiagnostics(),
  185. Context.getCommentCommandTraits(),
  186. PP);
  187. S.setDecl(D);
  188. comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
  189. Context.getDiagnostics(),
  190. Context.getCommentCommandTraits());
  191. return P.parseFullComment();
  192. }
  193. static bool onlyWhitespaceBetween(SourceManager &SM,
  194. SourceLocation Loc1, SourceLocation Loc2,
  195. unsigned MaxNewlinesAllowed) {
  196. std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
  197. std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
  198. // Question does not make sense if locations are in different files.
  199. if (Loc1Info.first != Loc2Info.first)
  200. return false;
  201. bool Invalid = false;
  202. const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
  203. if (Invalid)
  204. return false;
  205. unsigned NumNewlines = 0;
  206. assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
  207. // Look for non-whitespace characters and remember any newlines seen.
  208. for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
  209. switch (Buffer[I]) {
  210. default:
  211. return false;
  212. case ' ':
  213. case '\t':
  214. case '\f':
  215. case '\v':
  216. break;
  217. case '\r':
  218. case '\n':
  219. ++NumNewlines;
  220. // Check if we have found more than the maximum allowed number of
  221. // newlines.
  222. if (NumNewlines > MaxNewlinesAllowed)
  223. return false;
  224. // Collapse \r\n and \n\r into a single newline.
  225. if (I + 1 != Loc2Info.second &&
  226. (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
  227. Buffer[I] != Buffer[I + 1])
  228. ++I;
  229. break;
  230. }
  231. }
  232. return true;
  233. }
  234. void RawCommentList::addComment(const RawComment &RC,
  235. const CommentOptions &CommentOpts,
  236. llvm::BumpPtrAllocator &Allocator) {
  237. if (RC.isInvalid())
  238. return;
  239. // Ordinary comments are not interesting for us.
  240. if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
  241. return;
  242. std::pair<FileID, unsigned> Loc =
  243. SourceMgr.getDecomposedLoc(RC.getBeginLoc());
  244. const FileID CommentFile = Loc.first;
  245. const unsigned CommentOffset = Loc.second;
  246. // If this is the first Doxygen comment, save it (because there isn't
  247. // anything to merge it with).
  248. if (OrderedComments[CommentFile].empty()) {
  249. OrderedComments[CommentFile][CommentOffset] =
  250. new (Allocator) RawComment(RC);
  251. return;
  252. }
  253. const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second;
  254. const RawComment &C2 = RC;
  255. // Merge comments only if there is only whitespace between them.
  256. // Can't merge trailing and non-trailing comments unless the second is
  257. // non-trailing ordinary in the same column, as in the case:
  258. // int x; // documents x
  259. // // more text
  260. // versus:
  261. // int x; // documents x
  262. // int y; // documents y
  263. // or:
  264. // int x; // documents x
  265. // // documents y
  266. // int y;
  267. // Merge comments if they are on same or consecutive lines.
  268. if ((C1.isTrailingComment() == C2.isTrailingComment() ||
  269. (C1.isTrailingComment() && !C2.isTrailingComment() &&
  270. isOrdinaryKind(C2.getKind()) &&
  271. commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
  272. onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
  273. /*MaxNewlinesAllowed=*/1)) {
  274. SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
  275. *OrderedComments[CommentFile].rbegin()->second =
  276. RawComment(SourceMgr, MergedRange, CommentOpts, true);
  277. } else {
  278. OrderedComments[CommentFile][CommentOffset] =
  279. new (Allocator) RawComment(RC);
  280. }
  281. }
  282. const std::map<unsigned, RawComment *> *
  283. RawCommentList::getCommentsInFile(FileID File) const {
  284. auto CommentsInFile = OrderedComments.find(File);
  285. if (CommentsInFile == OrderedComments.end())
  286. return nullptr;
  287. return &CommentsInFile->second;
  288. }
  289. bool RawCommentList::empty() const { return OrderedComments.empty(); }
  290. unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
  291. unsigned Offset) const {
  292. auto Cached = CommentBeginLine.find(C);
  293. if (Cached != CommentBeginLine.end())
  294. return Cached->second;
  295. const unsigned Line = SourceMgr.getLineNumber(File, Offset);
  296. CommentBeginLine[C] = Line;
  297. return Line;
  298. }
  299. unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
  300. auto Cached = CommentEndOffset.find(C);
  301. if (Cached != CommentEndOffset.end())
  302. return Cached->second;
  303. const unsigned Offset =
  304. SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
  305. CommentEndOffset[C] = Offset;
  306. return Offset;
  307. }
  308. std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
  309. DiagnosticsEngine &Diags) const {
  310. llvm::StringRef CommentText = getRawText(SourceMgr);
  311. if (CommentText.empty())
  312. return "";
  313. llvm::BumpPtrAllocator Allocator;
  314. // We do not parse any commands, so CommentOptions are ignored by
  315. // comments::Lexer. Therefore, we just use default-constructed options.
  316. CommentOptions DefOpts;
  317. comments::CommandTraits EmptyTraits(Allocator, DefOpts);
  318. comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
  319. CommentText.begin(), CommentText.end(),
  320. /*ParseCommands=*/false);
  321. std::string Result;
  322. // A column number of the first non-whitespace token in the comment text.
  323. // We skip whitespace up to this column, but keep the whitespace after this
  324. // column. IndentColumn is calculated when lexing the first line and reused
  325. // for the rest of lines.
  326. unsigned IndentColumn = 0;
  327. // Processes one line of the comment and adds it to the result.
  328. // Handles skipping the indent at the start of the line.
  329. // Returns false when eof is reached and true otherwise.
  330. auto LexLine = [&](bool IsFirstLine) -> bool {
  331. comments::Token Tok;
  332. // Lex the first token on the line. We handle it separately, because we to
  333. // fix up its indentation.
  334. L.lex(Tok);
  335. if (Tok.is(comments::tok::eof))
  336. return false;
  337. if (Tok.is(comments::tok::newline)) {
  338. Result += "\n";
  339. return true;
  340. }
  341. llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
  342. bool LocInvalid = false;
  343. unsigned TokColumn =
  344. SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
  345. assert(!LocInvalid && "getFormattedText for invalid location");
  346. // Amount of leading whitespace in TokText.
  347. size_t WhitespaceLen = TokText.find_first_not_of(" \t");
  348. if (WhitespaceLen == StringRef::npos)
  349. WhitespaceLen = TokText.size();
  350. // Remember the amount of whitespace we skipped in the first line to remove
  351. // indent up to that column in the following lines.
  352. if (IsFirstLine)
  353. IndentColumn = TokColumn + WhitespaceLen;
  354. // Amount of leading whitespace we actually want to skip.
  355. // For the first line we skip all the whitespace.
  356. // For the rest of the lines, we skip whitespace up to IndentColumn.
  357. unsigned SkipLen =
  358. IsFirstLine
  359. ? WhitespaceLen
  360. : std::min<size_t>(
  361. WhitespaceLen,
  362. std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
  363. llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
  364. Result += Trimmed;
  365. // Lex all tokens in the rest of the line.
  366. for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
  367. if (Tok.is(comments::tok::newline)) {
  368. Result += "\n";
  369. return true;
  370. }
  371. Result += L.getSpelling(Tok, SourceMgr);
  372. }
  373. // We've reached the end of file token.
  374. return false;
  375. };
  376. auto DropTrailingNewLines = [](std::string &Str) {
  377. while (Str.back() == '\n')
  378. Str.pop_back();
  379. };
  380. // Process first line separately to remember indent for the following lines.
  381. if (!LexLine(/*IsFirstLine=*/true)) {
  382. DropTrailingNewLines(Result);
  383. return Result;
  384. }
  385. // Process the rest of the lines.
  386. while (LexLine(/*IsFirstLine=*/false))
  387. ;
  388. DropTrailingNewLines(Result);
  389. return Result;
  390. }