YAMLParser.cpp 68 KB


  1. //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements a YAML parser.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Support/YAMLParser.h"
  14. #include "llvm/ADT/AllocatorList.h"
  15. #include "llvm/ADT/ArrayRef.h"
  16. #include "llvm/ADT/None.h"
  17. #include "llvm/ADT/STLExtras.h"
  18. #include "llvm/ADT/SmallString.h"
  19. #include "llvm/ADT/SmallVector.h"
  20. #include "llvm/ADT/StringExtras.h"
  21. #include "llvm/ADT/StringRef.h"
  22. #include "llvm/ADT/Twine.h"
  23. #include "llvm/Support/Compiler.h"
  24. #include "llvm/Support/ErrorHandling.h"
  25. #include "llvm/Support/MemoryBuffer.h"
  26. #include "llvm/Support/SMLoc.h"
  27. #include "llvm/Support/SourceMgr.h"
  28. #include "llvm/Support/Unicode.h"
  29. #include "llvm/Support/raw_ostream.h"
  30. #include <algorithm>
  31. #include <cassert>
  32. #include <cstddef>
  33. #include <cstdint>
  34. #include <map>
  35. #include <memory>
  36. #include <string>
  37. #include <system_error>
  38. #include <utility>
  39. using namespace llvm;
  40. using namespace yaml;
  41. enum UnicodeEncodingForm {
  42. UEF_UTF32_LE, ///< UTF-32 Little Endian
  43. UEF_UTF32_BE, ///< UTF-32 Big Endian
  44. UEF_UTF16_LE, ///< UTF-16 Little Endian
  45. UEF_UTF16_BE, ///< UTF-16 Big Endian
  46. UEF_UTF8, ///< UTF-8 or ascii.
  47. UEF_Unknown ///< Not a valid Unicode encoding.
  48. };
  49. /// EncodingInfo - Holds the encoding type and length of the byte order mark if
  50. /// it exists. Length is in {0, 2, 3, 4}.
  51. using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
  52. /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
  53. /// encoding form of \a Input.
  54. ///
  55. /// @param Input A string of length 0 or more.
  56. /// @returns An EncodingInfo indicating the Unicode encoding form of the input
  57. /// and how long the byte order mark is if one exists.
  58. static EncodingInfo getUnicodeEncoding(StringRef Input) {
  59. if (Input.empty())
  60. return std::make_pair(UEF_Unknown, 0);
  61. switch (uint8_t(Input[0])) {
  62. case 0x00:
  63. if (Input.size() >= 4) {
  64. if ( Input[1] == 0
  65. && uint8_t(Input[2]) == 0xFE
  66. && uint8_t(Input[3]) == 0xFF)
  67. return std::make_pair(UEF_UTF32_BE, 4);
  68. if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
  69. return std::make_pair(UEF_UTF32_BE, 0);
  70. }
  71. if (Input.size() >= 2 && Input[1] != 0)
  72. return std::make_pair(UEF_UTF16_BE, 0);
  73. return std::make_pair(UEF_Unknown, 0);
  74. case 0xFF:
  75. if ( Input.size() >= 4
  76. && uint8_t(Input[1]) == 0xFE
  77. && Input[2] == 0
  78. && Input[3] == 0)
  79. return std::make_pair(UEF_UTF32_LE, 4);
  80. if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
  81. return std::make_pair(UEF_UTF16_LE, 2);
  82. return std::make_pair(UEF_Unknown, 0);
  83. case 0xFE:
  84. if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
  85. return std::make_pair(UEF_UTF16_BE, 2);
  86. return std::make_pair(UEF_Unknown, 0);
  87. case 0xEF:
  88. if ( Input.size() >= 3
  89. && uint8_t(Input[1]) == 0xBB
  90. && uint8_t(Input[2]) == 0xBF)
  91. return std::make_pair(UEF_UTF8, 3);
  92. return std::make_pair(UEF_Unknown, 0);
  93. }
  94. // It could still be utf-32 or utf-16.
  95. if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
  96. return std::make_pair(UEF_UTF32_LE, 0);
  97. if (Input.size() >= 2 && Input[1] == 0)
  98. return std::make_pair(UEF_UTF16_LE, 0);
  99. return std::make_pair(UEF_UTF8, 0);
  100. }
  101. /// Pin the vtables to this file.
  102. void Node::anchor() {}
  103. void NullNode::anchor() {}
  104. void ScalarNode::anchor() {}
  105. void BlockScalarNode::anchor() {}
  106. void KeyValueNode::anchor() {}
  107. void MappingNode::anchor() {}
  108. void SequenceNode::anchor() {}
  109. void AliasNode::anchor() {}
  110. namespace llvm {
  111. namespace yaml {
  112. /// Token - A single YAML token.
  113. struct Token {
  114. enum TokenKind {
  115. TK_Error, // Uninitialized token.
  116. TK_StreamStart,
  117. TK_StreamEnd,
  118. TK_VersionDirective,
  119. TK_TagDirective,
  120. TK_DocumentStart,
  121. TK_DocumentEnd,
  122. TK_BlockEntry,
  123. TK_BlockEnd,
  124. TK_BlockSequenceStart,
  125. TK_BlockMappingStart,
  126. TK_FlowEntry,
  127. TK_FlowSequenceStart,
  128. TK_FlowSequenceEnd,
  129. TK_FlowMappingStart,
  130. TK_FlowMappingEnd,
  131. TK_Key,
  132. TK_Value,
  133. TK_Scalar,
  134. TK_BlockScalar,
  135. TK_Alias,
  136. TK_Anchor,
  137. TK_Tag
  138. } Kind = TK_Error;
  139. /// A string of length 0 or more whose begin() points to the logical location
  140. /// of the token in the input.
  141. StringRef Range;
  142. /// The value of a block scalar node.
  143. std::string Value;
  144. Token() = default;
  145. };
  146. } // end namespace yaml
  147. } // end namespace llvm
  148. using TokenQueueT = BumpPtrList<Token>;
  149. namespace {
  150. /// @brief This struct is used to track simple keys.
  151. ///
  152. /// Simple keys are handled by creating an entry in SimpleKeys for each Token
  153. /// which could legally be the start of a simple key. When peekNext is called,
  154. /// if the Token To be returned is referenced by a SimpleKey, we continue
  155. /// tokenizing until that potential simple key has either been found to not be
  156. /// a simple key (we moved on to the next line or went further than 1024 chars).
  157. /// Or when we run into a Value, and then insert a Key token (and possibly
  158. /// others) before the SimpleKey's Tok.
  159. struct SimpleKey {
  160. TokenQueueT::iterator Tok;
  161. unsigned Column;
  162. unsigned Line;
  163. unsigned FlowLevel;
  164. bool IsRequired;
  165. bool operator ==(const SimpleKey &Other) {
  166. return Tok == Other.Tok;
  167. }
  168. };
  169. } // end anonymous namespace
  170. /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit
  171. /// subsequence and the subsequence's length in code units (uint8_t).
  172. /// A length of 0 represents an error.
  173. using UTF8Decoded = std::pair<uint32_t, unsigned>;
  174. static UTF8Decoded decodeUTF8(StringRef Range) {
  175. StringRef::iterator Position= Range.begin();
  176. StringRef::iterator End = Range.end();
  177. // 1 byte: [0x00, 0x7f]
  178. // Bit pattern: 0xxxxxxx
  179. if ((*Position & 0x80) == 0) {
  180. return std::make_pair(*Position, 1);
  181. }
  182. // 2 bytes: [0x80, 0x7ff]
  183. // Bit pattern: 110xxxxx 10xxxxxx
  184. if (Position + 1 != End &&
  185. ((*Position & 0xE0) == 0xC0) &&
  186. ((*(Position + 1) & 0xC0) == 0x80)) {
  187. uint32_t codepoint = ((*Position & 0x1F) << 6) |
  188. (*(Position + 1) & 0x3F);
  189. if (codepoint >= 0x80)
  190. return std::make_pair(codepoint, 2);
  191. }
  192. // 3 bytes: [0x8000, 0xffff]
  193. // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
  194. if (Position + 2 != End &&
  195. ((*Position & 0xF0) == 0xE0) &&
  196. ((*(Position + 1) & 0xC0) == 0x80) &&
  197. ((*(Position + 2) & 0xC0) == 0x80)) {
  198. uint32_t codepoint = ((*Position & 0x0F) << 12) |
  199. ((*(Position + 1) & 0x3F) << 6) |
  200. (*(Position + 2) & 0x3F);
  201. // Codepoints between 0xD800 and 0xDFFF are invalid, as
  202. // they are high / low surrogate halves used by UTF-16.
  203. if (codepoint >= 0x800 &&
  204. (codepoint < 0xD800 || codepoint > 0xDFFF))
  205. return std::make_pair(codepoint, 3);
  206. }
  207. // 4 bytes: [0x10000, 0x10FFFF]
  208. // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  209. if (Position + 3 != End &&
  210. ((*Position & 0xF8) == 0xF0) &&
  211. ((*(Position + 1) & 0xC0) == 0x80) &&
  212. ((*(Position + 2) & 0xC0) == 0x80) &&
  213. ((*(Position + 3) & 0xC0) == 0x80)) {
  214. uint32_t codepoint = ((*Position & 0x07) << 18) |
  215. ((*(Position + 1) & 0x3F) << 12) |
  216. ((*(Position + 2) & 0x3F) << 6) |
  217. (*(Position + 3) & 0x3F);
  218. if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
  219. return std::make_pair(codepoint, 4);
  220. }
  221. return std::make_pair(0, 0);
  222. }
  223. namespace llvm {
  224. namespace yaml {
  225. /// @brief Scans YAML tokens from a MemoryBuffer.
  226. class Scanner {
  227. public:
  228. Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
  229. std::error_code *EC = nullptr);
  230. Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
  231. std::error_code *EC = nullptr);
  232. /// @brief Parse the next token and return it without popping it.
  233. Token &peekNext();
  234. /// @brief Parse the next token and pop it from the queue.
  235. Token getNext();
  236. void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
  237. ArrayRef<SMRange> Ranges = None) {
  238. SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
  239. }
  240. void setError(const Twine &Message, StringRef::iterator Position) {
  241. if (Current >= End)
  242. Current = End - 1;
  243. // propagate the error if possible
  244. if (EC)
  245. *EC = make_error_code(std::errc::invalid_argument);
  246. // Don't print out more errors after the first one we encounter. The rest
  247. // are just the result of the first, and have no meaning.
  248. if (!Failed)
  249. printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
  250. Failed = true;
  251. }
  252. void setError(const Twine &Message) {
  253. setError(Message, Current);
  254. }
  255. /// @brief Returns true if an error occurred while parsing.
  256. bool failed() {
  257. return Failed;
  258. }
  259. private:
  260. void init(MemoryBufferRef Buffer);
  261. StringRef currentInput() {
  262. return StringRef(Current, End - Current);
  263. }
  264. /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting
  265. /// at \a Position.
  266. ///
  267. /// If the UTF-8 code units starting at Position do not form a well-formed
  268. /// code unit subsequence, then the Unicode scalar value is 0, and the length
  269. /// is 0.
  270. UTF8Decoded decodeUTF8(StringRef::iterator Position) {
  271. return ::decodeUTF8(StringRef(Position, End - Position));
  272. }
  273. // The following functions are based on the gramar rules in the YAML spec. The
  274. // style of the function names it meant to closely match how they are written
  275. // in the spec. The number within the [] is the number of the grammar rule in
  276. // the spec.
  277. //
  278. // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
  279. //
  280. // c-
  281. // A production starting and ending with a special character.
  282. // b-
  283. // A production matching a single line break.
  284. // nb-
  285. // A production starting and ending with a non-break character.
  286. // s-
  287. // A production starting and ending with a white space character.
  288. // ns-
  289. // A production starting and ending with a non-space character.
  290. // l-
  291. // A production matching complete line(s).
  292. /// @brief Skip a single nb-char[27] starting at Position.
  293. ///
  294. /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
  295. /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
  296. ///
  297. /// @returns The code unit after the nb-char, or Position if it's not an
  298. /// nb-char.
  299. StringRef::iterator skip_nb_char(StringRef::iterator Position);
  300. /// @brief Skip a single b-break[28] starting at Position.
  301. ///
  302. /// A b-break is 0xD 0xA | 0xD | 0xA
  303. ///
  304. /// @returns The code unit after the b-break, or Position if it's not a
  305. /// b-break.
  306. StringRef::iterator skip_b_break(StringRef::iterator Position);
  307. /// Skip a single s-space[31] starting at Position.
  308. ///
  309. /// An s-space is 0x20
  310. ///
  311. /// @returns The code unit after the s-space, or Position if it's not a
  312. /// s-space.
  313. StringRef::iterator skip_s_space(StringRef::iterator Position);
  314. /// @brief Skip a single s-white[33] starting at Position.
  315. ///
  316. /// A s-white is 0x20 | 0x9
  317. ///
  318. /// @returns The code unit after the s-white, or Position if it's not a
  319. /// s-white.
  320. StringRef::iterator skip_s_white(StringRef::iterator Position);
  321. /// @brief Skip a single ns-char[34] starting at Position.
  322. ///
  323. /// A ns-char is nb-char - s-white
  324. ///
  325. /// @returns The code unit after the ns-char, or Position if it's not a
  326. /// ns-char.
  327. StringRef::iterator skip_ns_char(StringRef::iterator Position);
  328. using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
  329. /// @brief Skip minimal well-formed code unit subsequences until Func
  330. /// returns its input.
  331. ///
  332. /// @returns The code unit after the last minimal well-formed code unit
  333. /// subsequence that Func accepted.
  334. StringRef::iterator skip_while( SkipWhileFunc Func
  335. , StringRef::iterator Position);
  336. /// Skip minimal well-formed code unit subsequences until Func returns its
  337. /// input.
  338. void advanceWhile(SkipWhileFunc Func);
  339. /// @brief Scan ns-uri-char[39]s starting at Cur.
  340. ///
  341. /// This updates Cur and Column while scanning.
  342. void scan_ns_uri_char();
  343. /// @brief Consume a minimal well-formed code unit subsequence starting at
  344. /// \a Cur. Return false if it is not the same Unicode scalar value as
  345. /// \a Expected. This updates \a Column.
  346. bool consume(uint32_t Expected);
  347. /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
  348. void skip(uint32_t Distance);
  349. /// @brief Return true if the minimal well-formed code unit subsequence at
  350. /// Pos is whitespace or a new line
  351. bool isBlankOrBreak(StringRef::iterator Position);
  352. /// Consume a single b-break[28] if it's present at the current position.
  353. ///
  354. /// Return false if the code unit at the current position isn't a line break.
  355. bool consumeLineBreakIfPresent();
  356. /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
  357. void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
  358. , unsigned AtColumn
  359. , bool IsRequired);
  360. /// @brief Remove simple keys that can no longer be valid simple keys.
  361. ///
  362. /// Invalid simple keys are not on the current line or are further than 1024
  363. /// columns back.
  364. void removeStaleSimpleKeyCandidates();
  365. /// @brief Remove all simple keys on FlowLevel \a Level.
  366. void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
  367. /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
  368. /// tokens if needed.
  369. bool unrollIndent(int ToColumn);
  370. /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
  371. /// if needed.
  372. bool rollIndent( int ToColumn
  373. , Token::TokenKind Kind
  374. , TokenQueueT::iterator InsertPoint);
  375. /// @brief Skip a single-line comment when the comment starts at the current
  376. /// position of the scanner.
  377. void skipComment();
  378. /// @brief Skip whitespace and comments until the start of the next token.
  379. void scanToNextToken();
  380. /// @brief Must be the first token generated.
  381. bool scanStreamStart();
  382. /// @brief Generate tokens needed to close out the stream.
  383. bool scanStreamEnd();
  384. /// @brief Scan a %BLAH directive.
  385. bool scanDirective();
  386. /// @brief Scan a ... or ---.
  387. bool scanDocumentIndicator(bool IsStart);
  388. /// @brief Scan a [ or { and generate the proper flow collection start token.
  389. bool scanFlowCollectionStart(bool IsSequence);
  390. /// @brief Scan a ] or } and generate the proper flow collection end token.
  391. bool scanFlowCollectionEnd(bool IsSequence);
  392. /// @brief Scan the , that separates entries in a flow collection.
  393. bool scanFlowEntry();
  394. /// @brief Scan the - that starts block sequence entries.
  395. bool scanBlockEntry();
  396. /// @brief Scan an explicit ? indicating a key.
  397. bool scanKey();
  398. /// @brief Scan an explicit : indicating a value.
  399. bool scanValue();
  400. /// @brief Scan a quoted scalar.
  401. bool scanFlowScalar(bool IsDoubleQuoted);
  402. /// @brief Scan an unquoted scalar.
  403. bool scanPlainScalar();
  404. /// @brief Scan an Alias or Anchor starting with * or &.
  405. bool scanAliasOrAnchor(bool IsAlias);
  406. /// @brief Scan a block scalar starting with | or >.
  407. bool scanBlockScalar(bool IsLiteral);
  408. /// Scan a chomping indicator in a block scalar header.
  409. char scanBlockChompingIndicator();
  410. /// Scan an indentation indicator in a block scalar header.
  411. unsigned scanBlockIndentationIndicator();
  412. /// Scan a block scalar header.
  413. ///
  414. /// Return false if an error occurred.
  415. bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
  416. bool &IsDone);
  417. /// Look for the indentation level of a block scalar.
  418. ///
  419. /// Return false if an error occurred.
  420. bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
  421. unsigned &LineBreaks, bool &IsDone);
  422. /// Scan the indentation of a text line in a block scalar.
  423. ///
  424. /// Return false if an error occurred.
  425. bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
  426. bool &IsDone);
  427. /// @brief Scan a tag of the form !stuff.
  428. bool scanTag();
  429. /// @brief Dispatch to the next scanning function based on \a *Cur.
  430. bool fetchMoreTokens();
  431. /// @brief The SourceMgr used for diagnostics and buffer management.
  432. SourceMgr &SM;
  433. /// @brief The original input.
  434. MemoryBufferRef InputBuffer;
  435. /// @brief The current position of the scanner.
  436. StringRef::iterator Current;
  437. /// @brief The end of the input (one past the last character).
  438. StringRef::iterator End;
  439. /// @brief Current YAML indentation level in spaces.
  440. int Indent;
  441. /// @brief Current column number in Unicode code points.
  442. unsigned Column;
  443. /// @brief Current line number.
  444. unsigned Line;
  445. /// @brief How deep we are in flow style containers. 0 Means at block level.
  446. unsigned FlowLevel;
  447. /// @brief Are we at the start of the stream?
  448. bool IsStartOfStream;
  449. /// @brief Can the next token be the start of a simple key?
  450. bool IsSimpleKeyAllowed;
  451. /// @brief True if an error has occurred.
  452. bool Failed;
  453. /// @brief Should colors be used when printing out the diagnostic messages?
  454. bool ShowColors;
  455. /// @brief Queue of tokens. This is required to queue up tokens while looking
  456. /// for the end of a simple key. And for cases where a single character
  457. /// can produce multiple tokens (e.g. BlockEnd).
  458. TokenQueueT TokenQueue;
  459. /// @brief Indentation levels.
  460. SmallVector<int, 4> Indents;
  461. /// @brief Potential simple keys.
  462. SmallVector<SimpleKey, 4> SimpleKeys;
  463. std::error_code *EC;
  464. };
  465. } // end namespace yaml
  466. } // end namespace llvm
  467. /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
  468. static void encodeUTF8( uint32_t UnicodeScalarValue
  469. , SmallVectorImpl<char> &Result) {
  470. if (UnicodeScalarValue <= 0x7F) {
  471. Result.push_back(UnicodeScalarValue & 0x7F);
  472. } else if (UnicodeScalarValue <= 0x7FF) {
  473. uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
  474. uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
  475. Result.push_back(FirstByte);
  476. Result.push_back(SecondByte);
  477. } else if (UnicodeScalarValue <= 0xFFFF) {
  478. uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
  479. uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
  480. uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
  481. Result.push_back(FirstByte);
  482. Result.push_back(SecondByte);
  483. Result.push_back(ThirdByte);
  484. } else if (UnicodeScalarValue <= 0x10FFFF) {
  485. uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
  486. uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
  487. uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
  488. uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
  489. Result.push_back(FirstByte);
  490. Result.push_back(SecondByte);
  491. Result.push_back(ThirdByte);
  492. Result.push_back(FourthByte);
  493. }
  494. }
  495. bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
  496. SourceMgr SM;
  497. Scanner scanner(Input, SM);
  498. while (true) {
  499. Token T = scanner.getNext();
  500. switch (T.Kind) {
  501. case Token::TK_StreamStart:
  502. OS << "Stream-Start: ";
  503. break;
  504. case Token::TK_StreamEnd:
  505. OS << "Stream-End: ";
  506. break;
  507. case Token::TK_VersionDirective:
  508. OS << "Version-Directive: ";
  509. break;
  510. case Token::TK_TagDirective:
  511. OS << "Tag-Directive: ";
  512. break;
  513. case Token::TK_DocumentStart:
  514. OS << "Document-Start: ";
  515. break;
  516. case Token::TK_DocumentEnd:
  517. OS << "Document-End: ";
  518. break;
  519. case Token::TK_BlockEntry:
  520. OS << "Block-Entry: ";
  521. break;
  522. case Token::TK_BlockEnd:
  523. OS << "Block-End: ";
  524. break;
  525. case Token::TK_BlockSequenceStart:
  526. OS << "Block-Sequence-Start: ";
  527. break;
  528. case Token::TK_BlockMappingStart:
  529. OS << "Block-Mapping-Start: ";
  530. break;
  531. case Token::TK_FlowEntry:
  532. OS << "Flow-Entry: ";
  533. break;
  534. case Token::TK_FlowSequenceStart:
  535. OS << "Flow-Sequence-Start: ";
  536. break;
  537. case Token::TK_FlowSequenceEnd:
  538. OS << "Flow-Sequence-End: ";
  539. break;
  540. case Token::TK_FlowMappingStart:
  541. OS << "Flow-Mapping-Start: ";
  542. break;
  543. case Token::TK_FlowMappingEnd:
  544. OS << "Flow-Mapping-End: ";
  545. break;
  546. case Token::TK_Key:
  547. OS << "Key: ";
  548. break;
  549. case Token::TK_Value:
  550. OS << "Value: ";
  551. break;
  552. case Token::TK_Scalar:
  553. OS << "Scalar: ";
  554. break;
  555. case Token::TK_BlockScalar:
  556. OS << "Block Scalar: ";
  557. break;
  558. case Token::TK_Alias:
  559. OS << "Alias: ";
  560. break;
  561. case Token::TK_Anchor:
  562. OS << "Anchor: ";
  563. break;
  564. case Token::TK_Tag:
  565. OS << "Tag: ";
  566. break;
  567. case Token::TK_Error:
  568. break;
  569. }
  570. OS << T.Range << "\n";
  571. if (T.Kind == Token::TK_StreamEnd)
  572. break;
  573. else if (T.Kind == Token::TK_Error)
  574. return false;
  575. }
  576. return true;
  577. }
  578. bool yaml::scanTokens(StringRef Input) {
  579. SourceMgr SM;
  580. Scanner scanner(Input, SM);
  581. while (true) {
  582. Token T = scanner.getNext();
  583. if (T.Kind == Token::TK_StreamEnd)
  584. break;
  585. else if (T.Kind == Token::TK_Error)
  586. return false;
  587. }
  588. return true;
  589. }
  590. std::string yaml::escape(StringRef Input, bool EscapePrintable) {
  591. std::string EscapedInput;
  592. for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
  593. if (*i == '\\')
  594. EscapedInput += "\\\\";
  595. else if (*i == '"')
  596. EscapedInput += "\\\"";
  597. else if (*i == 0)
  598. EscapedInput += "\\0";
  599. else if (*i == 0x07)
  600. EscapedInput += "\\a";
  601. else if (*i == 0x08)
  602. EscapedInput += "\\b";
  603. else if (*i == 0x09)
  604. EscapedInput += "\\t";
  605. else if (*i == 0x0A)
  606. EscapedInput += "\\n";
  607. else if (*i == 0x0B)
  608. EscapedInput += "\\v";
  609. else if (*i == 0x0C)
  610. EscapedInput += "\\f";
  611. else if (*i == 0x0D)
  612. EscapedInput += "\\r";
  613. else if (*i == 0x1B)
  614. EscapedInput += "\\e";
  615. else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
  616. std::string HexStr = utohexstr(*i);
  617. EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
  618. } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
  619. UTF8Decoded UnicodeScalarValue
  620. = decodeUTF8(StringRef(i, Input.end() - i));
  621. if (UnicodeScalarValue.second == 0) {
  622. // Found invalid char.
  623. SmallString<4> Val;
  624. encodeUTF8(0xFFFD, Val);
  625. EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end());
  626. // FIXME: Error reporting.
  627. return EscapedInput;
  628. }
  629. if (UnicodeScalarValue.first == 0x85)
  630. EscapedInput += "\\N";
  631. else if (UnicodeScalarValue.first == 0xA0)
  632. EscapedInput += "\\_";
  633. else if (UnicodeScalarValue.first == 0x2028)
  634. EscapedInput += "\\L";
  635. else if (UnicodeScalarValue.first == 0x2029)
  636. EscapedInput += "\\P";
  637. else if (!EscapePrintable &&
  638. sys::unicode::isPrintable(UnicodeScalarValue.first))
  639. EscapedInput += StringRef(i, UnicodeScalarValue.second);
  640. else {
  641. std::string HexStr = utohexstr(UnicodeScalarValue.first);
  642. if (HexStr.size() <= 2)
  643. EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
  644. else if (HexStr.size() <= 4)
  645. EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
  646. else if (HexStr.size() <= 8)
  647. EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
  648. }
  649. i += UnicodeScalarValue.second - 1;
  650. } else
  651. EscapedInput.push_back(*i);
  652. }
  653. return EscapedInput;
  654. }
  655. Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
  656. std::error_code *EC)
  657. : SM(sm), ShowColors(ShowColors), EC(EC) {
  658. init(MemoryBufferRef(Input, "YAML"));
  659. }
  660. Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
  661. std::error_code *EC)
  662. : SM(SM_), ShowColors(ShowColors), EC(EC) {
  663. init(Buffer);
  664. }
  665. void Scanner::init(MemoryBufferRef Buffer) {
  666. InputBuffer = Buffer;
  667. Current = InputBuffer.getBufferStart();
  668. End = InputBuffer.getBufferEnd();
  669. Indent = -1;
  670. Column = 0;
  671. Line = 0;
  672. FlowLevel = 0;
  673. IsStartOfStream = true;
  674. IsSimpleKeyAllowed = true;
  675. Failed = false;
  676. std::unique_ptr<MemoryBuffer> InputBufferOwner =
  677. MemoryBuffer::getMemBuffer(Buffer);
  678. SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
  679. }
  680. Token &Scanner::peekNext() {
  681. // If the current token is a possible simple key, keep parsing until we
  682. // can confirm.
  683. bool NeedMore = false;
  684. while (true) {
  685. if (TokenQueue.empty() || NeedMore) {
  686. if (!fetchMoreTokens()) {
  687. TokenQueue.clear();
  688. TokenQueue.push_back(Token());
  689. return TokenQueue.front();
  690. }
  691. }
  692. assert(!TokenQueue.empty() &&
  693. "fetchMoreTokens lied about getting tokens!");
  694. removeStaleSimpleKeyCandidates();
  695. SimpleKey SK;
  696. SK.Tok = TokenQueue.begin();
  697. if (!is_contained(SimpleKeys, SK))
  698. break;
  699. else
  700. NeedMore = true;
  701. }
  702. return TokenQueue.front();
  703. }
  704. Token Scanner::getNext() {
  705. Token Ret = peekNext();
  706. // TokenQueue can be empty if there was an error getting the next token.
  707. if (!TokenQueue.empty())
  708. TokenQueue.pop_front();
  709. // There cannot be any referenced Token's if the TokenQueue is empty. So do a
  710. // quick deallocation of them all.
  711. if (TokenQueue.empty())
  712. TokenQueue.resetAlloc();
  713. return Ret;
  714. }
  715. StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
  716. if (Position == End)
  717. return Position;
  718. // Check 7 bit c-printable - b-char.
  719. if ( *Position == 0x09
  720. || (*Position >= 0x20 && *Position <= 0x7E))
  721. return Position + 1;
  722. // Check for valid UTF-8.
  723. if (uint8_t(*Position) & 0x80) {
  724. UTF8Decoded u8d = decodeUTF8(Position);
  725. if ( u8d.second != 0
  726. && u8d.first != 0xFEFF
  727. && ( u8d.first == 0x85
  728. || ( u8d.first >= 0xA0
  729. && u8d.first <= 0xD7FF)
  730. || ( u8d.first >= 0xE000
  731. && u8d.first <= 0xFFFD)
  732. || ( u8d.first >= 0x10000
  733. && u8d.first <= 0x10FFFF)))
  734. return Position + u8d.second;
  735. }
  736. return Position;
  737. }
  738. StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
  739. if (Position == End)
  740. return Position;
  741. if (*Position == 0x0D) {
  742. if (Position + 1 != End && *(Position + 1) == 0x0A)
  743. return Position + 2;
  744. return Position + 1;
  745. }
  746. if (*Position == 0x0A)
  747. return Position + 1;
  748. return Position;
  749. }
  750. StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
  751. if (Position == End)
  752. return Position;
  753. if (*Position == ' ')
  754. return Position + 1;
  755. return Position;
  756. }
  757. StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
  758. if (Position == End)
  759. return Position;
  760. if (*Position == ' ' || *Position == '\t')
  761. return Position + 1;
  762. return Position;
  763. }
  764. StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
  765. if (Position == End)
  766. return Position;
  767. if (*Position == ' ' || *Position == '\t')
  768. return Position;
  769. return skip_nb_char(Position);
  770. }
  771. StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
  772. , StringRef::iterator Position) {
  773. while (true) {
  774. StringRef::iterator i = (this->*Func)(Position);
  775. if (i == Position)
  776. break;
  777. Position = i;
  778. }
  779. return Position;
  780. }
  781. void Scanner::advanceWhile(SkipWhileFunc Func) {
  782. auto Final = skip_while(Func, Current);
  783. Column += Final - Current;
  784. Current = Final;
  785. }
  786. static bool is_ns_hex_digit(const char C) {
  787. return (C >= '0' && C <= '9')
  788. || (C >= 'a' && C <= 'z')
  789. || (C >= 'A' && C <= 'Z');
  790. }
  791. static bool is_ns_word_char(const char C) {
  792. return C == '-'
  793. || (C >= 'a' && C <= 'z')
  794. || (C >= 'A' && C <= 'Z');
  795. }
  796. void Scanner::scan_ns_uri_char() {
  797. while (true) {
  798. if (Current == End)
  799. break;
  800. if (( *Current == '%'
  801. && Current + 2 < End
  802. && is_ns_hex_digit(*(Current + 1))
  803. && is_ns_hex_digit(*(Current + 2)))
  804. || is_ns_word_char(*Current)
  805. || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
  806. != StringRef::npos) {
  807. ++Current;
  808. ++Column;
  809. } else
  810. break;
  811. }
  812. }
  813. bool Scanner::consume(uint32_t Expected) {
  814. if (Expected >= 0x80)
  815. report_fatal_error("Not dealing with this yet");
  816. if (Current == End)
  817. return false;
  818. if (uint8_t(*Current) >= 0x80)
  819. report_fatal_error("Not dealing with this yet");
  820. if (uint8_t(*Current) == Expected) {
  821. ++Current;
  822. ++Column;
  823. return true;
  824. }
  825. return false;
  826. }
  827. void Scanner::skip(uint32_t Distance) {
  828. Current += Distance;
  829. Column += Distance;
  830. assert(Current <= End && "Skipped past the end");
  831. }
  832. bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
  833. if (Position == End)
  834. return false;
  835. return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
  836. *Position == '\n';
  837. }
  838. bool Scanner::consumeLineBreakIfPresent() {
  839. auto Next = skip_b_break(Current);
  840. if (Next == Current)
  841. return false;
  842. Column = 0;
  843. ++Line;
  844. Current = Next;
  845. return true;
  846. }
  847. void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
  848. , unsigned AtColumn
  849. , bool IsRequired) {
  850. if (IsSimpleKeyAllowed) {
  851. SimpleKey SK;
  852. SK.Tok = Tok;
  853. SK.Line = Line;
  854. SK.Column = AtColumn;
  855. SK.IsRequired = IsRequired;
  856. SK.FlowLevel = FlowLevel;
  857. SimpleKeys.push_back(SK);
  858. }
  859. }
  860. void Scanner::removeStaleSimpleKeyCandidates() {
  861. for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
  862. i != SimpleKeys.end();) {
  863. if (i->Line != Line || i->Column + 1024 < Column) {
  864. if (i->IsRequired)
  865. setError( "Could not find expected : for simple key"
  866. , i->Tok->Range.begin());
  867. i = SimpleKeys.erase(i);
  868. } else
  869. ++i;
  870. }
  871. }
  872. void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
  873. if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
  874. SimpleKeys.pop_back();
  875. }
  876. bool Scanner::unrollIndent(int ToColumn) {
  877. Token T;
  878. // Indentation is ignored in flow.
  879. if (FlowLevel != 0)
  880. return true;
  881. while (Indent > ToColumn) {
  882. T.Kind = Token::TK_BlockEnd;
  883. T.Range = StringRef(Current, 1);
  884. TokenQueue.push_back(T);
  885. Indent = Indents.pop_back_val();
  886. }
  887. return true;
  888. }
  889. bool Scanner::rollIndent( int ToColumn
  890. , Token::TokenKind Kind
  891. , TokenQueueT::iterator InsertPoint) {
  892. if (FlowLevel)
  893. return true;
  894. if (Indent < ToColumn) {
  895. Indents.push_back(Indent);
  896. Indent = ToColumn;
  897. Token T;
  898. T.Kind = Kind;
  899. T.Range = StringRef(Current, 0);
  900. TokenQueue.insert(InsertPoint, T);
  901. }
  902. return true;
  903. }
  904. void Scanner::skipComment() {
  905. if (*Current != '#')
  906. return;
  907. while (true) {
  908. // This may skip more than one byte, thus Column is only incremented
  909. // for code points.
  910. StringRef::iterator I = skip_nb_char(Current);
  911. if (I == Current)
  912. break;
  913. Current = I;
  914. ++Column;
  915. }
  916. }
  917. void Scanner::scanToNextToken() {
  918. while (true) {
  919. while (*Current == ' ' || *Current == '\t') {
  920. skip(1);
  921. }
  922. skipComment();
  923. // Skip EOL.
  924. StringRef::iterator i = skip_b_break(Current);
  925. if (i == Current)
  926. break;
  927. Current = i;
  928. ++Line;
  929. Column = 0;
  930. // New lines may start a simple key.
  931. if (!FlowLevel)
  932. IsSimpleKeyAllowed = true;
  933. }
  934. }
  935. bool Scanner::scanStreamStart() {
  936. IsStartOfStream = false;
  937. EncodingInfo EI = getUnicodeEncoding(currentInput());
  938. Token T;
  939. T.Kind = Token::TK_StreamStart;
  940. T.Range = StringRef(Current, EI.second);
  941. TokenQueue.push_back(T);
  942. Current += EI.second;
  943. return true;
  944. }
  945. bool Scanner::scanStreamEnd() {
  946. // Force an ending new line if one isn't present.
  947. if (Column != 0) {
  948. Column = 0;
  949. ++Line;
  950. }
  951. unrollIndent(-1);
  952. SimpleKeys.clear();
  953. IsSimpleKeyAllowed = false;
  954. Token T;
  955. T.Kind = Token::TK_StreamEnd;
  956. T.Range = StringRef(Current, 0);
  957. TokenQueue.push_back(T);
  958. return true;
  959. }
  960. bool Scanner::scanDirective() {
  961. // Reset the indentation level.
  962. unrollIndent(-1);
  963. SimpleKeys.clear();
  964. IsSimpleKeyAllowed = false;
  965. StringRef::iterator Start = Current;
  966. consume('%');
  967. StringRef::iterator NameStart = Current;
  968. Current = skip_while(&Scanner::skip_ns_char, Current);
  969. StringRef Name(NameStart, Current - NameStart);
  970. Current = skip_while(&Scanner::skip_s_white, Current);
  971. Token T;
  972. if (Name == "YAML") {
  973. Current = skip_while(&Scanner::skip_ns_char, Current);
  974. T.Kind = Token::TK_VersionDirective;
  975. T.Range = StringRef(Start, Current - Start);
  976. TokenQueue.push_back(T);
  977. return true;
  978. } else if(Name == "TAG") {
  979. Current = skip_while(&Scanner::skip_ns_char, Current);
  980. Current = skip_while(&Scanner::skip_s_white, Current);
  981. Current = skip_while(&Scanner::skip_ns_char, Current);
  982. T.Kind = Token::TK_TagDirective;
  983. T.Range = StringRef(Start, Current - Start);
  984. TokenQueue.push_back(T);
  985. return true;
  986. }
  987. return false;
  988. }
  989. bool Scanner::scanDocumentIndicator(bool IsStart) {
  990. unrollIndent(-1);
  991. SimpleKeys.clear();
  992. IsSimpleKeyAllowed = false;
  993. Token T;
  994. T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
  995. T.Range = StringRef(Current, 3);
  996. skip(3);
  997. TokenQueue.push_back(T);
  998. return true;
  999. }
  1000. bool Scanner::scanFlowCollectionStart(bool IsSequence) {
  1001. Token T;
  1002. T.Kind = IsSequence ? Token::TK_FlowSequenceStart
  1003. : Token::TK_FlowMappingStart;
  1004. T.Range = StringRef(Current, 1);
  1005. skip(1);
  1006. TokenQueue.push_back(T);
  1007. // [ and { may begin a simple key.
  1008. saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
  1009. // And may also be followed by a simple key.
  1010. IsSimpleKeyAllowed = true;
  1011. ++FlowLevel;
  1012. return true;
  1013. }
  1014. bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
  1015. removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  1016. IsSimpleKeyAllowed = false;
  1017. Token T;
  1018. T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
  1019. : Token::TK_FlowMappingEnd;
  1020. T.Range = StringRef(Current, 1);
  1021. skip(1);
  1022. TokenQueue.push_back(T);
  1023. if (FlowLevel)
  1024. --FlowLevel;
  1025. return true;
  1026. }
  1027. bool Scanner::scanFlowEntry() {
  1028. removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  1029. IsSimpleKeyAllowed = true;
  1030. Token T;
  1031. T.Kind = Token::TK_FlowEntry;
  1032. T.Range = StringRef(Current, 1);
  1033. skip(1);
  1034. TokenQueue.push_back(T);
  1035. return true;
  1036. }
  1037. bool Scanner::scanBlockEntry() {
  1038. rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
  1039. removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  1040. IsSimpleKeyAllowed = true;
  1041. Token T;
  1042. T.Kind = Token::TK_BlockEntry;
  1043. T.Range = StringRef(Current, 1);
  1044. skip(1);
  1045. TokenQueue.push_back(T);
  1046. return true;
  1047. }
  1048. bool Scanner::scanKey() {
  1049. if (!FlowLevel)
  1050. rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
  1051. removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
  1052. IsSimpleKeyAllowed = !FlowLevel;
  1053. Token T;
  1054. T.Kind = Token::TK_Key;
  1055. T.Range = StringRef(Current, 1);
  1056. skip(1);
  1057. TokenQueue.push_back(T);
  1058. return true;
  1059. }
  1060. bool Scanner::scanValue() {
  1061. // If the previous token could have been a simple key, insert the key token
  1062. // into the token queue.
  1063. if (!SimpleKeys.empty()) {
  1064. SimpleKey SK = SimpleKeys.pop_back_val();
  1065. Token T;
  1066. T.Kind = Token::TK_Key;
  1067. T.Range = SK.Tok->Range;
  1068. TokenQueueT::iterator i, e;
  1069. for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
  1070. if (i == SK.Tok)
  1071. break;
  1072. }
  1073. assert(i != e && "SimpleKey not in token queue!");
  1074. i = TokenQueue.insert(i, T);
  1075. // We may also need to add a Block-Mapping-Start token.
  1076. rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
  1077. IsSimpleKeyAllowed = false;
  1078. } else {
  1079. if (!FlowLevel)
  1080. rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
  1081. IsSimpleKeyAllowed = !FlowLevel;
  1082. }
  1083. Token T;
  1084. T.Kind = Token::TK_Value;
  1085. T.Range = StringRef(Current, 1);
  1086. skip(1);
  1087. TokenQueue.push_back(T);
  1088. return true;
  1089. }
  1090. // Forbidding inlining improves performance by roughly 20%.
  1091. // FIXME: Remove once llvm optimizes this to the faster version without hints.
  1092. LLVM_ATTRIBUTE_NOINLINE static bool
  1093. wasEscaped(StringRef::iterator First, StringRef::iterator Position);
  1094. // Returns whether a character at 'Position' was escaped with a leading '\'.
  1095. // 'First' specifies the position of the first character in the string.
  1096. static bool wasEscaped(StringRef::iterator First,
  1097. StringRef::iterator Position) {
  1098. assert(Position - 1 >= First);
  1099. StringRef::iterator I = Position - 1;
  1100. // We calculate the number of consecutive '\'s before the current position
  1101. // by iterating backwards through our string.
  1102. while (I >= First && *I == '\\') --I;
  1103. // (Position - 1 - I) now contains the number of '\'s before the current
  1104. // position. If it is odd, the character at 'Position' was escaped.
  1105. return (Position - 1 - I) % 2 == 1;
  1106. }
  1107. bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
  1108. StringRef::iterator Start = Current;
  1109. unsigned ColStart = Column;
  1110. if (IsDoubleQuoted) {
  1111. do {
  1112. ++Current;
  1113. while (Current != End && *Current != '"')
  1114. ++Current;
  1115. // Repeat until the previous character was not a '\' or was an escaped
  1116. // backslash.
  1117. } while ( Current != End
  1118. && *(Current - 1) == '\\'
  1119. && wasEscaped(Start + 1, Current));
  1120. } else {
  1121. skip(1);
  1122. while (true) {
  1123. // Skip a ' followed by another '.
  1124. if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
  1125. skip(2);
  1126. continue;
  1127. } else if (*Current == '\'')
  1128. break;
  1129. StringRef::iterator i = skip_nb_char(Current);
  1130. if (i == Current) {
  1131. i = skip_b_break(Current);
  1132. if (i == Current)
  1133. break;
  1134. Current = i;
  1135. Column = 0;
  1136. ++Line;
  1137. } else {
  1138. if (i == End)
  1139. break;
  1140. Current = i;
  1141. ++Column;
  1142. }
  1143. }
  1144. }
  1145. if (Current == End) {
  1146. setError("Expected quote at end of scalar", Current);
  1147. return false;
  1148. }
  1149. skip(1); // Skip ending quote.
  1150. Token T;
  1151. T.Kind = Token::TK_Scalar;
  1152. T.Range = StringRef(Start, Current - Start);
  1153. TokenQueue.push_back(T);
  1154. saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
  1155. IsSimpleKeyAllowed = false;
  1156. return true;
  1157. }
  1158. bool Scanner::scanPlainScalar() {
  1159. StringRef::iterator Start = Current;
  1160. unsigned ColStart = Column;
  1161. unsigned LeadingBlanks = 0;
  1162. assert(Indent >= -1 && "Indent must be >= -1 !");
  1163. unsigned indent = static_cast<unsigned>(Indent + 1);
  1164. while (true) {
  1165. if (*Current == '#')
  1166. break;
  1167. while (!isBlankOrBreak(Current)) {
  1168. if ( FlowLevel && *Current == ':'
  1169. && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) {
  1170. setError("Found unexpected ':' while scanning a plain scalar", Current);
  1171. return false;
  1172. }
  1173. // Check for the end of the plain scalar.
  1174. if ( (*Current == ':' && isBlankOrBreak(Current + 1))
  1175. || ( FlowLevel
  1176. && (StringRef(Current, 1).find_first_of(",:?[]{}")
  1177. != StringRef::npos)))
  1178. break;
  1179. StringRef::iterator i = skip_nb_char(Current);
  1180. if (i == Current)
  1181. break;
  1182. Current = i;
  1183. ++Column;
  1184. }
  1185. // Are we at the end?
  1186. if (!isBlankOrBreak(Current))
  1187. break;
  1188. // Eat blanks.
  1189. StringRef::iterator Tmp = Current;
  1190. while (isBlankOrBreak(Tmp)) {
  1191. StringRef::iterator i = skip_s_white(Tmp);
  1192. if (i != Tmp) {
  1193. if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
  1194. setError("Found invalid tab character in indentation", Tmp);
  1195. return false;
  1196. }
  1197. Tmp = i;
  1198. ++Column;
  1199. } else {
  1200. i = skip_b_break(Tmp);
  1201. if (!LeadingBlanks)
  1202. LeadingBlanks = 1;
  1203. Tmp = i;
  1204. Column = 0;
  1205. ++Line;
  1206. }
  1207. }
  1208. if (!FlowLevel && Column < indent)
  1209. break;
  1210. Current = Tmp;
  1211. }
  1212. if (Start == Current) {
  1213. setError("Got empty plain scalar", Start);
  1214. return false;
  1215. }
  1216. Token T;
  1217. T.Kind = Token::TK_Scalar;
  1218. T.Range = StringRef(Start, Current - Start);
  1219. TokenQueue.push_back(T);
  1220. // Plain scalars can be simple keys.
  1221. saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
  1222. IsSimpleKeyAllowed = false;
  1223. return true;
  1224. }
  1225. bool Scanner::scanAliasOrAnchor(bool IsAlias) {
  1226. StringRef::iterator Start = Current;
  1227. unsigned ColStart = Column;
  1228. skip(1);
  1229. while(true) {
  1230. if ( *Current == '[' || *Current == ']'
  1231. || *Current == '{' || *Current == '}'
  1232. || *Current == ','
  1233. || *Current == ':')
  1234. break;
  1235. StringRef::iterator i = skip_ns_char(Current);
  1236. if (i == Current)
  1237. break;
  1238. Current = i;
  1239. ++Column;
  1240. }
  1241. if (Start == Current) {
  1242. setError("Got empty alias or anchor", Start);
  1243. return false;
  1244. }
  1245. Token T;
  1246. T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
  1247. T.Range = StringRef(Start, Current - Start);
  1248. TokenQueue.push_back(T);
  1249. // Alias and anchors can be simple keys.
  1250. saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
  1251. IsSimpleKeyAllowed = false;
  1252. return true;
  1253. }
  1254. char Scanner::scanBlockChompingIndicator() {
  1255. char Indicator = ' ';
  1256. if (Current != End && (*Current == '+' || *Current == '-')) {
  1257. Indicator = *Current;
  1258. skip(1);
  1259. }
  1260. return Indicator;
  1261. }
  1262. /// Get the number of line breaks after chomping.
  1263. ///
  1264. /// Return the number of trailing line breaks to emit, depending on
  1265. /// \p ChompingIndicator.
  1266. static unsigned getChompedLineBreaks(char ChompingIndicator,
  1267. unsigned LineBreaks, StringRef Str) {
  1268. if (ChompingIndicator == '-') // Strip all line breaks.
  1269. return 0;
  1270. if (ChompingIndicator == '+') // Keep all line breaks.
  1271. return LineBreaks;
  1272. // Clip trailing lines.
  1273. return Str.empty() ? 0 : 1;
  1274. }
  1275. unsigned Scanner::scanBlockIndentationIndicator() {
  1276. unsigned Indent = 0;
  1277. if (Current != End && (*Current >= '1' && *Current <= '9')) {
  1278. Indent = unsigned(*Current - '0');
  1279. skip(1);
  1280. }
  1281. return Indent;
  1282. }
  1283. bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
  1284. unsigned &IndentIndicator, bool &IsDone) {
  1285. auto Start = Current;
  1286. ChompingIndicator = scanBlockChompingIndicator();
  1287. IndentIndicator = scanBlockIndentationIndicator();
  1288. // Check for the chomping indicator once again.
  1289. if (ChompingIndicator == ' ')
  1290. ChompingIndicator = scanBlockChompingIndicator();
  1291. Current = skip_while(&Scanner::skip_s_white, Current);
  1292. skipComment();
  1293. if (Current == End) { // EOF, we have an empty scalar.
  1294. Token T;
  1295. T.Kind = Token::TK_BlockScalar;
  1296. T.Range = StringRef(Start, Current - Start);
  1297. TokenQueue.push_back(T);
  1298. IsDone = true;
  1299. return true;
  1300. }
  1301. if (!consumeLineBreakIfPresent()) {
  1302. setError("Expected a line break after block scalar header", Current);
  1303. return false;
  1304. }
  1305. return true;
  1306. }
  1307. bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
  1308. unsigned BlockExitIndent,
  1309. unsigned &LineBreaks, bool &IsDone) {
  1310. unsigned MaxAllSpaceLineCharacters = 0;
  1311. StringRef::iterator LongestAllSpaceLine;
  1312. while (true) {
  1313. advanceWhile(&Scanner::skip_s_space);
  1314. if (skip_nb_char(Current) != Current) {
  1315. // This line isn't empty, so try and find the indentation.
  1316. if (Column <= BlockExitIndent) { // End of the block literal.
  1317. IsDone = true;
  1318. return true;
  1319. }
  1320. // We found the block's indentation.
  1321. BlockIndent = Column;
  1322. if (MaxAllSpaceLineCharacters > BlockIndent) {
  1323. setError(
  1324. "Leading all-spaces line must be smaller than the block indent",
  1325. LongestAllSpaceLine);
  1326. return false;
  1327. }
  1328. return true;
  1329. }
  1330. if (skip_b_break(Current) != Current &&
  1331. Column > MaxAllSpaceLineCharacters) {
  1332. // Record the longest all-space line in case it's longer than the
  1333. // discovered block indent.
  1334. MaxAllSpaceLineCharacters = Column;
  1335. LongestAllSpaceLine = Current;
  1336. }
  1337. // Check for EOF.
  1338. if (Current == End) {
  1339. IsDone = true;
  1340. return true;
  1341. }
  1342. if (!consumeLineBreakIfPresent()) {
  1343. IsDone = true;
  1344. return true;
  1345. }
  1346. ++LineBreaks;
  1347. }
  1348. return true;
  1349. }
  1350. bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
  1351. unsigned BlockExitIndent, bool &IsDone) {
  1352. // Skip the indentation.
  1353. while (Column < BlockIndent) {
  1354. auto I = skip_s_space(Current);
  1355. if (I == Current)
  1356. break;
  1357. Current = I;
  1358. ++Column;
  1359. }
  1360. if (skip_nb_char(Current) == Current)
  1361. return true;
  1362. if (Column <= BlockExitIndent) { // End of the block literal.
  1363. IsDone = true;
  1364. return true;
  1365. }
  1366. if (Column < BlockIndent) {
  1367. if (Current != End && *Current == '#') { // Trailing comment.
  1368. IsDone = true;
  1369. return true;
  1370. }
  1371. setError("A text line is less indented than the block scalar", Current);
  1372. return false;
  1373. }
  1374. return true; // A normal text line.
  1375. }
  1376. bool Scanner::scanBlockScalar(bool IsLiteral) {
  1377. // Eat '|' or '>'
  1378. assert(*Current == '|' || *Current == '>');
  1379. skip(1);
  1380. char ChompingIndicator;
  1381. unsigned BlockIndent;
  1382. bool IsDone = false;
  1383. if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
  1384. return false;
  1385. if (IsDone)
  1386. return true;
  1387. auto Start = Current;
  1388. unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
  1389. unsigned LineBreaks = 0;
  1390. if (BlockIndent == 0) {
  1391. if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
  1392. IsDone))
  1393. return false;
  1394. }
  1395. // Scan the block's scalars body.
  1396. SmallString<256> Str;
  1397. while (!IsDone) {
  1398. if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
  1399. return false;
  1400. if (IsDone)
  1401. break;
  1402. // Parse the current line.
  1403. auto LineStart = Current;
  1404. advanceWhile(&Scanner::skip_nb_char);
  1405. if (LineStart != Current) {
  1406. Str.append(LineBreaks, '\n');
  1407. Str.append(StringRef(LineStart, Current - LineStart));
  1408. LineBreaks = 0;
  1409. }
  1410. // Check for EOF.
  1411. if (Current == End)
  1412. break;
  1413. if (!consumeLineBreakIfPresent())
  1414. break;
  1415. ++LineBreaks;
  1416. }
  1417. if (Current == End && !LineBreaks)
  1418. // Ensure that there is at least one line break before the end of file.
  1419. LineBreaks = 1;
  1420. Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
  1421. // New lines may start a simple key.
  1422. if (!FlowLevel)
  1423. IsSimpleKeyAllowed = true;
  1424. Token T;
  1425. T.Kind = Token::TK_BlockScalar;
  1426. T.Range = StringRef(Start, Current - Start);
  1427. T.Value = Str.str().str();
  1428. TokenQueue.push_back(T);
  1429. return true;
  1430. }
  1431. bool Scanner::scanTag() {
  1432. StringRef::iterator Start = Current;
  1433. unsigned ColStart = Column;
  1434. skip(1); // Eat !.
  1435. if (Current == End || isBlankOrBreak(Current)); // An empty tag.
  1436. else if (*Current == '<') {
  1437. skip(1);
  1438. scan_ns_uri_char();
  1439. if (!consume('>'))
  1440. return false;
  1441. } else {
  1442. // FIXME: Actually parse the c-ns-shorthand-tag rule.
  1443. Current = skip_while(&Scanner::skip_ns_char, Current);
  1444. }
  1445. Token T;
  1446. T.Kind = Token::TK_Tag;
  1447. T.Range = StringRef(Start, Current - Start);
  1448. TokenQueue.push_back(T);
  1449. // Tags can be simple keys.
  1450. saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
  1451. IsSimpleKeyAllowed = false;
  1452. return true;
  1453. }
  1454. bool Scanner::fetchMoreTokens() {
  1455. if (IsStartOfStream)
  1456. return scanStreamStart();
  1457. scanToNextToken();
  1458. if (Current == End)
  1459. return scanStreamEnd();
  1460. removeStaleSimpleKeyCandidates();
  1461. unrollIndent(Column);
  1462. if (Column == 0 && *Current == '%')
  1463. return scanDirective();
  1464. if (Column == 0 && Current + 4 <= End
  1465. && *Current == '-'
  1466. && *(Current + 1) == '-'
  1467. && *(Current + 2) == '-'
  1468. && (Current + 3 == End || isBlankOrBreak(Current + 3)))
  1469. return scanDocumentIndicator(true);
  1470. if (Column == 0 && Current + 4 <= End
  1471. && *Current == '.'
  1472. && *(Current + 1) == '.'
  1473. && *(Current + 2) == '.'
  1474. && (Current + 3 == End || isBlankOrBreak(Current + 3)))
  1475. return scanDocumentIndicator(false);
  1476. if (*Current == '[')
  1477. return scanFlowCollectionStart(true);
  1478. if (*Current == '{')
  1479. return scanFlowCollectionStart(false);
  1480. if (*Current == ']')
  1481. return scanFlowCollectionEnd(true);
  1482. if (*Current == '}')
  1483. return scanFlowCollectionEnd(false);
  1484. if (*Current == ',')
  1485. return scanFlowEntry();
  1486. if (*Current == '-' && isBlankOrBreak(Current + 1))
  1487. return scanBlockEntry();
  1488. if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
  1489. return scanKey();
  1490. if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
  1491. return scanValue();
  1492. if (*Current == '*')
  1493. return scanAliasOrAnchor(true);
  1494. if (*Current == '&')
  1495. return scanAliasOrAnchor(false);
  1496. if (*Current == '!')
  1497. return scanTag();
  1498. if (*Current == '|' && !FlowLevel)
  1499. return scanBlockScalar(true);
  1500. if (*Current == '>' && !FlowLevel)
  1501. return scanBlockScalar(false);
  1502. if (*Current == '\'')
  1503. return scanFlowScalar(false);
  1504. if (*Current == '"')
  1505. return scanFlowScalar(true);
  1506. // Get a plain scalar.
  1507. StringRef FirstChar(Current, 1);
  1508. if (!(isBlankOrBreak(Current)
  1509. || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
  1510. || (*Current == '-' && !isBlankOrBreak(Current + 1))
  1511. || (!FlowLevel && (*Current == '?' || *Current == ':')
  1512. && isBlankOrBreak(Current + 1))
  1513. || (!FlowLevel && *Current == ':'
  1514. && Current + 2 < End
  1515. && *(Current + 1) == ':'
  1516. && !isBlankOrBreak(Current + 2)))
  1517. return scanPlainScalar();
  1518. setError("Unrecognized character while tokenizing.");
  1519. return false;
  1520. }
  1521. Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
  1522. std::error_code *EC)
  1523. : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
  1524. Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
  1525. std::error_code *EC)
  1526. : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
  1527. Stream::~Stream() = default;
  1528. bool Stream::failed() { return scanner->failed(); }
  1529. void Stream::printError(Node *N, const Twine &Msg) {
  1530. scanner->printError( N->getSourceRange().Start
  1531. , SourceMgr::DK_Error
  1532. , Msg
  1533. , N->getSourceRange());
  1534. }
  1535. document_iterator Stream::begin() {
  1536. if (CurrentDoc)
  1537. report_fatal_error("Can only iterate over the stream once");
  1538. // Skip Stream-Start.
  1539. scanner->getNext();
  1540. CurrentDoc.reset(new Document(*this));
  1541. return document_iterator(CurrentDoc);
  1542. }
  1543. document_iterator Stream::end() {
  1544. return document_iterator();
  1545. }
  1546. void Stream::skip() {
  1547. for (document_iterator i = begin(), e = end(); i != e; ++i)
  1548. i->skip();
  1549. }
  1550. Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
  1551. StringRef T)
  1552. : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
  1553. SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
  1554. SourceRange = SMRange(Start, Start);
  1555. }
  1556. std::string Node::getVerbatimTag() const {
  1557. StringRef Raw = getRawTag();
  1558. if (!Raw.empty() && Raw != "!") {
  1559. std::string Ret;
  1560. if (Raw.find_last_of('!') == 0) {
  1561. Ret = Doc->getTagMap().find("!")->second;
  1562. Ret += Raw.substr(1);
  1563. return Ret;
  1564. } else if (Raw.startswith("!!")) {
  1565. Ret = Doc->getTagMap().find("!!")->second;
  1566. Ret += Raw.substr(2);
  1567. return Ret;
  1568. } else {
  1569. StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
  1570. std::map<StringRef, StringRef>::const_iterator It =
  1571. Doc->getTagMap().find(TagHandle);
  1572. if (It != Doc->getTagMap().end())
  1573. Ret = It->second;
  1574. else {
  1575. Token T;
  1576. T.Kind = Token::TK_Tag;
  1577. T.Range = TagHandle;
  1578. setError(Twine("Unknown tag handle ") + TagHandle, T);
  1579. }
  1580. Ret += Raw.substr(Raw.find_last_of('!') + 1);
  1581. return Ret;
  1582. }
  1583. }
  1584. switch (getType()) {
  1585. case NK_Null:
  1586. return "tag:yaml.org,2002:null";
  1587. case NK_Scalar:
  1588. case NK_BlockScalar:
  1589. // TODO: Tag resolution.
  1590. return "tag:yaml.org,2002:str";
  1591. case NK_Mapping:
  1592. return "tag:yaml.org,2002:map";
  1593. case NK_Sequence:
  1594. return "tag:yaml.org,2002:seq";
  1595. }
  1596. return "";
  1597. }
  1598. Token &Node::peekNext() {
  1599. return Doc->peekNext();
  1600. }
  1601. Token Node::getNext() {
  1602. return Doc->getNext();
  1603. }
  1604. Node *Node::parseBlockNode() {
  1605. return Doc->parseBlockNode();
  1606. }
  1607. BumpPtrAllocator &Node::getAllocator() {
  1608. return Doc->NodeAllocator;
  1609. }
  1610. void Node::setError(const Twine &Msg, Token &Tok) const {
  1611. Doc->setError(Msg, Tok);
  1612. }
  1613. bool Node::failed() const {
  1614. return Doc->failed();
  1615. }
  1616. StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
  1617. // TODO: Handle newlines properly. We need to remove leading whitespace.
  1618. if (Value[0] == '"') { // Double quoted.
  1619. // Pull off the leading and trailing "s.
  1620. StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
  1621. // Search for characters that would require unescaping the value.
  1622. StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
  1623. if (i != StringRef::npos)
  1624. return unescapeDoubleQuoted(UnquotedValue, i, Storage);
  1625. return UnquotedValue;
  1626. } else if (Value[0] == '\'') { // Single quoted.
  1627. // Pull off the leading and trailing 's.
  1628. StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
  1629. StringRef::size_type i = UnquotedValue.find('\'');
  1630. if (i != StringRef::npos) {
  1631. // We're going to need Storage.
  1632. Storage.clear();
  1633. Storage.reserve(UnquotedValue.size());
  1634. for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
  1635. StringRef Valid(UnquotedValue.begin(), i);
  1636. Storage.insert(Storage.end(), Valid.begin(), Valid.end());
  1637. Storage.push_back('\'');
  1638. UnquotedValue = UnquotedValue.substr(i + 2);
  1639. }
  1640. Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
  1641. return StringRef(Storage.begin(), Storage.size());
  1642. }
  1643. return UnquotedValue;
  1644. }
  1645. // Plain or block.
  1646. return Value.rtrim(' ');
  1647. }
  1648. StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
  1649. , StringRef::size_type i
  1650. , SmallVectorImpl<char> &Storage)
  1651. const {
  1652. // Use Storage to build proper value.
  1653. Storage.clear();
  1654. Storage.reserve(UnquotedValue.size());
  1655. for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
  1656. // Insert all previous chars into Storage.
  1657. StringRef Valid(UnquotedValue.begin(), i);
  1658. Storage.insert(Storage.end(), Valid.begin(), Valid.end());
  1659. // Chop off inserted chars.
  1660. UnquotedValue = UnquotedValue.substr(i);
  1661. assert(!UnquotedValue.empty() && "Can't be empty!");
  1662. // Parse escape or line break.
  1663. switch (UnquotedValue[0]) {
  1664. case '\r':
  1665. case '\n':
  1666. Storage.push_back('\n');
  1667. if ( UnquotedValue.size() > 1
  1668. && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
  1669. UnquotedValue = UnquotedValue.substr(1);
  1670. UnquotedValue = UnquotedValue.substr(1);
  1671. break;
  1672. default:
  1673. if (UnquotedValue.size() == 1)
  1674. // TODO: Report error.
  1675. break;
  1676. UnquotedValue = UnquotedValue.substr(1);
  1677. switch (UnquotedValue[0]) {
  1678. default: {
  1679. Token T;
  1680. T.Range = StringRef(UnquotedValue.begin(), 1);
  1681. setError("Unrecognized escape code!", T);
  1682. return "";
  1683. }
  1684. case '\r':
  1685. case '\n':
  1686. // Remove the new line.
  1687. if ( UnquotedValue.size() > 1
  1688. && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
  1689. UnquotedValue = UnquotedValue.substr(1);
  1690. // If this was just a single byte newline, it will get skipped
  1691. // below.
  1692. break;
  1693. case '0':
  1694. Storage.push_back(0x00);
  1695. break;
  1696. case 'a':
  1697. Storage.push_back(0x07);
  1698. break;
  1699. case 'b':
  1700. Storage.push_back(0x08);
  1701. break;
  1702. case 't':
  1703. case 0x09:
  1704. Storage.push_back(0x09);
  1705. break;
  1706. case 'n':
  1707. Storage.push_back(0x0A);
  1708. break;
  1709. case 'v':
  1710. Storage.push_back(0x0B);
  1711. break;
  1712. case 'f':
  1713. Storage.push_back(0x0C);
  1714. break;
  1715. case 'r':
  1716. Storage.push_back(0x0D);
  1717. break;
  1718. case 'e':
  1719. Storage.push_back(0x1B);
  1720. break;
  1721. case ' ':
  1722. Storage.push_back(0x20);
  1723. break;
  1724. case '"':
  1725. Storage.push_back(0x22);
  1726. break;
  1727. case '/':
  1728. Storage.push_back(0x2F);
  1729. break;
  1730. case '\\':
  1731. Storage.push_back(0x5C);
  1732. break;
  1733. case 'N':
  1734. encodeUTF8(0x85, Storage);
  1735. break;
  1736. case '_':
  1737. encodeUTF8(0xA0, Storage);
  1738. break;
  1739. case 'L':
  1740. encodeUTF8(0x2028, Storage);
  1741. break;
  1742. case 'P':
  1743. encodeUTF8(0x2029, Storage);
  1744. break;
  1745. case 'x': {
  1746. if (UnquotedValue.size() < 3)
  1747. // TODO: Report error.
  1748. break;
  1749. unsigned int UnicodeScalarValue;
  1750. if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
  1751. // TODO: Report error.
  1752. UnicodeScalarValue = 0xFFFD;
  1753. encodeUTF8(UnicodeScalarValue, Storage);
  1754. UnquotedValue = UnquotedValue.substr(2);
  1755. break;
  1756. }
  1757. case 'u': {
  1758. if (UnquotedValue.size() < 5)
  1759. // TODO: Report error.
  1760. break;
  1761. unsigned int UnicodeScalarValue;
  1762. if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
  1763. // TODO: Report error.
  1764. UnicodeScalarValue = 0xFFFD;
  1765. encodeUTF8(UnicodeScalarValue, Storage);
  1766. UnquotedValue = UnquotedValue.substr(4);
  1767. break;
  1768. }
  1769. case 'U': {
  1770. if (UnquotedValue.size() < 9)
  1771. // TODO: Report error.
  1772. break;
  1773. unsigned int UnicodeScalarValue;
  1774. if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
  1775. // TODO: Report error.
  1776. UnicodeScalarValue = 0xFFFD;
  1777. encodeUTF8(UnicodeScalarValue, Storage);
  1778. UnquotedValue = UnquotedValue.substr(8);
  1779. break;
  1780. }
  1781. }
  1782. UnquotedValue = UnquotedValue.substr(1);
  1783. }
  1784. }
  1785. Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end());
  1786. return StringRef(Storage.begin(), Storage.size());
  1787. }
  1788. Node *KeyValueNode::getKey() {
  1789. if (Key)
  1790. return Key;
  1791. // Handle implicit null keys.
  1792. {
  1793. Token &t = peekNext();
  1794. if ( t.Kind == Token::TK_BlockEnd
  1795. || t.Kind == Token::TK_Value
  1796. || t.Kind == Token::TK_Error) {
  1797. return Key = new (getAllocator()) NullNode(Doc);
  1798. }
  1799. if (t.Kind == Token::TK_Key)
  1800. getNext(); // skip TK_Key.
  1801. }
  1802. // Handle explicit null keys.
  1803. Token &t = peekNext();
  1804. if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
  1805. return Key = new (getAllocator()) NullNode(Doc);
  1806. }
  1807. // We've got a normal key.
  1808. return Key = parseBlockNode();
  1809. }
  1810. Node *KeyValueNode::getValue() {
  1811. if (Value)
  1812. return Value;
  1813. getKey()->skip();
  1814. if (failed())
  1815. return Value = new (getAllocator()) NullNode(Doc);
  1816. // Handle implicit null values.
  1817. {
  1818. Token &t = peekNext();
  1819. if ( t.Kind == Token::TK_BlockEnd
  1820. || t.Kind == Token::TK_FlowMappingEnd
  1821. || t.Kind == Token::TK_Key
  1822. || t.Kind == Token::TK_FlowEntry
  1823. || t.Kind == Token::TK_Error) {
  1824. return Value = new (getAllocator()) NullNode(Doc);
  1825. }
  1826. if (t.Kind != Token::TK_Value) {
  1827. setError("Unexpected token in Key Value.", t);
  1828. return Value = new (getAllocator()) NullNode(Doc);
  1829. }
  1830. getNext(); // skip TK_Value.
  1831. }
  1832. // Handle explicit null values.
  1833. Token &t = peekNext();
  1834. if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
  1835. return Value = new (getAllocator()) NullNode(Doc);
  1836. }
  1837. // We got a normal value.
  1838. return Value = parseBlockNode();
  1839. }
  1840. void MappingNode::increment() {
  1841. if (failed()) {
  1842. IsAtEnd = true;
  1843. CurrentEntry = nullptr;
  1844. return;
  1845. }
  1846. if (CurrentEntry) {
  1847. CurrentEntry->skip();
  1848. if (Type == MT_Inline) {
  1849. IsAtEnd = true;
  1850. CurrentEntry = nullptr;
  1851. return;
  1852. }
  1853. }
  1854. Token T = peekNext();
  1855. if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
  1856. // KeyValueNode eats the TK_Key. That way it can detect null keys.
  1857. CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
  1858. } else if (Type == MT_Block) {
  1859. switch (T.Kind) {
  1860. case Token::TK_BlockEnd:
  1861. getNext();
  1862. IsAtEnd = true;
  1863. CurrentEntry = nullptr;
  1864. break;
  1865. default:
  1866. setError("Unexpected token. Expected Key or Block End", T);
  1867. LLVM_FALLTHROUGH;
  1868. case Token::TK_Error:
  1869. IsAtEnd = true;
  1870. CurrentEntry = nullptr;
  1871. }
  1872. } else {
  1873. switch (T.Kind) {
  1874. case Token::TK_FlowEntry:
  1875. // Eat the flow entry and recurse.
  1876. getNext();
  1877. return increment();
  1878. case Token::TK_FlowMappingEnd:
  1879. getNext();
  1880. LLVM_FALLTHROUGH;
  1881. case Token::TK_Error:
  1882. // Set this to end iterator.
  1883. IsAtEnd = true;
  1884. CurrentEntry = nullptr;
  1885. break;
  1886. default:
  1887. setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
  1888. "Mapping End."
  1889. , T);
  1890. IsAtEnd = true;
  1891. CurrentEntry = nullptr;
  1892. }
  1893. }
  1894. }
  1895. void SequenceNode::increment() {
  1896. if (failed()) {
  1897. IsAtEnd = true;
  1898. CurrentEntry = nullptr;
  1899. return;
  1900. }
  1901. if (CurrentEntry)
  1902. CurrentEntry->skip();
  1903. Token T = peekNext();
  1904. if (SeqType == ST_Block) {
  1905. switch (T.Kind) {
  1906. case Token::TK_BlockEntry:
  1907. getNext();
  1908. CurrentEntry = parseBlockNode();
  1909. if (!CurrentEntry) { // An error occurred.
  1910. IsAtEnd = true;
  1911. CurrentEntry = nullptr;
  1912. }
  1913. break;
  1914. case Token::TK_BlockEnd:
  1915. getNext();
  1916. IsAtEnd = true;
  1917. CurrentEntry = nullptr;
  1918. break;
  1919. default:
  1920. setError( "Unexpected token. Expected Block Entry or Block End."
  1921. , T);
  1922. LLVM_FALLTHROUGH;
  1923. case Token::TK_Error:
  1924. IsAtEnd = true;
  1925. CurrentEntry = nullptr;
  1926. }
  1927. } else if (SeqType == ST_Indentless) {
  1928. switch (T.Kind) {
  1929. case Token::TK_BlockEntry:
  1930. getNext();
  1931. CurrentEntry = parseBlockNode();
  1932. if (!CurrentEntry) { // An error occurred.
  1933. IsAtEnd = true;
  1934. CurrentEntry = nullptr;
  1935. }
  1936. break;
  1937. default:
  1938. case Token::TK_Error:
  1939. IsAtEnd = true;
  1940. CurrentEntry = nullptr;
  1941. }
  1942. } else if (SeqType == ST_Flow) {
  1943. switch (T.Kind) {
  1944. case Token::TK_FlowEntry:
  1945. // Eat the flow entry and recurse.
  1946. getNext();
  1947. WasPreviousTokenFlowEntry = true;
  1948. return increment();
  1949. case Token::TK_FlowSequenceEnd:
  1950. getNext();
  1951. LLVM_FALLTHROUGH;
  1952. case Token::TK_Error:
  1953. // Set this to end iterator.
  1954. IsAtEnd = true;
  1955. CurrentEntry = nullptr;
  1956. break;
  1957. case Token::TK_StreamEnd:
  1958. case Token::TK_DocumentEnd:
  1959. case Token::TK_DocumentStart:
  1960. setError("Could not find closing ]!", T);
  1961. // Set this to end iterator.
  1962. IsAtEnd = true;
  1963. CurrentEntry = nullptr;
  1964. break;
  1965. default:
  1966. if (!WasPreviousTokenFlowEntry) {
  1967. setError("Expected , between entries!", T);
  1968. IsAtEnd = true;
  1969. CurrentEntry = nullptr;
  1970. break;
  1971. }
  1972. // Otherwise it must be a flow entry.
  1973. CurrentEntry = parseBlockNode();
  1974. if (!CurrentEntry) {
  1975. IsAtEnd = true;
  1976. }
  1977. WasPreviousTokenFlowEntry = false;
  1978. break;
  1979. }
  1980. }
  1981. }
  1982. Document::Document(Stream &S) : stream(S), Root(nullptr) {
  1983. // Tag maps starts with two default mappings.
  1984. TagMap["!"] = "!";
  1985. TagMap["!!"] = "tag:yaml.org,2002:";
  1986. if (parseDirectives())
  1987. expectToken(Token::TK_DocumentStart);
  1988. Token &T = peekNext();
  1989. if (T.Kind == Token::TK_DocumentStart)
  1990. getNext();
  1991. }
  1992. bool Document::skip() {
  1993. if (stream.scanner->failed())
  1994. return false;
  1995. if (!Root)
  1996. getRoot();
  1997. Root->skip();
  1998. Token &T = peekNext();
  1999. if (T.Kind == Token::TK_StreamEnd)
  2000. return false;
  2001. if (T.Kind == Token::TK_DocumentEnd) {
  2002. getNext();
  2003. return skip();
  2004. }
  2005. return true;
  2006. }
  2007. Token &Document::peekNext() {
  2008. return stream.scanner->peekNext();
  2009. }
  2010. Token Document::getNext() {
  2011. return stream.scanner->getNext();
  2012. }
  2013. void Document::setError(const Twine &Message, Token &Location) const {
  2014. stream.scanner->setError(Message, Location.Range.begin());
  2015. }
  2016. bool Document::failed() const {
  2017. return stream.scanner->failed();
  2018. }
  2019. Node *Document::parseBlockNode() {
  2020. Token T = peekNext();
  2021. // Handle properties.
  2022. Token AnchorInfo;
  2023. Token TagInfo;
  2024. parse_property:
  2025. switch (T.Kind) {
  2026. case Token::TK_Alias:
  2027. getNext();
  2028. return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
  2029. case Token::TK_Anchor:
  2030. if (AnchorInfo.Kind == Token::TK_Anchor) {
  2031. setError("Already encountered an anchor for this node!", T);
  2032. return nullptr;
  2033. }
  2034. AnchorInfo = getNext(); // Consume TK_Anchor.
  2035. T = peekNext();
  2036. goto parse_property;
  2037. case Token::TK_Tag:
  2038. if (TagInfo.Kind == Token::TK_Tag) {
  2039. setError("Already encountered a tag for this node!", T);
  2040. return nullptr;
  2041. }
  2042. TagInfo = getNext(); // Consume TK_Tag.
  2043. T = peekNext();
  2044. goto parse_property;
  2045. default:
  2046. break;
  2047. }
  2048. switch (T.Kind) {
  2049. case Token::TK_BlockEntry:
  2050. // We got an unindented BlockEntry sequence. This is not terminated with
  2051. // a BlockEnd.
  2052. // Don't eat the TK_BlockEntry, SequenceNode needs it.
  2053. return new (NodeAllocator) SequenceNode( stream.CurrentDoc
  2054. , AnchorInfo.Range.substr(1)
  2055. , TagInfo.Range
  2056. , SequenceNode::ST_Indentless);
  2057. case Token::TK_BlockSequenceStart:
  2058. getNext();
  2059. return new (NodeAllocator)
  2060. SequenceNode( stream.CurrentDoc
  2061. , AnchorInfo.Range.substr(1)
  2062. , TagInfo.Range
  2063. , SequenceNode::ST_Block);
  2064. case Token::TK_BlockMappingStart:
  2065. getNext();
  2066. return new (NodeAllocator)
  2067. MappingNode( stream.CurrentDoc
  2068. , AnchorInfo.Range.substr(1)
  2069. , TagInfo.Range
  2070. , MappingNode::MT_Block);
  2071. case Token::TK_FlowSequenceStart:
  2072. getNext();
  2073. return new (NodeAllocator)
  2074. SequenceNode( stream.CurrentDoc
  2075. , AnchorInfo.Range.substr(1)
  2076. , TagInfo.Range
  2077. , SequenceNode::ST_Flow);
  2078. case Token::TK_FlowMappingStart:
  2079. getNext();
  2080. return new (NodeAllocator)
  2081. MappingNode( stream.CurrentDoc
  2082. , AnchorInfo.Range.substr(1)
  2083. , TagInfo.Range
  2084. , MappingNode::MT_Flow);
  2085. case Token::TK_Scalar:
  2086. getNext();
  2087. return new (NodeAllocator)
  2088. ScalarNode( stream.CurrentDoc
  2089. , AnchorInfo.Range.substr(1)
  2090. , TagInfo.Range
  2091. , T.Range);
  2092. case Token::TK_BlockScalar: {
  2093. getNext();
  2094. StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
  2095. StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
  2096. return new (NodeAllocator)
  2097. BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
  2098. TagInfo.Range, StrCopy, T.Range);
  2099. }
  2100. case Token::TK_Key:
  2101. // Don't eat the TK_Key, KeyValueNode expects it.
  2102. return new (NodeAllocator)
  2103. MappingNode( stream.CurrentDoc
  2104. , AnchorInfo.Range.substr(1)
  2105. , TagInfo.Range
  2106. , MappingNode::MT_Inline);
  2107. case Token::TK_DocumentStart:
  2108. case Token::TK_DocumentEnd:
  2109. case Token::TK_StreamEnd:
  2110. default:
  2111. // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
  2112. // !!null null.
  2113. return new (NodeAllocator) NullNode(stream.CurrentDoc);
  2114. case Token::TK_Error:
  2115. return nullptr;
  2116. }
  2117. llvm_unreachable("Control flow shouldn't reach here.");
  2118. return nullptr;
  2119. }
  2120. bool Document::parseDirectives() {
  2121. bool isDirective = false;
  2122. while (true) {
  2123. Token T = peekNext();
  2124. if (T.Kind == Token::TK_TagDirective) {
  2125. parseTAGDirective();
  2126. isDirective = true;
  2127. } else if (T.Kind == Token::TK_VersionDirective) {
  2128. parseYAMLDirective();
  2129. isDirective = true;
  2130. } else
  2131. break;
  2132. }
  2133. return isDirective;
  2134. }
  2135. void Document::parseYAMLDirective() {
  2136. getNext(); // Eat %YAML <version>
  2137. }
  2138. void Document::parseTAGDirective() {
  2139. Token Tag = getNext(); // %TAG <handle> <prefix>
  2140. StringRef T = Tag.Range;
  2141. // Strip %TAG
  2142. T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
  2143. std::size_t HandleEnd = T.find_first_of(" \t");
  2144. StringRef TagHandle = T.substr(0, HandleEnd);
  2145. StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
  2146. TagMap[TagHandle] = TagPrefix;
  2147. }
  2148. bool Document::expectToken(int TK) {
  2149. Token T = getNext();
  2150. if (T.Kind != TK) {
  2151. setError("Unexpected token", T);
  2152. return false;
  2153. }
  2154. return true;
  2155. }