LiteralSupport.cpp 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400
  1. //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the NumericLiteralParser, CharLiteralParser, and
  11. // StringLiteralParser interfaces.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #include "clang/Lex/LiteralSupport.h"
  15. #include "clang/Lex/Preprocessor.h"
  16. #include "clang/Lex/LexDiagnostic.h"
  17. #include "clang/Basic/TargetInfo.h"
  18. #include "clang/Basic/ConvertUTF.h"
  19. #include "llvm/ADT/StringExtras.h"
  20. #include "llvm/Support/ErrorHandling.h"
  21. using namespace clang;
  22. /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
  23. /// not valid.
  24. static int HexDigitValue(char C) {
  25. if (C >= '0' && C <= '9') return C-'0';
  26. if (C >= 'a' && C <= 'f') return C-'a'+10;
  27. if (C >= 'A' && C <= 'F') return C-'A'+10;
  28. return -1;
  29. }
  30. static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  31. switch (kind) {
  32. default: llvm_unreachable("Unknown token type!");
  33. case tok::char_constant:
  34. case tok::string_literal:
  35. case tok::utf8_string_literal:
  36. return Target.getCharWidth();
  37. case tok::wide_char_constant:
  38. case tok::wide_string_literal:
  39. return Target.getWCharWidth();
  40. case tok::utf16_char_constant:
  41. case tok::utf16_string_literal:
  42. return Target.getChar16Width();
  43. case tok::utf32_char_constant:
  44. case tok::utf32_string_literal:
  45. return Target.getChar32Width();
  46. }
  47. }
  48. /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  49. /// either a character or a string literal.
  50. static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  51. const char *ThisTokEnd, bool &HadError,
  52. FullSourceLoc Loc, unsigned CharWidth,
  53. DiagnosticsEngine *Diags) {
  54. // Skip the '\' char.
  55. ++ThisTokBuf;
  56. // We know that this character can't be off the end of the buffer, because
  57. // that would have been \", which would not have been the end of string.
  58. unsigned ResultChar = *ThisTokBuf++;
  59. switch (ResultChar) {
  60. // These map to themselves.
  61. case '\\': case '\'': case '"': case '?': break;
  62. // These have fixed mappings.
  63. case 'a':
  64. // TODO: K&R: the meaning of '\\a' is different in traditional C
  65. ResultChar = 7;
  66. break;
  67. case 'b':
  68. ResultChar = 8;
  69. break;
  70. case 'e':
  71. if (Diags)
  72. Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
  73. ResultChar = 27;
  74. break;
  75. case 'E':
  76. if (Diags)
  77. Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
  78. ResultChar = 27;
  79. break;
  80. case 'f':
  81. ResultChar = 12;
  82. break;
  83. case 'n':
  84. ResultChar = 10;
  85. break;
  86. case 'r':
  87. ResultChar = 13;
  88. break;
  89. case 't':
  90. ResultChar = 9;
  91. break;
  92. case 'v':
  93. ResultChar = 11;
  94. break;
  95. case 'x': { // Hex escape.
  96. ResultChar = 0;
  97. if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
  98. if (Diags)
  99. Diags->Report(Loc, diag::err_hex_escape_no_digits);
  100. HadError = 1;
  101. break;
  102. }
  103. // Hex escapes are a maximal series of hex digits.
  104. bool Overflow = false;
  105. for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  106. int CharVal = HexDigitValue(ThisTokBuf[0]);
  107. if (CharVal == -1) break;
  108. // About to shift out a digit?
  109. Overflow |= (ResultChar & 0xF0000000) ? true : false;
  110. ResultChar <<= 4;
  111. ResultChar |= CharVal;
  112. }
  113. // See if any bits will be truncated when evaluated as a character.
  114. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  115. Overflow = true;
  116. ResultChar &= ~0U >> (32-CharWidth);
  117. }
  118. // Check for overflow.
  119. if (Overflow && Diags) // Too many digits to fit in
  120. Diags->Report(Loc, diag::warn_hex_escape_too_large);
  121. break;
  122. }
  123. case '0': case '1': case '2': case '3':
  124. case '4': case '5': case '6': case '7': {
  125. // Octal escapes.
  126. --ThisTokBuf;
  127. ResultChar = 0;
  128. // Octal escapes are a series of octal digits with maximum length 3.
  129. // "\0123" is a two digit sequence equal to "\012" "3".
  130. unsigned NumDigits = 0;
  131. do {
  132. ResultChar <<= 3;
  133. ResultChar |= *ThisTokBuf++ - '0';
  134. ++NumDigits;
  135. } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
  136. ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
  137. // Check for overflow. Reject '\777', but not L'\777'.
  138. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  139. if (Diags)
  140. Diags->Report(Loc, diag::warn_octal_escape_too_large);
  141. ResultChar &= ~0U >> (32-CharWidth);
  142. }
  143. break;
  144. }
  145. // Otherwise, these are not valid escapes.
  146. case '(': case '{': case '[': case '%':
  147. // GCC accepts these as extensions. We warn about them as such though.
  148. if (Diags)
  149. Diags->Report(Loc, diag::ext_nonstandard_escape)
  150. << std::string()+(char)ResultChar;
  151. break;
  152. default:
  153. if (Diags == 0)
  154. break;
  155. if (isgraph(ResultChar))
  156. Diags->Report(Loc, diag::ext_unknown_escape)
  157. << std::string()+(char)ResultChar;
  158. else
  159. Diags->Report(Loc, diag::ext_unknown_escape)
  160. << "x"+llvm::utohexstr(ResultChar);
  161. break;
  162. }
  163. return ResultChar;
  164. }
  165. /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
  166. /// return the UTF32.
  167. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  168. const char *ThisTokEnd,
  169. uint32_t &UcnVal, unsigned short &UcnLen,
  170. FullSourceLoc Loc, DiagnosticsEngine *Diags,
  171. const LangOptions &Features,
  172. bool in_char_string_literal = false) {
  173. if (!Features.CPlusPlus && !Features.C99 && Diags)
  174. Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
  175. const char *UcnBegin = ThisTokBuf;
  176. // Skip the '\u' char's.
  177. ThisTokBuf += 2;
  178. if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
  179. if (Diags)
  180. Diags->Report(Loc, diag::err_ucn_escape_no_digits);
  181. return false;
  182. }
  183. UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
  184. unsigned short UcnLenSave = UcnLen;
  185. for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
  186. int CharVal = HexDigitValue(ThisTokBuf[0]);
  187. if (CharVal == -1) break;
  188. UcnVal <<= 4;
  189. UcnVal |= CharVal;
  190. }
  191. // If we didn't consume the proper number of digits, there is a problem.
  192. if (UcnLenSave) {
  193. if (Diags) {
  194. SourceLocation L =
  195. Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
  196. Loc.getManager(), Features);
  197. Diags->Report(L, diag::err_ucn_escape_incomplete);
  198. }
  199. return false;
  200. }
  201. // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
  202. if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
  203. UcnVal > 0x10FFFF) { // maximum legal UTF32 value
  204. if (Diags)
  205. Diags->Report(Loc, diag::err_ucn_escape_invalid);
  206. return false;
  207. }
  208. // C++11 allows UCNs that refer to control characters and basic source
  209. // characters inside character and string literals
  210. if (UcnVal < 0xa0 &&
  211. (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
  212. bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
  213. if (Diags) {
  214. SourceLocation UcnBeginLoc =
  215. Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
  216. Loc.getManager(), Features);
  217. char BasicSCSChar = UcnVal;
  218. if (UcnVal >= 0x20 && UcnVal < 0x7f)
  219. Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
  220. diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
  221. << StringRef(&BasicSCSChar, 1);
  222. else
  223. Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
  224. diag::warn_cxx98_compat_literal_ucn_control_character);
  225. }
  226. if (IsError)
  227. return false;
  228. }
  229. return true;
  230. }
  231. /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
  232. /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
  233. /// StringLiteralParser. When we decide to implement UCN's for identifiers,
  234. /// we will likely rework our support for UCN's.
  235. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  236. const char *ThisTokEnd,
  237. char *&ResultBuf, bool &HadError,
  238. FullSourceLoc Loc, unsigned CharByteWidth,
  239. DiagnosticsEngine *Diags,
  240. const LangOptions &Features) {
  241. typedef uint32_t UTF32;
  242. UTF32 UcnVal = 0;
  243. unsigned short UcnLen = 0;
  244. if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
  245. Loc, Diags, Features, true)) {
  246. HadError = 1;
  247. return;
  248. }
  249. assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
  250. "only character widths of 1, 2, or 4 bytes supported");
  251. (void)UcnLen;
  252. assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
  253. if (CharByteWidth == 4) {
  254. // FIXME: Make the type of the result buffer correct instead of
  255. // using reinterpret_cast.
  256. UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
  257. *ResultPtr = UcnVal;
  258. ResultBuf += 4;
  259. return;
  260. }
  261. if (CharByteWidth == 2) {
  262. // FIXME: Make the type of the result buffer correct instead of
  263. // using reinterpret_cast.
  264. UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
  265. if (UcnVal < (UTF32)0xFFFF) {
  266. *ResultPtr = UcnVal;
  267. ResultBuf += 2;
  268. return;
  269. }
  270. // Convert to UTF16.
  271. UcnVal -= 0x10000;
  272. *ResultPtr = 0xD800 + (UcnVal >> 10);
  273. *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
  274. ResultBuf += 4;
  275. return;
  276. }
  277. assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
  278. // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
  279. // The conversion below was inspired by:
  280. // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
  281. // First, we determine how many bytes the result will require.
  282. typedef uint8_t UTF8;
  283. unsigned short bytesToWrite = 0;
  284. if (UcnVal < (UTF32)0x80)
  285. bytesToWrite = 1;
  286. else if (UcnVal < (UTF32)0x800)
  287. bytesToWrite = 2;
  288. else if (UcnVal < (UTF32)0x10000)
  289. bytesToWrite = 3;
  290. else
  291. bytesToWrite = 4;
  292. const unsigned byteMask = 0xBF;
  293. const unsigned byteMark = 0x80;
  294. // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
  295. // into the first byte, depending on how many bytes follow.
  296. static const UTF8 firstByteMark[5] = {
  297. 0x00, 0x00, 0xC0, 0xE0, 0xF0
  298. };
  299. // Finally, we write the bytes into ResultBuf.
  300. ResultBuf += bytesToWrite;
  301. switch (bytesToWrite) { // note: everything falls through.
  302. case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  303. case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  304. case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  305. case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
  306. }
  307. // Update the buffer.
  308. ResultBuf += bytesToWrite;
  309. }
  310. /// integer-constant: [C99 6.4.4.1]
  311. /// decimal-constant integer-suffix
  312. /// octal-constant integer-suffix
  313. /// hexadecimal-constant integer-suffix
  314. /// user-defined-integer-literal: [C++11 lex.ext]
  315. /// decimal-literal ud-suffix
  316. /// octal-literal ud-suffix
  317. /// hexadecimal-literal ud-suffix
  318. /// decimal-constant:
  319. /// nonzero-digit
  320. /// decimal-constant digit
  321. /// octal-constant:
  322. /// 0
  323. /// octal-constant octal-digit
  324. /// hexadecimal-constant:
  325. /// hexadecimal-prefix hexadecimal-digit
  326. /// hexadecimal-constant hexadecimal-digit
  327. /// hexadecimal-prefix: one of
  328. /// 0x 0X
  329. /// integer-suffix:
  330. /// unsigned-suffix [long-suffix]
  331. /// unsigned-suffix [long-long-suffix]
  332. /// long-suffix [unsigned-suffix]
  333. /// long-long-suffix [unsigned-sufix]
  334. /// nonzero-digit:
  335. /// 1 2 3 4 5 6 7 8 9
  336. /// octal-digit:
  337. /// 0 1 2 3 4 5 6 7
  338. /// hexadecimal-digit:
  339. /// 0 1 2 3 4 5 6 7 8 9
  340. /// a b c d e f
  341. /// A B C D E F
  342. /// unsigned-suffix: one of
  343. /// u U
  344. /// long-suffix: one of
  345. /// l L
  346. /// long-long-suffix: one of
  347. /// ll LL
  348. ///
  349. /// floating-constant: [C99 6.4.4.2]
  350. /// TODO: add rules...
  351. ///
  352. NumericLiteralParser::
  353. NumericLiteralParser(const char *begin, const char *end,
  354. SourceLocation TokLoc, Preprocessor &pp)
  355. : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
  356. // This routine assumes that the range begin/end matches the regex for integer
  357. // and FP constants (specifically, the 'pp-number' regex), and assumes that
  358. // the byte at "*end" is both valid and not part of the regex. Because of
  359. // this, it doesn't have to check for 'overscan' in various places.
  360. assert(!isalnum(*end) && *end != '.' && *end != '_' &&
  361. "Lexer didn't maximally munch?");
  362. s = DigitsBegin = begin;
  363. saw_exponent = false;
  364. saw_period = false;
  365. saw_ud_suffix = false;
  366. isLong = false;
  367. isUnsigned = false;
  368. isLongLong = false;
  369. isFloat = false;
  370. isImaginary = false;
  371. isMicrosoftInteger = false;
  372. hadError = false;
  373. if (*s == '0') { // parse radix
  374. ParseNumberStartingWithZero(TokLoc);
  375. if (hadError)
  376. return;
  377. } else { // the first digit is non-zero
  378. radix = 10;
  379. s = SkipDigits(s);
  380. if (s == ThisTokEnd) {
  381. // Done.
  382. } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
  383. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
  384. diag::err_invalid_decimal_digit) << StringRef(s, 1);
  385. hadError = true;
  386. return;
  387. } else if (*s == '.') {
  388. s++;
  389. saw_period = true;
  390. s = SkipDigits(s);
  391. }
  392. if ((*s == 'e' || *s == 'E')) { // exponent
  393. const char *Exponent = s;
  394. s++;
  395. saw_exponent = true;
  396. if (*s == '+' || *s == '-') s++; // sign
  397. const char *first_non_digit = SkipDigits(s);
  398. if (first_non_digit != s) {
  399. s = first_non_digit;
  400. } else {
  401. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
  402. diag::err_exponent_has_no_digits);
  403. hadError = true;
  404. return;
  405. }
  406. }
  407. }
  408. SuffixBegin = s;
  409. // Parse the suffix. At this point we can classify whether we have an FP or
  410. // integer constant.
  411. bool isFPConstant = isFloatingLiteral();
  412. // Loop over all of the characters of the suffix. If we see something bad,
  413. // we break out of the loop.
  414. for (; s != ThisTokEnd; ++s) {
  415. switch (*s) {
  416. case 'f': // FP Suffix for "float"
  417. case 'F':
  418. if (!isFPConstant) break; // Error for integer constant.
  419. if (isFloat || isLong) break; // FF, LF invalid.
  420. isFloat = true;
  421. continue; // Success.
  422. case 'u':
  423. case 'U':
  424. if (isFPConstant) break; // Error for floating constant.
  425. if (isUnsigned) break; // Cannot be repeated.
  426. isUnsigned = true;
  427. continue; // Success.
  428. case 'l':
  429. case 'L':
  430. if (isLong || isLongLong) break; // Cannot be repeated.
  431. if (isFloat) break; // LF invalid.
  432. // Check for long long. The L's need to be adjacent and the same case.
  433. if (s+1 != ThisTokEnd && s[1] == s[0]) {
  434. if (isFPConstant) break; // long long invalid for floats.
  435. isLongLong = true;
  436. ++s; // Eat both of them.
  437. } else {
  438. isLong = true;
  439. }
  440. continue; // Success.
  441. case 'i':
  442. case 'I':
  443. if (PP.getLangOpts().MicrosoftExt) {
  444. if (isFPConstant || isLong || isLongLong) break;
  445. // Allow i8, i16, i32, i64, and i128.
  446. if (s + 1 != ThisTokEnd) {
  447. switch (s[1]) {
  448. case '8':
  449. s += 2; // i8 suffix
  450. isMicrosoftInteger = true;
  451. break;
  452. case '1':
  453. if (s + 2 == ThisTokEnd) break;
  454. if (s[2] == '6') {
  455. s += 3; // i16 suffix
  456. isMicrosoftInteger = true;
  457. }
  458. else if (s[2] == '2') {
  459. if (s + 3 == ThisTokEnd) break;
  460. if (s[3] == '8') {
  461. s += 4; // i128 suffix
  462. isMicrosoftInteger = true;
  463. }
  464. }
  465. break;
  466. case '3':
  467. if (s + 2 == ThisTokEnd) break;
  468. if (s[2] == '2') {
  469. s += 3; // i32 suffix
  470. isLong = true;
  471. isMicrosoftInteger = true;
  472. }
  473. break;
  474. case '6':
  475. if (s + 2 == ThisTokEnd) break;
  476. if (s[2] == '4') {
  477. s += 3; // i64 suffix
  478. isLongLong = true;
  479. isMicrosoftInteger = true;
  480. }
  481. break;
  482. default:
  483. break;
  484. }
  485. break;
  486. }
  487. }
  488. // fall through.
  489. case 'j':
  490. case 'J':
  491. if (isImaginary) break; // Cannot be repeated.
  492. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
  493. diag::ext_imaginary_constant);
  494. isImaginary = true;
  495. continue; // Success.
  496. }
  497. // If we reached here, there was an error or a ud-suffix.
  498. break;
  499. }
  500. if (s != ThisTokEnd) {
  501. if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
  502. // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
  503. // with an '_' are ill-formed.
  504. saw_ud_suffix = true;
  505. return;
  506. }
  507. // Report an error if there are any.
  508. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
  509. isFPConstant ? diag::err_invalid_suffix_float_constant :
  510. diag::err_invalid_suffix_integer_constant)
  511. << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
  512. hadError = true;
  513. return;
  514. }
  515. }
  516. /// ParseNumberStartingWithZero - This method is called when the first character
  517. /// of the number is found to be a zero. This means it is either an octal
  518. /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
  519. /// a floating point number (01239.123e4). Eat the prefix, determining the
  520. /// radix etc.
  521. void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
  522. assert(s[0] == '0' && "Invalid method call");
  523. s++;
  524. // Handle a hex number like 0x1234.
  525. if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
  526. s++;
  527. radix = 16;
  528. DigitsBegin = s;
  529. s = SkipHexDigits(s);
  530. bool noSignificand = (s == DigitsBegin);
  531. if (s == ThisTokEnd) {
  532. // Done.
  533. } else if (*s == '.') {
  534. s++;
  535. saw_period = true;
  536. const char *floatDigitsBegin = s;
  537. s = SkipHexDigits(s);
  538. noSignificand &= (floatDigitsBegin == s);
  539. }
  540. if (noSignificand) {
  541. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
  542. diag::err_hexconstant_requires_digits);
  543. hadError = true;
  544. return;
  545. }
  546. // A binary exponent can appear with or with a '.'. If dotted, the
  547. // binary exponent is required.
  548. if (*s == 'p' || *s == 'P') {
  549. const char *Exponent = s;
  550. s++;
  551. saw_exponent = true;
  552. if (*s == '+' || *s == '-') s++; // sign
  553. const char *first_non_digit = SkipDigits(s);
  554. if (first_non_digit == s) {
  555. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
  556. diag::err_exponent_has_no_digits);
  557. hadError = true;
  558. return;
  559. }
  560. s = first_non_digit;
  561. if (!PP.getLangOpts().HexFloats)
  562. PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
  563. } else if (saw_period) {
  564. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  565. diag::err_hexconstant_requires_exponent);
  566. hadError = true;
  567. }
  568. return;
  569. }
  570. // Handle simple binary numbers 0b01010
  571. if (*s == 'b' || *s == 'B') {
  572. // 0b101010 is a GCC extension.
  573. PP.Diag(TokLoc, diag::ext_binary_literal);
  574. ++s;
  575. radix = 2;
  576. DigitsBegin = s;
  577. s = SkipBinaryDigits(s);
  578. if (s == ThisTokEnd) {
  579. // Done.
  580. } else if (isxdigit(*s)) {
  581. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  582. diag::err_invalid_binary_digit) << StringRef(s, 1);
  583. hadError = true;
  584. }
  585. // Other suffixes will be diagnosed by the caller.
  586. return;
  587. }
  588. // For now, the radix is set to 8. If we discover that we have a
  589. // floating point constant, the radix will change to 10. Octal floating
  590. // point constants are not permitted (only decimal and hexadecimal).
  591. radix = 8;
  592. DigitsBegin = s;
  593. s = SkipOctalDigits(s);
  594. if (s == ThisTokEnd)
  595. return; // Done, simple octal number like 01234
  596. // If we have some other non-octal digit that *is* a decimal digit, see if
  597. // this is part of a floating point number like 094.123 or 09e1.
  598. if (isdigit(*s)) {
  599. const char *EndDecimal = SkipDigits(s);
  600. if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
  601. s = EndDecimal;
  602. radix = 10;
  603. }
  604. }
  605. // If we have a hex digit other than 'e' (which denotes a FP exponent) then
  606. // the code is using an incorrect base.
  607. if (isxdigit(*s) && *s != 'e' && *s != 'E') {
  608. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  609. diag::err_invalid_octal_digit) << StringRef(s, 1);
  610. hadError = true;
  611. return;
  612. }
  613. if (*s == '.') {
  614. s++;
  615. radix = 10;
  616. saw_period = true;
  617. s = SkipDigits(s); // Skip suffix.
  618. }
  619. if (*s == 'e' || *s == 'E') { // exponent
  620. const char *Exponent = s;
  621. s++;
  622. radix = 10;
  623. saw_exponent = true;
  624. if (*s == '+' || *s == '-') s++; // sign
  625. const char *first_non_digit = SkipDigits(s);
  626. if (first_non_digit != s) {
  627. s = first_non_digit;
  628. } else {
  629. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
  630. diag::err_exponent_has_no_digits);
  631. hadError = true;
  632. return;
  633. }
  634. }
  635. }
  636. /// GetIntegerValue - Convert this numeric literal value to an APInt that
  637. /// matches Val's input width. If there is an overflow, set Val to the low bits
  638. /// of the result and return true. Otherwise, return false.
  639. bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
  640. // Fast path: Compute a conservative bound on the maximum number of
  641. // bits per digit in this radix. If we can't possibly overflow a
  642. // uint64 based on that bound then do the simple conversion to
  643. // integer. This avoids the expensive overflow checking below, and
  644. // handles the common cases that matter (small decimal integers and
  645. // hex/octal values which don't overflow).
  646. unsigned MaxBitsPerDigit = 1;
  647. while ((1U << MaxBitsPerDigit) < radix)
  648. MaxBitsPerDigit += 1;
  649. if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
  650. uint64_t N = 0;
  651. for (s = DigitsBegin; s != SuffixBegin; ++s)
  652. N = N*radix + HexDigitValue(*s);
  653. // This will truncate the value to Val's input width. Simply check
  654. // for overflow by comparing.
  655. Val = N;
  656. return Val.getZExtValue() != N;
  657. }
  658. Val = 0;
  659. s = DigitsBegin;
  660. llvm::APInt RadixVal(Val.getBitWidth(), radix);
  661. llvm::APInt CharVal(Val.getBitWidth(), 0);
  662. llvm::APInt OldVal = Val;
  663. bool OverflowOccurred = false;
  664. while (s < SuffixBegin) {
  665. unsigned C = HexDigitValue(*s++);
  666. // If this letter is out of bound for this radix, reject it.
  667. assert(C < radix && "NumericLiteralParser ctor should have rejected this");
  668. CharVal = C;
  669. // Add the digit to the value in the appropriate radix. If adding in digits
  670. // made the value smaller, then this overflowed.
  671. OldVal = Val;
  672. // Multiply by radix, did overflow occur on the multiply?
  673. Val *= RadixVal;
  674. OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
  675. // Add value, did overflow occur on the value?
  676. // (a + b) ult b <=> overflow
  677. Val += CharVal;
  678. OverflowOccurred |= Val.ult(CharVal);
  679. }
  680. return OverflowOccurred;
  681. }
  682. llvm::APFloat::opStatus
  683. NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
  684. using llvm::APFloat;
  685. unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
  686. return Result.convertFromString(StringRef(ThisTokBegin, n),
  687. APFloat::rmNearestTiesToEven);
  688. }
  689. /// user-defined-character-literal: [C++11 lex.ext]
  690. /// character-literal ud-suffix
  691. /// ud-suffix:
  692. /// identifier
  693. /// character-literal: [C++11 lex.ccon]
  694. /// ' c-char-sequence '
  695. /// u' c-char-sequence '
  696. /// U' c-char-sequence '
  697. /// L' c-char-sequence '
  698. /// c-char-sequence:
  699. /// c-char
  700. /// c-char-sequence c-char
  701. /// c-char:
  702. /// any member of the source character set except the single-quote ',
  703. /// backslash \, or new-line character
  704. /// escape-sequence
  705. /// universal-character-name
  706. /// escape-sequence:
  707. /// simple-escape-sequence
  708. /// octal-escape-sequence
  709. /// hexadecimal-escape-sequence
  710. /// simple-escape-sequence:
  711. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  712. /// octal-escape-sequence:
  713. /// \ octal-digit
  714. /// \ octal-digit octal-digit
  715. /// \ octal-digit octal-digit octal-digit
  716. /// hexadecimal-escape-sequence:
  717. /// \x hexadecimal-digit
  718. /// hexadecimal-escape-sequence hexadecimal-digit
  719. /// universal-character-name: [C++11 lex.charset]
  720. /// \u hex-quad
  721. /// \U hex-quad hex-quad
  722. /// hex-quad:
  723. /// hex-digit hex-digit hex-digit hex-digit
  724. ///
  725. CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
  726. SourceLocation Loc, Preprocessor &PP,
  727. tok::TokenKind kind) {
  728. // At this point we know that the character matches the regex "(L|u|U)?'.*'".
  729. HadError = false;
  730. Kind = kind;
  731. const char *TokBegin = begin;
  732. // Skip over wide character determinant.
  733. if (Kind != tok::char_constant) {
  734. ++begin;
  735. }
  736. // Skip over the entry quote.
  737. assert(begin[0] == '\'' && "Invalid token lexed");
  738. ++begin;
  739. // Remove an optional ud-suffix.
  740. if (end[-1] != '\'') {
  741. const char *UDSuffixEnd = end;
  742. do {
  743. --end;
  744. } while (end[-1] != '\'');
  745. UDSuffixBuf.assign(end, UDSuffixEnd);
  746. UDSuffixOffset = end - TokBegin;
  747. }
  748. // Trim the ending quote.
  749. assert(end != begin && "Invalid token lexed");
  750. --end;
  751. // FIXME: The "Value" is an uint64_t so we can handle char literals of
  752. // up to 64-bits.
  753. // FIXME: This extensively assumes that 'char' is 8-bits.
  754. assert(PP.getTargetInfo().getCharWidth() == 8 &&
  755. "Assumes char is 8 bits");
  756. assert(PP.getTargetInfo().getIntWidth() <= 64 &&
  757. (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
  758. "Assumes sizeof(int) on target is <= 64 and a multiple of char");
  759. assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
  760. "Assumes sizeof(wchar) on target is <= 64");
  761. SmallVector<uint32_t,4> codepoint_buffer;
  762. codepoint_buffer.resize(end-begin);
  763. uint32_t *buffer_begin = &codepoint_buffer.front();
  764. uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
  765. // Unicode escapes representing characters that cannot be correctly
  766. // represented in a single code unit are disallowed in character literals
  767. // by this implementation.
  768. uint32_t largest_character_for_kind;
  769. if (tok::wide_char_constant == Kind) {
  770. largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
  771. } else if (tok::utf16_char_constant == Kind) {
  772. largest_character_for_kind = 0xFFFF;
  773. } else if (tok::utf32_char_constant == Kind) {
  774. largest_character_for_kind = 0x10FFFF;
  775. } else {
  776. largest_character_for_kind = 0x7Fu;
  777. }
  778. while (begin!=end) {
  779. // Is this a span of non-escape characters?
  780. if (begin[0] != '\\') {
  781. char const *start = begin;
  782. do {
  783. ++begin;
  784. } while (begin != end && *begin != '\\');
  785. char const *tmp_in_start = start;
  786. uint32_t *tmp_out_start = buffer_begin;
  787. ConversionResult res =
  788. ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
  789. reinterpret_cast<UTF8 const *>(begin),
  790. &buffer_begin,buffer_end,strictConversion);
  791. if (res!=conversionOK) {
  792. // If we see bad encoding for unprefixed character literals, warn and
  793. // simply copy the byte values, for compatibility with gcc and
  794. // older versions of clang.
  795. bool NoErrorOnBadEncoding = isAscii();
  796. unsigned Msg = diag::err_bad_character_encoding;
  797. if (NoErrorOnBadEncoding)
  798. Msg = diag::warn_bad_character_encoding;
  799. PP.Diag(Loc, Msg);
  800. if (NoErrorOnBadEncoding) {
  801. start = tmp_in_start;
  802. buffer_begin = tmp_out_start;
  803. for ( ; start != begin; ++start, ++buffer_begin)
  804. *buffer_begin = static_cast<uint8_t>(*start);
  805. } else {
  806. HadError = true;
  807. }
  808. } else {
  809. for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
  810. if (*tmp_out_start > largest_character_for_kind) {
  811. HadError = true;
  812. PP.Diag(Loc, diag::err_character_too_large);
  813. }
  814. }
  815. }
  816. continue;
  817. }
  818. // Is this a Universal Character Name excape?
  819. if (begin[1] == 'u' || begin[1] == 'U') {
  820. unsigned short UcnLen = 0;
  821. if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
  822. FullSourceLoc(Loc, PP.getSourceManager()),
  823. &PP.getDiagnostics(), PP.getLangOpts(),
  824. true))
  825. {
  826. HadError = true;
  827. } else if (*buffer_begin > largest_character_for_kind) {
  828. HadError = true;
  829. PP.Diag(Loc,diag::err_character_too_large);
  830. }
  831. ++buffer_begin;
  832. continue;
  833. }
  834. unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
  835. uint64_t result =
  836. ProcessCharEscape(begin, end, HadError,
  837. FullSourceLoc(Loc,PP.getSourceManager()),
  838. CharWidth, &PP.getDiagnostics());
  839. *buffer_begin++ = result;
  840. }
  841. unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
  842. if (NumCharsSoFar > 1) {
  843. if (isWide())
  844. PP.Diag(Loc, diag::warn_extraneous_char_constant);
  845. else if (isAscii() && NumCharsSoFar == 4)
  846. PP.Diag(Loc, diag::ext_four_char_character_literal);
  847. else if (isAscii())
  848. PP.Diag(Loc, diag::ext_multichar_character_literal);
  849. else
  850. PP.Diag(Loc, diag::err_multichar_utf_character_literal);
  851. IsMultiChar = true;
  852. } else
  853. IsMultiChar = false;
  854. llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
  855. // Narrow character literals act as though their value is concatenated
  856. // in this implementation, but warn on overflow.
  857. bool multi_char_too_long = false;
  858. if (isAscii() && isMultiChar()) {
  859. LitVal = 0;
  860. for (size_t i=0;i<NumCharsSoFar;++i) {
  861. // check for enough leading zeros to shift into
  862. multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
  863. LitVal <<= 8;
  864. LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
  865. }
  866. } else if (NumCharsSoFar > 0) {
  867. // otherwise just take the last character
  868. LitVal = buffer_begin[-1];
  869. }
  870. if (!HadError && multi_char_too_long) {
  871. PP.Diag(Loc,diag::warn_char_constant_too_large);
  872. }
  873. // Transfer the value from APInt to uint64_t
  874. Value = LitVal.getZExtValue();
  875. // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
  876. // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
  877. // character constants are not sign extended in the this implementation:
  878. // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
  879. if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
  880. PP.getLangOpts().CharIsSigned)
  881. Value = (signed char)Value;
  882. }
  883. /// string-literal: [C++0x lex.string]
  884. /// encoding-prefix " [s-char-sequence] "
  885. /// encoding-prefix R raw-string
  886. /// encoding-prefix:
  887. /// u8
  888. /// u
  889. /// U
  890. /// L
  891. /// s-char-sequence:
  892. /// s-char
  893. /// s-char-sequence s-char
  894. /// s-char:
  895. /// any member of the source character set except the double-quote ",
  896. /// backslash \, or new-line character
  897. /// escape-sequence
  898. /// universal-character-name
  899. /// raw-string:
  900. /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
  901. /// r-char-sequence:
  902. /// r-char
  903. /// r-char-sequence r-char
  904. /// r-char:
  905. /// any member of the source character set, except a right parenthesis )
  906. /// followed by the initial d-char-sequence (which may be empty)
  907. /// followed by a double quote ".
  908. /// d-char-sequence:
  909. /// d-char
  910. /// d-char-sequence d-char
  911. /// d-char:
  912. /// any member of the basic source character set except:
  913. /// space, the left parenthesis (, the right parenthesis ),
  914. /// the backslash \, and the control characters representing horizontal
  915. /// tab, vertical tab, form feed, and newline.
  916. /// escape-sequence: [C++0x lex.ccon]
  917. /// simple-escape-sequence
  918. /// octal-escape-sequence
  919. /// hexadecimal-escape-sequence
  920. /// simple-escape-sequence:
  921. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  922. /// octal-escape-sequence:
  923. /// \ octal-digit
  924. /// \ octal-digit octal-digit
  925. /// \ octal-digit octal-digit octal-digit
  926. /// hexadecimal-escape-sequence:
  927. /// \x hexadecimal-digit
  928. /// hexadecimal-escape-sequence hexadecimal-digit
  929. /// universal-character-name:
  930. /// \u hex-quad
  931. /// \U hex-quad hex-quad
  932. /// hex-quad:
  933. /// hex-digit hex-digit hex-digit hex-digit
  934. ///
  935. StringLiteralParser::
  936. StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
  937. Preprocessor &PP, bool Complain)
  938. : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
  939. Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
  940. MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
  941. ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
  942. init(StringToks, NumStringToks);
  943. }
  944. void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
  945. // The literal token may have come from an invalid source location (e.g. due
  946. // to a PCH error), in which case the token length will be 0.
  947. if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
  948. hadError = true;
  949. return;
  950. }
  951. // Scan all of the string portions, remember the max individual token length,
  952. // computing a bound on the concatenated string length, and see whether any
  953. // piece is a wide-string. If any of the string portions is a wide-string
  954. // literal, the result is a wide-string literal [C99 6.4.5p4].
  955. assert(NumStringToks && "expected at least one token");
  956. MaxTokenLength = StringToks[0].getLength();
  957. assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
  958. SizeBound = StringToks[0].getLength()-2; // -2 for "".
  959. Kind = StringToks[0].getKind();
  960. hadError = false;
  961. // Implement Translation Phase #6: concatenation of string literals
  962. /// (C99 5.1.1.2p1). The common case is only one string fragment.
  963. for (unsigned i = 1; i != NumStringToks; ++i) {
  964. if (StringToks[i].getLength() < 2) {
  965. hadError = true;
  966. return;
  967. }
  968. // The string could be shorter than this if it needs cleaning, but this is a
  969. // reasonable bound, which is all we need.
  970. assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
  971. SizeBound += StringToks[i].getLength()-2; // -2 for "".
  972. // Remember maximum string piece length.
  973. if (StringToks[i].getLength() > MaxTokenLength)
  974. MaxTokenLength = StringToks[i].getLength();
  975. // Remember if we see any wide or utf-8/16/32 strings.
  976. // Also check for illegal concatenations.
  977. if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
  978. if (isAscii()) {
  979. Kind = StringToks[i].getKind();
  980. } else {
  981. if (Diags)
  982. Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
  983. diag::err_unsupported_string_concat);
  984. hadError = true;
  985. }
  986. }
  987. }
  988. // Include space for the null terminator.
  989. ++SizeBound;
  990. // TODO: K&R warning: "traditional C rejects string constant concatenation"
  991. // Get the width in bytes of char/wchar_t/char16_t/char32_t
  992. CharByteWidth = getCharWidth(Kind, Target);
  993. assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
  994. CharByteWidth /= 8;
  995. // The output buffer size needs to be large enough to hold wide characters.
  996. // This is a worst-case assumption which basically corresponds to L"" "long".
  997. SizeBound *= CharByteWidth;
  998. // Size the temporary buffer to hold the result string data.
  999. ResultBuf.resize(SizeBound);
  1000. // Likewise, but for each string piece.
  1001. SmallString<512> TokenBuf;
  1002. TokenBuf.resize(MaxTokenLength);
  1003. // Loop over all the strings, getting their spelling, and expanding them to
  1004. // wide strings as appropriate.
  1005. ResultPtr = &ResultBuf[0]; // Next byte to fill in.
  1006. Pascal = false;
  1007. SourceLocation UDSuffixTokLoc;
  1008. for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
  1009. const char *ThisTokBuf = &TokenBuf[0];
  1010. // Get the spelling of the token, which eliminates trigraphs, etc. We know
  1011. // that ThisTokBuf points to a buffer that is big enough for the whole token
  1012. // and 'spelled' tokens can only shrink.
  1013. bool StringInvalid = false;
  1014. unsigned ThisTokLen =
  1015. Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
  1016. &StringInvalid);
  1017. if (StringInvalid) {
  1018. hadError = true;
  1019. continue;
  1020. }
  1021. const char *ThisTokBegin = ThisTokBuf;
  1022. const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
  1023. // Remove an optional ud-suffix.
  1024. if (ThisTokEnd[-1] != '"') {
  1025. const char *UDSuffixEnd = ThisTokEnd;
  1026. do {
  1027. --ThisTokEnd;
  1028. } while (ThisTokEnd[-1] != '"');
  1029. StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
  1030. if (UDSuffixBuf.empty()) {
  1031. UDSuffixBuf.assign(UDSuffix);
  1032. UDSuffixToken = i;
  1033. UDSuffixOffset = ThisTokEnd - ThisTokBuf;
  1034. UDSuffixTokLoc = StringToks[i].getLocation();
  1035. } else if (!UDSuffixBuf.equals(UDSuffix)) {
  1036. // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
  1037. // result of a concatenation involving at least one user-defined-string-
  1038. // literal, all the participating user-defined-string-literals shall
  1039. // have the same ud-suffix.
  1040. if (Diags) {
  1041. SourceLocation TokLoc = StringToks[i].getLocation();
  1042. Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
  1043. << UDSuffixBuf << UDSuffix
  1044. << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
  1045. << SourceRange(TokLoc, TokLoc);
  1046. }
  1047. hadError = true;
  1048. }
  1049. }
  1050. // Strip the end quote.
  1051. --ThisTokEnd;
  1052. // TODO: Input character set mapping support.
  1053. // Skip marker for wide or unicode strings.
  1054. if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
  1055. ++ThisTokBuf;
  1056. // Skip 8 of u8 marker for utf8 strings.
  1057. if (ThisTokBuf[0] == '8')
  1058. ++ThisTokBuf;
  1059. }
  1060. // Check for raw string
  1061. if (ThisTokBuf[0] == 'R') {
  1062. ThisTokBuf += 2; // skip R"
  1063. const char *Prefix = ThisTokBuf;
  1064. while (ThisTokBuf[0] != '(')
  1065. ++ThisTokBuf;
  1066. ++ThisTokBuf; // skip '('
  1067. // Remove same number of characters from the end
  1068. ThisTokEnd -= ThisTokBuf - Prefix;
  1069. assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
  1070. // Copy the string over
  1071. if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
  1072. if (DiagnoseBadString(StringToks[i]))
  1073. hadError = true;
  1074. } else {
  1075. assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
  1076. ++ThisTokBuf; // skip "
  1077. // Check if this is a pascal string
  1078. if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
  1079. ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
  1080. // If the \p sequence is found in the first token, we have a pascal string
  1081. // Otherwise, if we already have a pascal string, ignore the first \p
  1082. if (i == 0) {
  1083. ++ThisTokBuf;
  1084. Pascal = true;
  1085. } else if (Pascal)
  1086. ThisTokBuf += 2;
  1087. }
  1088. while (ThisTokBuf != ThisTokEnd) {
  1089. // Is this a span of non-escape characters?
  1090. if (ThisTokBuf[0] != '\\') {
  1091. const char *InStart = ThisTokBuf;
  1092. do {
  1093. ++ThisTokBuf;
  1094. } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
  1095. // Copy the character span over.
  1096. if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
  1097. if (DiagnoseBadString(StringToks[i]))
  1098. hadError = true;
  1099. continue;
  1100. }
  1101. // Is this a Universal Character Name escape?
  1102. if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
  1103. EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
  1104. ResultPtr, hadError,
  1105. FullSourceLoc(StringToks[i].getLocation(), SM),
  1106. CharByteWidth, Diags, Features);
  1107. continue;
  1108. }
  1109. // Otherwise, this is a non-UCN escape character. Process it.
  1110. unsigned ResultChar =
  1111. ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
  1112. FullSourceLoc(StringToks[i].getLocation(), SM),
  1113. CharByteWidth*8, Diags);
  1114. if (CharByteWidth == 4) {
  1115. // FIXME: Make the type of the result buffer correct instead of
  1116. // using reinterpret_cast.
  1117. UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
  1118. *ResultWidePtr = ResultChar;
  1119. ResultPtr += 4;
  1120. } else if (CharByteWidth == 2) {
  1121. // FIXME: Make the type of the result buffer correct instead of
  1122. // using reinterpret_cast.
  1123. UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
  1124. *ResultWidePtr = ResultChar & 0xFFFF;
  1125. ResultPtr += 2;
  1126. } else {
  1127. assert(CharByteWidth == 1 && "Unexpected char width");
  1128. *ResultPtr++ = ResultChar & 0xFF;
  1129. }
  1130. }
  1131. }
  1132. }
  1133. if (Pascal) {
  1134. if (CharByteWidth == 4) {
  1135. // FIXME: Make the type of the result buffer correct instead of
  1136. // using reinterpret_cast.
  1137. UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
  1138. ResultWidePtr[0] = GetNumStringChars() - 1;
  1139. } else if (CharByteWidth == 2) {
  1140. // FIXME: Make the type of the result buffer correct instead of
  1141. // using reinterpret_cast.
  1142. UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
  1143. ResultWidePtr[0] = GetNumStringChars() - 1;
  1144. } else {
  1145. assert(CharByteWidth == 1 && "Unexpected char width");
  1146. ResultBuf[0] = GetNumStringChars() - 1;
  1147. }
  1148. // Verify that pascal strings aren't too large.
  1149. if (GetStringLength() > 256) {
  1150. if (Diags)
  1151. Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
  1152. diag::err_pascal_string_too_long)
  1153. << SourceRange(StringToks[0].getLocation(),
  1154. StringToks[NumStringToks-1].getLocation());
  1155. hadError = true;
  1156. return;
  1157. }
  1158. } else if (Diags) {
  1159. // Complain if this string literal has too many characters.
  1160. unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
  1161. if (GetNumStringChars() > MaxChars)
  1162. Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
  1163. diag::ext_string_too_long)
  1164. << GetNumStringChars() << MaxChars
  1165. << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
  1166. << SourceRange(StringToks[0].getLocation(),
  1167. StringToks[NumStringToks-1].getLocation());
  1168. }
  1169. }
  1170. /// copyStringFragment - This function copies from Start to End into ResultPtr.
  1171. /// Performs widening for multi-byte characters.
  1172. bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
  1173. assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
  1174. ConversionResult result = conversionOK;
  1175. // Copy the character span over.
  1176. if (CharByteWidth == 1) {
  1177. if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
  1178. reinterpret_cast<const UTF8*>(Fragment.end())))
  1179. result = sourceIllegal;
  1180. memcpy(ResultPtr, Fragment.data(), Fragment.size());
  1181. ResultPtr += Fragment.size();
  1182. } else if (CharByteWidth == 2) {
  1183. UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
  1184. // FIXME: Make the type of the result buffer correct instead of
  1185. // using reinterpret_cast.
  1186. UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
  1187. ConversionFlags flags = strictConversion;
  1188. result = ConvertUTF8toUTF16(
  1189. &sourceStart,sourceStart + Fragment.size(),
  1190. &targetStart,targetStart + 2*Fragment.size(),flags);
  1191. if (result==conversionOK)
  1192. ResultPtr = reinterpret_cast<char*>(targetStart);
  1193. } else if (CharByteWidth == 4) {
  1194. UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
  1195. // FIXME: Make the type of the result buffer correct instead of
  1196. // using reinterpret_cast.
  1197. UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
  1198. ConversionFlags flags = strictConversion;
  1199. result = ConvertUTF8toUTF32(
  1200. &sourceStart,sourceStart + Fragment.size(),
  1201. &targetStart,targetStart + 4*Fragment.size(),flags);
  1202. if (result==conversionOK)
  1203. ResultPtr = reinterpret_cast<char*>(targetStart);
  1204. }
  1205. assert((result != targetExhausted)
  1206. && "ConvertUTF8toUTFXX exhausted target buffer");
  1207. return result != conversionOK;
  1208. }
  1209. bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
  1210. // If we see bad encoding for unprefixed string literals, warn and
  1211. // simply copy the byte values, for compatibility with gcc and older
  1212. // versions of clang.
  1213. bool NoErrorOnBadEncoding = isAscii();
  1214. unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
  1215. diag::err_bad_string_encoding;
  1216. if (Diags)
  1217. Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
  1218. return !NoErrorOnBadEncoding;
  1219. }
  1220. /// getOffsetOfStringByte - This function returns the offset of the
  1221. /// specified byte of the string data represented by Token. This handles
  1222. /// advancing over escape sequences in the string.
  1223. unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
  1224. unsigned ByteNo) const {
  1225. // Get the spelling of the token.
  1226. SmallString<32> SpellingBuffer;
  1227. SpellingBuffer.resize(Tok.getLength());
  1228. bool StringInvalid = false;
  1229. const char *SpellingPtr = &SpellingBuffer[0];
  1230. unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
  1231. &StringInvalid);
  1232. if (StringInvalid)
  1233. return 0;
  1234. assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
  1235. SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
  1236. const char *SpellingStart = SpellingPtr;
  1237. const char *SpellingEnd = SpellingPtr+TokLen;
  1238. // Skip over the leading quote.
  1239. assert(SpellingPtr[0] == '"' && "Should be a string literal!");
  1240. ++SpellingPtr;
  1241. // Skip over bytes until we find the offset we're looking for.
  1242. while (ByteNo) {
  1243. assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
  1244. // Step over non-escapes simply.
  1245. if (*SpellingPtr != '\\') {
  1246. ++SpellingPtr;
  1247. --ByteNo;
  1248. continue;
  1249. }
  1250. // Otherwise, this is an escape character. Advance over it.
  1251. bool HadError = false;
  1252. ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
  1253. FullSourceLoc(Tok.getLocation(), SM),
  1254. CharByteWidth*8, Diags);
  1255. assert(!HadError && "This method isn't valid on erroneous strings");
  1256. --ByteNo;
  1257. }
  1258. return SpellingPtr-SpellingStart;
  1259. }