12 лет назад · c7629d9415
--- a/include/clang/Basic/ConvertUTF.h
+++ b/include/clang/Basic/ConvertUTF.h
@@ -161,6 +161,16 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
 
				 
			
 
				 unsigned getNumBytesForUTF8(UTF8 firstByte);
			
 
				 
			
 
				+static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
			
 
				+                                                   const UTF8 *sourceEnd,
			
 
				+                                                   UTF32 *target,
			
 
				+                                                   ConversionFlags flags) {
			
 
				+  unsigned size = getNumBytesForUTF8(**source);
			
 
				+  if (size > sourceEnd - *source)
			
 
				+    return sourceExhausted;
			
 
				+  return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
			
 
				+}
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 
			
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -93,15 +93,29 @@ def ext_multichar_character_literal : ExtWarn<
 
				   "multi-character character constant">, InGroup<MultiChar>;
			
 
				 def ext_four_char_character_literal : Extension<
			
 
				   "multi-character character constant">, InGroup<FourByteMultiChar>;
			
 
				-  
			
 
				 
			
 
				-// Literal
			
 
				-def ext_nonstandard_escape : Extension<
			
 
				-  "use of non-standard escape character '\\%0'">;
			
 
				-def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
			
 
				-def err_hex_escape_no_digits : Error<"\\%0 used with no following hex digits">;
			
 
				+
			
 
				+// Unicode and UCNs
			
 
				+def err_invalid_utf8 : Error<
			
 
				+  "source file is not valid UTF-8">;
			
 
				+def err_non_ascii : Error<
			
 
				+  "non-ASCII characters are not allowed outside of literals and identifiers">;
			
 
				+def ext_unicode_whitespace : ExtWarn<
			
 
				+  "treating Unicode character as whitespace">,
			
 
				+  InGroup<DiagGroup<"unicode-whitespace">>;
			
 
				+
			
 
				+def err_hex_escape_no_digits : Error<
			
 
				+  "\\%0 used with no following hex digits">;
			
 
				+def warn_ucn_escape_no_digits : Warning<
			
 
				+  "\\%0 used with no following hex digits; "
			
 
				+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
			
 
				+def err_ucn_escape_incomplete : Error<
			
 
				+  "incomplete universal character name">;
			
 
				+def warn_ucn_escape_incomplete : Warning<
			
 
				+  "incomplete universal character name; "
			
 
				+  "treating as '\\' followed by identifier">, InGroup<Unicode>;
			
 
				 def err_ucn_escape_invalid : Error<"invalid universal character">;
			
 
				-def err_ucn_escape_incomplete : Error<"incomplete universal character name">;
			
 
				+
			
 
				 def err_ucn_escape_basic_scs : Error<
			
 
				   "character '%0' cannot be specified by a universal character name">;
			
 
				 def err_ucn_control_character : Error<
			
@@ -112,6 +126,12 @@ def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
 
				 def warn_cxx98_compat_literal_ucn_control_character : Warning<
			
 
				   "universal character name referring to a control character "
			
 
				   "is incompatible with C++98">, InGroup<CXX98Compat>, DefaultIgnore;
			
 
				+
			
 
				+
			
 
				+// Literal
			
 
				+def ext_nonstandard_escape : Extension<
			
 
				+  "use of non-standard escape character '\\%0'">;
			
 
				+def ext_unknown_escape : ExtWarn<"unknown escape sequence '\\%0'">;
			
 
				 def err_invalid_decimal_digit : Error<"invalid digit '%0' in decimal constant">;
			
 
				 def err_invalid_binary_digit : Error<"invalid digit '%0' in binary constant">;
			
 
				 def err_invalid_octal_digit : Error<"invalid digit '%0' in octal constant">;
			
--- a/include/clang/Lex/Lexer.h
+++ b/include/clang/Lex/Lexer.h
@@ -437,6 +437,11 @@ private:
 
				   ///
			
 
				   void LexTokenInternal(Token &Result);
			
 
				 
			
 
				+  /// Given that a token begins with the Unicode character \p C, figure out
			
 
				+  /// what kind of token it is and dispatch to the appropriate lexing helper
			
 
				+  /// function.
			
 
				+  void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
			
 
				+
			
 
				   /// FormTokenWithChars - When we lex a token, we have identified a span
			
 
				   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
			
 
				   /// takes that range and assigns it to the token as its location and size.  In
			
@@ -579,6 +584,21 @@ private:
 
				   void cutOffLexing() { BufferPtr = BufferEnd; }
			
 
				 
			
 
				   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
			
 
				+
			
 
				+
			
 
				+  /// Read a universal character name.
			
 
				+  ///
			
 
				+  /// \param CurPtr The position in the source buffer after the initial '\'.
			
 
				+  ///               If the UCN is syntactically well-formed (but not necessarily
			
 
				+  ///               valid), this parameter will be updated to point to the
			
 
				+  ///               character after the UCN.
			
 
				+  /// \param SlashLoc The position in the source buffer of the '\'.
			
 
				+  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
			
 
				+  ///            and handle token formation in the caller.
			
 
				+  ///
			
 
				+  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
			
 
				+  ///         invalid.
			
 
				+  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
			
 
				 };
			
 
				 
			
 
				 
			
--- a/include/clang/Lex/Token.h
+++ b/include/clang/Lex/Token.h
@@ -74,9 +74,10 @@ public:
 
				     StartOfLine   = 0x01,  // At start of line or only after whitespace.
			
 
				     LeadingSpace  = 0x02,  // Whitespace exists before this token.
			
 
				     DisableExpand = 0x04,  // This identifier may never be macro expanded.
			
 
				-    NeedsCleaning = 0x08,   // Contained an escaped newline or trigraph.
			
 
				+    NeedsCleaning = 0x08,  // Contained an escaped newline or trigraph.
			
 
				     LeadingEmptyMacro = 0x10, // Empty macro exists before this token.
			
 
				-    HasUDSuffix = 0x20     // This string or character literal has a ud-suffix.
			
 
				+    HasUDSuffix = 0x20,    // This string or character literal has a ud-suffix.
			
 
				+    HasUCN = 0x40          // This identifier contains a UCN.
			
 
				   };
			
 
				 
			
 
				   tok::TokenKind getKind() const { return (tok::TokenKind)Kind; }
			
@@ -257,6 +258,9 @@ public:
 
				   /// \brief Return true if this token is a string or character literal which
			
 
				   /// has a ud-suffix.
			
 
				   bool hasUDSuffix() const { return (Flags & HasUDSuffix) ? true : false; }
			
 
				+
			
 
				+  /// Returns true if this token contains a universal character name.
			
 
				+  bool hasUCN() const { return (Flags & HasUCN) ? true : false; }
			
 
				 };
			
 
				 
			
 
				 /// \brief Information about the conditional stack (\#if directives)
			
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -25,11 +25,13 @@
 
				 //===----------------------------------------------------------------------===//
			
 
				 
			
 
				 #include "clang/Lex/Lexer.h"
			
 
				+#include "clang/Basic/ConvertUTF.h"
			
 
				 #include "clang/Basic/SourceManager.h"
			
 
				 #include "clang/Lex/CodeCompletionHandler.h"
			
 
				 #include "clang/Lex/LexDiagnostic.h"
			
 
				 #include "clang/Lex/Preprocessor.h"
			
 
				 #include "llvm/ADT/STLExtras.h"
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				 #include "llvm/ADT/StringSwitch.h"
			
 
				 #include "llvm/Support/Compiler.h"
			
 
				 #include "llvm/Support/MemoryBuffer.h"
			
@@ -371,10 +373,12 @@ unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
 
				   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
			
 
				   if (Tok.is(tok::raw_identifier))
			
 
				     TokStart = Tok.getRawIdentifierData();
			
 
				-  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
			
 
				-    // Just return the string from the identifier table, which is very quick.
			
 
				-    Buffer = II->getNameStart();
			
 
				-    return II->getLength();
			
 
				+  else if (!Tok.hasUCN()) {
			
 
				+    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
			
 
				+      // Just return the string from the identifier table, which is very quick.
			
 
				+      Buffer = II->getNameStart();
			
 
				+      return II->getLength();
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // NOTE: this can be checked even after testing for an IdentifierInfo.
			
@@ -1376,7 +1380,6 @@ SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
 
				 ///   2. If this is an escaped newline (potentially with whitespace between
			
 
				 ///      the backslash and newline), implicitly skip the newline and return
			
 
				 ///      the char after it.
			
 
				-///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
			
 
				 ///
			
 
				 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
			
 
				 /// know that we can accumulate into Size, and that we have already incremented
			
@@ -1509,6 +1512,77 @@ void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
 
				   IsAtStartOfLine = StartOfLine;
			
 
				 }
			
 
				 
			
 
				+namespace {
			
 
				+  struct UCNCharRange {
			
 
				+    uint32_t Lower;
			
 
				+    uint32_t Upper;
			
 
				+  };
			
 
				+  
			
 
				+  // C11 D.1, C++11 [charname.allowed]
			
 
				+  // FIXME: C99 and C++03 each have a different set of allowed UCNs.
			
 
				+  const UCNCharRange UCNAllowedCharRanges[] = {
			
 
				+    // 1
			
 
				+    { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD },
			
 
				+    { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA },
			
 
				+    { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 },
			
 
				+    { 0x00F8, 0x00FF },
			
 
				+    // 2
			
 
				+    { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF },
			
 
				+    // 3
			
 
				+    { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 },
			
 
				+    { 0x2054, 0x2054 }, { 0x2060, 0x206F },
			
 
				+    // 4
			
 
				+    { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 },
			
 
				+    { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF },
			
 
				+    // 5
			
 
				+    { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F },
			
 
				+    // 6
			
 
				+    { 0x3040, 0xD7FF },
			
 
				+    // 7
			
 
				+    { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 },
			
 
				+    { 0xFE47, 0xFFFD },
			
 
				+    // 8
			
 
				+    { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD },
			
 
				+    { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD },
			
 
				+    { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD },
			
 
				+    { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD },
			
 
				+    { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
			
 
				+  };
			
 
				+}
			
 
				+
			
 
				+static bool isAllowedIDChar(uint32_t c) {
			
 
				+  unsigned LowPoint = 0;
			
 
				+  unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges);
			
 
				+
			
 
				+  // Binary search the UCNAllowedCharRanges set.
			
 
				+  while (HighPoint != LowPoint) {
			
 
				+    unsigned MidPoint = (HighPoint + LowPoint) / 2;
			
 
				+    if (c < UCNAllowedCharRanges[MidPoint].Lower)
			
 
				+      HighPoint = MidPoint;
			
 
				+    else if (c > UCNAllowedCharRanges[MidPoint].Upper)
			
 
				+      LowPoint = MidPoint + 1;
			
 
				+    else
			
 
				+      return true;
			
 
				+  }
			
 
				+
			
 
				+  return false;
			
 
				+}
			
 
				+
			
 
				+static bool isAllowedInitiallyIDChar(uint32_t c) {
			
 
				+  // C11 D.2, C++11 [charname.disallowed]
			
 
				+  // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D.
			
 
				+  // FIXME: C++03 does not forbid any initial characters.
			
 
				+  return !(0x0300 <= c && c <= 0x036F) &&
			
 
				+         !(0x1DC0 <= c && c <= 0x1DFF) &&
			
 
				+         !(0x20D0 <= c && c <= 0x20FF) &&
			
 
				+         !(0xFE20 <= c && c <= 0xFE2F);
			
 
				+}
			
 
				+
			
 
				+static inline bool isASCII(char C) {
			
 
				+  return static_cast<signed char>(C) >= 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
			
 
				   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
			
 
				   unsigned Size;
			
@@ -1520,11 +1594,11 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
 
				 
			
 
				   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
			
 
				   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
			
 
				-  // FIXME: UCNs.
			
 
				   //
			
 
				   // TODO: Could merge these checks into a CharInfo flag to make the comparison
			
 
				   // cheaper
			
 
				-  if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) {
			
 
				+  if (isASCII(C) && C != '\\' && C != '?' &&
			
 
				+      (C != '$' || !LangOpts.DollarIdents)) {
			
 
				 FinishIdentifier:
			
 
				     const char *IdStart = BufferPtr;
			
 
				     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
			
@@ -1561,8 +1635,38 @@ FinishIdentifier:
 
				       CurPtr = ConsumeChar(CurPtr, Size, Result);
			
 
				       C = getCharAndSize(CurPtr, Size);
			
 
				       continue;
			
 
				-    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
			
 
				-      // Found end of identifier.
			
 
				+
			
 
				+    } else if (C == '\\') {
			
 
				+      const char *UCNPtr = CurPtr + Size;
			
 
				+      uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
			
 
				+      if (CodePoint == 0 || !isAllowedIDChar(CodePoint))
			
 
				+        goto FinishIdentifier;
			
 
				+
			
 
				+      Result.setFlag(Token::HasUCN);
			
 
				+      if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
			
 
				+          (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
			
 
				+        CurPtr = UCNPtr;
			
 
				+      else
			
 
				+        while (CurPtr != UCNPtr)
			
 
				+          (void)getAndAdvanceChar(CurPtr, Result);
			
 
				+
			
 
				+      C = getCharAndSize(CurPtr, Size);
			
 
				+      continue;
			
 
				+    } else if (!isASCII(C)) {
			
 
				+      const char *UnicodePtr = CurPtr;
			
 
				+      UTF32 CodePoint;
			
 
				+      ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr,
			
 
				+                                                    (const UTF8 *)BufferEnd,
			
 
				+                                                    &CodePoint,
			
 
				+                                                    strictConversion);
			
 
				+      if (Result != conversionOK ||
			
 
				+          !isAllowedIDChar(static_cast<uint32_t>(CodePoint)))
			
 
				+        goto FinishIdentifier;
			
 
				+
			
 
				+      CurPtr = UnicodePtr;
			
 
				+      C = getCharAndSize(CurPtr, Size);
			
 
				+      continue;
			
 
				+    } else if (!isIdentifierBody(C)) {
			
 
				       goto FinishIdentifier;
			
 
				     }
			
 
				 
			
@@ -1570,7 +1674,7 @@ FinishIdentifier:
 
				     CurPtr = ConsumeChar(CurPtr, Size, Result);
			
 
				 
			
 
				     C = getCharAndSize(CurPtr, Size);
			
 
				-    while (isIdentifierBody(C)) { // FIXME: UCNs.
			
 
				+    while (isIdentifierBody(C)) {
			
 
				       CurPtr = ConsumeChar(CurPtr, Size, Result);
			
 
				       C = getCharAndSize(CurPtr, Size);
			
 
				     }
			
@@ -2592,6 +2696,135 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
 
				   return false;
			
 
				 }
			
 
				 
			
 
				+uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
			
 
				+                           Token *Result) {
			
 
				+  assert(LangOpts.CPlusPlus || LangOpts.C99);
			
 
				+
			
 
				+  unsigned CharSize;
			
 
				+  char Kind = getCharAndSize(StartPtr, CharSize);
			
 
				+
			
 
				+  unsigned NumHexDigits;
			
 
				+  if (Kind == 'u')
			
 
				+    NumHexDigits = 4;
			
 
				+  else if (Kind == 'U')
			
 
				+    NumHexDigits = 8;
			
 
				+  else
			
 
				+    return 0;
			
 
				+
			
 
				+  const char *CurPtr = StartPtr + CharSize;
			
 
				+  const char *KindLoc = &CurPtr[-1];
			
 
				+
			
 
				+  uint32_t CodePoint = 0;
			
 
				+  for (unsigned i = 0; i < NumHexDigits; ++i) {
			
 
				+    char C = getCharAndSize(CurPtr, CharSize);
			
 
				+
			
 
				+    unsigned Value = llvm::hexDigitValue(C);
			
 
				+    if (Value == -1U) {
			
 
				+      if (Result && !isLexingRawMode()) {
			
 
				+        if (i == 0) {
			
 
				+          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
			
 
				+            << StringRef(KindLoc, 1);
			
 
				+        } else {
			
 
				+          // FIXME: if i == 4 and NumHexDigits == 8, suggest a fixit to \u.
			
 
				+          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
			
 
				+        }
			
 
				+      }
			
 
				+      
			
 
				+      return 0;
			
 
				+    }
			
 
				+
			
 
				+    CodePoint <<= 4;
			
 
				+    CodePoint += Value;
			
 
				+
			
 
				+    CurPtr += CharSize;
			
 
				+  }
			
 
				+
			
 
				+  if (Result) {
			
 
				+    Result->setFlag(Token::HasUCN);
			
 
				+    if (CurPtr - StartPtr == NumHexDigits + 2)
			
 
				+      StartPtr = CurPtr;
			
 
				+    else
			
 
				+      while (StartPtr != CurPtr)
			
 
				+        (void)getAndAdvanceChar(StartPtr, *Result);
			
 
				+  } else {
			
 
				+    StartPtr = CurPtr;
			
 
				+  }
			
 
				+
			
 
				+  // C99 6.4.3p2: A universal character name shall not specify a character whose
			
 
				+  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
			
 
				+  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
			
 
				+  // C++11 [lex.charset]p2: If the hexadecimal value for a
			
 
				+  //   universal-character-name corresponds to a surrogate code point (in the
			
 
				+  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
			
 
				+  //   if the hexadecimal value for a universal-character-name outside the
			
 
				+  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
			
 
				+  //   string literal corresponds to a control character (in either of the
			
 
				+  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
			
 
				+  //   basic source character set, the program is ill-formed.
			
 
				+  if (CodePoint < 0xA0) {
			
 
				+    if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
			
 
				+      return CodePoint;
			
 
				+
			
 
				+    // We don't use isLexingRawMode() here because we need to warn about bad
			
 
				+    // UCNs even when skipping preprocessing tokens in a #if block.
			
 
				+    if (Result && PP) {
			
 
				+      if (CodePoint < 0x20 || CodePoint >= 0x7F)
			
 
				+        Diag(BufferPtr, diag::err_ucn_control_character);
			
 
				+      else {
			
 
				+        char C = static_cast<char>(CodePoint);
			
 
				+        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+    
			
 
				+  } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) &&
			
 
				+             (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) {
			
 
				+    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
			
 
				+    // We don't use isLexingRawMode() here because we need to warn about bad
			
 
				+    // UCNs even when skipping preprocessing tokens in a #if block.
			
 
				+    if (Result && PP)
			
 
				+      Diag(BufferPtr, diag::err_ucn_escape_invalid);
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  return CodePoint;
			
 
				+}
			
 
				+
			
 
				+void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
			
 
				+  if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) {
			
 
				+    MIOpt.ReadToken();
			
 
				+    return LexIdentifier(Result, CurPtr);
			
 
				+  }
			
 
				+
			
 
				+  if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) {
			
 
				+    // Non-ASCII characters tend to creep into source code unintentionally.
			
 
				+    // Instead of letting the parser complain about the unknown token,
			
 
				+    // just drop the character.
			
 
				+    // Note that we can /only/ do this when the non-ASCII character is actually
			
 
				+    // spelled as Unicode, not written as a UCN. The standard requires that
			
 
				+    // we not throw away any possible preprocessor tokens, but there's a
			
 
				+    // loophole in the mapping of Unicode characters to basic character set
			
 
				+    // characters that allows us to map these particular characters to, say,
			
 
				+    // whitespace.
			
 
				+    if (!isLexingRawMode()) {
			
 
				+      CharSourceRange CharRange =
			
 
				+        CharSourceRange::getCharRange(getSourceLocation(),
			
 
				+                                      getSourceLocation(CurPtr));
			
 
				+      Diag(BufferPtr, diag::err_non_ascii)
			
 
				+        << FixItHint::CreateRemoval(CharRange);
			
 
				+    }
			
 
				+
			
 
				+    BufferPtr = CurPtr;
			
 
				+    return LexTokenInternal(Result);
			
 
				+  }
			
 
				+
			
 
				+  // Otherwise, we have an explicit UCN or a character that's unlikely to show
			
 
				+  // up by accident.
			
 
				+  MIOpt.ReadToken();
			
 
				+  FormTokenWithChars(Result, CurPtr, tok::unknown);
			
 
				+}
			
 
				+
			
 
				 
			
 
				 /// LexTokenInternal - This implements a simple C family lexer.  It is an
			
 
				 /// extremely performance critical piece of code.  This assumes that the buffer
			
@@ -3243,12 +3476,41 @@ LexNextToken:
 
				       Kind = tok::unknown;
			
 
				     break;
			
 
				 
			
 
				+  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
			
 
				   case '\\':
			
 
				-    // FIXME: UCN's.
			
 
				-    // FALL THROUGH.
			
 
				-  default:
			
 
				+    if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
			
 
				+      return LexUnicode(Result, CodePoint, CurPtr);
			
 
				+
			
 
				     Kind = tok::unknown;
			
 
				     break;
			
 
				+
			
 
				+  default: {
			
 
				+    if (isASCII(Char)) {
			
 
				+      Kind = tok::unknown;
			
 
				+      break;
			
 
				+    }
			
 
				+
			
 
				+    UTF32 CodePoint;
			
 
				+
			
 
				+    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
			
 
				+    // an escaped newline.
			
 
				+    --CurPtr;
			
 
				+    ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr,
			
 
				+                                                  (const UTF8 *)BufferEnd,
			
 
				+                                                  &CodePoint,
			
 
				+                                                  strictConversion);
			
 
				+    if (Status == conversionOK)
			
 
				+      return LexUnicode(Result, CodePoint, CurPtr);
			
 
				+    
			
 
				+    // Non-ASCII characters tend to creep into source code unintentionally.
			
 
				+    // Instead of letting the parser complain about the unknown token,
			
 
				+    // just warn that we don't have valid UTF-8, then drop the character.
			
 
				+    if (!isLexingRawMode())
			
 
				+      Diag(CurPtr, diag::err_invalid_utf8);
			
 
				+
			
 
				+    BufferPtr = CurPtr+1;
			
 
				+    goto LexNextToken;
			
 
				+  }
			
 
				   }
			
 
				 
			
 
				   // Notify MIOpt that we read a non-whitespace/non-comment token.
			
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -27,6 +27,7 @@
 
				 
			
 
				 #include "clang/Lex/Preprocessor.h"
			
 
				 #include "MacroArgs.h"
			
 
				+#include "clang/Basic/ConvertUTF.h"
			
 
				 #include "clang/Basic/FileManager.h"
			
 
				 #include "clang/Basic/SourceManager.h"
			
 
				 #include "clang/Basic/TargetInfo.h"
			
@@ -43,6 +44,8 @@
 
				 #include "clang/Lex/ScratchBuffer.h"
			
 
				 #include "llvm/ADT/APFloat.h"
			
 
				 #include "llvm/ADT/SmallString.h"
			
 
				+#include "llvm/ADT/STLExtras.h"
			
 
				+#include "llvm/ADT/StringExtras.h"
			
 
				 #include "llvm/Support/Capacity.h"
			
 
				 #include "llvm/Support/MemoryBuffer.h"
			
 
				 #include "llvm/Support/raw_ostream.h"
			
@@ -396,7 +399,7 @@ StringRef Preprocessor::getSpelling(const Token &Tok,
 
				                                           SmallVectorImpl<char> &Buffer,
			
 
				                                           bool *Invalid) const {
			
 
				   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
			
 
				-  if (Tok.isNot(tok::raw_identifier)) {
			
 
				+  if (Tok.isNot(tok::raw_identifier) && !Tok.hasUCN()) {
			
 
				     // Try the fast path.
			
 
				     if (const IdentifierInfo *II = Tok.getIdentifierInfo())
			
 
				       return II->getName();
			
@@ -494,6 +497,48 @@ void Preprocessor::EndSourceFile() {
 
				 // Lexer Event Handling.
			
 
				 //===----------------------------------------------------------------------===//
			
 
				 
			
 
				+static void appendCodePoint(unsigned Codepoint,
			
 
				+                            llvm::SmallVectorImpl<char> &Str) {
			
 
				+  char ResultBuf[4];
			
 
				+  char *ResultPtr = ResultBuf;
			
 
				+  bool Res = ConvertCodePointToUTF8(Codepoint, ResultPtr);
			
 
				+  (void)Res;
			
 
				+  assert(Res && "Unexpected conversion failure");
			
 
				+  Str.append(ResultBuf, ResultPtr);
			
 
				+}
			
 
				+
			
 
				+static void expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
			
 
				+  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
			
 
				+    if (*I != '\\') {
			
 
				+      Buf.push_back(*I);
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    ++I;
			
 
				+    assert(*I == 'u' || *I == 'U');
			
 
				+
			
 
				+    unsigned NumHexDigits;
			
 
				+    if (*I == 'u')
			
 
				+      NumHexDigits = 4;
			
 
				+    else
			
 
				+      NumHexDigits = 8;
			
 
				+
			
 
				+    assert(I + NumHexDigits <= E);
			
 
				+
			
 
				+    uint32_t CodePoint = 0;
			
 
				+    for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
			
 
				+      unsigned Value = llvm::hexDigitValue(*I);
			
 
				+      assert(Value != -1U);
			
 
				+
			
 
				+      CodePoint <<= 4;
			
 
				+      CodePoint += Value;
			
 
				+    }
			
 
				+
			
 
				+    appendCodePoint(CodePoint, Buf);
			
 
				+    --I;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 /// LookUpIdentifierInfo - Given a tok::raw_identifier token, look up the
			
 
				 /// identifier information for the token and install it into the token,
			
 
				 /// updating the token kind accordingly.
			
@@ -502,15 +547,22 @@ IdentifierInfo *Preprocessor::LookUpIdentifierInfo(Token &Identifier) const {
 
				 
			
 
				   // Look up this token, see if it is a macro, or if it is a language keyword.
			
 
				   IdentifierInfo *II;
			
 
				-  if (!Identifier.needsCleaning()) {
			
 
				+  if (!Identifier.needsCleaning() && !Identifier.hasUCN()) {
			
 
				     // No cleaning needed, just use the characters from the lexed buffer.
			
 
				     II = getIdentifierInfo(StringRef(Identifier.getRawIdentifierData(),
			
 
				-                                           Identifier.getLength()));
			
 
				+                                     Identifier.getLength()));
			
 
				   } else {
			
 
				     // Cleaning needed, alloca a buffer, clean into it, then use the buffer.
			
 
				     SmallString<64> IdentifierBuffer;
			
 
				     StringRef CleanedStr = getSpelling(Identifier, IdentifierBuffer);
			
 
				-    II = getIdentifierInfo(CleanedStr);
			
 
				+
			
 
				+    if (Identifier.hasUCN()) {
			
 
				+      SmallString<64> UCNIdentifierBuffer;
			
 
				+      expandUCNs(UCNIdentifierBuffer, CleanedStr);
			
 
				+      II = getIdentifierInfo(UCNIdentifierBuffer);
			
 
				+    } else {
			
 
				+      II = getIdentifierInfo(CleanedStr);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // Update the token info (identifier info and appropriate token kind).
			
--- a/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -7,8 +7,7 @@ namespace std {
 
				 
			
 
				 void operator "" _km(long double); // ok
			
 
				 string operator "" _i18n(const char*, std::size_t); // ok
			
 
				-// FIXME: This should be accepted once we support UCNs
			
 
				-template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-error {{expected identifier}}
			
 
				+template<char...> int operator "" \u03C0(); // ok, UCN for lowercase pi // expected-warning {{reserved}}
			
 
				 float operator ""E(const char *); // expected-error {{invalid suffix on literal}} expected-warning {{reserved}}
			
 
				 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
			
 
				 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
			
--- a/test/CodeGen/ucn-identifiers.c
+++ b/test/CodeGen/ucn-identifiers.c
@@ -0,0 +1,14 @@
 
				+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null
			
 
				+// RUN: %clang_cc1 %s -emit-llvm -o /dev/null -x c++
			
 
				+// This file contains UTF-8; please do not fix!
			
 
				+
			
 
				+
			
 
				+extern void \u00FCber(int);
			
 
				+extern void \U000000FCber(int); // redeclaration, no warning
			
 
				+
			
 
				+void goodCalls() {
			
 
				+  \u00FCber(0);
			
 
				+  \u00fcber(1);
			
 
				+  über(2);
			
 
				+  \U000000FCber(3);
			
 
				+}
			
--- a/test/FixIt/fixit-unicode.c
+++ b/test/FixIt/fixit-unicode.c
@@ -8,13 +8,15 @@ struct Foo {
 
				 // PR13312
			
 
				 void test1() {
			
 
				   struct Foo foo;
			
 
				-  (&foo)☃>bar = 42;
			
 
				+  foo.bar = 42☃
			
 
				+// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers
			
 
				+// CHECK: {{^              \^}}
			
 
				 // CHECK: error: expected ';' after expression
			
 
				 // Make sure we emit the fixit right in front of the snowman.
			
 
				-// CHECK: {{^        \^}}
			
 
				-// CHECK: {{^        ;}}
			
 
				+// CHECK: {{^              \^}}
			
 
				+// CHECK: {{^              ;}}
			
 
				 
			
 
				-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{11:9-11:9}:";"
			
 
				+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-8]]:15-[[@LINE-8]]:15}:";"
			
 
				 }
			
 
				 
			
 
				 
			
@@ -29,5 +31,5 @@ void test2() {
 
				 // because different systems will render the delta differently (either as a
			
 
				 // character, or as <U+2206>.) The fixit should line up with the %d regardless.
			
 
				 
			
 
				-// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{23:16-23:18}:"%ld"
			
 
				+// CHECK-MACHINE: fix-it:"{{.*}}fixit-unicode.c":{[[@LINE-9]]:16-[[@LINE-9]]:18}:"%ld"
			
 
				 }
			
--- a/test/Lexer/utf8-invalid.c
+++ b/test/Lexer/utf8-invalid.c
@@ -0,0 +1,6 @@
 
				+// RUN: %clang_cc1 -fsyntax-only -verify %s
			
 
				+
			
 
				+// Note: this file contains invalid UTF-8 before the variable name in the
			
 
				+// next line. Please do not fix!
			
 
				+
			
 
				+extern int ‚x; // expected-error{{source file is not valid UTF-8}}
			
--- a/test/Preprocessor/ucn-pp-identifier.c
+++ b/test/Preprocessor/ucn-pp-identifier.c
@@ -0,0 +1,97 @@
 
				+// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify -Wundef
			
 
				+// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify -Wundef
			
 
				+
			
 
				+#define \u00FC
			
 
				+#define a\u00FD() 0
			
 
				+#ifndef \u00FC
			
 
				+#error "This should never happen"
			
 
				+#endif
			
 
				+
			
 
				+#if a\u00FD()
			
 
				+#error "This should never happen"
			
 
				+#endif
			
 
				+
			
 
				+#if a\U000000FD()
			
 
				+#error "This should never happen"
			
 
				+#endif
			
 
				+
			
 
				+#if \uarecool // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
			
 
				+#endif
			
 
				+#if \uwerecool // expected-warning{{\u used with no following hex digits; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
			
 
				+#endif
			
 
				+#if \U0001000  // expected-warning{{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{invalid token at start of a preprocessor expression}}
			
 
				+#endif
			
 
				+
			
 
				+// Make sure we reject disallowed UCNs
			
 
				+#define \ufffe // expected-error {{macro names must be identifiers}}
			
 
				+#define \U10000000  // expected-error {{macro names must be identifiers}}
			
 
				+#define \u0061  // expected-error {{character 'a' cannot be specified by a universal character name}} expected-error {{macro names must be identifiers}}
			
 
				+
			
 
				+// FIXME: Not clear what our behavior should be here; \u0024 is "$".
			
 
				+#define a\u0024  // expected-warning {{whitespace}}
			
 
				+
			
 
				+#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#define \u0110 1 / 0
			
 
				+#if \u0110 // expected-error {{division by zero in preprocessor expression}}
			
 
				+#endif
			
 
				+
			
 
				+#define STRINGIZE(X) # X
			
 
				+
			
 
				+extern int check_size[sizeof(STRINGIZE(\u0112)) == 3 ? 1 : -1];
			
 
				+
			
 
				+// Check that we still diagnose disallowed UCNs in #if 0 blocks.
			
 
				+// C99 5.1.1.2p1 and C++11 [lex.phases]p1 dictate that preprocessor tokens are
			
 
				+// formed before directives are parsed.
			
 
				+// expected-error@+4 {{character 'a' cannot be specified by a universal character name}}
			
 
				+#if 0
			
 
				+#define \ufffe // okay
			
 
				+#define \U10000000 // okay
			
 
				+#define \u0061 // error, but -verify only looks at comments outside #if 0
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// A UCN formed by token pasting is undefined in both C99 and C++.
			
 
				+// Right now we don't do anything special, which causes us to coincidentally
			
 
				+// accept the first case below but reject the second two.
			
 
				+#define PASTE(A, B) A ## B
			
 
				+extern int PASTE(\, u00FD);
			
 
				+extern int PASTE(\u, 00FD); // expected-warning{{\u used with no following hex digits}}
			
 
				+extern int PASTE(\u0, 0FD); // expected-warning{{incomplete universal character name}}
			
 
				+#ifdef __cplusplus
			
 
				+// expected-error@-3 {{expected unqualified-id}}
			
 
				+// expected-error@-3 {{expected unqualified-id}}
			
 
				+#else
			
 
				+// expected-error@-6 {{expected identifier}}
			
 
				+// expected-error@-6 {{expected identifier}}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// A UCN produced by line splicing is valid in C99 but undefined in C++.
			
 
				+// Since undefined behavior can do anything including working as intended,
			
 
				+// we just accept it in C++ as well.;
			
 
				+#define newline_1_\u00F\
			
 
				+C 1
			
 
				+#define newline_2_\u00\
			
 
				+F\
			
 
				+C 1
			
 
				+#define newline_3_\u\
			
 
				+00\
			
 
				+FC 1
			
 
				+#define newline_4_\\
			
 
				+u00FC 1
			
 
				+#define newline_5_\\
			
 
				+u\
			
 
				+\
			
 
				+0\
			
 
				+0\
			
 
				+F\
			
 
				+C 1
			
 
				+
			
 
				+#if (newline_1_\u00FC && newline_2_\u00FC && newline_3_\u00FC && \
			
 
				+     newline_4_\u00FC && newline_5_\u00FC)
			
 
				+#else
			
 
				+#error "Line splicing failed to produce UCNs"
			
 
				+#endif
			
--- a/test/Sema/ucn-identifiers.c
+++ b/test/Sema/ucn-identifiers.c
@@ -0,0 +1,35 @@
 
				+// RUN: %clang_cc1 %s -verify -fsyntax-only -pedantic
			
 
				+// RUN: %clang_cc1 %s -verify -fsyntax-only -x c++ -pedantic
			
 
				+
			
 
				+// This file contains UTF-8; please do not fix!
			
 
				+
			
 
				+
			
 
				+extern void \u00FCber(int);
			
 
				+extern void \U000000FCber(int); // redeclaration, no warning
			
 
				+#ifdef __cplusplus
			
 
				+// expected-note@-2 + {{candidate function not viable}}
			
 
				+#else
			
 
				+// expected-note@-4 + {{declared here}}
			
 
				+#endif
			
 
				+
			
 
				+void goodCalls() {
			
 
				+  \u00FCber(0);
			
 
				+  \u00fcber(1);
			
 
				+  über(2);
			
 
				+  \U000000FCber(3);
			
 
				+}
			
 
				+
			
 
				+void badCalls() {
			
 
				+  \u00FCber(0.5); // expected-warning{{implicit conversion from 'double' to 'int'}}
			
 
				+  \u00fcber = 0; // expected-error{{non-object type 'void (int)' is not assignable}}
			
 
				+
			
 
				+  über(1, 2);
			
 
				+  \U000000FCber(); 
			
 
				+#ifdef __cplusplus
			
 
				+  // expected-error@-3 {{no matching function}}
			
 
				+  // expected-error@-3 {{no matching function}}
			
 
				+#else
			
 
				+  // expected-error@-6 {{too many arguments to function call, expected 1, have 2}}
			
 
				+  // expected-error@-6 {{too few arguments to function call, expected 1, have 0}}
			
 
				+#endif
			
 
				+}