StringRef.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. //===-- StringRef.cpp - Lightweight String References ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #include "llvm/ADT/StringRef.h"
  10. #include "llvm/ADT/APInt.h"
  11. #include <bitset>
  12. using namespace llvm;
  13. // MSVC emits references to this into the translation units which reference it.
  14. #ifndef _MSC_VER
  15. const size_t StringRef::npos;
  16. #endif
  17. static char ascii_tolower(char x) {
  18. if (x >= 'A' && x <= 'Z')
  19. return x - 'A' + 'a';
  20. return x;
  21. }
  22. static bool ascii_isdigit(char x) {
  23. return x >= '0' && x <= '9';
  24. }
  25. /// compare_lower - Compare strings, ignoring case.
  26. int StringRef::compare_lower(StringRef RHS) const {
  27. for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
  28. unsigned char LHC = ascii_tolower(Data[I]);
  29. unsigned char RHC = ascii_tolower(RHS.Data[I]);
  30. if (LHC != RHC)
  31. return LHC < RHC ? -1 : 1;
  32. }
  33. if (Length == RHS.Length)
  34. return 0;
  35. return Length < RHS.Length ? -1 : 1;
  36. }
  37. /// compare_numeric - Compare strings, handle embedded numbers.
  38. int StringRef::compare_numeric(StringRef RHS) const {
  39. for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
  40. if (Data[I] == RHS.Data[I])
  41. continue;
  42. if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
  43. // The longer sequence of numbers is larger. This doesn't really handle
  44. // prefixed zeros well.
  45. for (size_t J = I+1; J != E+1; ++J) {
  46. bool ld = J < Length && ascii_isdigit(Data[J]);
  47. bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
  48. if (ld != rd)
  49. return rd ? -1 : 1;
  50. if (!rd)
  51. break;
  52. }
  53. }
  54. return Data[I] < RHS.Data[I] ? -1 : 1;
  55. }
  56. if (Length == RHS.Length)
  57. return 0;
  58. return Length < RHS.Length ? -1 : 1;
  59. }
  60. // Compute the edit distance between the two given strings.
  61. unsigned StringRef::edit_distance(llvm::StringRef Other,
  62. bool AllowReplacements) {
  63. // The algorithm implemented below is the "classic"
  64. // dynamic-programming algorithm for computing the Levenshtein
  65. // distance, which is described here:
  66. //
  67. // http://en.wikipedia.org/wiki/Levenshtein_distance
  68. //
  69. // Although the algorithm is typically described using an m x n
  70. // array, only two rows are used at a time, so this implemenation
  71. // just keeps two separate vectors for those two rows.
  72. size_type m = size();
  73. size_type n = Other.size();
  74. const unsigned SmallBufferSize = 64;
  75. unsigned SmallBuffer[SmallBufferSize];
  76. unsigned *Allocated = 0;
  77. unsigned *previous = SmallBuffer;
  78. if (2*(n + 1) > SmallBufferSize)
  79. Allocated = previous = new unsigned [2*(n+1)];
  80. unsigned *current = previous + (n + 1);
  81. for (unsigned i = 0; i <= n; ++i)
  82. previous[i] = i;
  83. for (size_type y = 1; y <= m; ++y) {
  84. current[0] = y;
  85. for (size_type x = 1; x <= n; ++x) {
  86. if (AllowReplacements) {
  87. current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
  88. min(current[x-1], previous[x])+1);
  89. }
  90. else {
  91. if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
  92. else current[x] = min(current[x-1], previous[x]) + 1;
  93. }
  94. }
  95. unsigned *tmp = current;
  96. current = previous;
  97. previous = tmp;
  98. }
  99. unsigned Result = previous[n];
  100. delete [] Allocated;
  101. return Result;
  102. }
  103. //===----------------------------------------------------------------------===//
  104. // String Searching
  105. //===----------------------------------------------------------------------===//
  106. /// find - Search for the first string \arg Str in the string.
  107. ///
  108. /// \return - The index of the first occurence of \arg Str, or npos if not
  109. /// found.
  110. size_t StringRef::find(StringRef Str, size_t From) const {
  111. size_t N = Str.size();
  112. if (N > Length)
  113. return npos;
  114. for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
  115. if (substr(i, N).equals(Str))
  116. return i;
  117. return npos;
  118. }
  119. /// rfind - Search for the last string \arg Str in the string.
  120. ///
  121. /// \return - The index of the last occurence of \arg Str, or npos if not
  122. /// found.
  123. size_t StringRef::rfind(StringRef Str) const {
  124. size_t N = Str.size();
  125. if (N > Length)
  126. return npos;
  127. for (size_t i = Length - N + 1, e = 0; i != e;) {
  128. --i;
  129. if (substr(i, N).equals(Str))
  130. return i;
  131. }
  132. return npos;
  133. }
  134. /// find_first_of - Find the first character in the string that is in \arg
  135. /// Chars, or npos if not found.
  136. ///
  137. /// Note: O(size() + Chars.size())
  138. StringRef::size_type StringRef::find_first_of(StringRef Chars,
  139. size_t From) const {
  140. std::bitset<1 << CHAR_BIT> CharBits;
  141. for (size_type i = 0; i != Chars.size(); ++i)
  142. CharBits.set((unsigned char)Chars[i]);
  143. for (size_type i = min(From, Length), e = Length; i != e; ++i)
  144. if (CharBits.test((unsigned char)Data[i]))
  145. return i;
  146. return npos;
  147. }
  148. /// find_first_not_of - Find the first character in the string that is not
  149. /// \arg C or npos if not found.
  150. StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
  151. for (size_type i = min(From, Length), e = Length; i != e; ++i)
  152. if (Data[i] != C)
  153. return i;
  154. return npos;
  155. }
  156. /// find_first_not_of - Find the first character in the string that is not
  157. /// in the string \arg Chars, or npos if not found.
  158. ///
  159. /// Note: O(size() + Chars.size())
  160. StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
  161. size_t From) const {
  162. std::bitset<1 << CHAR_BIT> CharBits;
  163. for (size_type i = 0; i != Chars.size(); ++i)
  164. CharBits.set((unsigned char)Chars[i]);
  165. for (size_type i = min(From, Length), e = Length; i != e; ++i)
  166. if (!CharBits.test((unsigned char)Data[i]))
  167. return i;
  168. return npos;
  169. }
  170. //===----------------------------------------------------------------------===//
  171. // Helpful Algorithms
  172. //===----------------------------------------------------------------------===//
  173. /// count - Return the number of non-overlapped occurrences of \arg Str in
  174. /// the string.
  175. size_t StringRef::count(StringRef Str) const {
  176. size_t Count = 0;
  177. size_t N = Str.size();
  178. if (N > Length)
  179. return 0;
  180. for (size_t i = 0, e = Length - N + 1; i != e; ++i)
  181. if (substr(i, N).equals(Str))
  182. ++Count;
  183. return Count;
  184. }
  185. static unsigned GetAutoSenseRadix(StringRef &Str) {
  186. if (Str.startswith("0x")) {
  187. Str = Str.substr(2);
  188. return 16;
  189. } else if (Str.startswith("0b")) {
  190. Str = Str.substr(2);
  191. return 2;
  192. } else if (Str.startswith("0")) {
  193. return 8;
  194. } else {
  195. return 10;
  196. }
  197. }
  198. /// GetAsUnsignedInteger - Workhorse method that converts a integer character
  199. /// sequence of radix up to 36 to an unsigned long long value.
  200. static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
  201. unsigned long long &Result) {
  202. // Autosense radix if not specified.
  203. if (Radix == 0)
  204. Radix = GetAutoSenseRadix(Str);
  205. // Empty strings (after the radix autosense) are invalid.
  206. if (Str.empty()) return true;
  207. // Parse all the bytes of the string given this radix. Watch for overflow.
  208. Result = 0;
  209. while (!Str.empty()) {
  210. unsigned CharVal;
  211. if (Str[0] >= '0' && Str[0] <= '9')
  212. CharVal = Str[0]-'0';
  213. else if (Str[0] >= 'a' && Str[0] <= 'z')
  214. CharVal = Str[0]-'a'+10;
  215. else if (Str[0] >= 'A' && Str[0] <= 'Z')
  216. CharVal = Str[0]-'A'+10;
  217. else
  218. return true;
  219. // If the parsed value is larger than the integer radix, the string is
  220. // invalid.
  221. if (CharVal >= Radix)
  222. return true;
  223. // Add in this character.
  224. unsigned long long PrevResult = Result;
  225. Result = Result*Radix+CharVal;
  226. // Check for overflow.
  227. if (Result < PrevResult)
  228. return true;
  229. Str = Str.substr(1);
  230. }
  231. return false;
  232. }
  233. bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
  234. return GetAsUnsignedInteger(*this, Radix, Result);
  235. }
  236. bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
  237. unsigned long long ULLVal;
  238. // Handle positive strings first.
  239. if (empty() || front() != '-') {
  240. if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
  241. // Check for value so large it overflows a signed value.
  242. (long long)ULLVal < 0)
  243. return true;
  244. Result = ULLVal;
  245. return false;
  246. }
  247. // Get the positive part of the value.
  248. if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
  249. // Reject values so large they'd overflow as negative signed, but allow
  250. // "-0". This negates the unsigned so that the negative isn't undefined
  251. // on signed overflow.
  252. (long long)-ULLVal > 0)
  253. return true;
  254. Result = -ULLVal;
  255. return false;
  256. }
  257. bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
  258. long long Val;
  259. if (getAsInteger(Radix, Val) ||
  260. (int)Val != Val)
  261. return true;
  262. Result = Val;
  263. return false;
  264. }
  265. bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
  266. unsigned long long Val;
  267. if (getAsInteger(Radix, Val) ||
  268. (unsigned)Val != Val)
  269. return true;
  270. Result = Val;
  271. return false;
  272. }
  273. bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
  274. StringRef Str = *this;
  275. // Autosense radix if not specified.
  276. if (Radix == 0)
  277. Radix = GetAutoSenseRadix(Str);
  278. assert(Radix > 1 && Radix <= 36);
  279. // Empty strings (after the radix autosense) are invalid.
  280. if (Str.empty()) return true;
  281. // Skip leading zeroes. This can be a significant improvement if
  282. // it means we don't need > 64 bits.
  283. while (!Str.empty() && Str.front() == '0')
  284. Str = Str.substr(1);
  285. // If it was nothing but zeroes....
  286. if (Str.empty()) {
  287. Result = APInt(64, 0);
  288. return false;
  289. }
  290. // (Over-)estimate the required number of bits.
  291. unsigned Log2Radix = 0;
  292. while ((1U << Log2Radix) < Radix) Log2Radix++;
  293. bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
  294. unsigned BitWidth = Log2Radix * Str.size();
  295. if (BitWidth < Result.getBitWidth())
  296. BitWidth = Result.getBitWidth(); // don't shrink the result
  297. else
  298. Result.zext(BitWidth);
  299. APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
  300. if (!IsPowerOf2Radix) {
  301. // These must have the same bit-width as Result.
  302. RadixAP = APInt(BitWidth, Radix);
  303. CharAP = APInt(BitWidth, 0);
  304. }
  305. // Parse all the bytes of the string given this radix.
  306. Result = 0;
  307. while (!Str.empty()) {
  308. unsigned CharVal;
  309. if (Str[0] >= '0' && Str[0] <= '9')
  310. CharVal = Str[0]-'0';
  311. else if (Str[0] >= 'a' && Str[0] <= 'z')
  312. CharVal = Str[0]-'a'+10;
  313. else if (Str[0] >= 'A' && Str[0] <= 'Z')
  314. CharVal = Str[0]-'A'+10;
  315. else
  316. return true;
  317. // If the parsed value is larger than the integer radix, the string is
  318. // invalid.
  319. if (CharVal >= Radix)
  320. return true;
  321. // Add in this character.
  322. if (IsPowerOf2Radix) {
  323. Result <<= Log2Radix;
  324. Result |= CharVal;
  325. } else {
  326. Result *= RadixAP;
  327. CharAP = CharVal;
  328. Result += CharAP;
  329. }
  330. Str = Str.substr(1);
  331. }
  332. return false;
  333. }