softfloat-macros.h 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700
  1. /*
  2. * QEMU float support macros
  3. *
  4. * The code in this source file is derived from release 2a of the SoftFloat
  5. * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
  6. * some later contributions) are provided under that license, as detailed below.
  7. * It has subsequently been modified by contributors to the QEMU Project,
  8. * so some portions are provided under:
  9. * the SoftFloat-2a license
  10. * the BSD license
  11. *
  12. * Any future contributions to this file after December 1st 2014 will be
  13. * taken to be licensed under the Softfloat-2a license unless specifically
  14. * indicated otherwise.
  15. */
  16. /*
  17. ===============================================================================
  18. This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
  19. Arithmetic Package, Release 2a.
  20. Written by John R. Hauser. This work was made possible in part by the
  21. International Computer Science Institute, located at Suite 600, 1947 Center
  22. Street, Berkeley, California 94704. Funding was partially provided by the
  23. National Science Foundation under grant MIP-9311980. The original version
  24. of this code was written as part of a project to build a fixed-point vector
  25. processor in collaboration with the University of California at Berkeley,
  26. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  27. is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  28. arithmetic/SoftFloat.html'.
  29. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  30. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  31. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  32. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  33. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  34. Derivative works are acceptable, even for commercial purposes, so long as
  35. (1) they include prominent notice that the work is derivative, and (2) they
  36. include prominent notice akin to these four paragraphs for those parts of
  37. this code that are retained.
  38. ===============================================================================
  39. */
  40. /* BSD licensing:
  41. * Copyright (c) 2006, Fabrice Bellard
  42. * All rights reserved.
  43. *
  44. * Redistribution and use in source and binary forms, with or without
  45. * modification, are permitted provided that the following conditions are met:
  46. *
  47. * 1. Redistributions of source code must retain the above copyright notice,
  48. * this list of conditions and the following disclaimer.
  49. *
  50. * 2. Redistributions in binary form must reproduce the above copyright notice,
  51. * this list of conditions and the following disclaimer in the documentation
  52. * and/or other materials provided with the distribution.
  53. *
  54. * 3. Neither the name of the copyright holder nor the names of its contributors
  55. * may be used to endorse or promote products derived from this software without
  56. * specific prior written permission.
  57. *
  58. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  59. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  60. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  61. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  62. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  63. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  64. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  65. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  66. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  67. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  68. * THE POSSIBILITY OF SUCH DAMAGE.
  69. */
  70. #ifndef FPU_SOFTFLOAT_MACROS_H
  71. #define FPU_SOFTFLOAT_MACROS_H
  72. #include "fpu/softfloat-types.h"
  73. #include "qemu/host-utils.h"
  74. /**
  75. * shl_double: double-word merging left shift
  76. * @l: left or most-significant word
  77. * @r: right or least-significant word
  78. * @c: shift count
  79. *
  80. * Shift @l left by @c bits, shifting in bits from @r.
  81. */
  82. static inline uint64_t shl_double(uint64_t l, uint64_t r, int c)
  83. {
  84. #if defined(__x86_64__)
  85. asm("shld %b2, %1, %0" : "+r"(l) : "r"(r), "ci"(c));
  86. return l;
  87. #else
  88. return c ? (l << c) | (r >> (64 - c)) : l;
  89. #endif
  90. }
  91. /**
  92. * shr_double: double-word merging right shift
  93. * @l: left or most-significant word
  94. * @r: right or least-significant word
  95. * @c: shift count
  96. *
  97. * Shift @r right by @c bits, shifting in bits from @l.
  98. */
  99. static inline uint64_t shr_double(uint64_t l, uint64_t r, int c)
  100. {
  101. #if defined(__x86_64__)
  102. asm("shrd %b2, %1, %0" : "+r"(r) : "r"(l), "ci"(c));
  103. return r;
  104. #else
  105. return c ? (r >> c) | (l << (64 - c)) : r;
  106. #endif
  107. }
  108. /*----------------------------------------------------------------------------
  109. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  110. | bits are shifted off, they are ``jammed'' into the least significant bit of
  111. | the result by setting the least significant bit to 1. The value of `count'
  112. | can be arbitrarily large; in particular, if `count' is greater than 32, the
  113. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  114. | The result is stored in the location pointed to by `zPtr'.
  115. *----------------------------------------------------------------------------*/
  116. static inline void shift32RightJamming(uint32_t a, int count, uint32_t *zPtr)
  117. {
  118. uint32_t z;
  119. if ( count == 0 ) {
  120. z = a;
  121. }
  122. else if ( count < 32 ) {
  123. z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 );
  124. }
  125. else {
  126. z = ( a != 0 );
  127. }
  128. *zPtr = z;
  129. }
  130. /*----------------------------------------------------------------------------
  131. | Shifts `a' right by the number of bits given in `count'. If any nonzero
  132. | bits are shifted off, they are ``jammed'' into the least significant bit of
  133. | the result by setting the least significant bit to 1. The value of `count'
  134. | can be arbitrarily large; in particular, if `count' is greater than 64, the
  135. | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
  136. | The result is stored in the location pointed to by `zPtr'.
  137. *----------------------------------------------------------------------------*/
  138. static inline void shift64RightJamming(uint64_t a, int count, uint64_t *zPtr)
  139. {
  140. uint64_t z;
  141. if ( count == 0 ) {
  142. z = a;
  143. }
  144. else if ( count < 64 ) {
  145. z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 );
  146. }
  147. else {
  148. z = ( a != 0 );
  149. }
  150. *zPtr = z;
  151. }
  152. /*----------------------------------------------------------------------------
  153. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64
  154. | _plus_ the number of bits given in `count'. The shifted result is at most
  155. | 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The
  156. | bits shifted off form a second 64-bit result as follows: The _last_ bit
  157. | shifted off is the most-significant bit of the extra result, and the other
  158. | 63 bits of the extra result are all zero if and only if _all_but_the_last_
  159. | bits shifted off were all zero. This extra result is stored in the location
  160. | pointed to by `z1Ptr'. The value of `count' can be arbitrarily large.
  161. | (This routine makes more sense if `a0' and `a1' are considered to form a
  162. | fixed-point value with binary point between `a0' and `a1'. This fixed-point
  163. | value is shifted right by the number of bits given in `count', and the
  164. | integer part of the result is returned at the location pointed to by
  165. | `z0Ptr'. The fractional part of the result may be slightly corrupted as
  166. | described above, and is returned at the location pointed to by `z1Ptr'.)
  167. *----------------------------------------------------------------------------*/
  168. static inline void
  169. shift64ExtraRightJamming(
  170. uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
  171. {
  172. uint64_t z0, z1;
  173. int8_t negCount = ( - count ) & 63;
  174. if ( count == 0 ) {
  175. z1 = a1;
  176. z0 = a0;
  177. }
  178. else if ( count < 64 ) {
  179. z1 = ( a0<<negCount ) | ( a1 != 0 );
  180. z0 = a0>>count;
  181. }
  182. else {
  183. if ( count == 64 ) {
  184. z1 = a0 | ( a1 != 0 );
  185. }
  186. else {
  187. z1 = ( ( a0 | a1 ) != 0 );
  188. }
  189. z0 = 0;
  190. }
  191. *z1Ptr = z1;
  192. *z0Ptr = z0;
  193. }
  194. /*----------------------------------------------------------------------------
  195. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  196. | number of bits given in `count'. Any bits shifted off are lost. The value
  197. | of `count' can be arbitrarily large; in particular, if `count' is greater
  198. | than 128, the result will be 0. The result is broken into two 64-bit pieces
  199. | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  200. *----------------------------------------------------------------------------*/
  201. static inline void
  202. shift128Right(
  203. uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
  204. {
  205. uint64_t z0, z1;
  206. int8_t negCount = ( - count ) & 63;
  207. if ( count == 0 ) {
  208. z1 = a1;
  209. z0 = a0;
  210. }
  211. else if ( count < 64 ) {
  212. z1 = ( a0<<negCount ) | ( a1>>count );
  213. z0 = a0>>count;
  214. }
  215. else {
  216. z1 = (count < 128) ? (a0 >> (count & 63)) : 0;
  217. z0 = 0;
  218. }
  219. *z1Ptr = z1;
  220. *z0Ptr = z0;
  221. }
  222. /*----------------------------------------------------------------------------
  223. | Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the
  224. | number of bits given in `count'. If any nonzero bits are shifted off, they
  225. | are ``jammed'' into the least significant bit of the result by setting the
  226. | least significant bit to 1. The value of `count' can be arbitrarily large;
  227. | in particular, if `count' is greater than 128, the result will be either
  228. | 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or
  229. | nonzero. The result is broken into two 64-bit pieces which are stored at
  230. | the locations pointed to by `z0Ptr' and `z1Ptr'.
  231. *----------------------------------------------------------------------------*/
  232. static inline void
  233. shift128RightJamming(
  234. uint64_t a0, uint64_t a1, int count, uint64_t *z0Ptr, uint64_t *z1Ptr)
  235. {
  236. uint64_t z0, z1;
  237. int8_t negCount = ( - count ) & 63;
  238. if ( count == 0 ) {
  239. z1 = a1;
  240. z0 = a0;
  241. }
  242. else if ( count < 64 ) {
  243. z1 = ( a0<<negCount ) | ( a1>>count ) | ( ( a1<<negCount ) != 0 );
  244. z0 = a0>>count;
  245. }
  246. else {
  247. if ( count == 64 ) {
  248. z1 = a0 | ( a1 != 0 );
  249. }
  250. else if ( count < 128 ) {
  251. z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<<negCount ) | a1 ) != 0 );
  252. }
  253. else {
  254. z1 = ( ( a0 | a1 ) != 0 );
  255. }
  256. z0 = 0;
  257. }
  258. *z1Ptr = z1;
  259. *z0Ptr = z0;
  260. }
  261. /*----------------------------------------------------------------------------
  262. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' right
  263. | by 64 _plus_ the number of bits given in `count'. The shifted result is
  264. | at most 128 nonzero bits; these are broken into two 64-bit pieces which are
  265. | stored at the locations pointed to by `z0Ptr' and `z1Ptr'. The bits shifted
  266. | off form a third 64-bit result as follows: The _last_ bit shifted off is
  267. | the most-significant bit of the extra result, and the other 63 bits of the
  268. | extra result are all zero if and only if _all_but_the_last_ bits shifted off
  269. | were all zero. This extra result is stored in the location pointed to by
  270. | `z2Ptr'. The value of `count' can be arbitrarily large.
  271. | (This routine makes more sense if `a0', `a1', and `a2' are considered
  272. | to form a fixed-point value with binary point between `a1' and `a2'. This
  273. | fixed-point value is shifted right by the number of bits given in `count',
  274. | and the integer part of the result is returned at the locations pointed to
  275. | by `z0Ptr' and `z1Ptr'. The fractional part of the result may be slightly
  276. | corrupted as described above, and is returned at the location pointed to by
  277. | `z2Ptr'.)
  278. *----------------------------------------------------------------------------*/
  279. static inline void
  280. shift128ExtraRightJamming(
  281. uint64_t a0,
  282. uint64_t a1,
  283. uint64_t a2,
  284. int count,
  285. uint64_t *z0Ptr,
  286. uint64_t *z1Ptr,
  287. uint64_t *z2Ptr
  288. )
  289. {
  290. uint64_t z0, z1, z2;
  291. int8_t negCount = ( - count ) & 63;
  292. if ( count == 0 ) {
  293. z2 = a2;
  294. z1 = a1;
  295. z0 = a0;
  296. }
  297. else {
  298. if ( count < 64 ) {
  299. z2 = a1<<negCount;
  300. z1 = ( a0<<negCount ) | ( a1>>count );
  301. z0 = a0>>count;
  302. }
  303. else {
  304. if ( count == 64 ) {
  305. z2 = a1;
  306. z1 = a0;
  307. }
  308. else {
  309. a2 |= a1;
  310. if ( count < 128 ) {
  311. z2 = a0<<negCount;
  312. z1 = a0>>( count & 63 );
  313. }
  314. else {
  315. z2 = ( count == 128 ) ? a0 : ( a0 != 0 );
  316. z1 = 0;
  317. }
  318. }
  319. z0 = 0;
  320. }
  321. z2 |= ( a2 != 0 );
  322. }
  323. *z2Ptr = z2;
  324. *z1Ptr = z1;
  325. *z0Ptr = z0;
  326. }
  327. /*----------------------------------------------------------------------------
  328. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  329. | number of bits given in `count'. Any bits shifted off are lost. The value
  330. | of `count' must be less than 64. The result is broken into two 64-bit
  331. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  332. *----------------------------------------------------------------------------*/
  333. static inline void shortShift128Left(uint64_t a0, uint64_t a1, int count,
  334. uint64_t *z0Ptr, uint64_t *z1Ptr)
  335. {
  336. *z1Ptr = a1 << count;
  337. *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
  338. }
  339. /*----------------------------------------------------------------------------
  340. | Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the
  341. | number of bits given in `count'. Any bits shifted off are lost. The value
  342. | of `count' may be greater than 64. The result is broken into two 64-bit
  343. | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  344. *----------------------------------------------------------------------------*/
  345. static inline void shift128Left(uint64_t a0, uint64_t a1, int count,
  346. uint64_t *z0Ptr, uint64_t *z1Ptr)
  347. {
  348. if (count < 64) {
  349. *z1Ptr = a1 << count;
  350. *z0Ptr = count == 0 ? a0 : (a0 << count) | (a1 >> (-count & 63));
  351. } else {
  352. *z1Ptr = 0;
  353. *z0Ptr = a1 << (count - 64);
  354. }
  355. }
  356. /*----------------------------------------------------------------------------
  357. | Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left
  358. | by the number of bits given in `count'. Any bits shifted off are lost.
  359. | The value of `count' must be less than 64. The result is broken into three
  360. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  361. | `z1Ptr', and `z2Ptr'.
  362. *----------------------------------------------------------------------------*/
  363. static inline void
  364. shortShift192Left(
  365. uint64_t a0,
  366. uint64_t a1,
  367. uint64_t a2,
  368. int count,
  369. uint64_t *z0Ptr,
  370. uint64_t *z1Ptr,
  371. uint64_t *z2Ptr
  372. )
  373. {
  374. uint64_t z0, z1, z2;
  375. int8_t negCount;
  376. z2 = a2<<count;
  377. z1 = a1<<count;
  378. z0 = a0<<count;
  379. if ( 0 < count ) {
  380. negCount = ( ( - count ) & 63 );
  381. z1 |= a2>>negCount;
  382. z0 |= a1>>negCount;
  383. }
  384. *z2Ptr = z2;
  385. *z1Ptr = z1;
  386. *z0Ptr = z0;
  387. }
  388. /*----------------------------------------------------------------------------
  389. | Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit
  390. | value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so
  391. | any carry out is lost. The result is broken into two 64-bit pieces which
  392. | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
  393. *----------------------------------------------------------------------------*/
  394. static inline void add128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
  395. uint64_t *z0Ptr, uint64_t *z1Ptr)
  396. {
  397. bool c = 0;
  398. *z1Ptr = uadd64_carry(a1, b1, &c);
  399. *z0Ptr = uadd64_carry(a0, b0, &c);
  400. }
  401. /*----------------------------------------------------------------------------
  402. | Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the
  403. | 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is
  404. | modulo 2^192, so any carry out is lost. The result is broken into three
  405. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr',
  406. | `z1Ptr', and `z2Ptr'.
  407. *----------------------------------------------------------------------------*/
  408. static inline void add192(uint64_t a0, uint64_t a1, uint64_t a2,
  409. uint64_t b0, uint64_t b1, uint64_t b2,
  410. uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
  411. {
  412. bool c = 0;
  413. *z2Ptr = uadd64_carry(a2, b2, &c);
  414. *z1Ptr = uadd64_carry(a1, b1, &c);
  415. *z0Ptr = uadd64_carry(a0, b0, &c);
  416. }
  417. /*----------------------------------------------------------------------------
  418. | Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the
  419. | 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo
  420. | 2^128, so any borrow out (carry out) is lost. The result is broken into two
  421. | 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and
  422. | `z1Ptr'.
  423. *----------------------------------------------------------------------------*/
  424. static inline void sub128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1,
  425. uint64_t *z0Ptr, uint64_t *z1Ptr)
  426. {
  427. bool c = 0;
  428. *z1Ptr = usub64_borrow(a1, b1, &c);
  429. *z0Ptr = usub64_borrow(a0, b0, &c);
  430. }
  431. /*----------------------------------------------------------------------------
  432. | Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2'
  433. | from the 192-bit value formed by concatenating `a0', `a1', and `a2'.
  434. | Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The
  435. | result is broken into three 64-bit pieces which are stored at the locations
  436. | pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'.
  437. *----------------------------------------------------------------------------*/
  438. static inline void sub192(uint64_t a0, uint64_t a1, uint64_t a2,
  439. uint64_t b0, uint64_t b1, uint64_t b2,
  440. uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
  441. {
  442. bool c = 0;
  443. *z2Ptr = usub64_borrow(a2, b2, &c);
  444. *z1Ptr = usub64_borrow(a1, b1, &c);
  445. *z0Ptr = usub64_borrow(a0, b0, &c);
  446. }
  447. /*----------------------------------------------------------------------------
  448. | Multiplies `a' by `b' to obtain a 128-bit product. The product is broken
  449. | into two 64-bit pieces which are stored at the locations pointed to by
  450. | `z0Ptr' and `z1Ptr'.
  451. *----------------------------------------------------------------------------*/
  452. static inline void
  453. mul64To128(uint64_t a, uint64_t b, uint64_t *z0Ptr, uint64_t *z1Ptr)
  454. {
  455. mulu64(z1Ptr, z0Ptr, a, b);
  456. }
  457. /*----------------------------------------------------------------------------
  458. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' by
  459. | `b' to obtain a 192-bit product. The product is broken into three 64-bit
  460. | pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and
  461. | `z2Ptr'.
  462. *----------------------------------------------------------------------------*/
  463. static inline void
  464. mul128By64To192(uint64_t a0, uint64_t a1, uint64_t b,
  465. uint64_t *z0Ptr, uint64_t *z1Ptr, uint64_t *z2Ptr)
  466. {
  467. uint64_t z0, z1, m1;
  468. mul64To128(a1, b, &m1, z2Ptr);
  469. mul64To128(a0, b, &z0, &z1);
  470. add128(z0, z1, 0, m1, z0Ptr, z1Ptr);
  471. }
  472. /*----------------------------------------------------------------------------
  473. | Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the
  474. | 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit
  475. | product. The product is broken into four 64-bit pieces which are stored at
  476. | the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'.
  477. *----------------------------------------------------------------------------*/
  478. static inline void mul128To256(uint64_t a0, uint64_t a1,
  479. uint64_t b0, uint64_t b1,
  480. uint64_t *z0Ptr, uint64_t *z1Ptr,
  481. uint64_t *z2Ptr, uint64_t *z3Ptr)
  482. {
  483. uint64_t z0, z1, z2;
  484. uint64_t m0, m1, m2, n1, n2;
  485. mul64To128(a1, b0, &m1, &m2);
  486. mul64To128(a0, b1, &n1, &n2);
  487. mul64To128(a1, b1, &z2, z3Ptr);
  488. mul64To128(a0, b0, &z0, &z1);
  489. add192( 0, m1, m2, 0, n1, n2, &m0, &m1, &m2);
  490. add192(m0, m1, m2, z0, z1, z2, z0Ptr, z1Ptr, z2Ptr);
  491. }
  492. /*----------------------------------------------------------------------------
  493. | Returns an approximation to the 64-bit integer quotient obtained by dividing
  494. | `b' into the 128-bit value formed by concatenating `a0' and `a1'. The
  495. | divisor `b' must be at least 2^63. If q is the exact quotient truncated
  496. | toward zero, the approximation returned lies between q and q + 2 inclusive.
  497. | If the exact quotient q is larger than 64 bits, the maximum positive 64-bit
  498. | unsigned integer is returned.
  499. *----------------------------------------------------------------------------*/
  500. static inline uint64_t estimateDiv128To64(uint64_t a0, uint64_t a1, uint64_t b)
  501. {
  502. uint64_t b0, b1;
  503. uint64_t rem0, rem1, term0, term1;
  504. uint64_t z;
  505. if ( b <= a0 ) return UINT64_C(0xFFFFFFFFFFFFFFFF);
  506. b0 = b>>32;
  507. z = ( b0<<32 <= a0 ) ? UINT64_C(0xFFFFFFFF00000000) : ( a0 / b0 )<<32;
  508. mul64To128( b, z, &term0, &term1 );
  509. sub128( a0, a1, term0, term1, &rem0, &rem1 );
  510. while ( ( (int64_t) rem0 ) < 0 ) {
  511. z -= UINT64_C(0x100000000);
  512. b1 = b<<32;
  513. add128( rem0, rem1, b0, b1, &rem0, &rem1 );
  514. }
  515. rem0 = ( rem0<<32 ) | ( rem1>>32 );
  516. z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0;
  517. return z;
  518. }
  519. /*----------------------------------------------------------------------------
  520. | Returns an approximation to the square root of the 32-bit significand given
  521. | by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of
  522. | `aExp' (the least significant bit) is 1, the integer returned approximates
  523. | 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp'
  524. | is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either
  525. | case, the approximation returned lies strictly within +/-2 of the exact
  526. | value.
  527. *----------------------------------------------------------------------------*/
  528. static inline uint32_t estimateSqrt32(int aExp, uint32_t a)
  529. {
  530. static const uint16_t sqrtOddAdjustments[] = {
  531. 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0,
  532. 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67
  533. };
  534. static const uint16_t sqrtEvenAdjustments[] = {
  535. 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E,
  536. 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002
  537. };
  538. int8_t index;
  539. uint32_t z;
  540. index = ( a>>27 ) & 15;
  541. if ( aExp & 1 ) {
  542. z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ (int)index ];
  543. z = ( ( a / z )<<14 ) + ( z<<15 );
  544. a >>= 1;
  545. }
  546. else {
  547. z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ (int)index ];
  548. z = a / z + z;
  549. z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 );
  550. if ( z <= a ) return (uint32_t) ( ( (int32_t) a )>>1 );
  551. }
  552. return ( (uint32_t) ( ( ( (uint64_t) a )<<31 ) / z ) ) + ( z>>1 );
  553. }
  554. /*----------------------------------------------------------------------------
  555. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1'
  556. | is equal to the 128-bit value formed by concatenating `b0' and `b1'.
  557. | Otherwise, returns 0.
  558. *----------------------------------------------------------------------------*/
  559. static inline bool eq128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
  560. {
  561. return a0 == b0 && a1 == b1;
  562. }
  563. /*----------------------------------------------------------------------------
  564. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  565. | than or equal to the 128-bit value formed by concatenating `b0' and `b1'.
  566. | Otherwise, returns 0.
  567. *----------------------------------------------------------------------------*/
  568. static inline bool le128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
  569. {
  570. return a0 < b0 || (a0 == b0 && a1 <= b1);
  571. }
  572. /*----------------------------------------------------------------------------
  573. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less
  574. | than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise,
  575. | returns 0.
  576. *----------------------------------------------------------------------------*/
  577. static inline bool lt128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
  578. {
  579. return a0 < b0 || (a0 == b0 && a1 < b1);
  580. }
  581. /*----------------------------------------------------------------------------
  582. | Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is
  583. | not equal to the 128-bit value formed by concatenating `b0' and `b1'.
  584. | Otherwise, returns 0.
  585. *----------------------------------------------------------------------------*/
  586. static inline bool ne128(uint64_t a0, uint64_t a1, uint64_t b0, uint64_t b1)
  587. {
  588. return a0 != b0 || a1 != b1;
  589. }
  590. /*
  591. * Similarly, comparisons of 192-bit values.
  592. */
  593. static inline bool eq192(uint64_t a0, uint64_t a1, uint64_t a2,
  594. uint64_t b0, uint64_t b1, uint64_t b2)
  595. {
  596. return ((a0 ^ b0) | (a1 ^ b1) | (a2 ^ b2)) == 0;
  597. }
  598. static inline bool le192(uint64_t a0, uint64_t a1, uint64_t a2,
  599. uint64_t b0, uint64_t b1, uint64_t b2)
  600. {
  601. if (a0 != b0) {
  602. return a0 < b0;
  603. }
  604. if (a1 != b1) {
  605. return a1 < b1;
  606. }
  607. return a2 <= b2;
  608. }
  609. static inline bool lt192(uint64_t a0, uint64_t a1, uint64_t a2,
  610. uint64_t b0, uint64_t b1, uint64_t b2)
  611. {
  612. if (a0 != b0) {
  613. return a0 < b0;
  614. }
  615. if (a1 != b1) {
  616. return a1 < b1;
  617. }
  618. return a2 < b2;
  619. }
  620. #endif