softfloat.c 277 KB


  1. /*
  2. * QEMU float support
  3. *
  4. * The code in this source file is derived from release 2a of the SoftFloat
  5. * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
  6. * some later contributions) are provided under that license, as detailed below.
  7. * It has subsequently been modified by contributors to the QEMU Project,
  8. * so some portions are provided under:
  9. * the SoftFloat-2a license
  10. * the BSD license
  11. * GPL-v2-or-later
  12. *
  13. * Any future contributions to this file after December 1st 2014 will be
  14. * taken to be licensed under the Softfloat-2a license unless specifically
  15. * indicated otherwise.
  16. */
  17. /*
  18. ===============================================================================
  19. This C source file is part of the SoftFloat IEC/IEEE Floating-point
  20. Arithmetic Package, Release 2a.
  21. Written by John R. Hauser. This work was made possible in part by the
  22. International Computer Science Institute, located at Suite 600, 1947 Center
  23. Street, Berkeley, California 94704. Funding was partially provided by the
  24. National Science Foundation under grant MIP-9311980. The original version
  25. of this code was written as part of a project to build a fixed-point vector
  26. processor in collaboration with the University of California at Berkeley,
  27. overseen by Profs. Nelson Morgan and John Wawrzynek. More information
  28. is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
  29. arithmetic/SoftFloat.html'.
  30. THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
  31. has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
  32. TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
  33. PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
  34. AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
  35. Derivative works are acceptable, even for commercial purposes, so long as
  36. (1) they include prominent notice that the work is derivative, and (2) they
  37. include prominent notice akin to these four paragraphs for those parts of
  38. this code that are retained.
  39. ===============================================================================
  40. */
  41. /* BSD licensing:
  42. * Copyright (c) 2006, Fabrice Bellard
  43. * All rights reserved.
  44. *
  45. * Redistribution and use in source and binary forms, with or without
  46. * modification, are permitted provided that the following conditions are met:
  47. *
  48. * 1. Redistributions of source code must retain the above copyright notice,
  49. * this list of conditions and the following disclaimer.
  50. *
  51. * 2. Redistributions in binary form must reproduce the above copyright notice,
  52. * this list of conditions and the following disclaimer in the documentation
  53. * and/or other materials provided with the distribution.
  54. *
  55. * 3. Neither the name of the copyright holder nor the names of its contributors
  56. * may be used to endorse or promote products derived from this software without
  57. * specific prior written permission.
  58. *
  59. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  60. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  61. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  62. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  63. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  64. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  65. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  66. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  67. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  68. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  69. * THE POSSIBILITY OF SUCH DAMAGE.
  70. */
  71. /* Portions of this work are licensed under the terms of the GNU GPL,
  72. * version 2 or later. See the COPYING file in the top-level directory.
  73. */
  74. /* softfloat (and in particular the code in softfloat-specialize.h) is
  75. * target-dependent and needs the TARGET_* macros.
  76. */
  77. #include "qemu/osdep.h"
  78. #include "fpu/softfloat.h"
  79. /* We only need stdlib for abort() */
  80. /*----------------------------------------------------------------------------
  81. | Primitive arithmetic functions, including multi-word arithmetic, and
  82. | division and square root approximations. (Can be specialized to target if
  83. | desired.)
  84. *----------------------------------------------------------------------------*/
  85. #include "softfloat-macros.h"
  86. /*----------------------------------------------------------------------------
  87. | Functions and definitions to determine: (1) whether tininess for underflow
  88. | is detected before or after rounding by default, (2) what (if anything)
  89. | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  90. | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  91. | are propagated from function inputs to output. These details are target-
  92. | specific.
  93. *----------------------------------------------------------------------------*/
  94. #include "softfloat-specialize.h"
  95. /*----------------------------------------------------------------------------
  96. | Returns the fraction bits of the half-precision floating-point value `a'.
  97. *----------------------------------------------------------------------------*/
  98. static inline uint32_t extractFloat16Frac(float16 a)
  99. {
  100. return float16_val(a) & 0x3ff;
  101. }
  102. /*----------------------------------------------------------------------------
  103. | Returns the exponent bits of the half-precision floating-point value `a'.
  104. *----------------------------------------------------------------------------*/
  105. static inline int extractFloat16Exp(float16 a)
  106. {
  107. return (float16_val(a) >> 10) & 0x1f;
  108. }
  109. /*----------------------------------------------------------------------------
  110. | Returns the sign bit of the single-precision floating-point value `a'.
  111. *----------------------------------------------------------------------------*/
  112. static inline flag extractFloat16Sign(float16 a)
  113. {
  114. return float16_val(a)>>15;
  115. }
  116. /*----------------------------------------------------------------------------
  117. | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  118. | and 7, and returns the properly rounded 32-bit integer corresponding to the
  119. | input. If `zSign' is 1, the input is negated before being converted to an
  120. | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
  121. | is simply rounded to an integer, with the inexact exception raised if the
  122. | input cannot be represented exactly as an integer. However, if the fixed-
  123. | point input is too large, the invalid exception is raised and the largest
  124. | positive or negative integer is returned.
  125. *----------------------------------------------------------------------------*/
  126. static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
  127. {
  128. int8_t roundingMode;
  129. flag roundNearestEven;
  130. int8_t roundIncrement, roundBits;
  131. int32_t z;
  132. roundingMode = status->float_rounding_mode;
  133. roundNearestEven = ( roundingMode == float_round_nearest_even );
  134. switch (roundingMode) {
  135. case float_round_nearest_even:
  136. case float_round_ties_away:
  137. roundIncrement = 0x40;
  138. break;
  139. case float_round_to_zero:
  140. roundIncrement = 0;
  141. break;
  142. case float_round_up:
  143. roundIncrement = zSign ? 0 : 0x7f;
  144. break;
  145. case float_round_down:
  146. roundIncrement = zSign ? 0x7f : 0;
  147. break;
  148. default:
  149. abort();
  150. }
  151. roundBits = absZ & 0x7F;
  152. absZ = ( absZ + roundIncrement )>>7;
  153. absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  154. z = absZ;
  155. if ( zSign ) z = - z;
  156. if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
  157. float_raise(float_flag_invalid, status);
  158. return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
  159. }
  160. if (roundBits) {
  161. status->float_exception_flags |= float_flag_inexact;
  162. }
  163. return z;
  164. }
  165. /*----------------------------------------------------------------------------
  166. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  167. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  168. | and returns the properly rounded 64-bit integer corresponding to the input.
  169. | If `zSign' is 1, the input is negated before being converted to an integer.
  170. | Ordinarily, the fixed-point input is simply rounded to an integer, with
  171. | the inexact exception raised if the input cannot be represented exactly as
  172. | an integer. However, if the fixed-point input is too large, the invalid
  173. | exception is raised and the largest positive or negative integer is
  174. | returned.
  175. *----------------------------------------------------------------------------*/
  176. static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
  177. float_status *status)
  178. {
  179. int8_t roundingMode;
  180. flag roundNearestEven, increment;
  181. int64_t z;
  182. roundingMode = status->float_rounding_mode;
  183. roundNearestEven = ( roundingMode == float_round_nearest_even );
  184. switch (roundingMode) {
  185. case float_round_nearest_even:
  186. case float_round_ties_away:
  187. increment = ((int64_t) absZ1 < 0);
  188. break;
  189. case float_round_to_zero:
  190. increment = 0;
  191. break;
  192. case float_round_up:
  193. increment = !zSign && absZ1;
  194. break;
  195. case float_round_down:
  196. increment = zSign && absZ1;
  197. break;
  198. default:
  199. abort();
  200. }
  201. if ( increment ) {
  202. ++absZ0;
  203. if ( absZ0 == 0 ) goto overflow;
  204. absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
  205. }
  206. z = absZ0;
  207. if ( zSign ) z = - z;
  208. if ( z && ( ( z < 0 ) ^ zSign ) ) {
  209. overflow:
  210. float_raise(float_flag_invalid, status);
  211. return
  212. zSign ? (int64_t) LIT64( 0x8000000000000000 )
  213. : LIT64( 0x7FFFFFFFFFFFFFFF );
  214. }
  215. if (absZ1) {
  216. status->float_exception_flags |= float_flag_inexact;
  217. }
  218. return z;
  219. }
  220. /*----------------------------------------------------------------------------
  221. | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
  222. | `absZ1', with binary point between bits 63 and 64 (between the input words),
  223. | and returns the properly rounded 64-bit unsigned integer corresponding to the
  224. | input. Ordinarily, the fixed-point input is simply rounded to an integer,
  225. | with the inexact exception raised if the input cannot be represented exactly
  226. | as an integer. However, if the fixed-point input is too large, the invalid
  227. | exception is raised and the largest unsigned integer is returned.
  228. *----------------------------------------------------------------------------*/
  229. static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
  230. uint64_t absZ1, float_status *status)
  231. {
  232. int8_t roundingMode;
  233. flag roundNearestEven, increment;
  234. roundingMode = status->float_rounding_mode;
  235. roundNearestEven = (roundingMode == float_round_nearest_even);
  236. switch (roundingMode) {
  237. case float_round_nearest_even:
  238. case float_round_ties_away:
  239. increment = ((int64_t)absZ1 < 0);
  240. break;
  241. case float_round_to_zero:
  242. increment = 0;
  243. break;
  244. case float_round_up:
  245. increment = !zSign && absZ1;
  246. break;
  247. case float_round_down:
  248. increment = zSign && absZ1;
  249. break;
  250. default:
  251. abort();
  252. }
  253. if (increment) {
  254. ++absZ0;
  255. if (absZ0 == 0) {
  256. float_raise(float_flag_invalid, status);
  257. return LIT64(0xFFFFFFFFFFFFFFFF);
  258. }
  259. absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
  260. }
  261. if (zSign && absZ0) {
  262. float_raise(float_flag_invalid, status);
  263. return 0;
  264. }
  265. if (absZ1) {
  266. status->float_exception_flags |= float_flag_inexact;
  267. }
  268. return absZ0;
  269. }
  270. /*----------------------------------------------------------------------------
  271. | Returns the fraction bits of the single-precision floating-point value `a'.
  272. *----------------------------------------------------------------------------*/
  273. static inline uint32_t extractFloat32Frac( float32 a )
  274. {
  275. return float32_val(a) & 0x007FFFFF;
  276. }
  277. /*----------------------------------------------------------------------------
  278. | Returns the exponent bits of the single-precision floating-point value `a'.
  279. *----------------------------------------------------------------------------*/
  280. static inline int extractFloat32Exp(float32 a)
  281. {
  282. return ( float32_val(a)>>23 ) & 0xFF;
  283. }
  284. /*----------------------------------------------------------------------------
  285. | Returns the sign bit of the single-precision floating-point value `a'.
  286. *----------------------------------------------------------------------------*/
  287. static inline flag extractFloat32Sign( float32 a )
  288. {
  289. return float32_val(a)>>31;
  290. }
  291. /*----------------------------------------------------------------------------
  292. | If `a' is denormal and we are in flush-to-zero mode then set the
  293. | input-denormal exception and return zero. Otherwise just return the value.
  294. *----------------------------------------------------------------------------*/
  295. float32 float32_squash_input_denormal(float32 a, float_status *status)
  296. {
  297. if (status->flush_inputs_to_zero) {
  298. if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
  299. float_raise(float_flag_input_denormal, status);
  300. return make_float32(float32_val(a) & 0x80000000);
  301. }
  302. }
  303. return a;
  304. }
  305. /*----------------------------------------------------------------------------
  306. | Normalizes the subnormal single-precision floating-point value represented
  307. | by the denormalized significand `aSig'. The normalized exponent and
  308. | significand are stored at the locations pointed to by `zExpPtr' and
  309. | `zSigPtr', respectively.
  310. *----------------------------------------------------------------------------*/
  311. static void
  312. normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
  313. {
  314. int8_t shiftCount;
  315. shiftCount = countLeadingZeros32( aSig ) - 8;
  316. *zSigPtr = aSig<<shiftCount;
  317. *zExpPtr = 1 - shiftCount;
  318. }
  319. /*----------------------------------------------------------------------------
  320. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  321. | single-precision floating-point value, returning the result. After being
  322. | shifted into the proper positions, the three fields are simply added
  323. | together to form the result. This means that any integer portion of `zSig'
  324. | will be added into the exponent. Since a properly normalized significand
  325. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  326. | than the desired result exponent whenever `zSig' is a complete, normalized
  327. | significand.
  328. *----------------------------------------------------------------------------*/
  329. static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
  330. {
  331. return make_float32(
  332. ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
  333. }
  334. /*----------------------------------------------------------------------------
  335. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  336. | and significand `zSig', and returns the proper single-precision floating-
  337. | point value corresponding to the abstract input. Ordinarily, the abstract
  338. | value is simply rounded and packed into the single-precision format, with
  339. | the inexact exception raised if the abstract input cannot be represented
  340. | exactly. However, if the abstract value is too large, the overflow and
  341. | inexact exceptions are raised and an infinity or maximal finite value is
  342. | returned. If the abstract value is too small, the input value is rounded to
  343. | a subnormal number, and the underflow and inexact exceptions are raised if
  344. | the abstract input cannot be represented exactly as a subnormal single-
  345. | precision floating-point number.
  346. | The input significand `zSig' has its binary point between bits 30
  347. | and 29, which is 7 bits to the left of the usual location. This shifted
  348. | significand must be normalized or smaller. If `zSig' is not normalized,
  349. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  350. | and it must not require rounding. In the usual case that `zSig' is
  351. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  352. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  353. | Binary Floating-Point Arithmetic.
  354. *----------------------------------------------------------------------------*/
  355. static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
  356. float_status *status)
  357. {
  358. int8_t roundingMode;
  359. flag roundNearestEven;
  360. int8_t roundIncrement, roundBits;
  361. flag isTiny;
  362. roundingMode = status->float_rounding_mode;
  363. roundNearestEven = ( roundingMode == float_round_nearest_even );
  364. switch (roundingMode) {
  365. case float_round_nearest_even:
  366. case float_round_ties_away:
  367. roundIncrement = 0x40;
  368. break;
  369. case float_round_to_zero:
  370. roundIncrement = 0;
  371. break;
  372. case float_round_up:
  373. roundIncrement = zSign ? 0 : 0x7f;
  374. break;
  375. case float_round_down:
  376. roundIncrement = zSign ? 0x7f : 0;
  377. break;
  378. default:
  379. abort();
  380. break;
  381. }
  382. roundBits = zSig & 0x7F;
  383. if ( 0xFD <= (uint16_t) zExp ) {
  384. if ( ( 0xFD < zExp )
  385. || ( ( zExp == 0xFD )
  386. && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
  387. ) {
  388. float_raise(float_flag_overflow | float_flag_inexact, status);
  389. return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
  390. }
  391. if ( zExp < 0 ) {
  392. if (status->flush_to_zero) {
  393. float_raise(float_flag_output_denormal, status);
  394. return packFloat32(zSign, 0, 0);
  395. }
  396. isTiny =
  397. (status->float_detect_tininess
  398. == float_tininess_before_rounding)
  399. || ( zExp < -1 )
  400. || ( zSig + roundIncrement < 0x80000000 );
  401. shift32RightJamming( zSig, - zExp, &zSig );
  402. zExp = 0;
  403. roundBits = zSig & 0x7F;
  404. if (isTiny && roundBits) {
  405. float_raise(float_flag_underflow, status);
  406. }
  407. }
  408. }
  409. if (roundBits) {
  410. status->float_exception_flags |= float_flag_inexact;
  411. }
  412. zSig = ( zSig + roundIncrement )>>7;
  413. zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
  414. if ( zSig == 0 ) zExp = 0;
  415. return packFloat32( zSign, zExp, zSig );
  416. }
  417. /*----------------------------------------------------------------------------
  418. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  419. | and significand `zSig', and returns the proper single-precision floating-
  420. | point value corresponding to the abstract input. This routine is just like
  421. | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
  422. | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  423. | floating-point exponent.
  424. *----------------------------------------------------------------------------*/
  425. static float32
  426. normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
  427. float_status *status)
  428. {
  429. int8_t shiftCount;
  430. shiftCount = countLeadingZeros32( zSig ) - 1;
  431. return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
  432. status);
  433. }
  434. /*----------------------------------------------------------------------------
  435. | Returns the fraction bits of the double-precision floating-point value `a'.
  436. *----------------------------------------------------------------------------*/
  437. static inline uint64_t extractFloat64Frac( float64 a )
  438. {
  439. return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
  440. }
  441. /*----------------------------------------------------------------------------
  442. | Returns the exponent bits of the double-precision floating-point value `a'.
  443. *----------------------------------------------------------------------------*/
  444. static inline int extractFloat64Exp(float64 a)
  445. {
  446. return ( float64_val(a)>>52 ) & 0x7FF;
  447. }
  448. /*----------------------------------------------------------------------------
  449. | Returns the sign bit of the double-precision floating-point value `a'.
  450. *----------------------------------------------------------------------------*/
  451. static inline flag extractFloat64Sign( float64 a )
  452. {
  453. return float64_val(a)>>63;
  454. }
  455. /*----------------------------------------------------------------------------
  456. | If `a' is denormal and we are in flush-to-zero mode then set the
  457. | input-denormal exception and return zero. Otherwise just return the value.
  458. *----------------------------------------------------------------------------*/
  459. float64 float64_squash_input_denormal(float64 a, float_status *status)
  460. {
  461. if (status->flush_inputs_to_zero) {
  462. if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
  463. float_raise(float_flag_input_denormal, status);
  464. return make_float64(float64_val(a) & (1ULL << 63));
  465. }
  466. }
  467. return a;
  468. }
  469. /*----------------------------------------------------------------------------
  470. | Normalizes the subnormal double-precision floating-point value represented
  471. | by the denormalized significand `aSig'. The normalized exponent and
  472. | significand are stored at the locations pointed to by `zExpPtr' and
  473. | `zSigPtr', respectively.
  474. *----------------------------------------------------------------------------*/
  475. static void
  476. normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
  477. {
  478. int8_t shiftCount;
  479. shiftCount = countLeadingZeros64( aSig ) - 11;
  480. *zSigPtr = aSig<<shiftCount;
  481. *zExpPtr = 1 - shiftCount;
  482. }
  483. /*----------------------------------------------------------------------------
  484. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  485. | double-precision floating-point value, returning the result. After being
  486. | shifted into the proper positions, the three fields are simply added
  487. | together to form the result. This means that any integer portion of `zSig'
  488. | will be added into the exponent. Since a properly normalized significand
  489. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  490. | than the desired result exponent whenever `zSig' is a complete, normalized
  491. | significand.
  492. *----------------------------------------------------------------------------*/
  493. static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
  494. {
  495. return make_float64(
  496. ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
  497. }
  498. /*----------------------------------------------------------------------------
  499. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  500. | and significand `zSig', and returns the proper double-precision floating-
  501. | point value corresponding to the abstract input. Ordinarily, the abstract
  502. | value is simply rounded and packed into the double-precision format, with
  503. | the inexact exception raised if the abstract input cannot be represented
  504. | exactly. However, if the abstract value is too large, the overflow and
  505. | inexact exceptions are raised and an infinity or maximal finite value is
  506. | returned. If the abstract value is too small, the input value is rounded to
  507. | a subnormal number, and the underflow and inexact exceptions are raised if
  508. | the abstract input cannot be represented exactly as a subnormal double-
  509. | precision floating-point number.
  510. | The input significand `zSig' has its binary point between bits 62
  511. | and 61, which is 10 bits to the left of the usual location. This shifted
  512. | significand must be normalized or smaller. If `zSig' is not normalized,
  513. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  514. | and it must not require rounding. In the usual case that `zSig' is
  515. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  516. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  517. | Binary Floating-Point Arithmetic.
  518. *----------------------------------------------------------------------------*/
  519. static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
  520. float_status *status)
  521. {
  522. int8_t roundingMode;
  523. flag roundNearestEven;
  524. int roundIncrement, roundBits;
  525. flag isTiny;
  526. roundingMode = status->float_rounding_mode;
  527. roundNearestEven = ( roundingMode == float_round_nearest_even );
  528. switch (roundingMode) {
  529. case float_round_nearest_even:
  530. case float_round_ties_away:
  531. roundIncrement = 0x200;
  532. break;
  533. case float_round_to_zero:
  534. roundIncrement = 0;
  535. break;
  536. case float_round_up:
  537. roundIncrement = zSign ? 0 : 0x3ff;
  538. break;
  539. case float_round_down:
  540. roundIncrement = zSign ? 0x3ff : 0;
  541. break;
  542. case float_round_to_odd:
  543. roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
  544. break;
  545. default:
  546. abort();
  547. }
  548. roundBits = zSig & 0x3FF;
  549. if ( 0x7FD <= (uint16_t) zExp ) {
  550. if ( ( 0x7FD < zExp )
  551. || ( ( zExp == 0x7FD )
  552. && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
  553. ) {
  554. bool overflow_to_inf = roundingMode != float_round_to_odd &&
  555. roundIncrement != 0;
  556. float_raise(float_flag_overflow | float_flag_inexact, status);
  557. return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
  558. }
  559. if ( zExp < 0 ) {
  560. if (status->flush_to_zero) {
  561. float_raise(float_flag_output_denormal, status);
  562. return packFloat64(zSign, 0, 0);
  563. }
  564. isTiny =
  565. (status->float_detect_tininess
  566. == float_tininess_before_rounding)
  567. || ( zExp < -1 )
  568. || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
  569. shift64RightJamming( zSig, - zExp, &zSig );
  570. zExp = 0;
  571. roundBits = zSig & 0x3FF;
  572. if (isTiny && roundBits) {
  573. float_raise(float_flag_underflow, status);
  574. }
  575. if (roundingMode == float_round_to_odd) {
  576. /*
  577. * For round-to-odd case, the roundIncrement depends on
  578. * zSig which just changed.
  579. */
  580. roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
  581. }
  582. }
  583. }
  584. if (roundBits) {
  585. status->float_exception_flags |= float_flag_inexact;
  586. }
  587. zSig = ( zSig + roundIncrement )>>10;
  588. zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
  589. if ( zSig == 0 ) zExp = 0;
  590. return packFloat64( zSign, zExp, zSig );
  591. }
  592. /*----------------------------------------------------------------------------
  593. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  594. | and significand `zSig', and returns the proper double-precision floating-
  595. | point value corresponding to the abstract input. This routine is just like
  596. | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
  597. | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
  598. | floating-point exponent.
  599. *----------------------------------------------------------------------------*/
  600. static float64
  601. normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
  602. float_status *status)
  603. {
  604. int8_t shiftCount;
  605. shiftCount = countLeadingZeros64( zSig ) - 1;
  606. return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
  607. status);
  608. }
  609. /*----------------------------------------------------------------------------
  610. | Returns the fraction bits of the extended double-precision floating-point
  611. | value `a'.
  612. *----------------------------------------------------------------------------*/
  613. static inline uint64_t extractFloatx80Frac( floatx80 a )
  614. {
  615. return a.low;
  616. }
  617. /*----------------------------------------------------------------------------
  618. | Returns the exponent bits of the extended double-precision floating-point
  619. | value `a'.
  620. *----------------------------------------------------------------------------*/
  621. static inline int32_t extractFloatx80Exp( floatx80 a )
  622. {
  623. return a.high & 0x7FFF;
  624. }
  625. /*----------------------------------------------------------------------------
  626. | Returns the sign bit of the extended double-precision floating-point value
  627. | `a'.
  628. *----------------------------------------------------------------------------*/
  629. static inline flag extractFloatx80Sign( floatx80 a )
  630. {
  631. return a.high>>15;
  632. }
  633. /*----------------------------------------------------------------------------
  634. | Normalizes the subnormal extended double-precision floating-point value
  635. | represented by the denormalized significand `aSig'. The normalized exponent
  636. | and significand are stored at the locations pointed to by `zExpPtr' and
  637. | `zSigPtr', respectively.
  638. *----------------------------------------------------------------------------*/
  639. static void
  640. normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
  641. {
  642. int8_t shiftCount;
  643. shiftCount = countLeadingZeros64( aSig );
  644. *zSigPtr = aSig<<shiftCount;
  645. *zExpPtr = 1 - shiftCount;
  646. }
  647. /*----------------------------------------------------------------------------
  648. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
  649. | extended double-precision floating-point value, returning the result.
  650. *----------------------------------------------------------------------------*/
  651. static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
  652. {
  653. floatx80 z;
  654. z.low = zSig;
  655. z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
  656. return z;
  657. }
  658. /*----------------------------------------------------------------------------
  659. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  660. | and extended significand formed by the concatenation of `zSig0' and `zSig1',
  661. | and returns the proper extended double-precision floating-point value
  662. | corresponding to the abstract input. Ordinarily, the abstract value is
  663. | rounded and packed into the extended double-precision format, with the
  664. | inexact exception raised if the abstract input cannot be represented
  665. | exactly. However, if the abstract value is too large, the overflow and
  666. | inexact exceptions are raised and an infinity or maximal finite value is
  667. | returned. If the abstract value is too small, the input value is rounded to
  668. | a subnormal number, and the underflow and inexact exceptions are raised if
  669. | the abstract input cannot be represented exactly as a subnormal extended
  670. | double-precision floating-point number.
  671. | If `roundingPrecision' is 32 or 64, the result is rounded to the same
  672. | number of bits as single or double precision, respectively. Otherwise, the
  673. | result is rounded to the full precision of the extended double-precision
  674. | format.
  675. | The input significand must be normalized or smaller. If the input
  676. | significand is not normalized, `zExp' must be 0; in that case, the result
  677. | returned is a subnormal number, and it must not require rounding. The
  678. | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
  679. | Floating-Point Arithmetic.
  680. *----------------------------------------------------------------------------*/
  681. static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
  682. int32_t zExp, uint64_t zSig0, uint64_t zSig1,
  683. float_status *status)
  684. {
  685. int8_t roundingMode;
  686. flag roundNearestEven, increment, isTiny;
  687. int64_t roundIncrement, roundMask, roundBits;
  688. roundingMode = status->float_rounding_mode;
  689. roundNearestEven = ( roundingMode == float_round_nearest_even );
  690. if ( roundingPrecision == 80 ) goto precision80;
  691. if ( roundingPrecision == 64 ) {
  692. roundIncrement = LIT64( 0x0000000000000400 );
  693. roundMask = LIT64( 0x00000000000007FF );
  694. }
  695. else if ( roundingPrecision == 32 ) {
  696. roundIncrement = LIT64( 0x0000008000000000 );
  697. roundMask = LIT64( 0x000000FFFFFFFFFF );
  698. }
  699. else {
  700. goto precision80;
  701. }
  702. zSig0 |= ( zSig1 != 0 );
  703. switch (roundingMode) {
  704. case float_round_nearest_even:
  705. case float_round_ties_away:
  706. break;
  707. case float_round_to_zero:
  708. roundIncrement = 0;
  709. break;
  710. case float_round_up:
  711. roundIncrement = zSign ? 0 : roundMask;
  712. break;
  713. case float_round_down:
  714. roundIncrement = zSign ? roundMask : 0;
  715. break;
  716. default:
  717. abort();
  718. }
  719. roundBits = zSig0 & roundMask;
  720. if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
  721. if ( ( 0x7FFE < zExp )
  722. || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
  723. ) {
  724. goto overflow;
  725. }
  726. if ( zExp <= 0 ) {
  727. if (status->flush_to_zero) {
  728. float_raise(float_flag_output_denormal, status);
  729. return packFloatx80(zSign, 0, 0);
  730. }
  731. isTiny =
  732. (status->float_detect_tininess
  733. == float_tininess_before_rounding)
  734. || ( zExp < 0 )
  735. || ( zSig0 <= zSig0 + roundIncrement );
  736. shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
  737. zExp = 0;
  738. roundBits = zSig0 & roundMask;
  739. if (isTiny && roundBits) {
  740. float_raise(float_flag_underflow, status);
  741. }
  742. if (roundBits) {
  743. status->float_exception_flags |= float_flag_inexact;
  744. }
  745. zSig0 += roundIncrement;
  746. if ( (int64_t) zSig0 < 0 ) zExp = 1;
  747. roundIncrement = roundMask + 1;
  748. if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  749. roundMask |= roundIncrement;
  750. }
  751. zSig0 &= ~ roundMask;
  752. return packFloatx80( zSign, zExp, zSig0 );
  753. }
  754. }
  755. if (roundBits) {
  756. status->float_exception_flags |= float_flag_inexact;
  757. }
  758. zSig0 += roundIncrement;
  759. if ( zSig0 < roundIncrement ) {
  760. ++zExp;
  761. zSig0 = LIT64( 0x8000000000000000 );
  762. }
  763. roundIncrement = roundMask + 1;
  764. if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
  765. roundMask |= roundIncrement;
  766. }
  767. zSig0 &= ~ roundMask;
  768. if ( zSig0 == 0 ) zExp = 0;
  769. return packFloatx80( zSign, zExp, zSig0 );
  770. precision80:
  771. switch (roundingMode) {
  772. case float_round_nearest_even:
  773. case float_round_ties_away:
  774. increment = ((int64_t)zSig1 < 0);
  775. break;
  776. case float_round_to_zero:
  777. increment = 0;
  778. break;
  779. case float_round_up:
  780. increment = !zSign && zSig1;
  781. break;
  782. case float_round_down:
  783. increment = zSign && zSig1;
  784. break;
  785. default:
  786. abort();
  787. }
  788. if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
  789. if ( ( 0x7FFE < zExp )
  790. || ( ( zExp == 0x7FFE )
  791. && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
  792. && increment
  793. )
  794. ) {
  795. roundMask = 0;
  796. overflow:
  797. float_raise(float_flag_overflow | float_flag_inexact, status);
  798. if ( ( roundingMode == float_round_to_zero )
  799. || ( zSign && ( roundingMode == float_round_up ) )
  800. || ( ! zSign && ( roundingMode == float_round_down ) )
  801. ) {
  802. return packFloatx80( zSign, 0x7FFE, ~ roundMask );
  803. }
  804. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  805. }
  806. if ( zExp <= 0 ) {
  807. isTiny =
  808. (status->float_detect_tininess
  809. == float_tininess_before_rounding)
  810. || ( zExp < 0 )
  811. || ! increment
  812. || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
  813. shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
  814. zExp = 0;
  815. if (isTiny && zSig1) {
  816. float_raise(float_flag_underflow, status);
  817. }
  818. if (zSig1) {
  819. status->float_exception_flags |= float_flag_inexact;
  820. }
  821. switch (roundingMode) {
  822. case float_round_nearest_even:
  823. case float_round_ties_away:
  824. increment = ((int64_t)zSig1 < 0);
  825. break;
  826. case float_round_to_zero:
  827. increment = 0;
  828. break;
  829. case float_round_up:
  830. increment = !zSign && zSig1;
  831. break;
  832. case float_round_down:
  833. increment = zSign && zSig1;
  834. break;
  835. default:
  836. abort();
  837. }
  838. if ( increment ) {
  839. ++zSig0;
  840. zSig0 &=
  841. ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  842. if ( (int64_t) zSig0 < 0 ) zExp = 1;
  843. }
  844. return packFloatx80( zSign, zExp, zSig0 );
  845. }
  846. }
  847. if (zSig1) {
  848. status->float_exception_flags |= float_flag_inexact;
  849. }
  850. if ( increment ) {
  851. ++zSig0;
  852. if ( zSig0 == 0 ) {
  853. ++zExp;
  854. zSig0 = LIT64( 0x8000000000000000 );
  855. }
  856. else {
  857. zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
  858. }
  859. }
  860. else {
  861. if ( zSig0 == 0 ) zExp = 0;
  862. }
  863. return packFloatx80( zSign, zExp, zSig0 );
  864. }
  865. /*----------------------------------------------------------------------------
  866. | Takes an abstract floating-point value having sign `zSign', exponent
  867. | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
  868. | and returns the proper extended double-precision floating-point value
  869. | corresponding to the abstract input. This routine is just like
  870. | `roundAndPackFloatx80' except that the input significand does not have to be
  871. | normalized.
  872. *----------------------------------------------------------------------------*/
  873. static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
  874. flag zSign, int32_t zExp,
  875. uint64_t zSig0, uint64_t zSig1,
  876. float_status *status)
  877. {
  878. int8_t shiftCount;
  879. if ( zSig0 == 0 ) {
  880. zSig0 = zSig1;
  881. zSig1 = 0;
  882. zExp -= 64;
  883. }
  884. shiftCount = countLeadingZeros64( zSig0 );
  885. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  886. zExp -= shiftCount;
  887. return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
  888. zSig0, zSig1, status);
  889. }
  890. /*----------------------------------------------------------------------------
  891. | Returns the least-significant 64 fraction bits of the quadruple-precision
  892. | floating-point value `a'.
  893. *----------------------------------------------------------------------------*/
  894. static inline uint64_t extractFloat128Frac1( float128 a )
  895. {
  896. return a.low;
  897. }
  898. /*----------------------------------------------------------------------------
  899. | Returns the most-significant 48 fraction bits of the quadruple-precision
  900. | floating-point value `a'.
  901. *----------------------------------------------------------------------------*/
  902. static inline uint64_t extractFloat128Frac0( float128 a )
  903. {
  904. return a.high & LIT64( 0x0000FFFFFFFFFFFF );
  905. }
  906. /*----------------------------------------------------------------------------
  907. | Returns the exponent bits of the quadruple-precision floating-point value
  908. | `a'.
  909. *----------------------------------------------------------------------------*/
  910. static inline int32_t extractFloat128Exp( float128 a )
  911. {
  912. return ( a.high>>48 ) & 0x7FFF;
  913. }
  914. /*----------------------------------------------------------------------------
  915. | Returns the sign bit of the quadruple-precision floating-point value `a'.
  916. *----------------------------------------------------------------------------*/
  917. static inline flag extractFloat128Sign( float128 a )
  918. {
  919. return a.high>>63;
  920. }
  921. /*----------------------------------------------------------------------------
  922. | Normalizes the subnormal quadruple-precision floating-point value
  923. | represented by the denormalized significand formed by the concatenation of
  924. | `aSig0' and `aSig1'. The normalized exponent is stored at the location
  925. | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
  926. | significand are stored at the location pointed to by `zSig0Ptr', and the
  927. | least significant 64 bits of the normalized significand are stored at the
  928. | location pointed to by `zSig1Ptr'.
  929. *----------------------------------------------------------------------------*/
  930. static void
  931. normalizeFloat128Subnormal(
  932. uint64_t aSig0,
  933. uint64_t aSig1,
  934. int32_t *zExpPtr,
  935. uint64_t *zSig0Ptr,
  936. uint64_t *zSig1Ptr
  937. )
  938. {
  939. int8_t shiftCount;
  940. if ( aSig0 == 0 ) {
  941. shiftCount = countLeadingZeros64( aSig1 ) - 15;
  942. if ( shiftCount < 0 ) {
  943. *zSig0Ptr = aSig1>>( - shiftCount );
  944. *zSig1Ptr = aSig1<<( shiftCount & 63 );
  945. }
  946. else {
  947. *zSig0Ptr = aSig1<<shiftCount;
  948. *zSig1Ptr = 0;
  949. }
  950. *zExpPtr = - shiftCount - 63;
  951. }
  952. else {
  953. shiftCount = countLeadingZeros64( aSig0 ) - 15;
  954. shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
  955. *zExpPtr = 1 - shiftCount;
  956. }
  957. }
  958. /*----------------------------------------------------------------------------
  959. | Packs the sign `zSign', the exponent `zExp', and the significand formed
  960. | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
  961. | floating-point value, returning the result. After being shifted into the
  962. | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
  963. | added together to form the most significant 32 bits of the result. This
  964. | means that any integer portion of `zSig0' will be added into the exponent.
  965. | Since a properly normalized significand will have an integer portion equal
  966. | to 1, the `zExp' input should be 1 less than the desired result exponent
  967. | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
  968. | significand.
  969. *----------------------------------------------------------------------------*/
  970. static inline float128
  971. packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
  972. {
  973. float128 z;
  974. z.low = zSig1;
  975. z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
  976. return z;
  977. }
  978. /*----------------------------------------------------------------------------
  979. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  980. | and extended significand formed by the concatenation of `zSig0', `zSig1',
  981. | and `zSig2', and returns the proper quadruple-precision floating-point value
  982. | corresponding to the abstract input. Ordinarily, the abstract value is
  983. | simply rounded and packed into the quadruple-precision format, with the
  984. | inexact exception raised if the abstract input cannot be represented
  985. | exactly. However, if the abstract value is too large, the overflow and
  986. | inexact exceptions are raised and an infinity or maximal finite value is
  987. | returned. If the abstract value is too small, the input value is rounded to
  988. | a subnormal number, and the underflow and inexact exceptions are raised if
  989. | the abstract input cannot be represented exactly as a subnormal quadruple-
  990. | precision floating-point number.
  991. | The input significand must be normalized or smaller. If the input
  992. | significand is not normalized, `zExp' must be 0; in that case, the result
  993. | returned is a subnormal number, and it must not require rounding. In the
  994. | usual case that the input significand is normalized, `zExp' must be 1 less
  995. | than the ``true'' floating-point exponent. The handling of underflow and
  996. | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  997. *----------------------------------------------------------------------------*/
  998. static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
  999. uint64_t zSig0, uint64_t zSig1,
  1000. uint64_t zSig2, float_status *status)
  1001. {
  1002. int8_t roundingMode;
  1003. flag roundNearestEven, increment, isTiny;
  1004. roundingMode = status->float_rounding_mode;
  1005. roundNearestEven = ( roundingMode == float_round_nearest_even );
  1006. switch (roundingMode) {
  1007. case float_round_nearest_even:
  1008. case float_round_ties_away:
  1009. increment = ((int64_t)zSig2 < 0);
  1010. break;
  1011. case float_round_to_zero:
  1012. increment = 0;
  1013. break;
  1014. case float_round_up:
  1015. increment = !zSign && zSig2;
  1016. break;
  1017. case float_round_down:
  1018. increment = zSign && zSig2;
  1019. break;
  1020. case float_round_to_odd:
  1021. increment = !(zSig1 & 0x1) && zSig2;
  1022. break;
  1023. default:
  1024. abort();
  1025. }
  1026. if ( 0x7FFD <= (uint32_t) zExp ) {
  1027. if ( ( 0x7FFD < zExp )
  1028. || ( ( zExp == 0x7FFD )
  1029. && eq128(
  1030. LIT64( 0x0001FFFFFFFFFFFF ),
  1031. LIT64( 0xFFFFFFFFFFFFFFFF ),
  1032. zSig0,
  1033. zSig1
  1034. )
  1035. && increment
  1036. )
  1037. ) {
  1038. float_raise(float_flag_overflow | float_flag_inexact, status);
  1039. if ( ( roundingMode == float_round_to_zero )
  1040. || ( zSign && ( roundingMode == float_round_up ) )
  1041. || ( ! zSign && ( roundingMode == float_round_down ) )
  1042. || (roundingMode == float_round_to_odd)
  1043. ) {
  1044. return
  1045. packFloat128(
  1046. zSign,
  1047. 0x7FFE,
  1048. LIT64( 0x0000FFFFFFFFFFFF ),
  1049. LIT64( 0xFFFFFFFFFFFFFFFF )
  1050. );
  1051. }
  1052. return packFloat128( zSign, 0x7FFF, 0, 0 );
  1053. }
  1054. if ( zExp < 0 ) {
  1055. if (status->flush_to_zero) {
  1056. float_raise(float_flag_output_denormal, status);
  1057. return packFloat128(zSign, 0, 0, 0);
  1058. }
  1059. isTiny =
  1060. (status->float_detect_tininess
  1061. == float_tininess_before_rounding)
  1062. || ( zExp < -1 )
  1063. || ! increment
  1064. || lt128(
  1065. zSig0,
  1066. zSig1,
  1067. LIT64( 0x0001FFFFFFFFFFFF ),
  1068. LIT64( 0xFFFFFFFFFFFFFFFF )
  1069. );
  1070. shift128ExtraRightJamming(
  1071. zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
  1072. zExp = 0;
  1073. if (isTiny && zSig2) {
  1074. float_raise(float_flag_underflow, status);
  1075. }
  1076. switch (roundingMode) {
  1077. case float_round_nearest_even:
  1078. case float_round_ties_away:
  1079. increment = ((int64_t)zSig2 < 0);
  1080. break;
  1081. case float_round_to_zero:
  1082. increment = 0;
  1083. break;
  1084. case float_round_up:
  1085. increment = !zSign && zSig2;
  1086. break;
  1087. case float_round_down:
  1088. increment = zSign && zSig2;
  1089. break;
  1090. case float_round_to_odd:
  1091. increment = !(zSig1 & 0x1) && zSig2;
  1092. break;
  1093. default:
  1094. abort();
  1095. }
  1096. }
  1097. }
  1098. if (zSig2) {
  1099. status->float_exception_flags |= float_flag_inexact;
  1100. }
  1101. if ( increment ) {
  1102. add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
  1103. zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
  1104. }
  1105. else {
  1106. if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
  1107. }
  1108. return packFloat128( zSign, zExp, zSig0, zSig1 );
  1109. }
  1110. /*----------------------------------------------------------------------------
  1111. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  1112. | and significand formed by the concatenation of `zSig0' and `zSig1', and
  1113. | returns the proper quadruple-precision floating-point value corresponding
  1114. | to the abstract input. This routine is just like `roundAndPackFloat128'
  1115. | except that the input significand has fewer bits and does not have to be
  1116. | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
  1117. | point exponent.
  1118. *----------------------------------------------------------------------------*/
  1119. static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
  1120. uint64_t zSig0, uint64_t zSig1,
  1121. float_status *status)
  1122. {
  1123. int8_t shiftCount;
  1124. uint64_t zSig2;
  1125. if ( zSig0 == 0 ) {
  1126. zSig0 = zSig1;
  1127. zSig1 = 0;
  1128. zExp -= 64;
  1129. }
  1130. shiftCount = countLeadingZeros64( zSig0 ) - 15;
  1131. if ( 0 <= shiftCount ) {
  1132. zSig2 = 0;
  1133. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  1134. }
  1135. else {
  1136. shift128ExtraRightJamming(
  1137. zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
  1138. }
  1139. zExp -= shiftCount;
  1140. return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
  1141. }
  1142. /*----------------------------------------------------------------------------
  1143. | Returns the result of converting the 32-bit two's complement integer `a'
  1144. | to the single-precision floating-point format. The conversion is performed
  1145. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1146. *----------------------------------------------------------------------------*/
  1147. float32 int32_to_float32(int32_t a, float_status *status)
  1148. {
  1149. flag zSign;
  1150. if ( a == 0 ) return float32_zero;
  1151. if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
  1152. zSign = ( a < 0 );
  1153. return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
  1154. }
  1155. /*----------------------------------------------------------------------------
  1156. | Returns the result of converting the 32-bit two's complement integer `a'
  1157. | to the double-precision floating-point format. The conversion is performed
  1158. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1159. *----------------------------------------------------------------------------*/
  1160. float64 int32_to_float64(int32_t a, float_status *status)
  1161. {
  1162. flag zSign;
  1163. uint32_t absA;
  1164. int8_t shiftCount;
  1165. uint64_t zSig;
  1166. if ( a == 0 ) return float64_zero;
  1167. zSign = ( a < 0 );
  1168. absA = zSign ? - a : a;
  1169. shiftCount = countLeadingZeros32( absA ) + 21;
  1170. zSig = absA;
  1171. return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
  1172. }
  1173. /*----------------------------------------------------------------------------
  1174. | Returns the result of converting the 32-bit two's complement integer `a'
  1175. | to the extended double-precision floating-point format. The conversion
  1176. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1177. | Arithmetic.
  1178. *----------------------------------------------------------------------------*/
  1179. floatx80 int32_to_floatx80(int32_t a, float_status *status)
  1180. {
  1181. flag zSign;
  1182. uint32_t absA;
  1183. int8_t shiftCount;
  1184. uint64_t zSig;
  1185. if ( a == 0 ) return packFloatx80( 0, 0, 0 );
  1186. zSign = ( a < 0 );
  1187. absA = zSign ? - a : a;
  1188. shiftCount = countLeadingZeros32( absA ) + 32;
  1189. zSig = absA;
  1190. return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
  1191. }
  1192. /*----------------------------------------------------------------------------
  1193. | Returns the result of converting the 32-bit two's complement integer `a' to
  1194. | the quadruple-precision floating-point format. The conversion is performed
  1195. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1196. *----------------------------------------------------------------------------*/
  1197. float128 int32_to_float128(int32_t a, float_status *status)
  1198. {
  1199. flag zSign;
  1200. uint32_t absA;
  1201. int8_t shiftCount;
  1202. uint64_t zSig0;
  1203. if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
  1204. zSign = ( a < 0 );
  1205. absA = zSign ? - a : a;
  1206. shiftCount = countLeadingZeros32( absA ) + 17;
  1207. zSig0 = absA;
  1208. return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
  1209. }
  1210. /*----------------------------------------------------------------------------
  1211. | Returns the result of converting the 64-bit two's complement integer `a'
  1212. | to the single-precision floating-point format. The conversion is performed
  1213. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1214. *----------------------------------------------------------------------------*/
  1215. float32 int64_to_float32(int64_t a, float_status *status)
  1216. {
  1217. flag zSign;
  1218. uint64_t absA;
  1219. int8_t shiftCount;
  1220. if ( a == 0 ) return float32_zero;
  1221. zSign = ( a < 0 );
  1222. absA = zSign ? - a : a;
  1223. shiftCount = countLeadingZeros64( absA ) - 40;
  1224. if ( 0 <= shiftCount ) {
  1225. return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
  1226. }
  1227. else {
  1228. shiftCount += 7;
  1229. if ( shiftCount < 0 ) {
  1230. shift64RightJamming( absA, - shiftCount, &absA );
  1231. }
  1232. else {
  1233. absA <<= shiftCount;
  1234. }
  1235. return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
  1236. }
  1237. }
  1238. /*----------------------------------------------------------------------------
  1239. | Returns the result of converting the 64-bit two's complement integer `a'
  1240. | to the double-precision floating-point format. The conversion is performed
  1241. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1242. *----------------------------------------------------------------------------*/
  1243. float64 int64_to_float64(int64_t a, float_status *status)
  1244. {
  1245. flag zSign;
  1246. if ( a == 0 ) return float64_zero;
  1247. if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
  1248. return packFloat64( 1, 0x43E, 0 );
  1249. }
  1250. zSign = ( a < 0 );
  1251. return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
  1252. }
  1253. /*----------------------------------------------------------------------------
  1254. | Returns the result of converting the 64-bit two's complement integer `a'
  1255. | to the extended double-precision floating-point format. The conversion
  1256. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1257. | Arithmetic.
  1258. *----------------------------------------------------------------------------*/
  1259. floatx80 int64_to_floatx80(int64_t a, float_status *status)
  1260. {
  1261. flag zSign;
  1262. uint64_t absA;
  1263. int8_t shiftCount;
  1264. if ( a == 0 ) return packFloatx80( 0, 0, 0 );
  1265. zSign = ( a < 0 );
  1266. absA = zSign ? - a : a;
  1267. shiftCount = countLeadingZeros64( absA );
  1268. return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
  1269. }
  1270. /*----------------------------------------------------------------------------
  1271. | Returns the result of converting the 64-bit two's complement integer `a' to
  1272. | the quadruple-precision floating-point format. The conversion is performed
  1273. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1274. *----------------------------------------------------------------------------*/
  1275. float128 int64_to_float128(int64_t a, float_status *status)
  1276. {
  1277. flag zSign;
  1278. uint64_t absA;
  1279. int8_t shiftCount;
  1280. int32_t zExp;
  1281. uint64_t zSig0, zSig1;
  1282. if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
  1283. zSign = ( a < 0 );
  1284. absA = zSign ? - a : a;
  1285. shiftCount = countLeadingZeros64( absA ) + 49;
  1286. zExp = 0x406E - shiftCount;
  1287. if ( 64 <= shiftCount ) {
  1288. zSig1 = 0;
  1289. zSig0 = absA;
  1290. shiftCount -= 64;
  1291. }
  1292. else {
  1293. zSig1 = absA;
  1294. zSig0 = 0;
  1295. }
  1296. shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
  1297. return packFloat128( zSign, zExp, zSig0, zSig1 );
  1298. }
  1299. /*----------------------------------------------------------------------------
  1300. | Returns the result of converting the 64-bit unsigned integer `a'
  1301. | to the single-precision floating-point format. The conversion is performed
  1302. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1303. *----------------------------------------------------------------------------*/
  1304. float32 uint64_to_float32(uint64_t a, float_status *status)
  1305. {
  1306. int shiftcount;
  1307. if (a == 0) {
  1308. return float32_zero;
  1309. }
  1310. /* Determine (left) shift needed to put first set bit into bit posn 23
  1311. * (since packFloat32() expects the binary point between bits 23 and 22);
  1312. * this is the fast case for smallish numbers.
  1313. */
  1314. shiftcount = countLeadingZeros64(a) - 40;
  1315. if (shiftcount >= 0) {
  1316. return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
  1317. }
  1318. /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
  1319. * expects the binary point between bits 30 and 29, hence the + 7.
  1320. */
  1321. shiftcount += 7;
  1322. if (shiftcount < 0) {
  1323. shift64RightJamming(a, -shiftcount, &a);
  1324. } else {
  1325. a <<= shiftcount;
  1326. }
  1327. return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
  1328. }
  1329. /*----------------------------------------------------------------------------
  1330. | Returns the result of converting the 64-bit unsigned integer `a'
  1331. | to the double-precision floating-point format. The conversion is performed
  1332. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1333. *----------------------------------------------------------------------------*/
  1334. float64 uint64_to_float64(uint64_t a, float_status *status)
  1335. {
  1336. int exp = 0x43C;
  1337. int shiftcount;
  1338. if (a == 0) {
  1339. return float64_zero;
  1340. }
  1341. shiftcount = countLeadingZeros64(a) - 1;
  1342. if (shiftcount < 0) {
  1343. shift64RightJamming(a, -shiftcount, &a);
  1344. } else {
  1345. a <<= shiftcount;
  1346. }
  1347. return roundAndPackFloat64(0, exp - shiftcount, a, status);
  1348. }
  1349. /*----------------------------------------------------------------------------
  1350. | Returns the result of converting the 64-bit unsigned integer `a'
  1351. | to the quadruple-precision floating-point format. The conversion is performed
  1352. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  1353. *----------------------------------------------------------------------------*/
  1354. float128 uint64_to_float128(uint64_t a, float_status *status)
  1355. {
  1356. if (a == 0) {
  1357. return float128_zero;
  1358. }
  1359. return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
  1360. }
  1361. /*----------------------------------------------------------------------------
  1362. | Returns the result of converting the single-precision floating-point value
  1363. | `a' to the 32-bit two's complement integer format. The conversion is
  1364. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1365. | Arithmetic---which means in particular that the conversion is rounded
  1366. | according to the current rounding mode. If `a' is a NaN, the largest
  1367. | positive integer is returned. Otherwise, if the conversion overflows, the
  1368. | largest integer with the same sign as `a' is returned.
  1369. *----------------------------------------------------------------------------*/
  1370. int32_t float32_to_int32(float32 a, float_status *status)
  1371. {
  1372. flag aSign;
  1373. int aExp;
  1374. int shiftCount;
  1375. uint32_t aSig;
  1376. uint64_t aSig64;
  1377. a = float32_squash_input_denormal(a, status);
  1378. aSig = extractFloat32Frac( a );
  1379. aExp = extractFloat32Exp( a );
  1380. aSign = extractFloat32Sign( a );
  1381. if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
  1382. if ( aExp ) aSig |= 0x00800000;
  1383. shiftCount = 0xAF - aExp;
  1384. aSig64 = aSig;
  1385. aSig64 <<= 32;
  1386. if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
  1387. return roundAndPackInt32(aSign, aSig64, status);
  1388. }
  1389. /*----------------------------------------------------------------------------
  1390. | Returns the result of converting the single-precision floating-point value
  1391. | `a' to the 32-bit two's complement integer format. The conversion is
  1392. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1393. | Arithmetic, except that the conversion is always rounded toward zero.
  1394. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  1395. | the conversion overflows, the largest integer with the same sign as `a' is
  1396. | returned.
  1397. *----------------------------------------------------------------------------*/
  1398. int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
  1399. {
  1400. flag aSign;
  1401. int aExp;
  1402. int shiftCount;
  1403. uint32_t aSig;
  1404. int32_t z;
  1405. a = float32_squash_input_denormal(a, status);
  1406. aSig = extractFloat32Frac( a );
  1407. aExp = extractFloat32Exp( a );
  1408. aSign = extractFloat32Sign( a );
  1409. shiftCount = aExp - 0x9E;
  1410. if ( 0 <= shiftCount ) {
  1411. if ( float32_val(a) != 0xCF000000 ) {
  1412. float_raise(float_flag_invalid, status);
  1413. if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
  1414. }
  1415. return (int32_t) 0x80000000;
  1416. }
  1417. else if ( aExp <= 0x7E ) {
  1418. if (aExp | aSig) {
  1419. status->float_exception_flags |= float_flag_inexact;
  1420. }
  1421. return 0;
  1422. }
  1423. aSig = ( aSig | 0x00800000 )<<8;
  1424. z = aSig>>( - shiftCount );
  1425. if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
  1426. status->float_exception_flags |= float_flag_inexact;
  1427. }
  1428. if ( aSign ) z = - z;
  1429. return z;
  1430. }
  1431. /*----------------------------------------------------------------------------
  1432. | Returns the result of converting the single-precision floating-point value
  1433. | `a' to the 16-bit two's complement integer format. The conversion is
  1434. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1435. | Arithmetic, except that the conversion is always rounded toward zero.
  1436. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  1437. | the conversion overflows, the largest integer with the same sign as `a' is
  1438. | returned.
  1439. *----------------------------------------------------------------------------*/
  1440. int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
  1441. {
  1442. flag aSign;
  1443. int aExp;
  1444. int shiftCount;
  1445. uint32_t aSig;
  1446. int32_t z;
  1447. aSig = extractFloat32Frac( a );
  1448. aExp = extractFloat32Exp( a );
  1449. aSign = extractFloat32Sign( a );
  1450. shiftCount = aExp - 0x8E;
  1451. if ( 0 <= shiftCount ) {
  1452. if ( float32_val(a) != 0xC7000000 ) {
  1453. float_raise(float_flag_invalid, status);
  1454. if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
  1455. return 0x7FFF;
  1456. }
  1457. }
  1458. return (int32_t) 0xffff8000;
  1459. }
  1460. else if ( aExp <= 0x7E ) {
  1461. if ( aExp | aSig ) {
  1462. status->float_exception_flags |= float_flag_inexact;
  1463. }
  1464. return 0;
  1465. }
  1466. shiftCount -= 0x10;
  1467. aSig = ( aSig | 0x00800000 )<<8;
  1468. z = aSig>>( - shiftCount );
  1469. if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
  1470. status->float_exception_flags |= float_flag_inexact;
  1471. }
  1472. if ( aSign ) {
  1473. z = - z;
  1474. }
  1475. return z;
  1476. }
  1477. /*----------------------------------------------------------------------------
  1478. | Returns the result of converting the single-precision floating-point value
  1479. | `a' to the 64-bit two's complement integer format. The conversion is
  1480. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1481. | Arithmetic---which means in particular that the conversion is rounded
  1482. | according to the current rounding mode. If `a' is a NaN, the largest
  1483. | positive integer is returned. Otherwise, if the conversion overflows, the
  1484. | largest integer with the same sign as `a' is returned.
  1485. *----------------------------------------------------------------------------*/
  1486. int64_t float32_to_int64(float32 a, float_status *status)
  1487. {
  1488. flag aSign;
  1489. int aExp;
  1490. int shiftCount;
  1491. uint32_t aSig;
  1492. uint64_t aSig64, aSigExtra;
  1493. a = float32_squash_input_denormal(a, status);
  1494. aSig = extractFloat32Frac( a );
  1495. aExp = extractFloat32Exp( a );
  1496. aSign = extractFloat32Sign( a );
  1497. shiftCount = 0xBE - aExp;
  1498. if ( shiftCount < 0 ) {
  1499. float_raise(float_flag_invalid, status);
  1500. if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
  1501. return LIT64( 0x7FFFFFFFFFFFFFFF );
  1502. }
  1503. return (int64_t) LIT64( 0x8000000000000000 );
  1504. }
  1505. if ( aExp ) aSig |= 0x00800000;
  1506. aSig64 = aSig;
  1507. aSig64 <<= 40;
  1508. shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
  1509. return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
  1510. }
  1511. /*----------------------------------------------------------------------------
  1512. | Returns the result of converting the single-precision floating-point value
  1513. | `a' to the 64-bit unsigned integer format. The conversion is
  1514. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1515. | Arithmetic---which means in particular that the conversion is rounded
  1516. | according to the current rounding mode. If `a' is a NaN, the largest
  1517. | unsigned integer is returned. Otherwise, if the conversion overflows, the
  1518. | largest unsigned integer is returned. If the 'a' is negative, the result
  1519. | is rounded and zero is returned; values that do not round to zero will
  1520. | raise the inexact exception flag.
  1521. *----------------------------------------------------------------------------*/
  1522. uint64_t float32_to_uint64(float32 a, float_status *status)
  1523. {
  1524. flag aSign;
  1525. int aExp;
  1526. int shiftCount;
  1527. uint32_t aSig;
  1528. uint64_t aSig64, aSigExtra;
  1529. a = float32_squash_input_denormal(a, status);
  1530. aSig = extractFloat32Frac(a);
  1531. aExp = extractFloat32Exp(a);
  1532. aSign = extractFloat32Sign(a);
  1533. if ((aSign) && (aExp > 126)) {
  1534. float_raise(float_flag_invalid, status);
  1535. if (float32_is_any_nan(a)) {
  1536. return LIT64(0xFFFFFFFFFFFFFFFF);
  1537. } else {
  1538. return 0;
  1539. }
  1540. }
  1541. shiftCount = 0xBE - aExp;
  1542. if (aExp) {
  1543. aSig |= 0x00800000;
  1544. }
  1545. if (shiftCount < 0) {
  1546. float_raise(float_flag_invalid, status);
  1547. return LIT64(0xFFFFFFFFFFFFFFFF);
  1548. }
  1549. aSig64 = aSig;
  1550. aSig64 <<= 40;
  1551. shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
  1552. return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
  1553. }
  1554. /*----------------------------------------------------------------------------
  1555. | Returns the result of converting the single-precision floating-point value
  1556. | `a' to the 64-bit unsigned integer format. The conversion is
  1557. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1558. | Arithmetic, except that the conversion is always rounded toward zero. If
  1559. | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
  1560. | conversion overflows, the largest unsigned integer is returned. If the
  1561. | 'a' is negative, the result is rounded and zero is returned; values that do
  1562. | not round to zero will raise the inexact flag.
  1563. *----------------------------------------------------------------------------*/
  1564. uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
  1565. {
  1566. signed char current_rounding_mode = status->float_rounding_mode;
  1567. set_float_rounding_mode(float_round_to_zero, status);
  1568. int64_t v = float32_to_uint64(a, status);
  1569. set_float_rounding_mode(current_rounding_mode, status);
  1570. return v;
  1571. }
  1572. /*----------------------------------------------------------------------------
  1573. | Returns the result of converting the single-precision floating-point value
  1574. | `a' to the 64-bit two's complement integer format. The conversion is
  1575. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1576. | Arithmetic, except that the conversion is always rounded toward zero. If
  1577. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  1578. | conversion overflows, the largest integer with the same sign as `a' is
  1579. | returned.
  1580. *----------------------------------------------------------------------------*/
  1581. int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
  1582. {
  1583. flag aSign;
  1584. int aExp;
  1585. int shiftCount;
  1586. uint32_t aSig;
  1587. uint64_t aSig64;
  1588. int64_t z;
  1589. a = float32_squash_input_denormal(a, status);
  1590. aSig = extractFloat32Frac( a );
  1591. aExp = extractFloat32Exp( a );
  1592. aSign = extractFloat32Sign( a );
  1593. shiftCount = aExp - 0xBE;
  1594. if ( 0 <= shiftCount ) {
  1595. if ( float32_val(a) != 0xDF000000 ) {
  1596. float_raise(float_flag_invalid, status);
  1597. if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
  1598. return LIT64( 0x7FFFFFFFFFFFFFFF );
  1599. }
  1600. }
  1601. return (int64_t) LIT64( 0x8000000000000000 );
  1602. }
  1603. else if ( aExp <= 0x7E ) {
  1604. if (aExp | aSig) {
  1605. status->float_exception_flags |= float_flag_inexact;
  1606. }
  1607. return 0;
  1608. }
  1609. aSig64 = aSig | 0x00800000;
  1610. aSig64 <<= 40;
  1611. z = aSig64>>( - shiftCount );
  1612. if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
  1613. status->float_exception_flags |= float_flag_inexact;
  1614. }
  1615. if ( aSign ) z = - z;
  1616. return z;
  1617. }
  1618. /*----------------------------------------------------------------------------
  1619. | Returns the result of converting the single-precision floating-point value
  1620. | `a' to the double-precision floating-point format. The conversion is
  1621. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1622. | Arithmetic.
  1623. *----------------------------------------------------------------------------*/
  1624. float64 float32_to_float64(float32 a, float_status *status)
  1625. {
  1626. flag aSign;
  1627. int aExp;
  1628. uint32_t aSig;
  1629. a = float32_squash_input_denormal(a, status);
  1630. aSig = extractFloat32Frac( a );
  1631. aExp = extractFloat32Exp( a );
  1632. aSign = extractFloat32Sign( a );
  1633. if ( aExp == 0xFF ) {
  1634. if (aSig) {
  1635. return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
  1636. }
  1637. return packFloat64( aSign, 0x7FF, 0 );
  1638. }
  1639. if ( aExp == 0 ) {
  1640. if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
  1641. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1642. --aExp;
  1643. }
  1644. return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
  1645. }
  1646. /*----------------------------------------------------------------------------
  1647. | Returns the result of converting the single-precision floating-point value
  1648. | `a' to the extended double-precision floating-point format. The conversion
  1649. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  1650. | Arithmetic.
  1651. *----------------------------------------------------------------------------*/
  1652. floatx80 float32_to_floatx80(float32 a, float_status *status)
  1653. {
  1654. flag aSign;
  1655. int aExp;
  1656. uint32_t aSig;
  1657. a = float32_squash_input_denormal(a, status);
  1658. aSig = extractFloat32Frac( a );
  1659. aExp = extractFloat32Exp( a );
  1660. aSign = extractFloat32Sign( a );
  1661. if ( aExp == 0xFF ) {
  1662. if (aSig) {
  1663. return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
  1664. }
  1665. return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  1666. }
  1667. if ( aExp == 0 ) {
  1668. if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
  1669. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1670. }
  1671. aSig |= 0x00800000;
  1672. return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
  1673. }
  1674. /*----------------------------------------------------------------------------
  1675. | Returns the result of converting the single-precision floating-point value
  1676. | `a' to the double-precision floating-point format. The conversion is
  1677. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  1678. | Arithmetic.
  1679. *----------------------------------------------------------------------------*/
  1680. float128 float32_to_float128(float32 a, float_status *status)
  1681. {
  1682. flag aSign;
  1683. int aExp;
  1684. uint32_t aSig;
  1685. a = float32_squash_input_denormal(a, status);
  1686. aSig = extractFloat32Frac( a );
  1687. aExp = extractFloat32Exp( a );
  1688. aSign = extractFloat32Sign( a );
  1689. if ( aExp == 0xFF ) {
  1690. if (aSig) {
  1691. return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
  1692. }
  1693. return packFloat128( aSign, 0x7FFF, 0, 0 );
  1694. }
  1695. if ( aExp == 0 ) {
  1696. if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
  1697. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  1698. --aExp;
  1699. }
  1700. return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
  1701. }
  1702. /*----------------------------------------------------------------------------
  1703. | Rounds the single-precision floating-point value `a' to an integer, and
  1704. | returns the result as a single-precision floating-point value. The
  1705. | operation is performed according to the IEC/IEEE Standard for Binary
  1706. | Floating-Point Arithmetic.
  1707. *----------------------------------------------------------------------------*/
  1708. float32 float32_round_to_int(float32 a, float_status *status)
  1709. {
  1710. flag aSign;
  1711. int aExp;
  1712. uint32_t lastBitMask, roundBitsMask;
  1713. uint32_t z;
  1714. a = float32_squash_input_denormal(a, status);
  1715. aExp = extractFloat32Exp( a );
  1716. if ( 0x96 <= aExp ) {
  1717. if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
  1718. return propagateFloat32NaN(a, a, status);
  1719. }
  1720. return a;
  1721. }
  1722. if ( aExp <= 0x7E ) {
  1723. if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
  1724. status->float_exception_flags |= float_flag_inexact;
  1725. aSign = extractFloat32Sign( a );
  1726. switch (status->float_rounding_mode) {
  1727. case float_round_nearest_even:
  1728. if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
  1729. return packFloat32( aSign, 0x7F, 0 );
  1730. }
  1731. break;
  1732. case float_round_ties_away:
  1733. if (aExp == 0x7E) {
  1734. return packFloat32(aSign, 0x7F, 0);
  1735. }
  1736. break;
  1737. case float_round_down:
  1738. return make_float32(aSign ? 0xBF800000 : 0);
  1739. case float_round_up:
  1740. return make_float32(aSign ? 0x80000000 : 0x3F800000);
  1741. }
  1742. return packFloat32( aSign, 0, 0 );
  1743. }
  1744. lastBitMask = 1;
  1745. lastBitMask <<= 0x96 - aExp;
  1746. roundBitsMask = lastBitMask - 1;
  1747. z = float32_val(a);
  1748. switch (status->float_rounding_mode) {
  1749. case float_round_nearest_even:
  1750. z += lastBitMask>>1;
  1751. if ((z & roundBitsMask) == 0) {
  1752. z &= ~lastBitMask;
  1753. }
  1754. break;
  1755. case float_round_ties_away:
  1756. z += lastBitMask >> 1;
  1757. break;
  1758. case float_round_to_zero:
  1759. break;
  1760. case float_round_up:
  1761. if (!extractFloat32Sign(make_float32(z))) {
  1762. z += roundBitsMask;
  1763. }
  1764. break;
  1765. case float_round_down:
  1766. if (extractFloat32Sign(make_float32(z))) {
  1767. z += roundBitsMask;
  1768. }
  1769. break;
  1770. default:
  1771. abort();
  1772. }
  1773. z &= ~ roundBitsMask;
  1774. if (z != float32_val(a)) {
  1775. status->float_exception_flags |= float_flag_inexact;
  1776. }
  1777. return make_float32(z);
  1778. }
  1779. /*----------------------------------------------------------------------------
  1780. | Returns the result of adding the absolute values of the single-precision
  1781. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  1782. | before being returned. `zSign' is ignored if the result is a NaN.
  1783. | The addition is performed according to the IEC/IEEE Standard for Binary
  1784. | Floating-Point Arithmetic.
  1785. *----------------------------------------------------------------------------*/
  1786. static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
  1787. float_status *status)
  1788. {
  1789. int aExp, bExp, zExp;
  1790. uint32_t aSig, bSig, zSig;
  1791. int expDiff;
  1792. aSig = extractFloat32Frac( a );
  1793. aExp = extractFloat32Exp( a );
  1794. bSig = extractFloat32Frac( b );
  1795. bExp = extractFloat32Exp( b );
  1796. expDiff = aExp - bExp;
  1797. aSig <<= 6;
  1798. bSig <<= 6;
  1799. if ( 0 < expDiff ) {
  1800. if ( aExp == 0xFF ) {
  1801. if (aSig) {
  1802. return propagateFloat32NaN(a, b, status);
  1803. }
  1804. return a;
  1805. }
  1806. if ( bExp == 0 ) {
  1807. --expDiff;
  1808. }
  1809. else {
  1810. bSig |= 0x20000000;
  1811. }
  1812. shift32RightJamming( bSig, expDiff, &bSig );
  1813. zExp = aExp;
  1814. }
  1815. else if ( expDiff < 0 ) {
  1816. if ( bExp == 0xFF ) {
  1817. if (bSig) {
  1818. return propagateFloat32NaN(a, b, status);
  1819. }
  1820. return packFloat32( zSign, 0xFF, 0 );
  1821. }
  1822. if ( aExp == 0 ) {
  1823. ++expDiff;
  1824. }
  1825. else {
  1826. aSig |= 0x20000000;
  1827. }
  1828. shift32RightJamming( aSig, - expDiff, &aSig );
  1829. zExp = bExp;
  1830. }
  1831. else {
  1832. if ( aExp == 0xFF ) {
  1833. if (aSig | bSig) {
  1834. return propagateFloat32NaN(a, b, status);
  1835. }
  1836. return a;
  1837. }
  1838. if ( aExp == 0 ) {
  1839. if (status->flush_to_zero) {
  1840. if (aSig | bSig) {
  1841. float_raise(float_flag_output_denormal, status);
  1842. }
  1843. return packFloat32(zSign, 0, 0);
  1844. }
  1845. return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
  1846. }
  1847. zSig = 0x40000000 + aSig + bSig;
  1848. zExp = aExp;
  1849. goto roundAndPack;
  1850. }
  1851. aSig |= 0x20000000;
  1852. zSig = ( aSig + bSig )<<1;
  1853. --zExp;
  1854. if ( (int32_t) zSig < 0 ) {
  1855. zSig = aSig + bSig;
  1856. ++zExp;
  1857. }
  1858. roundAndPack:
  1859. return roundAndPackFloat32(zSign, zExp, zSig, status);
  1860. }
  1861. /*----------------------------------------------------------------------------
  1862. | Returns the result of subtracting the absolute values of the single-
  1863. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  1864. | difference is negated before being returned. `zSign' is ignored if the
  1865. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  1866. | Standard for Binary Floating-Point Arithmetic.
  1867. *----------------------------------------------------------------------------*/
  1868. static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
  1869. float_status *status)
  1870. {
  1871. int aExp, bExp, zExp;
  1872. uint32_t aSig, bSig, zSig;
  1873. int expDiff;
  1874. aSig = extractFloat32Frac( a );
  1875. aExp = extractFloat32Exp( a );
  1876. bSig = extractFloat32Frac( b );
  1877. bExp = extractFloat32Exp( b );
  1878. expDiff = aExp - bExp;
  1879. aSig <<= 7;
  1880. bSig <<= 7;
  1881. if ( 0 < expDiff ) goto aExpBigger;
  1882. if ( expDiff < 0 ) goto bExpBigger;
  1883. if ( aExp == 0xFF ) {
  1884. if (aSig | bSig) {
  1885. return propagateFloat32NaN(a, b, status);
  1886. }
  1887. float_raise(float_flag_invalid, status);
  1888. return float32_default_nan(status);
  1889. }
  1890. if ( aExp == 0 ) {
  1891. aExp = 1;
  1892. bExp = 1;
  1893. }
  1894. if ( bSig < aSig ) goto aBigger;
  1895. if ( aSig < bSig ) goto bBigger;
  1896. return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
  1897. bExpBigger:
  1898. if ( bExp == 0xFF ) {
  1899. if (bSig) {
  1900. return propagateFloat32NaN(a, b, status);
  1901. }
  1902. return packFloat32( zSign ^ 1, 0xFF, 0 );
  1903. }
  1904. if ( aExp == 0 ) {
  1905. ++expDiff;
  1906. }
  1907. else {
  1908. aSig |= 0x40000000;
  1909. }
  1910. shift32RightJamming( aSig, - expDiff, &aSig );
  1911. bSig |= 0x40000000;
  1912. bBigger:
  1913. zSig = bSig - aSig;
  1914. zExp = bExp;
  1915. zSign ^= 1;
  1916. goto normalizeRoundAndPack;
  1917. aExpBigger:
  1918. if ( aExp == 0xFF ) {
  1919. if (aSig) {
  1920. return propagateFloat32NaN(a, b, status);
  1921. }
  1922. return a;
  1923. }
  1924. if ( bExp == 0 ) {
  1925. --expDiff;
  1926. }
  1927. else {
  1928. bSig |= 0x40000000;
  1929. }
  1930. shift32RightJamming( bSig, expDiff, &bSig );
  1931. aSig |= 0x40000000;
  1932. aBigger:
  1933. zSig = aSig - bSig;
  1934. zExp = aExp;
  1935. normalizeRoundAndPack:
  1936. --zExp;
  1937. return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
  1938. }
  1939. /*----------------------------------------------------------------------------
  1940. | Returns the result of adding the single-precision floating-point values `a'
  1941. | and `b'. The operation is performed according to the IEC/IEEE Standard for
  1942. | Binary Floating-Point Arithmetic.
  1943. *----------------------------------------------------------------------------*/
  1944. float32 float32_add(float32 a, float32 b, float_status *status)
  1945. {
  1946. flag aSign, bSign;
  1947. a = float32_squash_input_denormal(a, status);
  1948. b = float32_squash_input_denormal(b, status);
  1949. aSign = extractFloat32Sign( a );
  1950. bSign = extractFloat32Sign( b );
  1951. if ( aSign == bSign ) {
  1952. return addFloat32Sigs(a, b, aSign, status);
  1953. }
  1954. else {
  1955. return subFloat32Sigs(a, b, aSign, status);
  1956. }
  1957. }
  1958. /*----------------------------------------------------------------------------
  1959. | Returns the result of subtracting the single-precision floating-point values
  1960. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  1961. | for Binary Floating-Point Arithmetic.
  1962. *----------------------------------------------------------------------------*/
  1963. float32 float32_sub(float32 a, float32 b, float_status *status)
  1964. {
  1965. flag aSign, bSign;
  1966. a = float32_squash_input_denormal(a, status);
  1967. b = float32_squash_input_denormal(b, status);
  1968. aSign = extractFloat32Sign( a );
  1969. bSign = extractFloat32Sign( b );
  1970. if ( aSign == bSign ) {
  1971. return subFloat32Sigs(a, b, aSign, status);
  1972. }
  1973. else {
  1974. return addFloat32Sigs(a, b, aSign, status);
  1975. }
  1976. }
  1977. /*----------------------------------------------------------------------------
  1978. | Returns the result of multiplying the single-precision floating-point values
  1979. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  1980. | for Binary Floating-Point Arithmetic.
  1981. *----------------------------------------------------------------------------*/
  1982. float32 float32_mul(float32 a, float32 b, float_status *status)
  1983. {
  1984. flag aSign, bSign, zSign;
  1985. int aExp, bExp, zExp;
  1986. uint32_t aSig, bSig;
  1987. uint64_t zSig64;
  1988. uint32_t zSig;
  1989. a = float32_squash_input_denormal(a, status);
  1990. b = float32_squash_input_denormal(b, status);
  1991. aSig = extractFloat32Frac( a );
  1992. aExp = extractFloat32Exp( a );
  1993. aSign = extractFloat32Sign( a );
  1994. bSig = extractFloat32Frac( b );
  1995. bExp = extractFloat32Exp( b );
  1996. bSign = extractFloat32Sign( b );
  1997. zSign = aSign ^ bSign;
  1998. if ( aExp == 0xFF ) {
  1999. if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
  2000. return propagateFloat32NaN(a, b, status);
  2001. }
  2002. if ( ( bExp | bSig ) == 0 ) {
  2003. float_raise(float_flag_invalid, status);
  2004. return float32_default_nan(status);
  2005. }
  2006. return packFloat32( zSign, 0xFF, 0 );
  2007. }
  2008. if ( bExp == 0xFF ) {
  2009. if (bSig) {
  2010. return propagateFloat32NaN(a, b, status);
  2011. }
  2012. if ( ( aExp | aSig ) == 0 ) {
  2013. float_raise(float_flag_invalid, status);
  2014. return float32_default_nan(status);
  2015. }
  2016. return packFloat32( zSign, 0xFF, 0 );
  2017. }
  2018. if ( aExp == 0 ) {
  2019. if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
  2020. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  2021. }
  2022. if ( bExp == 0 ) {
  2023. if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
  2024. normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  2025. }
  2026. zExp = aExp + bExp - 0x7F;
  2027. aSig = ( aSig | 0x00800000 )<<7;
  2028. bSig = ( bSig | 0x00800000 )<<8;
  2029. shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
  2030. zSig = zSig64;
  2031. if ( 0 <= (int32_t) ( zSig<<1 ) ) {
  2032. zSig <<= 1;
  2033. --zExp;
  2034. }
  2035. return roundAndPackFloat32(zSign, zExp, zSig, status);
  2036. }
  2037. /*----------------------------------------------------------------------------
  2038. | Returns the result of dividing the single-precision floating-point value `a'
  2039. | by the corresponding value `b'. The operation is performed according to the
  2040. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2041. *----------------------------------------------------------------------------*/
  2042. float32 float32_div(float32 a, float32 b, float_status *status)
  2043. {
  2044. flag aSign, bSign, zSign;
  2045. int aExp, bExp, zExp;
  2046. uint32_t aSig, bSig, zSig;
  2047. a = float32_squash_input_denormal(a, status);
  2048. b = float32_squash_input_denormal(b, status);
  2049. aSig = extractFloat32Frac( a );
  2050. aExp = extractFloat32Exp( a );
  2051. aSign = extractFloat32Sign( a );
  2052. bSig = extractFloat32Frac( b );
  2053. bExp = extractFloat32Exp( b );
  2054. bSign = extractFloat32Sign( b );
  2055. zSign = aSign ^ bSign;
  2056. if ( aExp == 0xFF ) {
  2057. if (aSig) {
  2058. return propagateFloat32NaN(a, b, status);
  2059. }
  2060. if ( bExp == 0xFF ) {
  2061. if (bSig) {
  2062. return propagateFloat32NaN(a, b, status);
  2063. }
  2064. float_raise(float_flag_invalid, status);
  2065. return float32_default_nan(status);
  2066. }
  2067. return packFloat32( zSign, 0xFF, 0 );
  2068. }
  2069. if ( bExp == 0xFF ) {
  2070. if (bSig) {
  2071. return propagateFloat32NaN(a, b, status);
  2072. }
  2073. return packFloat32( zSign, 0, 0 );
  2074. }
  2075. if ( bExp == 0 ) {
  2076. if ( bSig == 0 ) {
  2077. if ( ( aExp | aSig ) == 0 ) {
  2078. float_raise(float_flag_invalid, status);
  2079. return float32_default_nan(status);
  2080. }
  2081. float_raise(float_flag_divbyzero, status);
  2082. return packFloat32( zSign, 0xFF, 0 );
  2083. }
  2084. normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  2085. }
  2086. if ( aExp == 0 ) {
  2087. if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
  2088. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  2089. }
  2090. zExp = aExp - bExp + 0x7D;
  2091. aSig = ( aSig | 0x00800000 )<<7;
  2092. bSig = ( bSig | 0x00800000 )<<8;
  2093. if ( bSig <= ( aSig + aSig ) ) {
  2094. aSig >>= 1;
  2095. ++zExp;
  2096. }
  2097. zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
  2098. if ( ( zSig & 0x3F ) == 0 ) {
  2099. zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
  2100. }
  2101. return roundAndPackFloat32(zSign, zExp, zSig, status);
  2102. }
  2103. /*----------------------------------------------------------------------------
  2104. | Returns the remainder of the single-precision floating-point value `a'
  2105. | with respect to the corresponding value `b'. The operation is performed
  2106. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2107. *----------------------------------------------------------------------------*/
  2108. float32 float32_rem(float32 a, float32 b, float_status *status)
  2109. {
  2110. flag aSign, zSign;
  2111. int aExp, bExp, expDiff;
  2112. uint32_t aSig, bSig;
  2113. uint32_t q;
  2114. uint64_t aSig64, bSig64, q64;
  2115. uint32_t alternateASig;
  2116. int32_t sigMean;
  2117. a = float32_squash_input_denormal(a, status);
  2118. b = float32_squash_input_denormal(b, status);
  2119. aSig = extractFloat32Frac( a );
  2120. aExp = extractFloat32Exp( a );
  2121. aSign = extractFloat32Sign( a );
  2122. bSig = extractFloat32Frac( b );
  2123. bExp = extractFloat32Exp( b );
  2124. if ( aExp == 0xFF ) {
  2125. if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
  2126. return propagateFloat32NaN(a, b, status);
  2127. }
  2128. float_raise(float_flag_invalid, status);
  2129. return float32_default_nan(status);
  2130. }
  2131. if ( bExp == 0xFF ) {
  2132. if (bSig) {
  2133. return propagateFloat32NaN(a, b, status);
  2134. }
  2135. return a;
  2136. }
  2137. if ( bExp == 0 ) {
  2138. if ( bSig == 0 ) {
  2139. float_raise(float_flag_invalid, status);
  2140. return float32_default_nan(status);
  2141. }
  2142. normalizeFloat32Subnormal( bSig, &bExp, &bSig );
  2143. }
  2144. if ( aExp == 0 ) {
  2145. if ( aSig == 0 ) return a;
  2146. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  2147. }
  2148. expDiff = aExp - bExp;
  2149. aSig |= 0x00800000;
  2150. bSig |= 0x00800000;
  2151. if ( expDiff < 32 ) {
  2152. aSig <<= 8;
  2153. bSig <<= 8;
  2154. if ( expDiff < 0 ) {
  2155. if ( expDiff < -1 ) return a;
  2156. aSig >>= 1;
  2157. }
  2158. q = ( bSig <= aSig );
  2159. if ( q ) aSig -= bSig;
  2160. if ( 0 < expDiff ) {
  2161. q = ( ( (uint64_t) aSig )<<32 ) / bSig;
  2162. q >>= 32 - expDiff;
  2163. bSig >>= 2;
  2164. aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
  2165. }
  2166. else {
  2167. aSig >>= 2;
  2168. bSig >>= 2;
  2169. }
  2170. }
  2171. else {
  2172. if ( bSig <= aSig ) aSig -= bSig;
  2173. aSig64 = ( (uint64_t) aSig )<<40;
  2174. bSig64 = ( (uint64_t) bSig )<<40;
  2175. expDiff -= 64;
  2176. while ( 0 < expDiff ) {
  2177. q64 = estimateDiv128To64( aSig64, 0, bSig64 );
  2178. q64 = ( 2 < q64 ) ? q64 - 2 : 0;
  2179. aSig64 = - ( ( bSig * q64 )<<38 );
  2180. expDiff -= 62;
  2181. }
  2182. expDiff += 64;
  2183. q64 = estimateDiv128To64( aSig64, 0, bSig64 );
  2184. q64 = ( 2 < q64 ) ? q64 - 2 : 0;
  2185. q = q64>>( 64 - expDiff );
  2186. bSig <<= 6;
  2187. aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
  2188. }
  2189. do {
  2190. alternateASig = aSig;
  2191. ++q;
  2192. aSig -= bSig;
  2193. } while ( 0 <= (int32_t) aSig );
  2194. sigMean = aSig + alternateASig;
  2195. if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
  2196. aSig = alternateASig;
  2197. }
  2198. zSign = ( (int32_t) aSig < 0 );
  2199. if ( zSign ) aSig = - aSig;
  2200. return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
  2201. }
  2202. /*----------------------------------------------------------------------------
  2203. | Returns the result of multiplying the single-precision floating-point values
  2204. | `a' and `b' then adding 'c', with no intermediate rounding step after the
  2205. | multiplication. The operation is performed according to the IEC/IEEE
  2206. | Standard for Binary Floating-Point Arithmetic 754-2008.
  2207. | The flags argument allows the caller to select negation of the
  2208. | addend, the intermediate product, or the final result. (The difference
  2209. | between this and having the caller do a separate negation is that negating
  2210. | externally will flip the sign bit on NaNs.)
  2211. *----------------------------------------------------------------------------*/
  2212. float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
  2213. float_status *status)
  2214. {
  2215. flag aSign, bSign, cSign, zSign;
  2216. int aExp, bExp, cExp, pExp, zExp, expDiff;
  2217. uint32_t aSig, bSig, cSig;
  2218. flag pInf, pZero, pSign;
  2219. uint64_t pSig64, cSig64, zSig64;
  2220. uint32_t pSig;
  2221. int shiftcount;
  2222. flag signflip, infzero;
  2223. a = float32_squash_input_denormal(a, status);
  2224. b = float32_squash_input_denormal(b, status);
  2225. c = float32_squash_input_denormal(c, status);
  2226. aSig = extractFloat32Frac(a);
  2227. aExp = extractFloat32Exp(a);
  2228. aSign = extractFloat32Sign(a);
  2229. bSig = extractFloat32Frac(b);
  2230. bExp = extractFloat32Exp(b);
  2231. bSign = extractFloat32Sign(b);
  2232. cSig = extractFloat32Frac(c);
  2233. cExp = extractFloat32Exp(c);
  2234. cSign = extractFloat32Sign(c);
  2235. infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
  2236. (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
  2237. /* It is implementation-defined whether the cases of (0,inf,qnan)
  2238. * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
  2239. * they return if they do), so we have to hand this information
  2240. * off to the target-specific pick-a-NaN routine.
  2241. */
  2242. if (((aExp == 0xff) && aSig) ||
  2243. ((bExp == 0xff) && bSig) ||
  2244. ((cExp == 0xff) && cSig)) {
  2245. return propagateFloat32MulAddNaN(a, b, c, infzero, status);
  2246. }
  2247. if (infzero) {
  2248. float_raise(float_flag_invalid, status);
  2249. return float32_default_nan(status);
  2250. }
  2251. if (flags & float_muladd_negate_c) {
  2252. cSign ^= 1;
  2253. }
  2254. signflip = (flags & float_muladd_negate_result) ? 1 : 0;
  2255. /* Work out the sign and type of the product */
  2256. pSign = aSign ^ bSign;
  2257. if (flags & float_muladd_negate_product) {
  2258. pSign ^= 1;
  2259. }
  2260. pInf = (aExp == 0xff) || (bExp == 0xff);
  2261. pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
  2262. if (cExp == 0xff) {
  2263. if (pInf && (pSign ^ cSign)) {
  2264. /* addition of opposite-signed infinities => InvalidOperation */
  2265. float_raise(float_flag_invalid, status);
  2266. return float32_default_nan(status);
  2267. }
  2268. /* Otherwise generate an infinity of the same sign */
  2269. return packFloat32(cSign ^ signflip, 0xff, 0);
  2270. }
  2271. if (pInf) {
  2272. return packFloat32(pSign ^ signflip, 0xff, 0);
  2273. }
  2274. if (pZero) {
  2275. if (cExp == 0) {
  2276. if (cSig == 0) {
  2277. /* Adding two exact zeroes */
  2278. if (pSign == cSign) {
  2279. zSign = pSign;
  2280. } else if (status->float_rounding_mode == float_round_down) {
  2281. zSign = 1;
  2282. } else {
  2283. zSign = 0;
  2284. }
  2285. return packFloat32(zSign ^ signflip, 0, 0);
  2286. }
  2287. /* Exact zero plus a denorm */
  2288. if (status->flush_to_zero) {
  2289. float_raise(float_flag_output_denormal, status);
  2290. return packFloat32(cSign ^ signflip, 0, 0);
  2291. }
  2292. }
  2293. /* Zero plus something non-zero : just return the something */
  2294. if (flags & float_muladd_halve_result) {
  2295. if (cExp == 0) {
  2296. normalizeFloat32Subnormal(cSig, &cExp, &cSig);
  2297. }
  2298. /* Subtract one to halve, and one again because roundAndPackFloat32
  2299. * wants one less than the true exponent.
  2300. */
  2301. cExp -= 2;
  2302. cSig = (cSig | 0x00800000) << 7;
  2303. return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
  2304. }
  2305. return packFloat32(cSign ^ signflip, cExp, cSig);
  2306. }
  2307. if (aExp == 0) {
  2308. normalizeFloat32Subnormal(aSig, &aExp, &aSig);
  2309. }
  2310. if (bExp == 0) {
  2311. normalizeFloat32Subnormal(bSig, &bExp, &bSig);
  2312. }
  2313. /* Calculate the actual result a * b + c */
  2314. /* Multiply first; this is easy. */
  2315. /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
  2316. * because we want the true exponent, not the "one-less-than"
  2317. * flavour that roundAndPackFloat32() takes.
  2318. */
  2319. pExp = aExp + bExp - 0x7e;
  2320. aSig = (aSig | 0x00800000) << 7;
  2321. bSig = (bSig | 0x00800000) << 8;
  2322. pSig64 = (uint64_t)aSig * bSig;
  2323. if ((int64_t)(pSig64 << 1) >= 0) {
  2324. pSig64 <<= 1;
  2325. pExp--;
  2326. }
  2327. zSign = pSign ^ signflip;
  2328. /* Now pSig64 is the significand of the multiply, with the explicit bit in
  2329. * position 62.
  2330. */
  2331. if (cExp == 0) {
  2332. if (!cSig) {
  2333. /* Throw out the special case of c being an exact zero now */
  2334. shift64RightJamming(pSig64, 32, &pSig64);
  2335. pSig = pSig64;
  2336. if (flags & float_muladd_halve_result) {
  2337. pExp--;
  2338. }
  2339. return roundAndPackFloat32(zSign, pExp - 1,
  2340. pSig, status);
  2341. }
  2342. normalizeFloat32Subnormal(cSig, &cExp, &cSig);
  2343. }
  2344. cSig64 = (uint64_t)cSig << (62 - 23);
  2345. cSig64 |= LIT64(0x4000000000000000);
  2346. expDiff = pExp - cExp;
  2347. if (pSign == cSign) {
  2348. /* Addition */
  2349. if (expDiff > 0) {
  2350. /* scale c to match p */
  2351. shift64RightJamming(cSig64, expDiff, &cSig64);
  2352. zExp = pExp;
  2353. } else if (expDiff < 0) {
  2354. /* scale p to match c */
  2355. shift64RightJamming(pSig64, -expDiff, &pSig64);
  2356. zExp = cExp;
  2357. } else {
  2358. /* no scaling needed */
  2359. zExp = cExp;
  2360. }
  2361. /* Add significands and make sure explicit bit ends up in posn 62 */
  2362. zSig64 = pSig64 + cSig64;
  2363. if ((int64_t)zSig64 < 0) {
  2364. shift64RightJamming(zSig64, 1, &zSig64);
  2365. } else {
  2366. zExp--;
  2367. }
  2368. } else {
  2369. /* Subtraction */
  2370. if (expDiff > 0) {
  2371. shift64RightJamming(cSig64, expDiff, &cSig64);
  2372. zSig64 = pSig64 - cSig64;
  2373. zExp = pExp;
  2374. } else if (expDiff < 0) {
  2375. shift64RightJamming(pSig64, -expDiff, &pSig64);
  2376. zSig64 = cSig64 - pSig64;
  2377. zExp = cExp;
  2378. zSign ^= 1;
  2379. } else {
  2380. zExp = pExp;
  2381. if (cSig64 < pSig64) {
  2382. zSig64 = pSig64 - cSig64;
  2383. } else if (pSig64 < cSig64) {
  2384. zSig64 = cSig64 - pSig64;
  2385. zSign ^= 1;
  2386. } else {
  2387. /* Exact zero */
  2388. zSign = signflip;
  2389. if (status->float_rounding_mode == float_round_down) {
  2390. zSign ^= 1;
  2391. }
  2392. return packFloat32(zSign, 0, 0);
  2393. }
  2394. }
  2395. --zExp;
  2396. /* Normalize to put the explicit bit back into bit 62. */
  2397. shiftcount = countLeadingZeros64(zSig64) - 1;
  2398. zSig64 <<= shiftcount;
  2399. zExp -= shiftcount;
  2400. }
  2401. if (flags & float_muladd_halve_result) {
  2402. zExp--;
  2403. }
  2404. shift64RightJamming(zSig64, 32, &zSig64);
  2405. return roundAndPackFloat32(zSign, zExp, zSig64, status);
  2406. }
  2407. /*----------------------------------------------------------------------------
  2408. | Returns the square root of the single-precision floating-point value `a'.
  2409. | The operation is performed according to the IEC/IEEE Standard for Binary
  2410. | Floating-Point Arithmetic.
  2411. *----------------------------------------------------------------------------*/
  2412. float32 float32_sqrt(float32 a, float_status *status)
  2413. {
  2414. flag aSign;
  2415. int aExp, zExp;
  2416. uint32_t aSig, zSig;
  2417. uint64_t rem, term;
  2418. a = float32_squash_input_denormal(a, status);
  2419. aSig = extractFloat32Frac( a );
  2420. aExp = extractFloat32Exp( a );
  2421. aSign = extractFloat32Sign( a );
  2422. if ( aExp == 0xFF ) {
  2423. if (aSig) {
  2424. return propagateFloat32NaN(a, float32_zero, status);
  2425. }
  2426. if ( ! aSign ) return a;
  2427. float_raise(float_flag_invalid, status);
  2428. return float32_default_nan(status);
  2429. }
  2430. if ( aSign ) {
  2431. if ( ( aExp | aSig ) == 0 ) return a;
  2432. float_raise(float_flag_invalid, status);
  2433. return float32_default_nan(status);
  2434. }
  2435. if ( aExp == 0 ) {
  2436. if ( aSig == 0 ) return float32_zero;
  2437. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  2438. }
  2439. zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
  2440. aSig = ( aSig | 0x00800000 )<<8;
  2441. zSig = estimateSqrt32( aExp, aSig ) + 2;
  2442. if ( ( zSig & 0x7F ) <= 5 ) {
  2443. if ( zSig < 2 ) {
  2444. zSig = 0x7FFFFFFF;
  2445. goto roundAndPack;
  2446. }
  2447. aSig >>= aExp & 1;
  2448. term = ( (uint64_t) zSig ) * zSig;
  2449. rem = ( ( (uint64_t) aSig )<<32 ) - term;
  2450. while ( (int64_t) rem < 0 ) {
  2451. --zSig;
  2452. rem += ( ( (uint64_t) zSig )<<1 ) | 1;
  2453. }
  2454. zSig |= ( rem != 0 );
  2455. }
  2456. shift32RightJamming( zSig, 1, &zSig );
  2457. roundAndPack:
  2458. return roundAndPackFloat32(0, zExp, zSig, status);
  2459. }
  2460. /*----------------------------------------------------------------------------
  2461. | Returns the binary exponential of the single-precision floating-point value
  2462. | `a'. The operation is performed according to the IEC/IEEE Standard for
  2463. | Binary Floating-Point Arithmetic.
  2464. |
  2465. | Uses the following identities:
  2466. |
  2467. | 1. -------------------------------------------------------------------------
  2468. | x x*ln(2)
  2469. | 2 = e
  2470. |
  2471. | 2. -------------------------------------------------------------------------
  2472. | 2 3 4 5 n
  2473. | x x x x x x x
  2474. | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
  2475. | 1! 2! 3! 4! 5! n!
  2476. *----------------------------------------------------------------------------*/
  2477. static const float64 float32_exp2_coefficients[15] =
  2478. {
  2479. const_float64( 0x3ff0000000000000ll ), /* 1 */
  2480. const_float64( 0x3fe0000000000000ll ), /* 2 */
  2481. const_float64( 0x3fc5555555555555ll ), /* 3 */
  2482. const_float64( 0x3fa5555555555555ll ), /* 4 */
  2483. const_float64( 0x3f81111111111111ll ), /* 5 */
  2484. const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
  2485. const_float64( 0x3f2a01a01a01a01all ), /* 7 */
  2486. const_float64( 0x3efa01a01a01a01all ), /* 8 */
  2487. const_float64( 0x3ec71de3a556c734ll ), /* 9 */
  2488. const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
  2489. const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
  2490. const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
  2491. const_float64( 0x3de6124613a86d09ll ), /* 13 */
  2492. const_float64( 0x3da93974a8c07c9dll ), /* 14 */
  2493. const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
  2494. };
  2495. float32 float32_exp2(float32 a, float_status *status)
  2496. {
  2497. flag aSign;
  2498. int aExp;
  2499. uint32_t aSig;
  2500. float64 r, x, xn;
  2501. int i;
  2502. a = float32_squash_input_denormal(a, status);
  2503. aSig = extractFloat32Frac( a );
  2504. aExp = extractFloat32Exp( a );
  2505. aSign = extractFloat32Sign( a );
  2506. if ( aExp == 0xFF) {
  2507. if (aSig) {
  2508. return propagateFloat32NaN(a, float32_zero, status);
  2509. }
  2510. return (aSign) ? float32_zero : a;
  2511. }
  2512. if (aExp == 0) {
  2513. if (aSig == 0) return float32_one;
  2514. }
  2515. float_raise(float_flag_inexact, status);
  2516. /* ******************************* */
  2517. /* using float64 for approximation */
  2518. /* ******************************* */
  2519. x = float32_to_float64(a, status);
  2520. x = float64_mul(x, float64_ln2, status);
  2521. xn = x;
  2522. r = float64_one;
  2523. for (i = 0 ; i < 15 ; i++) {
  2524. float64 f;
  2525. f = float64_mul(xn, float32_exp2_coefficients[i], status);
  2526. r = float64_add(r, f, status);
  2527. xn = float64_mul(xn, x, status);
  2528. }
  2529. return float64_to_float32(r, status);
  2530. }
  2531. /*----------------------------------------------------------------------------
  2532. | Returns the binary log of the single-precision floating-point value `a'.
  2533. | The operation is performed according to the IEC/IEEE Standard for Binary
  2534. | Floating-Point Arithmetic.
  2535. *----------------------------------------------------------------------------*/
  2536. float32 float32_log2(float32 a, float_status *status)
  2537. {
  2538. flag aSign, zSign;
  2539. int aExp;
  2540. uint32_t aSig, zSig, i;
  2541. a = float32_squash_input_denormal(a, status);
  2542. aSig = extractFloat32Frac( a );
  2543. aExp = extractFloat32Exp( a );
  2544. aSign = extractFloat32Sign( a );
  2545. if ( aExp == 0 ) {
  2546. if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
  2547. normalizeFloat32Subnormal( aSig, &aExp, &aSig );
  2548. }
  2549. if ( aSign ) {
  2550. float_raise(float_flag_invalid, status);
  2551. return float32_default_nan(status);
  2552. }
  2553. if ( aExp == 0xFF ) {
  2554. if (aSig) {
  2555. return propagateFloat32NaN(a, float32_zero, status);
  2556. }
  2557. return a;
  2558. }
  2559. aExp -= 0x7F;
  2560. aSig |= 0x00800000;
  2561. zSign = aExp < 0;
  2562. zSig = aExp << 23;
  2563. for (i = 1 << 22; i > 0; i >>= 1) {
  2564. aSig = ( (uint64_t)aSig * aSig ) >> 23;
  2565. if ( aSig & 0x01000000 ) {
  2566. aSig >>= 1;
  2567. zSig |= i;
  2568. }
  2569. }
  2570. if ( zSign )
  2571. zSig = -zSig;
  2572. return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
  2573. }
  2574. /*----------------------------------------------------------------------------
  2575. | Returns 1 if the single-precision floating-point value `a' is equal to
  2576. | the corresponding value `b', and 0 otherwise. The invalid exception is
  2577. | raised if either operand is a NaN. Otherwise, the comparison is performed
  2578. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2579. *----------------------------------------------------------------------------*/
  2580. int float32_eq(float32 a, float32 b, float_status *status)
  2581. {
  2582. uint32_t av, bv;
  2583. a = float32_squash_input_denormal(a, status);
  2584. b = float32_squash_input_denormal(b, status);
  2585. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2586. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2587. ) {
  2588. float_raise(float_flag_invalid, status);
  2589. return 0;
  2590. }
  2591. av = float32_val(a);
  2592. bv = float32_val(b);
  2593. return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
  2594. }
  2595. /*----------------------------------------------------------------------------
  2596. | Returns 1 if the single-precision floating-point value `a' is less than
  2597. | or equal to the corresponding value `b', and 0 otherwise. The invalid
  2598. | exception is raised if either operand is a NaN. The comparison is performed
  2599. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2600. *----------------------------------------------------------------------------*/
  2601. int float32_le(float32 a, float32 b, float_status *status)
  2602. {
  2603. flag aSign, bSign;
  2604. uint32_t av, bv;
  2605. a = float32_squash_input_denormal(a, status);
  2606. b = float32_squash_input_denormal(b, status);
  2607. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2608. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2609. ) {
  2610. float_raise(float_flag_invalid, status);
  2611. return 0;
  2612. }
  2613. aSign = extractFloat32Sign( a );
  2614. bSign = extractFloat32Sign( b );
  2615. av = float32_val(a);
  2616. bv = float32_val(b);
  2617. if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
  2618. return ( av == bv ) || ( aSign ^ ( av < bv ) );
  2619. }
  2620. /*----------------------------------------------------------------------------
  2621. | Returns 1 if the single-precision floating-point value `a' is less than
  2622. | the corresponding value `b', and 0 otherwise. The invalid exception is
  2623. | raised if either operand is a NaN. The comparison is performed according
  2624. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2625. *----------------------------------------------------------------------------*/
  2626. int float32_lt(float32 a, float32 b, float_status *status)
  2627. {
  2628. flag aSign, bSign;
  2629. uint32_t av, bv;
  2630. a = float32_squash_input_denormal(a, status);
  2631. b = float32_squash_input_denormal(b, status);
  2632. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2633. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2634. ) {
  2635. float_raise(float_flag_invalid, status);
  2636. return 0;
  2637. }
  2638. aSign = extractFloat32Sign( a );
  2639. bSign = extractFloat32Sign( b );
  2640. av = float32_val(a);
  2641. bv = float32_val(b);
  2642. if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
  2643. return ( av != bv ) && ( aSign ^ ( av < bv ) );
  2644. }
  2645. /*----------------------------------------------------------------------------
  2646. | Returns 1 if the single-precision floating-point values `a' and `b' cannot
  2647. | be compared, and 0 otherwise. The invalid exception is raised if either
  2648. | operand is a NaN. The comparison is performed according to the IEC/IEEE
  2649. | Standard for Binary Floating-Point Arithmetic.
  2650. *----------------------------------------------------------------------------*/
  2651. int float32_unordered(float32 a, float32 b, float_status *status)
  2652. {
  2653. a = float32_squash_input_denormal(a, status);
  2654. b = float32_squash_input_denormal(b, status);
  2655. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2656. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2657. ) {
  2658. float_raise(float_flag_invalid, status);
  2659. return 1;
  2660. }
  2661. return 0;
  2662. }
  2663. /*----------------------------------------------------------------------------
  2664. | Returns 1 if the single-precision floating-point value `a' is equal to
  2665. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  2666. | exception. The comparison is performed according to the IEC/IEEE Standard
  2667. | for Binary Floating-Point Arithmetic.
  2668. *----------------------------------------------------------------------------*/
  2669. int float32_eq_quiet(float32 a, float32 b, float_status *status)
  2670. {
  2671. a = float32_squash_input_denormal(a, status);
  2672. b = float32_squash_input_denormal(b, status);
  2673. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2674. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2675. ) {
  2676. if (float32_is_signaling_nan(a, status)
  2677. || float32_is_signaling_nan(b, status)) {
  2678. float_raise(float_flag_invalid, status);
  2679. }
  2680. return 0;
  2681. }
  2682. return ( float32_val(a) == float32_val(b) ) ||
  2683. ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
  2684. }
  2685. /*----------------------------------------------------------------------------
  2686. | Returns 1 if the single-precision floating-point value `a' is less than or
  2687. | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  2688. | cause an exception. Otherwise, the comparison is performed according to the
  2689. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  2690. *----------------------------------------------------------------------------*/
  2691. int float32_le_quiet(float32 a, float32 b, float_status *status)
  2692. {
  2693. flag aSign, bSign;
  2694. uint32_t av, bv;
  2695. a = float32_squash_input_denormal(a, status);
  2696. b = float32_squash_input_denormal(b, status);
  2697. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2698. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2699. ) {
  2700. if (float32_is_signaling_nan(a, status)
  2701. || float32_is_signaling_nan(b, status)) {
  2702. float_raise(float_flag_invalid, status);
  2703. }
  2704. return 0;
  2705. }
  2706. aSign = extractFloat32Sign( a );
  2707. bSign = extractFloat32Sign( b );
  2708. av = float32_val(a);
  2709. bv = float32_val(b);
  2710. if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
  2711. return ( av == bv ) || ( aSign ^ ( av < bv ) );
  2712. }
  2713. /*----------------------------------------------------------------------------
  2714. | Returns 1 if the single-precision floating-point value `a' is less than
  2715. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  2716. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  2717. | Standard for Binary Floating-Point Arithmetic.
  2718. *----------------------------------------------------------------------------*/
  2719. int float32_lt_quiet(float32 a, float32 b, float_status *status)
  2720. {
  2721. flag aSign, bSign;
  2722. uint32_t av, bv;
  2723. a = float32_squash_input_denormal(a, status);
  2724. b = float32_squash_input_denormal(b, status);
  2725. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2726. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2727. ) {
  2728. if (float32_is_signaling_nan(a, status)
  2729. || float32_is_signaling_nan(b, status)) {
  2730. float_raise(float_flag_invalid, status);
  2731. }
  2732. return 0;
  2733. }
  2734. aSign = extractFloat32Sign( a );
  2735. bSign = extractFloat32Sign( b );
  2736. av = float32_val(a);
  2737. bv = float32_val(b);
  2738. if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
  2739. return ( av != bv ) && ( aSign ^ ( av < bv ) );
  2740. }
  2741. /*----------------------------------------------------------------------------
  2742. | Returns 1 if the single-precision floating-point values `a' and `b' cannot
  2743. | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
  2744. | comparison is performed according to the IEC/IEEE Standard for Binary
  2745. | Floating-Point Arithmetic.
  2746. *----------------------------------------------------------------------------*/
  2747. int float32_unordered_quiet(float32 a, float32 b, float_status *status)
  2748. {
  2749. a = float32_squash_input_denormal(a, status);
  2750. b = float32_squash_input_denormal(b, status);
  2751. if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
  2752. || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
  2753. ) {
  2754. if (float32_is_signaling_nan(a, status)
  2755. || float32_is_signaling_nan(b, status)) {
  2756. float_raise(float_flag_invalid, status);
  2757. }
  2758. return 1;
  2759. }
  2760. return 0;
  2761. }
  2762. /*----------------------------------------------------------------------------
  2763. | Returns the result of converting the double-precision floating-point value
  2764. | `a' to the 32-bit two's complement integer format. The conversion is
  2765. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2766. | Arithmetic---which means in particular that the conversion is rounded
  2767. | according to the current rounding mode. If `a' is a NaN, the largest
  2768. | positive integer is returned. Otherwise, if the conversion overflows, the
  2769. | largest integer with the same sign as `a' is returned.
  2770. *----------------------------------------------------------------------------*/
  2771. int32_t float64_to_int32(float64 a, float_status *status)
  2772. {
  2773. flag aSign;
  2774. int aExp;
  2775. int shiftCount;
  2776. uint64_t aSig;
  2777. a = float64_squash_input_denormal(a, status);
  2778. aSig = extractFloat64Frac( a );
  2779. aExp = extractFloat64Exp( a );
  2780. aSign = extractFloat64Sign( a );
  2781. if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
  2782. if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2783. shiftCount = 0x42C - aExp;
  2784. if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
  2785. return roundAndPackInt32(aSign, aSig, status);
  2786. }
  2787. /*----------------------------------------------------------------------------
  2788. | Returns the result of converting the double-precision floating-point value
  2789. | `a' to the 32-bit two's complement integer format. The conversion is
  2790. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2791. | Arithmetic, except that the conversion is always rounded toward zero.
  2792. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2793. | the conversion overflows, the largest integer with the same sign as `a' is
  2794. | returned.
  2795. *----------------------------------------------------------------------------*/
  2796. int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
  2797. {
  2798. flag aSign;
  2799. int aExp;
  2800. int shiftCount;
  2801. uint64_t aSig, savedASig;
  2802. int32_t z;
  2803. a = float64_squash_input_denormal(a, status);
  2804. aSig = extractFloat64Frac( a );
  2805. aExp = extractFloat64Exp( a );
  2806. aSign = extractFloat64Sign( a );
  2807. if ( 0x41E < aExp ) {
  2808. if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
  2809. goto invalid;
  2810. }
  2811. else if ( aExp < 0x3FF ) {
  2812. if (aExp || aSig) {
  2813. status->float_exception_flags |= float_flag_inexact;
  2814. }
  2815. return 0;
  2816. }
  2817. aSig |= LIT64( 0x0010000000000000 );
  2818. shiftCount = 0x433 - aExp;
  2819. savedASig = aSig;
  2820. aSig >>= shiftCount;
  2821. z = aSig;
  2822. if ( aSign ) z = - z;
  2823. if ( ( z < 0 ) ^ aSign ) {
  2824. invalid:
  2825. float_raise(float_flag_invalid, status);
  2826. return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
  2827. }
  2828. if ( ( aSig<<shiftCount ) != savedASig ) {
  2829. status->float_exception_flags |= float_flag_inexact;
  2830. }
  2831. return z;
  2832. }
  2833. /*----------------------------------------------------------------------------
  2834. | Returns the result of converting the double-precision floating-point value
  2835. | `a' to the 16-bit two's complement integer format. The conversion is
  2836. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2837. | Arithmetic, except that the conversion is always rounded toward zero.
  2838. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2839. | the conversion overflows, the largest integer with the same sign as `a' is
  2840. | returned.
  2841. *----------------------------------------------------------------------------*/
  2842. int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
  2843. {
  2844. flag aSign;
  2845. int aExp;
  2846. int shiftCount;
  2847. uint64_t aSig, savedASig;
  2848. int32_t z;
  2849. aSig = extractFloat64Frac( a );
  2850. aExp = extractFloat64Exp( a );
  2851. aSign = extractFloat64Sign( a );
  2852. if ( 0x40E < aExp ) {
  2853. if ( ( aExp == 0x7FF ) && aSig ) {
  2854. aSign = 0;
  2855. }
  2856. goto invalid;
  2857. }
  2858. else if ( aExp < 0x3FF ) {
  2859. if ( aExp || aSig ) {
  2860. status->float_exception_flags |= float_flag_inexact;
  2861. }
  2862. return 0;
  2863. }
  2864. aSig |= LIT64( 0x0010000000000000 );
  2865. shiftCount = 0x433 - aExp;
  2866. savedASig = aSig;
  2867. aSig >>= shiftCount;
  2868. z = aSig;
  2869. if ( aSign ) {
  2870. z = - z;
  2871. }
  2872. if ( ( (int16_t)z < 0 ) ^ aSign ) {
  2873. invalid:
  2874. float_raise(float_flag_invalid, status);
  2875. return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
  2876. }
  2877. if ( ( aSig<<shiftCount ) != savedASig ) {
  2878. status->float_exception_flags |= float_flag_inexact;
  2879. }
  2880. return z;
  2881. }
  2882. /*----------------------------------------------------------------------------
  2883. | Returns the result of converting the double-precision floating-point value
  2884. | `a' to the 64-bit two's complement integer format. The conversion is
  2885. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2886. | Arithmetic---which means in particular that the conversion is rounded
  2887. | according to the current rounding mode. If `a' is a NaN, the largest
  2888. | positive integer is returned. Otherwise, if the conversion overflows, the
  2889. | largest integer with the same sign as `a' is returned.
  2890. *----------------------------------------------------------------------------*/
  2891. int64_t float64_to_int64(float64 a, float_status *status)
  2892. {
  2893. flag aSign;
  2894. int aExp;
  2895. int shiftCount;
  2896. uint64_t aSig, aSigExtra;
  2897. a = float64_squash_input_denormal(a, status);
  2898. aSig = extractFloat64Frac( a );
  2899. aExp = extractFloat64Exp( a );
  2900. aSign = extractFloat64Sign( a );
  2901. if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2902. shiftCount = 0x433 - aExp;
  2903. if ( shiftCount <= 0 ) {
  2904. if ( 0x43E < aExp ) {
  2905. float_raise(float_flag_invalid, status);
  2906. if ( ! aSign
  2907. || ( ( aExp == 0x7FF )
  2908. && ( aSig != LIT64( 0x0010000000000000 ) ) )
  2909. ) {
  2910. return LIT64( 0x7FFFFFFFFFFFFFFF );
  2911. }
  2912. return (int64_t) LIT64( 0x8000000000000000 );
  2913. }
  2914. aSigExtra = 0;
  2915. aSig <<= - shiftCount;
  2916. }
  2917. else {
  2918. shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  2919. }
  2920. return roundAndPackInt64(aSign, aSig, aSigExtra, status);
  2921. }
  2922. /*----------------------------------------------------------------------------
  2923. | Returns the result of converting the double-precision floating-point value
  2924. | `a' to the 64-bit two's complement integer format. The conversion is
  2925. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2926. | Arithmetic, except that the conversion is always rounded toward zero.
  2927. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  2928. | the conversion overflows, the largest integer with the same sign as `a' is
  2929. | returned.
  2930. *----------------------------------------------------------------------------*/
  2931. int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
  2932. {
  2933. flag aSign;
  2934. int aExp;
  2935. int shiftCount;
  2936. uint64_t aSig;
  2937. int64_t z;
  2938. a = float64_squash_input_denormal(a, status);
  2939. aSig = extractFloat64Frac( a );
  2940. aExp = extractFloat64Exp( a );
  2941. aSign = extractFloat64Sign( a );
  2942. if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
  2943. shiftCount = aExp - 0x433;
  2944. if ( 0 <= shiftCount ) {
  2945. if ( 0x43E <= aExp ) {
  2946. if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
  2947. float_raise(float_flag_invalid, status);
  2948. if ( ! aSign
  2949. || ( ( aExp == 0x7FF )
  2950. && ( aSig != LIT64( 0x0010000000000000 ) ) )
  2951. ) {
  2952. return LIT64( 0x7FFFFFFFFFFFFFFF );
  2953. }
  2954. }
  2955. return (int64_t) LIT64( 0x8000000000000000 );
  2956. }
  2957. z = aSig<<shiftCount;
  2958. }
  2959. else {
  2960. if ( aExp < 0x3FE ) {
  2961. if (aExp | aSig) {
  2962. status->float_exception_flags |= float_flag_inexact;
  2963. }
  2964. return 0;
  2965. }
  2966. z = aSig>>( - shiftCount );
  2967. if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
  2968. status->float_exception_flags |= float_flag_inexact;
  2969. }
  2970. }
  2971. if ( aSign ) z = - z;
  2972. return z;
  2973. }
  2974. /*----------------------------------------------------------------------------
  2975. | Returns the result of converting the double-precision floating-point value
  2976. | `a' to the single-precision floating-point format. The conversion is
  2977. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  2978. | Arithmetic.
  2979. *----------------------------------------------------------------------------*/
  2980. float32 float64_to_float32(float64 a, float_status *status)
  2981. {
  2982. flag aSign;
  2983. int aExp;
  2984. uint64_t aSig;
  2985. uint32_t zSig;
  2986. a = float64_squash_input_denormal(a, status);
  2987. aSig = extractFloat64Frac( a );
  2988. aExp = extractFloat64Exp( a );
  2989. aSign = extractFloat64Sign( a );
  2990. if ( aExp == 0x7FF ) {
  2991. if (aSig) {
  2992. return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
  2993. }
  2994. return packFloat32( aSign, 0xFF, 0 );
  2995. }
  2996. shift64RightJamming( aSig, 22, &aSig );
  2997. zSig = aSig;
  2998. if ( aExp || zSig ) {
  2999. zSig |= 0x40000000;
  3000. aExp -= 0x381;
  3001. }
  3002. return roundAndPackFloat32(aSign, aExp, zSig, status);
  3003. }
  3004. /*----------------------------------------------------------------------------
  3005. | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
  3006. | half-precision floating-point value, returning the result. After being
  3007. | shifted into the proper positions, the three fields are simply added
  3008. | together to form the result. This means that any integer portion of `zSig'
  3009. | will be added into the exponent. Since a properly normalized significand
  3010. | will have an integer portion equal to 1, the `zExp' input should be 1 less
  3011. | than the desired result exponent whenever `zSig' is a complete, normalized
  3012. | significand.
  3013. *----------------------------------------------------------------------------*/
  3014. static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
  3015. {
  3016. return make_float16(
  3017. (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
  3018. }
  3019. /*----------------------------------------------------------------------------
  3020. | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
  3021. | and significand `zSig', and returns the proper half-precision floating-
  3022. | point value corresponding to the abstract input. Ordinarily, the abstract
  3023. | value is simply rounded and packed into the half-precision format, with
  3024. | the inexact exception raised if the abstract input cannot be represented
  3025. | exactly. However, if the abstract value is too large, the overflow and
  3026. | inexact exceptions are raised and an infinity or maximal finite value is
  3027. | returned. If the abstract value is too small, the input value is rounded to
  3028. | a subnormal number, and the underflow and inexact exceptions are raised if
  3029. | the abstract input cannot be represented exactly as a subnormal half-
  3030. | precision floating-point number.
  3031. | The `ieee' flag indicates whether to use IEEE standard half precision, or
  3032. | ARM-style "alternative representation", which omits the NaN and Inf
  3033. | encodings in order to raise the maximum representable exponent by one.
  3034. | The input significand `zSig' has its binary point between bits 22
  3035. | and 23, which is 13 bits to the left of the usual location. This shifted
  3036. | significand must be normalized or smaller. If `zSig' is not normalized,
  3037. | `zExp' must be 0; in that case, the result returned is a subnormal number,
  3038. | and it must not require rounding. In the usual case that `zSig' is
  3039. | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
  3040. | Note the slightly odd position of the binary point in zSig compared with the
  3041. | other roundAndPackFloat functions. This should probably be fixed if we
  3042. | need to implement more float16 routines than just conversion.
  3043. | The handling of underflow and overflow follows the IEC/IEEE Standard for
  3044. | Binary Floating-Point Arithmetic.
  3045. *----------------------------------------------------------------------------*/
  3046. static float16 roundAndPackFloat16(flag zSign, int zExp,
  3047. uint32_t zSig, flag ieee,
  3048. float_status *status)
  3049. {
  3050. int maxexp = ieee ? 29 : 30;
  3051. uint32_t mask;
  3052. uint32_t increment;
  3053. bool rounding_bumps_exp;
  3054. bool is_tiny = false;
  3055. /* Calculate the mask of bits of the mantissa which are not
  3056. * representable in half-precision and will be lost.
  3057. */
  3058. if (zExp < 1) {
  3059. /* Will be denormal in halfprec */
  3060. mask = 0x00ffffff;
  3061. if (zExp >= -11) {
  3062. mask >>= 11 + zExp;
  3063. }
  3064. } else {
  3065. /* Normal number in halfprec */
  3066. mask = 0x00001fff;
  3067. }
  3068. switch (status->float_rounding_mode) {
  3069. case float_round_nearest_even:
  3070. increment = (mask + 1) >> 1;
  3071. if ((zSig & mask) == increment) {
  3072. increment = zSig & (increment << 1);
  3073. }
  3074. break;
  3075. case float_round_ties_away:
  3076. increment = (mask + 1) >> 1;
  3077. break;
  3078. case float_round_up:
  3079. increment = zSign ? 0 : mask;
  3080. break;
  3081. case float_round_down:
  3082. increment = zSign ? mask : 0;
  3083. break;
  3084. default: /* round_to_zero */
  3085. increment = 0;
  3086. break;
  3087. }
  3088. rounding_bumps_exp = (zSig + increment >= 0x01000000);
  3089. if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
  3090. if (ieee) {
  3091. float_raise(float_flag_overflow | float_flag_inexact, status);
  3092. return packFloat16(zSign, 0x1f, 0);
  3093. } else {
  3094. float_raise(float_flag_invalid, status);
  3095. return packFloat16(zSign, 0x1f, 0x3ff);
  3096. }
  3097. }
  3098. if (zExp < 0) {
  3099. /* Note that flush-to-zero does not affect half-precision results */
  3100. is_tiny =
  3101. (status->float_detect_tininess == float_tininess_before_rounding)
  3102. || (zExp < -1)
  3103. || (!rounding_bumps_exp);
  3104. }
  3105. if (zSig & mask) {
  3106. float_raise(float_flag_inexact, status);
  3107. if (is_tiny) {
  3108. float_raise(float_flag_underflow, status);
  3109. }
  3110. }
  3111. zSig += increment;
  3112. if (rounding_bumps_exp) {
  3113. zSig >>= 1;
  3114. zExp++;
  3115. }
  3116. if (zExp < -10) {
  3117. return packFloat16(zSign, 0, 0);
  3118. }
  3119. if (zExp < 0) {
  3120. zSig >>= -zExp;
  3121. zExp = 0;
  3122. }
  3123. return packFloat16(zSign, zExp, zSig >> 13);
  3124. }
  3125. static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
  3126. uint32_t *zSigPtr)
  3127. {
  3128. int8_t shiftCount = countLeadingZeros32(aSig) - 21;
  3129. *zSigPtr = aSig << shiftCount;
  3130. *zExpPtr = 1 - shiftCount;
  3131. }
  3132. /* Half precision floats come in two formats: standard IEEE and "ARM" format.
  3133. The latter gains extra exponent range by omitting the NaN/Inf encodings. */
  3134. float32 float16_to_float32(float16 a, flag ieee, float_status *status)
  3135. {
  3136. flag aSign;
  3137. int aExp;
  3138. uint32_t aSig;
  3139. aSign = extractFloat16Sign(a);
  3140. aExp = extractFloat16Exp(a);
  3141. aSig = extractFloat16Frac(a);
  3142. if (aExp == 0x1f && ieee) {
  3143. if (aSig) {
  3144. return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
  3145. }
  3146. return packFloat32(aSign, 0xff, 0);
  3147. }
  3148. if (aExp == 0) {
  3149. if (aSig == 0) {
  3150. return packFloat32(aSign, 0, 0);
  3151. }
  3152. normalizeFloat16Subnormal(aSig, &aExp, &aSig);
  3153. aExp--;
  3154. }
  3155. return packFloat32( aSign, aExp + 0x70, aSig << 13);
  3156. }
  3157. float16 float32_to_float16(float32 a, flag ieee, float_status *status)
  3158. {
  3159. flag aSign;
  3160. int aExp;
  3161. uint32_t aSig;
  3162. a = float32_squash_input_denormal(a, status);
  3163. aSig = extractFloat32Frac( a );
  3164. aExp = extractFloat32Exp( a );
  3165. aSign = extractFloat32Sign( a );
  3166. if ( aExp == 0xFF ) {
  3167. if (aSig) {
  3168. /* Input is a NaN */
  3169. if (!ieee) {
  3170. float_raise(float_flag_invalid, status);
  3171. return packFloat16(aSign, 0, 0);
  3172. }
  3173. return commonNaNToFloat16(
  3174. float32ToCommonNaN(a, status), status);
  3175. }
  3176. /* Infinity */
  3177. if (!ieee) {
  3178. float_raise(float_flag_invalid, status);
  3179. return packFloat16(aSign, 0x1f, 0x3ff);
  3180. }
  3181. return packFloat16(aSign, 0x1f, 0);
  3182. }
  3183. if (aExp == 0 && aSig == 0) {
  3184. return packFloat16(aSign, 0, 0);
  3185. }
  3186. /* Decimal point between bits 22 and 23. Note that we add the 1 bit
  3187. * even if the input is denormal; however this is harmless because
  3188. * the largest possible single-precision denormal is still smaller
  3189. * than the smallest representable half-precision denormal, and so we
  3190. * will end up ignoring aSig and returning via the "always return zero"
  3191. * codepath.
  3192. */
  3193. aSig |= 0x00800000;
  3194. aExp -= 0x71;
  3195. return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
  3196. }
  3197. float64 float16_to_float64(float16 a, flag ieee, float_status *status)
  3198. {
  3199. flag aSign;
  3200. int aExp;
  3201. uint32_t aSig;
  3202. aSign = extractFloat16Sign(a);
  3203. aExp = extractFloat16Exp(a);
  3204. aSig = extractFloat16Frac(a);
  3205. if (aExp == 0x1f && ieee) {
  3206. if (aSig) {
  3207. return commonNaNToFloat64(
  3208. float16ToCommonNaN(a, status), status);
  3209. }
  3210. return packFloat64(aSign, 0x7ff, 0);
  3211. }
  3212. if (aExp == 0) {
  3213. if (aSig == 0) {
  3214. return packFloat64(aSign, 0, 0);
  3215. }
  3216. normalizeFloat16Subnormal(aSig, &aExp, &aSig);
  3217. aExp--;
  3218. }
  3219. return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
  3220. }
  3221. float16 float64_to_float16(float64 a, flag ieee, float_status *status)
  3222. {
  3223. flag aSign;
  3224. int aExp;
  3225. uint64_t aSig;
  3226. uint32_t zSig;
  3227. a = float64_squash_input_denormal(a, status);
  3228. aSig = extractFloat64Frac(a);
  3229. aExp = extractFloat64Exp(a);
  3230. aSign = extractFloat64Sign(a);
  3231. if (aExp == 0x7FF) {
  3232. if (aSig) {
  3233. /* Input is a NaN */
  3234. if (!ieee) {
  3235. float_raise(float_flag_invalid, status);
  3236. return packFloat16(aSign, 0, 0);
  3237. }
  3238. return commonNaNToFloat16(
  3239. float64ToCommonNaN(a, status), status);
  3240. }
  3241. /* Infinity */
  3242. if (!ieee) {
  3243. float_raise(float_flag_invalid, status);
  3244. return packFloat16(aSign, 0x1f, 0x3ff);
  3245. }
  3246. return packFloat16(aSign, 0x1f, 0);
  3247. }
  3248. shift64RightJamming(aSig, 29, &aSig);
  3249. zSig = aSig;
  3250. if (aExp == 0 && zSig == 0) {
  3251. return packFloat16(aSign, 0, 0);
  3252. }
  3253. /* Decimal point between bits 22 and 23. Note that we add the 1 bit
  3254. * even if the input is denormal; however this is harmless because
  3255. * the largest possible single-precision denormal is still smaller
  3256. * than the smallest representable half-precision denormal, and so we
  3257. * will end up ignoring aSig and returning via the "always return zero"
  3258. * codepath.
  3259. */
  3260. zSig |= 0x00800000;
  3261. aExp -= 0x3F1;
  3262. return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
  3263. }
  3264. /*----------------------------------------------------------------------------
  3265. | Returns the result of converting the double-precision floating-point value
  3266. | `a' to the extended double-precision floating-point format. The conversion
  3267. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  3268. | Arithmetic.
  3269. *----------------------------------------------------------------------------*/
  3270. floatx80 float64_to_floatx80(float64 a, float_status *status)
  3271. {
  3272. flag aSign;
  3273. int aExp;
  3274. uint64_t aSig;
  3275. a = float64_squash_input_denormal(a, status);
  3276. aSig = extractFloat64Frac( a );
  3277. aExp = extractFloat64Exp( a );
  3278. aSign = extractFloat64Sign( a );
  3279. if ( aExp == 0x7FF ) {
  3280. if (aSig) {
  3281. return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
  3282. }
  3283. return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  3284. }
  3285. if ( aExp == 0 ) {
  3286. if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
  3287. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  3288. }
  3289. return
  3290. packFloatx80(
  3291. aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
  3292. }
  3293. /*----------------------------------------------------------------------------
  3294. | Returns the result of converting the double-precision floating-point value
  3295. | `a' to the quadruple-precision floating-point format. The conversion is
  3296. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  3297. | Arithmetic.
  3298. *----------------------------------------------------------------------------*/
  3299. float128 float64_to_float128(float64 a, float_status *status)
  3300. {
  3301. flag aSign;
  3302. int aExp;
  3303. uint64_t aSig, zSig0, zSig1;
  3304. a = float64_squash_input_denormal(a, status);
  3305. aSig = extractFloat64Frac( a );
  3306. aExp = extractFloat64Exp( a );
  3307. aSign = extractFloat64Sign( a );
  3308. if ( aExp == 0x7FF ) {
  3309. if (aSig) {
  3310. return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
  3311. }
  3312. return packFloat128( aSign, 0x7FFF, 0, 0 );
  3313. }
  3314. if ( aExp == 0 ) {
  3315. if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
  3316. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  3317. --aExp;
  3318. }
  3319. shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
  3320. return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
  3321. }
  3322. /*----------------------------------------------------------------------------
  3323. | Rounds the double-precision floating-point value `a' to an integer, and
  3324. | returns the result as a double-precision floating-point value. The
  3325. | operation is performed according to the IEC/IEEE Standard for Binary
  3326. | Floating-Point Arithmetic.
  3327. *----------------------------------------------------------------------------*/
  3328. float64 float64_round_to_int(float64 a, float_status *status)
  3329. {
  3330. flag aSign;
  3331. int aExp;
  3332. uint64_t lastBitMask, roundBitsMask;
  3333. uint64_t z;
  3334. a = float64_squash_input_denormal(a, status);
  3335. aExp = extractFloat64Exp( a );
  3336. if ( 0x433 <= aExp ) {
  3337. if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
  3338. return propagateFloat64NaN(a, a, status);
  3339. }
  3340. return a;
  3341. }
  3342. if ( aExp < 0x3FF ) {
  3343. if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
  3344. status->float_exception_flags |= float_flag_inexact;
  3345. aSign = extractFloat64Sign( a );
  3346. switch (status->float_rounding_mode) {
  3347. case float_round_nearest_even:
  3348. if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
  3349. return packFloat64( aSign, 0x3FF, 0 );
  3350. }
  3351. break;
  3352. case float_round_ties_away:
  3353. if (aExp == 0x3FE) {
  3354. return packFloat64(aSign, 0x3ff, 0);
  3355. }
  3356. break;
  3357. case float_round_down:
  3358. return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
  3359. case float_round_up:
  3360. return make_float64(
  3361. aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
  3362. }
  3363. return packFloat64( aSign, 0, 0 );
  3364. }
  3365. lastBitMask = 1;
  3366. lastBitMask <<= 0x433 - aExp;
  3367. roundBitsMask = lastBitMask - 1;
  3368. z = float64_val(a);
  3369. switch (status->float_rounding_mode) {
  3370. case float_round_nearest_even:
  3371. z += lastBitMask >> 1;
  3372. if ((z & roundBitsMask) == 0) {
  3373. z &= ~lastBitMask;
  3374. }
  3375. break;
  3376. case float_round_ties_away:
  3377. z += lastBitMask >> 1;
  3378. break;
  3379. case float_round_to_zero:
  3380. break;
  3381. case float_round_up:
  3382. if (!extractFloat64Sign(make_float64(z))) {
  3383. z += roundBitsMask;
  3384. }
  3385. break;
  3386. case float_round_down:
  3387. if (extractFloat64Sign(make_float64(z))) {
  3388. z += roundBitsMask;
  3389. }
  3390. break;
  3391. default:
  3392. abort();
  3393. }
  3394. z &= ~ roundBitsMask;
  3395. if (z != float64_val(a)) {
  3396. status->float_exception_flags |= float_flag_inexact;
  3397. }
  3398. return make_float64(z);
  3399. }
  3400. float64 float64_trunc_to_int(float64 a, float_status *status)
  3401. {
  3402. int oldmode;
  3403. float64 res;
  3404. oldmode = status->float_rounding_mode;
  3405. status->float_rounding_mode = float_round_to_zero;
  3406. res = float64_round_to_int(a, status);
  3407. status->float_rounding_mode = oldmode;
  3408. return res;
  3409. }
  3410. /*----------------------------------------------------------------------------
  3411. | Returns the result of adding the absolute values of the double-precision
  3412. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  3413. | before being returned. `zSign' is ignored if the result is a NaN.
  3414. | The addition is performed according to the IEC/IEEE Standard for Binary
  3415. | Floating-Point Arithmetic.
  3416. *----------------------------------------------------------------------------*/
  3417. static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
  3418. float_status *status)
  3419. {
  3420. int aExp, bExp, zExp;
  3421. uint64_t aSig, bSig, zSig;
  3422. int expDiff;
  3423. aSig = extractFloat64Frac( a );
  3424. aExp = extractFloat64Exp( a );
  3425. bSig = extractFloat64Frac( b );
  3426. bExp = extractFloat64Exp( b );
  3427. expDiff = aExp - bExp;
  3428. aSig <<= 9;
  3429. bSig <<= 9;
  3430. if ( 0 < expDiff ) {
  3431. if ( aExp == 0x7FF ) {
  3432. if (aSig) {
  3433. return propagateFloat64NaN(a, b, status);
  3434. }
  3435. return a;
  3436. }
  3437. if ( bExp == 0 ) {
  3438. --expDiff;
  3439. }
  3440. else {
  3441. bSig |= LIT64( 0x2000000000000000 );
  3442. }
  3443. shift64RightJamming( bSig, expDiff, &bSig );
  3444. zExp = aExp;
  3445. }
  3446. else if ( expDiff < 0 ) {
  3447. if ( bExp == 0x7FF ) {
  3448. if (bSig) {
  3449. return propagateFloat64NaN(a, b, status);
  3450. }
  3451. return packFloat64( zSign, 0x7FF, 0 );
  3452. }
  3453. if ( aExp == 0 ) {
  3454. ++expDiff;
  3455. }
  3456. else {
  3457. aSig |= LIT64( 0x2000000000000000 );
  3458. }
  3459. shift64RightJamming( aSig, - expDiff, &aSig );
  3460. zExp = bExp;
  3461. }
  3462. else {
  3463. if ( aExp == 0x7FF ) {
  3464. if (aSig | bSig) {
  3465. return propagateFloat64NaN(a, b, status);
  3466. }
  3467. return a;
  3468. }
  3469. if ( aExp == 0 ) {
  3470. if (status->flush_to_zero) {
  3471. if (aSig | bSig) {
  3472. float_raise(float_flag_output_denormal, status);
  3473. }
  3474. return packFloat64(zSign, 0, 0);
  3475. }
  3476. return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
  3477. }
  3478. zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
  3479. zExp = aExp;
  3480. goto roundAndPack;
  3481. }
  3482. aSig |= LIT64( 0x2000000000000000 );
  3483. zSig = ( aSig + bSig )<<1;
  3484. --zExp;
  3485. if ( (int64_t) zSig < 0 ) {
  3486. zSig = aSig + bSig;
  3487. ++zExp;
  3488. }
  3489. roundAndPack:
  3490. return roundAndPackFloat64(zSign, zExp, zSig, status);
  3491. }
  3492. /*----------------------------------------------------------------------------
  3493. | Returns the result of subtracting the absolute values of the double-
  3494. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  3495. | difference is negated before being returned. `zSign' is ignored if the
  3496. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  3497. | Standard for Binary Floating-Point Arithmetic.
  3498. *----------------------------------------------------------------------------*/
  3499. static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
  3500. float_status *status)
  3501. {
  3502. int aExp, bExp, zExp;
  3503. uint64_t aSig, bSig, zSig;
  3504. int expDiff;
  3505. aSig = extractFloat64Frac( a );
  3506. aExp = extractFloat64Exp( a );
  3507. bSig = extractFloat64Frac( b );
  3508. bExp = extractFloat64Exp( b );
  3509. expDiff = aExp - bExp;
  3510. aSig <<= 10;
  3511. bSig <<= 10;
  3512. if ( 0 < expDiff ) goto aExpBigger;
  3513. if ( expDiff < 0 ) goto bExpBigger;
  3514. if ( aExp == 0x7FF ) {
  3515. if (aSig | bSig) {
  3516. return propagateFloat64NaN(a, b, status);
  3517. }
  3518. float_raise(float_flag_invalid, status);
  3519. return float64_default_nan(status);
  3520. }
  3521. if ( aExp == 0 ) {
  3522. aExp = 1;
  3523. bExp = 1;
  3524. }
  3525. if ( bSig < aSig ) goto aBigger;
  3526. if ( aSig < bSig ) goto bBigger;
  3527. return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
  3528. bExpBigger:
  3529. if ( bExp == 0x7FF ) {
  3530. if (bSig) {
  3531. return propagateFloat64NaN(a, b, status);
  3532. }
  3533. return packFloat64( zSign ^ 1, 0x7FF, 0 );
  3534. }
  3535. if ( aExp == 0 ) {
  3536. ++expDiff;
  3537. }
  3538. else {
  3539. aSig |= LIT64( 0x4000000000000000 );
  3540. }
  3541. shift64RightJamming( aSig, - expDiff, &aSig );
  3542. bSig |= LIT64( 0x4000000000000000 );
  3543. bBigger:
  3544. zSig = bSig - aSig;
  3545. zExp = bExp;
  3546. zSign ^= 1;
  3547. goto normalizeRoundAndPack;
  3548. aExpBigger:
  3549. if ( aExp == 0x7FF ) {
  3550. if (aSig) {
  3551. return propagateFloat64NaN(a, b, status);
  3552. }
  3553. return a;
  3554. }
  3555. if ( bExp == 0 ) {
  3556. --expDiff;
  3557. }
  3558. else {
  3559. bSig |= LIT64( 0x4000000000000000 );
  3560. }
  3561. shift64RightJamming( bSig, expDiff, &bSig );
  3562. aSig |= LIT64( 0x4000000000000000 );
  3563. aBigger:
  3564. zSig = aSig - bSig;
  3565. zExp = aExp;
  3566. normalizeRoundAndPack:
  3567. --zExp;
  3568. return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
  3569. }
  3570. /*----------------------------------------------------------------------------
  3571. | Returns the result of adding the double-precision floating-point values `a'
  3572. | and `b'. The operation is performed according to the IEC/IEEE Standard for
  3573. | Binary Floating-Point Arithmetic.
  3574. *----------------------------------------------------------------------------*/
  3575. float64 float64_add(float64 a, float64 b, float_status *status)
  3576. {
  3577. flag aSign, bSign;
  3578. a = float64_squash_input_denormal(a, status);
  3579. b = float64_squash_input_denormal(b, status);
  3580. aSign = extractFloat64Sign( a );
  3581. bSign = extractFloat64Sign( b );
  3582. if ( aSign == bSign ) {
  3583. return addFloat64Sigs(a, b, aSign, status);
  3584. }
  3585. else {
  3586. return subFloat64Sigs(a, b, aSign, status);
  3587. }
  3588. }
  3589. /*----------------------------------------------------------------------------
  3590. | Returns the result of subtracting the double-precision floating-point values
  3591. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3592. | for Binary Floating-Point Arithmetic.
  3593. *----------------------------------------------------------------------------*/
  3594. float64 float64_sub(float64 a, float64 b, float_status *status)
  3595. {
  3596. flag aSign, bSign;
  3597. a = float64_squash_input_denormal(a, status);
  3598. b = float64_squash_input_denormal(b, status);
  3599. aSign = extractFloat64Sign( a );
  3600. bSign = extractFloat64Sign( b );
  3601. if ( aSign == bSign ) {
  3602. return subFloat64Sigs(a, b, aSign, status);
  3603. }
  3604. else {
  3605. return addFloat64Sigs(a, b, aSign, status);
  3606. }
  3607. }
  3608. /*----------------------------------------------------------------------------
  3609. | Returns the result of multiplying the double-precision floating-point values
  3610. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  3611. | for Binary Floating-Point Arithmetic.
  3612. *----------------------------------------------------------------------------*/
  3613. float64 float64_mul(float64 a, float64 b, float_status *status)
  3614. {
  3615. flag aSign, bSign, zSign;
  3616. int aExp, bExp, zExp;
  3617. uint64_t aSig, bSig, zSig0, zSig1;
  3618. a = float64_squash_input_denormal(a, status);
  3619. b = float64_squash_input_denormal(b, status);
  3620. aSig = extractFloat64Frac( a );
  3621. aExp = extractFloat64Exp( a );
  3622. aSign = extractFloat64Sign( a );
  3623. bSig = extractFloat64Frac( b );
  3624. bExp = extractFloat64Exp( b );
  3625. bSign = extractFloat64Sign( b );
  3626. zSign = aSign ^ bSign;
  3627. if ( aExp == 0x7FF ) {
  3628. if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
  3629. return propagateFloat64NaN(a, b, status);
  3630. }
  3631. if ( ( bExp | bSig ) == 0 ) {
  3632. float_raise(float_flag_invalid, status);
  3633. return float64_default_nan(status);
  3634. }
  3635. return packFloat64( zSign, 0x7FF, 0 );
  3636. }
  3637. if ( bExp == 0x7FF ) {
  3638. if (bSig) {
  3639. return propagateFloat64NaN(a, b, status);
  3640. }
  3641. if ( ( aExp | aSig ) == 0 ) {
  3642. float_raise(float_flag_invalid, status);
  3643. return float64_default_nan(status);
  3644. }
  3645. return packFloat64( zSign, 0x7FF, 0 );
  3646. }
  3647. if ( aExp == 0 ) {
  3648. if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
  3649. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  3650. }
  3651. if ( bExp == 0 ) {
  3652. if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
  3653. normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  3654. }
  3655. zExp = aExp + bExp - 0x3FF;
  3656. aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
  3657. bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  3658. mul64To128( aSig, bSig, &zSig0, &zSig1 );
  3659. zSig0 |= ( zSig1 != 0 );
  3660. if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
  3661. zSig0 <<= 1;
  3662. --zExp;
  3663. }
  3664. return roundAndPackFloat64(zSign, zExp, zSig0, status);
  3665. }
  3666. /*----------------------------------------------------------------------------
  3667. | Returns the result of dividing the double-precision floating-point value `a'
  3668. | by the corresponding value `b'. The operation is performed according to
  3669. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3670. *----------------------------------------------------------------------------*/
  3671. float64 float64_div(float64 a, float64 b, float_status *status)
  3672. {
  3673. flag aSign, bSign, zSign;
  3674. int aExp, bExp, zExp;
  3675. uint64_t aSig, bSig, zSig;
  3676. uint64_t rem0, rem1;
  3677. uint64_t term0, term1;
  3678. a = float64_squash_input_denormal(a, status);
  3679. b = float64_squash_input_denormal(b, status);
  3680. aSig = extractFloat64Frac( a );
  3681. aExp = extractFloat64Exp( a );
  3682. aSign = extractFloat64Sign( a );
  3683. bSig = extractFloat64Frac( b );
  3684. bExp = extractFloat64Exp( b );
  3685. bSign = extractFloat64Sign( b );
  3686. zSign = aSign ^ bSign;
  3687. if ( aExp == 0x7FF ) {
  3688. if (aSig) {
  3689. return propagateFloat64NaN(a, b, status);
  3690. }
  3691. if ( bExp == 0x7FF ) {
  3692. if (bSig) {
  3693. return propagateFloat64NaN(a, b, status);
  3694. }
  3695. float_raise(float_flag_invalid, status);
  3696. return float64_default_nan(status);
  3697. }
  3698. return packFloat64( zSign, 0x7FF, 0 );
  3699. }
  3700. if ( bExp == 0x7FF ) {
  3701. if (bSig) {
  3702. return propagateFloat64NaN(a, b, status);
  3703. }
  3704. return packFloat64( zSign, 0, 0 );
  3705. }
  3706. if ( bExp == 0 ) {
  3707. if ( bSig == 0 ) {
  3708. if ( ( aExp | aSig ) == 0 ) {
  3709. float_raise(float_flag_invalid, status);
  3710. return float64_default_nan(status);
  3711. }
  3712. float_raise(float_flag_divbyzero, status);
  3713. return packFloat64( zSign, 0x7FF, 0 );
  3714. }
  3715. normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  3716. }
  3717. if ( aExp == 0 ) {
  3718. if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
  3719. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  3720. }
  3721. zExp = aExp - bExp + 0x3FD;
  3722. aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
  3723. bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  3724. if ( bSig <= ( aSig + aSig ) ) {
  3725. aSig >>= 1;
  3726. ++zExp;
  3727. }
  3728. zSig = estimateDiv128To64( aSig, 0, bSig );
  3729. if ( ( zSig & 0x1FF ) <= 2 ) {
  3730. mul64To128( bSig, zSig, &term0, &term1 );
  3731. sub128( aSig, 0, term0, term1, &rem0, &rem1 );
  3732. while ( (int64_t) rem0 < 0 ) {
  3733. --zSig;
  3734. add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  3735. }
  3736. zSig |= ( rem1 != 0 );
  3737. }
  3738. return roundAndPackFloat64(zSign, zExp, zSig, status);
  3739. }
  3740. /*----------------------------------------------------------------------------
  3741. | Returns the remainder of the double-precision floating-point value `a'
  3742. | with respect to the corresponding value `b'. The operation is performed
  3743. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  3744. *----------------------------------------------------------------------------*/
  3745. float64 float64_rem(float64 a, float64 b, float_status *status)
  3746. {
  3747. flag aSign, zSign;
  3748. int aExp, bExp, expDiff;
  3749. uint64_t aSig, bSig;
  3750. uint64_t q, alternateASig;
  3751. int64_t sigMean;
  3752. a = float64_squash_input_denormal(a, status);
  3753. b = float64_squash_input_denormal(b, status);
  3754. aSig = extractFloat64Frac( a );
  3755. aExp = extractFloat64Exp( a );
  3756. aSign = extractFloat64Sign( a );
  3757. bSig = extractFloat64Frac( b );
  3758. bExp = extractFloat64Exp( b );
  3759. if ( aExp == 0x7FF ) {
  3760. if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
  3761. return propagateFloat64NaN(a, b, status);
  3762. }
  3763. float_raise(float_flag_invalid, status);
  3764. return float64_default_nan(status);
  3765. }
  3766. if ( bExp == 0x7FF ) {
  3767. if (bSig) {
  3768. return propagateFloat64NaN(a, b, status);
  3769. }
  3770. return a;
  3771. }
  3772. if ( bExp == 0 ) {
  3773. if ( bSig == 0 ) {
  3774. float_raise(float_flag_invalid, status);
  3775. return float64_default_nan(status);
  3776. }
  3777. normalizeFloat64Subnormal( bSig, &bExp, &bSig );
  3778. }
  3779. if ( aExp == 0 ) {
  3780. if ( aSig == 0 ) return a;
  3781. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  3782. }
  3783. expDiff = aExp - bExp;
  3784. aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
  3785. bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
  3786. if ( expDiff < 0 ) {
  3787. if ( expDiff < -1 ) return a;
  3788. aSig >>= 1;
  3789. }
  3790. q = ( bSig <= aSig );
  3791. if ( q ) aSig -= bSig;
  3792. expDiff -= 64;
  3793. while ( 0 < expDiff ) {
  3794. q = estimateDiv128To64( aSig, 0, bSig );
  3795. q = ( 2 < q ) ? q - 2 : 0;
  3796. aSig = - ( ( bSig>>2 ) * q );
  3797. expDiff -= 62;
  3798. }
  3799. expDiff += 64;
  3800. if ( 0 < expDiff ) {
  3801. q = estimateDiv128To64( aSig, 0, bSig );
  3802. q = ( 2 < q ) ? q - 2 : 0;
  3803. q >>= 64 - expDiff;
  3804. bSig >>= 2;
  3805. aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
  3806. }
  3807. else {
  3808. aSig >>= 2;
  3809. bSig >>= 2;
  3810. }
  3811. do {
  3812. alternateASig = aSig;
  3813. ++q;
  3814. aSig -= bSig;
  3815. } while ( 0 <= (int64_t) aSig );
  3816. sigMean = aSig + alternateASig;
  3817. if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
  3818. aSig = alternateASig;
  3819. }
  3820. zSign = ( (int64_t) aSig < 0 );
  3821. if ( zSign ) aSig = - aSig;
  3822. return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
  3823. }
  3824. /*----------------------------------------------------------------------------
  3825. | Returns the result of multiplying the double-precision floating-point values
  3826. | `a' and `b' then adding 'c', with no intermediate rounding step after the
  3827. | multiplication. The operation is performed according to the IEC/IEEE
  3828. | Standard for Binary Floating-Point Arithmetic 754-2008.
  3829. | The flags argument allows the caller to select negation of the
  3830. | addend, the intermediate product, or the final result. (The difference
  3831. | between this and having the caller do a separate negation is that negating
  3832. | externally will flip the sign bit on NaNs.)
  3833. *----------------------------------------------------------------------------*/
  3834. float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
  3835. float_status *status)
  3836. {
  3837. flag aSign, bSign, cSign, zSign;
  3838. int aExp, bExp, cExp, pExp, zExp, expDiff;
  3839. uint64_t aSig, bSig, cSig;
  3840. flag pInf, pZero, pSign;
  3841. uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
  3842. int shiftcount;
  3843. flag signflip, infzero;
  3844. a = float64_squash_input_denormal(a, status);
  3845. b = float64_squash_input_denormal(b, status);
  3846. c = float64_squash_input_denormal(c, status);
  3847. aSig = extractFloat64Frac(a);
  3848. aExp = extractFloat64Exp(a);
  3849. aSign = extractFloat64Sign(a);
  3850. bSig = extractFloat64Frac(b);
  3851. bExp = extractFloat64Exp(b);
  3852. bSign = extractFloat64Sign(b);
  3853. cSig = extractFloat64Frac(c);
  3854. cExp = extractFloat64Exp(c);
  3855. cSign = extractFloat64Sign(c);
  3856. infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
  3857. (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
  3858. /* It is implementation-defined whether the cases of (0,inf,qnan)
  3859. * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
  3860. * they return if they do), so we have to hand this information
  3861. * off to the target-specific pick-a-NaN routine.
  3862. */
  3863. if (((aExp == 0x7ff) && aSig) ||
  3864. ((bExp == 0x7ff) && bSig) ||
  3865. ((cExp == 0x7ff) && cSig)) {
  3866. return propagateFloat64MulAddNaN(a, b, c, infzero, status);
  3867. }
  3868. if (infzero) {
  3869. float_raise(float_flag_invalid, status);
  3870. return float64_default_nan(status);
  3871. }
  3872. if (flags & float_muladd_negate_c) {
  3873. cSign ^= 1;
  3874. }
  3875. signflip = (flags & float_muladd_negate_result) ? 1 : 0;
  3876. /* Work out the sign and type of the product */
  3877. pSign = aSign ^ bSign;
  3878. if (flags & float_muladd_negate_product) {
  3879. pSign ^= 1;
  3880. }
  3881. pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
  3882. pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
  3883. if (cExp == 0x7ff) {
  3884. if (pInf && (pSign ^ cSign)) {
  3885. /* addition of opposite-signed infinities => InvalidOperation */
  3886. float_raise(float_flag_invalid, status);
  3887. return float64_default_nan(status);
  3888. }
  3889. /* Otherwise generate an infinity of the same sign */
  3890. return packFloat64(cSign ^ signflip, 0x7ff, 0);
  3891. }
  3892. if (pInf) {
  3893. return packFloat64(pSign ^ signflip, 0x7ff, 0);
  3894. }
  3895. if (pZero) {
  3896. if (cExp == 0) {
  3897. if (cSig == 0) {
  3898. /* Adding two exact zeroes */
  3899. if (pSign == cSign) {
  3900. zSign = pSign;
  3901. } else if (status->float_rounding_mode == float_round_down) {
  3902. zSign = 1;
  3903. } else {
  3904. zSign = 0;
  3905. }
  3906. return packFloat64(zSign ^ signflip, 0, 0);
  3907. }
  3908. /* Exact zero plus a denorm */
  3909. if (status->flush_to_zero) {
  3910. float_raise(float_flag_output_denormal, status);
  3911. return packFloat64(cSign ^ signflip, 0, 0);
  3912. }
  3913. }
  3914. /* Zero plus something non-zero : just return the something */
  3915. if (flags & float_muladd_halve_result) {
  3916. if (cExp == 0) {
  3917. normalizeFloat64Subnormal(cSig, &cExp, &cSig);
  3918. }
  3919. /* Subtract one to halve, and one again because roundAndPackFloat64
  3920. * wants one less than the true exponent.
  3921. */
  3922. cExp -= 2;
  3923. cSig = (cSig | 0x0010000000000000ULL) << 10;
  3924. return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
  3925. }
  3926. return packFloat64(cSign ^ signflip, cExp, cSig);
  3927. }
  3928. if (aExp == 0) {
  3929. normalizeFloat64Subnormal(aSig, &aExp, &aSig);
  3930. }
  3931. if (bExp == 0) {
  3932. normalizeFloat64Subnormal(bSig, &bExp, &bSig);
  3933. }
  3934. /* Calculate the actual result a * b + c */
  3935. /* Multiply first; this is easy. */
  3936. /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
  3937. * because we want the true exponent, not the "one-less-than"
  3938. * flavour that roundAndPackFloat64() takes.
  3939. */
  3940. pExp = aExp + bExp - 0x3fe;
  3941. aSig = (aSig | LIT64(0x0010000000000000))<<10;
  3942. bSig = (bSig | LIT64(0x0010000000000000))<<11;
  3943. mul64To128(aSig, bSig, &pSig0, &pSig1);
  3944. if ((int64_t)(pSig0 << 1) >= 0) {
  3945. shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
  3946. pExp--;
  3947. }
  3948. zSign = pSign ^ signflip;
  3949. /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
  3950. * bit in position 126.
  3951. */
  3952. if (cExp == 0) {
  3953. if (!cSig) {
  3954. /* Throw out the special case of c being an exact zero now */
  3955. shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
  3956. if (flags & float_muladd_halve_result) {
  3957. pExp--;
  3958. }
  3959. return roundAndPackFloat64(zSign, pExp - 1,
  3960. pSig1, status);
  3961. }
  3962. normalizeFloat64Subnormal(cSig, &cExp, &cSig);
  3963. }
  3964. /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
  3965. * significand of the addend, with the explicit bit in position 126.
  3966. */
  3967. cSig0 = cSig << (126 - 64 - 52);
  3968. cSig1 = 0;
  3969. cSig0 |= LIT64(0x4000000000000000);
  3970. expDiff = pExp - cExp;
  3971. if (pSign == cSign) {
  3972. /* Addition */
  3973. if (expDiff > 0) {
  3974. /* scale c to match p */
  3975. shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
  3976. zExp = pExp;
  3977. } else if (expDiff < 0) {
  3978. /* scale p to match c */
  3979. shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
  3980. zExp = cExp;
  3981. } else {
  3982. /* no scaling needed */
  3983. zExp = cExp;
  3984. }
  3985. /* Add significands and make sure explicit bit ends up in posn 126 */
  3986. add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
  3987. if ((int64_t)zSig0 < 0) {
  3988. shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
  3989. } else {
  3990. zExp--;
  3991. }
  3992. shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
  3993. if (flags & float_muladd_halve_result) {
  3994. zExp--;
  3995. }
  3996. return roundAndPackFloat64(zSign, zExp, zSig1, status);
  3997. } else {
  3998. /* Subtraction */
  3999. if (expDiff > 0) {
  4000. shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
  4001. sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
  4002. zExp = pExp;
  4003. } else if (expDiff < 0) {
  4004. shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
  4005. sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
  4006. zExp = cExp;
  4007. zSign ^= 1;
  4008. } else {
  4009. zExp = pExp;
  4010. if (lt128(cSig0, cSig1, pSig0, pSig1)) {
  4011. sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
  4012. } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
  4013. sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
  4014. zSign ^= 1;
  4015. } else {
  4016. /* Exact zero */
  4017. zSign = signflip;
  4018. if (status->float_rounding_mode == float_round_down) {
  4019. zSign ^= 1;
  4020. }
  4021. return packFloat64(zSign, 0, 0);
  4022. }
  4023. }
  4024. --zExp;
  4025. /* Do the equivalent of normalizeRoundAndPackFloat64() but
  4026. * starting with the significand in a pair of uint64_t.
  4027. */
  4028. if (zSig0) {
  4029. shiftcount = countLeadingZeros64(zSig0) - 1;
  4030. shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
  4031. if (zSig1) {
  4032. zSig0 |= 1;
  4033. }
  4034. zExp -= shiftcount;
  4035. } else {
  4036. shiftcount = countLeadingZeros64(zSig1);
  4037. if (shiftcount == 0) {
  4038. zSig0 = (zSig1 >> 1) | (zSig1 & 1);
  4039. zExp -= 63;
  4040. } else {
  4041. shiftcount--;
  4042. zSig0 = zSig1 << shiftcount;
  4043. zExp -= (shiftcount + 64);
  4044. }
  4045. }
  4046. if (flags & float_muladd_halve_result) {
  4047. zExp--;
  4048. }
  4049. return roundAndPackFloat64(zSign, zExp, zSig0, status);
  4050. }
  4051. }
  4052. /*----------------------------------------------------------------------------
  4053. | Returns the square root of the double-precision floating-point value `a'.
  4054. | The operation is performed according to the IEC/IEEE Standard for Binary
  4055. | Floating-Point Arithmetic.
  4056. *----------------------------------------------------------------------------*/
  4057. float64 float64_sqrt(float64 a, float_status *status)
  4058. {
  4059. flag aSign;
  4060. int aExp, zExp;
  4061. uint64_t aSig, zSig, doubleZSig;
  4062. uint64_t rem0, rem1, term0, term1;
  4063. a = float64_squash_input_denormal(a, status);
  4064. aSig = extractFloat64Frac( a );
  4065. aExp = extractFloat64Exp( a );
  4066. aSign = extractFloat64Sign( a );
  4067. if ( aExp == 0x7FF ) {
  4068. if (aSig) {
  4069. return propagateFloat64NaN(a, a, status);
  4070. }
  4071. if ( ! aSign ) return a;
  4072. float_raise(float_flag_invalid, status);
  4073. return float64_default_nan(status);
  4074. }
  4075. if ( aSign ) {
  4076. if ( ( aExp | aSig ) == 0 ) return a;
  4077. float_raise(float_flag_invalid, status);
  4078. return float64_default_nan(status);
  4079. }
  4080. if ( aExp == 0 ) {
  4081. if ( aSig == 0 ) return float64_zero;
  4082. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  4083. }
  4084. zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
  4085. aSig |= LIT64( 0x0010000000000000 );
  4086. zSig = estimateSqrt32( aExp, aSig>>21 );
  4087. aSig <<= 9 - ( aExp & 1 );
  4088. zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
  4089. if ( ( zSig & 0x1FF ) <= 5 ) {
  4090. doubleZSig = zSig<<1;
  4091. mul64To128( zSig, zSig, &term0, &term1 );
  4092. sub128( aSig, 0, term0, term1, &rem0, &rem1 );
  4093. while ( (int64_t) rem0 < 0 ) {
  4094. --zSig;
  4095. doubleZSig -= 2;
  4096. add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
  4097. }
  4098. zSig |= ( ( rem0 | rem1 ) != 0 );
  4099. }
  4100. return roundAndPackFloat64(0, zExp, zSig, status);
  4101. }
  4102. /*----------------------------------------------------------------------------
  4103. | Returns the binary log of the double-precision floating-point value `a'.
  4104. | The operation is performed according to the IEC/IEEE Standard for Binary
  4105. | Floating-Point Arithmetic.
  4106. *----------------------------------------------------------------------------*/
  4107. float64 float64_log2(float64 a, float_status *status)
  4108. {
  4109. flag aSign, zSign;
  4110. int aExp;
  4111. uint64_t aSig, aSig0, aSig1, zSig, i;
  4112. a = float64_squash_input_denormal(a, status);
  4113. aSig = extractFloat64Frac( a );
  4114. aExp = extractFloat64Exp( a );
  4115. aSign = extractFloat64Sign( a );
  4116. if ( aExp == 0 ) {
  4117. if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
  4118. normalizeFloat64Subnormal( aSig, &aExp, &aSig );
  4119. }
  4120. if ( aSign ) {
  4121. float_raise(float_flag_invalid, status);
  4122. return float64_default_nan(status);
  4123. }
  4124. if ( aExp == 0x7FF ) {
  4125. if (aSig) {
  4126. return propagateFloat64NaN(a, float64_zero, status);
  4127. }
  4128. return a;
  4129. }
  4130. aExp -= 0x3FF;
  4131. aSig |= LIT64( 0x0010000000000000 );
  4132. zSign = aExp < 0;
  4133. zSig = (uint64_t)aExp << 52;
  4134. for (i = 1LL << 51; i > 0; i >>= 1) {
  4135. mul64To128( aSig, aSig, &aSig0, &aSig1 );
  4136. aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
  4137. if ( aSig & LIT64( 0x0020000000000000 ) ) {
  4138. aSig >>= 1;
  4139. zSig |= i;
  4140. }
  4141. }
  4142. if ( zSign )
  4143. zSig = -zSig;
  4144. return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
  4145. }
  4146. /*----------------------------------------------------------------------------
  4147. | Returns 1 if the double-precision floating-point value `a' is equal to the
  4148. | corresponding value `b', and 0 otherwise. The invalid exception is raised
  4149. | if either operand is a NaN. Otherwise, the comparison is performed
  4150. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4151. *----------------------------------------------------------------------------*/
  4152. int float64_eq(float64 a, float64 b, float_status *status)
  4153. {
  4154. uint64_t av, bv;
  4155. a = float64_squash_input_denormal(a, status);
  4156. b = float64_squash_input_denormal(b, status);
  4157. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4158. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4159. ) {
  4160. float_raise(float_flag_invalid, status);
  4161. return 0;
  4162. }
  4163. av = float64_val(a);
  4164. bv = float64_val(b);
  4165. return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
  4166. }
  4167. /*----------------------------------------------------------------------------
  4168. | Returns 1 if the double-precision floating-point value `a' is less than or
  4169. | equal to the corresponding value `b', and 0 otherwise. The invalid
  4170. | exception is raised if either operand is a NaN. The comparison is performed
  4171. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4172. *----------------------------------------------------------------------------*/
  4173. int float64_le(float64 a, float64 b, float_status *status)
  4174. {
  4175. flag aSign, bSign;
  4176. uint64_t av, bv;
  4177. a = float64_squash_input_denormal(a, status);
  4178. b = float64_squash_input_denormal(b, status);
  4179. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4180. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4181. ) {
  4182. float_raise(float_flag_invalid, status);
  4183. return 0;
  4184. }
  4185. aSign = extractFloat64Sign( a );
  4186. bSign = extractFloat64Sign( b );
  4187. av = float64_val(a);
  4188. bv = float64_val(b);
  4189. if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
  4190. return ( av == bv ) || ( aSign ^ ( av < bv ) );
  4191. }
  4192. /*----------------------------------------------------------------------------
  4193. | Returns 1 if the double-precision floating-point value `a' is less than
  4194. | the corresponding value `b', and 0 otherwise. The invalid exception is
  4195. | raised if either operand is a NaN. The comparison is performed according
  4196. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4197. *----------------------------------------------------------------------------*/
  4198. int float64_lt(float64 a, float64 b, float_status *status)
  4199. {
  4200. flag aSign, bSign;
  4201. uint64_t av, bv;
  4202. a = float64_squash_input_denormal(a, status);
  4203. b = float64_squash_input_denormal(b, status);
  4204. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4205. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4206. ) {
  4207. float_raise(float_flag_invalid, status);
  4208. return 0;
  4209. }
  4210. aSign = extractFloat64Sign( a );
  4211. bSign = extractFloat64Sign( b );
  4212. av = float64_val(a);
  4213. bv = float64_val(b);
  4214. if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
  4215. return ( av != bv ) && ( aSign ^ ( av < bv ) );
  4216. }
  4217. /*----------------------------------------------------------------------------
  4218. | Returns 1 if the double-precision floating-point values `a' and `b' cannot
  4219. | be compared, and 0 otherwise. The invalid exception is raised if either
  4220. | operand is a NaN. The comparison is performed according to the IEC/IEEE
  4221. | Standard for Binary Floating-Point Arithmetic.
  4222. *----------------------------------------------------------------------------*/
  4223. int float64_unordered(float64 a, float64 b, float_status *status)
  4224. {
  4225. a = float64_squash_input_denormal(a, status);
  4226. b = float64_squash_input_denormal(b, status);
  4227. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4228. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4229. ) {
  4230. float_raise(float_flag_invalid, status);
  4231. return 1;
  4232. }
  4233. return 0;
  4234. }
  4235. /*----------------------------------------------------------------------------
  4236. | Returns 1 if the double-precision floating-point value `a' is equal to the
  4237. | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4238. | exception.The comparison is performed according to the IEC/IEEE Standard
  4239. | for Binary Floating-Point Arithmetic.
  4240. *----------------------------------------------------------------------------*/
  4241. int float64_eq_quiet(float64 a, float64 b, float_status *status)
  4242. {
  4243. uint64_t av, bv;
  4244. a = float64_squash_input_denormal(a, status);
  4245. b = float64_squash_input_denormal(b, status);
  4246. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4247. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4248. ) {
  4249. if (float64_is_signaling_nan(a, status)
  4250. || float64_is_signaling_nan(b, status)) {
  4251. float_raise(float_flag_invalid, status);
  4252. }
  4253. return 0;
  4254. }
  4255. av = float64_val(a);
  4256. bv = float64_val(b);
  4257. return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
  4258. }
  4259. /*----------------------------------------------------------------------------
  4260. | Returns 1 if the double-precision floating-point value `a' is less than or
  4261. | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  4262. | cause an exception. Otherwise, the comparison is performed according to the
  4263. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4264. *----------------------------------------------------------------------------*/
  4265. int float64_le_quiet(float64 a, float64 b, float_status *status)
  4266. {
  4267. flag aSign, bSign;
  4268. uint64_t av, bv;
  4269. a = float64_squash_input_denormal(a, status);
  4270. b = float64_squash_input_denormal(b, status);
  4271. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4272. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4273. ) {
  4274. if (float64_is_signaling_nan(a, status)
  4275. || float64_is_signaling_nan(b, status)) {
  4276. float_raise(float_flag_invalid, status);
  4277. }
  4278. return 0;
  4279. }
  4280. aSign = extractFloat64Sign( a );
  4281. bSign = extractFloat64Sign( b );
  4282. av = float64_val(a);
  4283. bv = float64_val(b);
  4284. if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
  4285. return ( av == bv ) || ( aSign ^ ( av < bv ) );
  4286. }
  4287. /*----------------------------------------------------------------------------
  4288. | Returns 1 if the double-precision floating-point value `a' is less than
  4289. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  4290. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  4291. | Standard for Binary Floating-Point Arithmetic.
  4292. *----------------------------------------------------------------------------*/
  4293. int float64_lt_quiet(float64 a, float64 b, float_status *status)
  4294. {
  4295. flag aSign, bSign;
  4296. uint64_t av, bv;
  4297. a = float64_squash_input_denormal(a, status);
  4298. b = float64_squash_input_denormal(b, status);
  4299. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4300. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4301. ) {
  4302. if (float64_is_signaling_nan(a, status)
  4303. || float64_is_signaling_nan(b, status)) {
  4304. float_raise(float_flag_invalid, status);
  4305. }
  4306. return 0;
  4307. }
  4308. aSign = extractFloat64Sign( a );
  4309. bSign = extractFloat64Sign( b );
  4310. av = float64_val(a);
  4311. bv = float64_val(b);
  4312. if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
  4313. return ( av != bv ) && ( aSign ^ ( av < bv ) );
  4314. }
  4315. /*----------------------------------------------------------------------------
  4316. | Returns 1 if the double-precision floating-point values `a' and `b' cannot
  4317. | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
  4318. | comparison is performed according to the IEC/IEEE Standard for Binary
  4319. | Floating-Point Arithmetic.
  4320. *----------------------------------------------------------------------------*/
  4321. int float64_unordered_quiet(float64 a, float64 b, float_status *status)
  4322. {
  4323. a = float64_squash_input_denormal(a, status);
  4324. b = float64_squash_input_denormal(b, status);
  4325. if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
  4326. || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
  4327. ) {
  4328. if (float64_is_signaling_nan(a, status)
  4329. || float64_is_signaling_nan(b, status)) {
  4330. float_raise(float_flag_invalid, status);
  4331. }
  4332. return 1;
  4333. }
  4334. return 0;
  4335. }
  4336. /*----------------------------------------------------------------------------
  4337. | Returns the result of converting the extended double-precision floating-
  4338. | point value `a' to the 32-bit two's complement integer format. The
  4339. | conversion is performed according to the IEC/IEEE Standard for Binary
  4340. | Floating-Point Arithmetic---which means in particular that the conversion
  4341. | is rounded according to the current rounding mode. If `a' is a NaN, the
  4342. | largest positive integer is returned. Otherwise, if the conversion
  4343. | overflows, the largest integer with the same sign as `a' is returned.
  4344. *----------------------------------------------------------------------------*/
  4345. int32_t floatx80_to_int32(floatx80 a, float_status *status)
  4346. {
  4347. flag aSign;
  4348. int32_t aExp, shiftCount;
  4349. uint64_t aSig;
  4350. if (floatx80_invalid_encoding(a)) {
  4351. float_raise(float_flag_invalid, status);
  4352. return 1 << 31;
  4353. }
  4354. aSig = extractFloatx80Frac( a );
  4355. aExp = extractFloatx80Exp( a );
  4356. aSign = extractFloatx80Sign( a );
  4357. if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
  4358. shiftCount = 0x4037 - aExp;
  4359. if ( shiftCount <= 0 ) shiftCount = 1;
  4360. shift64RightJamming( aSig, shiftCount, &aSig );
  4361. return roundAndPackInt32(aSign, aSig, status);
  4362. }
  4363. /*----------------------------------------------------------------------------
  4364. | Returns the result of converting the extended double-precision floating-
  4365. | point value `a' to the 32-bit two's complement integer format. The
  4366. | conversion is performed according to the IEC/IEEE Standard for Binary
  4367. | Floating-Point Arithmetic, except that the conversion is always rounded
  4368. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  4369. | Otherwise, if the conversion overflows, the largest integer with the same
  4370. | sign as `a' is returned.
  4371. *----------------------------------------------------------------------------*/
  4372. int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
  4373. {
  4374. flag aSign;
  4375. int32_t aExp, shiftCount;
  4376. uint64_t aSig, savedASig;
  4377. int32_t z;
  4378. if (floatx80_invalid_encoding(a)) {
  4379. float_raise(float_flag_invalid, status);
  4380. return 1 << 31;
  4381. }
  4382. aSig = extractFloatx80Frac( a );
  4383. aExp = extractFloatx80Exp( a );
  4384. aSign = extractFloatx80Sign( a );
  4385. if ( 0x401E < aExp ) {
  4386. if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
  4387. goto invalid;
  4388. }
  4389. else if ( aExp < 0x3FFF ) {
  4390. if (aExp || aSig) {
  4391. status->float_exception_flags |= float_flag_inexact;
  4392. }
  4393. return 0;
  4394. }
  4395. shiftCount = 0x403E - aExp;
  4396. savedASig = aSig;
  4397. aSig >>= shiftCount;
  4398. z = aSig;
  4399. if ( aSign ) z = - z;
  4400. if ( ( z < 0 ) ^ aSign ) {
  4401. invalid:
  4402. float_raise(float_flag_invalid, status);
  4403. return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
  4404. }
  4405. if ( ( aSig<<shiftCount ) != savedASig ) {
  4406. status->float_exception_flags |= float_flag_inexact;
  4407. }
  4408. return z;
  4409. }
  4410. /*----------------------------------------------------------------------------
  4411. | Returns the result of converting the extended double-precision floating-
  4412. | point value `a' to the 64-bit two's complement integer format. The
  4413. | conversion is performed according to the IEC/IEEE Standard for Binary
  4414. | Floating-Point Arithmetic---which means in particular that the conversion
  4415. | is rounded according to the current rounding mode. If `a' is a NaN,
  4416. | the largest positive integer is returned. Otherwise, if the conversion
  4417. | overflows, the largest integer with the same sign as `a' is returned.
  4418. *----------------------------------------------------------------------------*/
  4419. int64_t floatx80_to_int64(floatx80 a, float_status *status)
  4420. {
  4421. flag aSign;
  4422. int32_t aExp, shiftCount;
  4423. uint64_t aSig, aSigExtra;
  4424. if (floatx80_invalid_encoding(a)) {
  4425. float_raise(float_flag_invalid, status);
  4426. return 1ULL << 63;
  4427. }
  4428. aSig = extractFloatx80Frac( a );
  4429. aExp = extractFloatx80Exp( a );
  4430. aSign = extractFloatx80Sign( a );
  4431. shiftCount = 0x403E - aExp;
  4432. if ( shiftCount <= 0 ) {
  4433. if ( shiftCount ) {
  4434. float_raise(float_flag_invalid, status);
  4435. if ( ! aSign
  4436. || ( ( aExp == 0x7FFF )
  4437. && ( aSig != LIT64( 0x8000000000000000 ) ) )
  4438. ) {
  4439. return LIT64( 0x7FFFFFFFFFFFFFFF );
  4440. }
  4441. return (int64_t) LIT64( 0x8000000000000000 );
  4442. }
  4443. aSigExtra = 0;
  4444. }
  4445. else {
  4446. shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
  4447. }
  4448. return roundAndPackInt64(aSign, aSig, aSigExtra, status);
  4449. }
  4450. /*----------------------------------------------------------------------------
  4451. | Returns the result of converting the extended double-precision floating-
  4452. | point value `a' to the 64-bit two's complement integer format. The
  4453. | conversion is performed according to the IEC/IEEE Standard for Binary
  4454. | Floating-Point Arithmetic, except that the conversion is always rounded
  4455. | toward zero. If `a' is a NaN, the largest positive integer is returned.
  4456. | Otherwise, if the conversion overflows, the largest integer with the same
  4457. | sign as `a' is returned.
  4458. *----------------------------------------------------------------------------*/
  4459. int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
  4460. {
  4461. flag aSign;
  4462. int32_t aExp, shiftCount;
  4463. uint64_t aSig;
  4464. int64_t z;
  4465. if (floatx80_invalid_encoding(a)) {
  4466. float_raise(float_flag_invalid, status);
  4467. return 1ULL << 63;
  4468. }
  4469. aSig = extractFloatx80Frac( a );
  4470. aExp = extractFloatx80Exp( a );
  4471. aSign = extractFloatx80Sign( a );
  4472. shiftCount = aExp - 0x403E;
  4473. if ( 0 <= shiftCount ) {
  4474. aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
  4475. if ( ( a.high != 0xC03E ) || aSig ) {
  4476. float_raise(float_flag_invalid, status);
  4477. if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
  4478. return LIT64( 0x7FFFFFFFFFFFFFFF );
  4479. }
  4480. }
  4481. return (int64_t) LIT64( 0x8000000000000000 );
  4482. }
  4483. else if ( aExp < 0x3FFF ) {
  4484. if (aExp | aSig) {
  4485. status->float_exception_flags |= float_flag_inexact;
  4486. }
  4487. return 0;
  4488. }
  4489. z = aSig>>( - shiftCount );
  4490. if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
  4491. status->float_exception_flags |= float_flag_inexact;
  4492. }
  4493. if ( aSign ) z = - z;
  4494. return z;
  4495. }
  4496. /*----------------------------------------------------------------------------
  4497. | Returns the result of converting the extended double-precision floating-
  4498. | point value `a' to the single-precision floating-point format. The
  4499. | conversion is performed according to the IEC/IEEE Standard for Binary
  4500. | Floating-Point Arithmetic.
  4501. *----------------------------------------------------------------------------*/
  4502. float32 floatx80_to_float32(floatx80 a, float_status *status)
  4503. {
  4504. flag aSign;
  4505. int32_t aExp;
  4506. uint64_t aSig;
  4507. if (floatx80_invalid_encoding(a)) {
  4508. float_raise(float_flag_invalid, status);
  4509. return float32_default_nan(status);
  4510. }
  4511. aSig = extractFloatx80Frac( a );
  4512. aExp = extractFloatx80Exp( a );
  4513. aSign = extractFloatx80Sign( a );
  4514. if ( aExp == 0x7FFF ) {
  4515. if ( (uint64_t) ( aSig<<1 ) ) {
  4516. return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
  4517. }
  4518. return packFloat32( aSign, 0xFF, 0 );
  4519. }
  4520. shift64RightJamming( aSig, 33, &aSig );
  4521. if ( aExp || aSig ) aExp -= 0x3F81;
  4522. return roundAndPackFloat32(aSign, aExp, aSig, status);
  4523. }
  4524. /*----------------------------------------------------------------------------
  4525. | Returns the result of converting the extended double-precision floating-
  4526. | point value `a' to the double-precision floating-point format. The
  4527. | conversion is performed according to the IEC/IEEE Standard for Binary
  4528. | Floating-Point Arithmetic.
  4529. *----------------------------------------------------------------------------*/
  4530. float64 floatx80_to_float64(floatx80 a, float_status *status)
  4531. {
  4532. flag aSign;
  4533. int32_t aExp;
  4534. uint64_t aSig, zSig;
  4535. if (floatx80_invalid_encoding(a)) {
  4536. float_raise(float_flag_invalid, status);
  4537. return float64_default_nan(status);
  4538. }
  4539. aSig = extractFloatx80Frac( a );
  4540. aExp = extractFloatx80Exp( a );
  4541. aSign = extractFloatx80Sign( a );
  4542. if ( aExp == 0x7FFF ) {
  4543. if ( (uint64_t) ( aSig<<1 ) ) {
  4544. return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
  4545. }
  4546. return packFloat64( aSign, 0x7FF, 0 );
  4547. }
  4548. shift64RightJamming( aSig, 1, &zSig );
  4549. if ( aExp || aSig ) aExp -= 0x3C01;
  4550. return roundAndPackFloat64(aSign, aExp, zSig, status);
  4551. }
  4552. /*----------------------------------------------------------------------------
  4553. | Returns the result of converting the extended double-precision floating-
  4554. | point value `a' to the quadruple-precision floating-point format. The
  4555. | conversion is performed according to the IEC/IEEE Standard for Binary
  4556. | Floating-Point Arithmetic.
  4557. *----------------------------------------------------------------------------*/
  4558. float128 floatx80_to_float128(floatx80 a, float_status *status)
  4559. {
  4560. flag aSign;
  4561. int aExp;
  4562. uint64_t aSig, zSig0, zSig1;
  4563. if (floatx80_invalid_encoding(a)) {
  4564. float_raise(float_flag_invalid, status);
  4565. return float128_default_nan(status);
  4566. }
  4567. aSig = extractFloatx80Frac( a );
  4568. aExp = extractFloatx80Exp( a );
  4569. aSign = extractFloatx80Sign( a );
  4570. if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
  4571. return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
  4572. }
  4573. shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
  4574. return packFloat128( aSign, aExp, zSig0, zSig1 );
  4575. }
  4576. /*----------------------------------------------------------------------------
  4577. | Rounds the extended double-precision floating-point value `a' to an integer,
  4578. | and returns the result as an extended quadruple-precision floating-point
  4579. | value. The operation is performed according to the IEC/IEEE Standard for
  4580. | Binary Floating-Point Arithmetic.
  4581. *----------------------------------------------------------------------------*/
  4582. floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
  4583. {
  4584. flag aSign;
  4585. int32_t aExp;
  4586. uint64_t lastBitMask, roundBitsMask;
  4587. floatx80 z;
  4588. if (floatx80_invalid_encoding(a)) {
  4589. float_raise(float_flag_invalid, status);
  4590. return floatx80_default_nan(status);
  4591. }
  4592. aExp = extractFloatx80Exp( a );
  4593. if ( 0x403E <= aExp ) {
  4594. if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
  4595. return propagateFloatx80NaN(a, a, status);
  4596. }
  4597. return a;
  4598. }
  4599. if ( aExp < 0x3FFF ) {
  4600. if ( ( aExp == 0 )
  4601. && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
  4602. return a;
  4603. }
  4604. status->float_exception_flags |= float_flag_inexact;
  4605. aSign = extractFloatx80Sign( a );
  4606. switch (status->float_rounding_mode) {
  4607. case float_round_nearest_even:
  4608. if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
  4609. ) {
  4610. return
  4611. packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
  4612. }
  4613. break;
  4614. case float_round_ties_away:
  4615. if (aExp == 0x3FFE) {
  4616. return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
  4617. }
  4618. break;
  4619. case float_round_down:
  4620. return
  4621. aSign ?
  4622. packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
  4623. : packFloatx80( 0, 0, 0 );
  4624. case float_round_up:
  4625. return
  4626. aSign ? packFloatx80( 1, 0, 0 )
  4627. : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
  4628. }
  4629. return packFloatx80( aSign, 0, 0 );
  4630. }
  4631. lastBitMask = 1;
  4632. lastBitMask <<= 0x403E - aExp;
  4633. roundBitsMask = lastBitMask - 1;
  4634. z = a;
  4635. switch (status->float_rounding_mode) {
  4636. case float_round_nearest_even:
  4637. z.low += lastBitMask>>1;
  4638. if ((z.low & roundBitsMask) == 0) {
  4639. z.low &= ~lastBitMask;
  4640. }
  4641. break;
  4642. case float_round_ties_away:
  4643. z.low += lastBitMask >> 1;
  4644. break;
  4645. case float_round_to_zero:
  4646. break;
  4647. case float_round_up:
  4648. if (!extractFloatx80Sign(z)) {
  4649. z.low += roundBitsMask;
  4650. }
  4651. break;
  4652. case float_round_down:
  4653. if (extractFloatx80Sign(z)) {
  4654. z.low += roundBitsMask;
  4655. }
  4656. break;
  4657. default:
  4658. abort();
  4659. }
  4660. z.low &= ~ roundBitsMask;
  4661. if ( z.low == 0 ) {
  4662. ++z.high;
  4663. z.low = LIT64( 0x8000000000000000 );
  4664. }
  4665. if (z.low != a.low) {
  4666. status->float_exception_flags |= float_flag_inexact;
  4667. }
  4668. return z;
  4669. }
  4670. /*----------------------------------------------------------------------------
  4671. | Returns the result of adding the absolute values of the extended double-
  4672. | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
  4673. | negated before being returned. `zSign' is ignored if the result is a NaN.
  4674. | The addition is performed according to the IEC/IEEE Standard for Binary
  4675. | Floating-Point Arithmetic.
  4676. *----------------------------------------------------------------------------*/
  4677. static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
  4678. float_status *status)
  4679. {
  4680. int32_t aExp, bExp, zExp;
  4681. uint64_t aSig, bSig, zSig0, zSig1;
  4682. int32_t expDiff;
  4683. aSig = extractFloatx80Frac( a );
  4684. aExp = extractFloatx80Exp( a );
  4685. bSig = extractFloatx80Frac( b );
  4686. bExp = extractFloatx80Exp( b );
  4687. expDiff = aExp - bExp;
  4688. if ( 0 < expDiff ) {
  4689. if ( aExp == 0x7FFF ) {
  4690. if ((uint64_t)(aSig << 1)) {
  4691. return propagateFloatx80NaN(a, b, status);
  4692. }
  4693. return a;
  4694. }
  4695. if ( bExp == 0 ) --expDiff;
  4696. shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  4697. zExp = aExp;
  4698. }
  4699. else if ( expDiff < 0 ) {
  4700. if ( bExp == 0x7FFF ) {
  4701. if ((uint64_t)(bSig << 1)) {
  4702. return propagateFloatx80NaN(a, b, status);
  4703. }
  4704. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4705. }
  4706. if ( aExp == 0 ) ++expDiff;
  4707. shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  4708. zExp = bExp;
  4709. }
  4710. else {
  4711. if ( aExp == 0x7FFF ) {
  4712. if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
  4713. return propagateFloatx80NaN(a, b, status);
  4714. }
  4715. return a;
  4716. }
  4717. zSig1 = 0;
  4718. zSig0 = aSig + bSig;
  4719. if ( aExp == 0 ) {
  4720. normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
  4721. goto roundAndPack;
  4722. }
  4723. zExp = aExp;
  4724. goto shiftRight1;
  4725. }
  4726. zSig0 = aSig + bSig;
  4727. if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
  4728. shiftRight1:
  4729. shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
  4730. zSig0 |= LIT64( 0x8000000000000000 );
  4731. ++zExp;
  4732. roundAndPack:
  4733. return roundAndPackFloatx80(status->floatx80_rounding_precision,
  4734. zSign, zExp, zSig0, zSig1, status);
  4735. }
  4736. /*----------------------------------------------------------------------------
  4737. | Returns the result of subtracting the absolute values of the extended
  4738. | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
  4739. | difference is negated before being returned. `zSign' is ignored if the
  4740. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  4741. | Standard for Binary Floating-Point Arithmetic.
  4742. *----------------------------------------------------------------------------*/
  4743. static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
  4744. float_status *status)
  4745. {
  4746. int32_t aExp, bExp, zExp;
  4747. uint64_t aSig, bSig, zSig0, zSig1;
  4748. int32_t expDiff;
  4749. aSig = extractFloatx80Frac( a );
  4750. aExp = extractFloatx80Exp( a );
  4751. bSig = extractFloatx80Frac( b );
  4752. bExp = extractFloatx80Exp( b );
  4753. expDiff = aExp - bExp;
  4754. if ( 0 < expDiff ) goto aExpBigger;
  4755. if ( expDiff < 0 ) goto bExpBigger;
  4756. if ( aExp == 0x7FFF ) {
  4757. if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
  4758. return propagateFloatx80NaN(a, b, status);
  4759. }
  4760. float_raise(float_flag_invalid, status);
  4761. return floatx80_default_nan(status);
  4762. }
  4763. if ( aExp == 0 ) {
  4764. aExp = 1;
  4765. bExp = 1;
  4766. }
  4767. zSig1 = 0;
  4768. if ( bSig < aSig ) goto aBigger;
  4769. if ( aSig < bSig ) goto bBigger;
  4770. return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
  4771. bExpBigger:
  4772. if ( bExp == 0x7FFF ) {
  4773. if ((uint64_t)(bSig << 1)) {
  4774. return propagateFloatx80NaN(a, b, status);
  4775. }
  4776. return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4777. }
  4778. if ( aExp == 0 ) ++expDiff;
  4779. shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
  4780. bBigger:
  4781. sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
  4782. zExp = bExp;
  4783. zSign ^= 1;
  4784. goto normalizeRoundAndPack;
  4785. aExpBigger:
  4786. if ( aExp == 0x7FFF ) {
  4787. if ((uint64_t)(aSig << 1)) {
  4788. return propagateFloatx80NaN(a, b, status);
  4789. }
  4790. return a;
  4791. }
  4792. if ( bExp == 0 ) --expDiff;
  4793. shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
  4794. aBigger:
  4795. sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
  4796. zExp = aExp;
  4797. normalizeRoundAndPack:
  4798. return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
  4799. zSign, zExp, zSig0, zSig1, status);
  4800. }
  4801. /*----------------------------------------------------------------------------
  4802. | Returns the result of adding the extended double-precision floating-point
  4803. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  4804. | Standard for Binary Floating-Point Arithmetic.
  4805. *----------------------------------------------------------------------------*/
  4806. floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
  4807. {
  4808. flag aSign, bSign;
  4809. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  4810. float_raise(float_flag_invalid, status);
  4811. return floatx80_default_nan(status);
  4812. }
  4813. aSign = extractFloatx80Sign( a );
  4814. bSign = extractFloatx80Sign( b );
  4815. if ( aSign == bSign ) {
  4816. return addFloatx80Sigs(a, b, aSign, status);
  4817. }
  4818. else {
  4819. return subFloatx80Sigs(a, b, aSign, status);
  4820. }
  4821. }
  4822. /*----------------------------------------------------------------------------
  4823. | Returns the result of subtracting the extended double-precision floating-
  4824. | point values `a' and `b'. The operation is performed according to the
  4825. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4826. *----------------------------------------------------------------------------*/
  4827. floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
  4828. {
  4829. flag aSign, bSign;
  4830. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  4831. float_raise(float_flag_invalid, status);
  4832. return floatx80_default_nan(status);
  4833. }
  4834. aSign = extractFloatx80Sign( a );
  4835. bSign = extractFloatx80Sign( b );
  4836. if ( aSign == bSign ) {
  4837. return subFloatx80Sigs(a, b, aSign, status);
  4838. }
  4839. else {
  4840. return addFloatx80Sigs(a, b, aSign, status);
  4841. }
  4842. }
  4843. /*----------------------------------------------------------------------------
  4844. | Returns the result of multiplying the extended double-precision floating-
  4845. | point values `a' and `b'. The operation is performed according to the
  4846. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4847. *----------------------------------------------------------------------------*/
  4848. floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
  4849. {
  4850. flag aSign, bSign, zSign;
  4851. int32_t aExp, bExp, zExp;
  4852. uint64_t aSig, bSig, zSig0, zSig1;
  4853. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  4854. float_raise(float_flag_invalid, status);
  4855. return floatx80_default_nan(status);
  4856. }
  4857. aSig = extractFloatx80Frac( a );
  4858. aExp = extractFloatx80Exp( a );
  4859. aSign = extractFloatx80Sign( a );
  4860. bSig = extractFloatx80Frac( b );
  4861. bExp = extractFloatx80Exp( b );
  4862. bSign = extractFloatx80Sign( b );
  4863. zSign = aSign ^ bSign;
  4864. if ( aExp == 0x7FFF ) {
  4865. if ( (uint64_t) ( aSig<<1 )
  4866. || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
  4867. return propagateFloatx80NaN(a, b, status);
  4868. }
  4869. if ( ( bExp | bSig ) == 0 ) goto invalid;
  4870. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4871. }
  4872. if ( bExp == 0x7FFF ) {
  4873. if ((uint64_t)(bSig << 1)) {
  4874. return propagateFloatx80NaN(a, b, status);
  4875. }
  4876. if ( ( aExp | aSig ) == 0 ) {
  4877. invalid:
  4878. float_raise(float_flag_invalid, status);
  4879. return floatx80_default_nan(status);
  4880. }
  4881. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4882. }
  4883. if ( aExp == 0 ) {
  4884. if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
  4885. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  4886. }
  4887. if ( bExp == 0 ) {
  4888. if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
  4889. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  4890. }
  4891. zExp = aExp + bExp - 0x3FFE;
  4892. mul64To128( aSig, bSig, &zSig0, &zSig1 );
  4893. if ( 0 < (int64_t) zSig0 ) {
  4894. shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
  4895. --zExp;
  4896. }
  4897. return roundAndPackFloatx80(status->floatx80_rounding_precision,
  4898. zSign, zExp, zSig0, zSig1, status);
  4899. }
  4900. /*----------------------------------------------------------------------------
  4901. | Returns the result of dividing the extended double-precision floating-point
  4902. | value `a' by the corresponding value `b'. The operation is performed
  4903. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4904. *----------------------------------------------------------------------------*/
  4905. floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
  4906. {
  4907. flag aSign, bSign, zSign;
  4908. int32_t aExp, bExp, zExp;
  4909. uint64_t aSig, bSig, zSig0, zSig1;
  4910. uint64_t rem0, rem1, rem2, term0, term1, term2;
  4911. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  4912. float_raise(float_flag_invalid, status);
  4913. return floatx80_default_nan(status);
  4914. }
  4915. aSig = extractFloatx80Frac( a );
  4916. aExp = extractFloatx80Exp( a );
  4917. aSign = extractFloatx80Sign( a );
  4918. bSig = extractFloatx80Frac( b );
  4919. bExp = extractFloatx80Exp( b );
  4920. bSign = extractFloatx80Sign( b );
  4921. zSign = aSign ^ bSign;
  4922. if ( aExp == 0x7FFF ) {
  4923. if ((uint64_t)(aSig << 1)) {
  4924. return propagateFloatx80NaN(a, b, status);
  4925. }
  4926. if ( bExp == 0x7FFF ) {
  4927. if ((uint64_t)(bSig << 1)) {
  4928. return propagateFloatx80NaN(a, b, status);
  4929. }
  4930. goto invalid;
  4931. }
  4932. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4933. }
  4934. if ( bExp == 0x7FFF ) {
  4935. if ((uint64_t)(bSig << 1)) {
  4936. return propagateFloatx80NaN(a, b, status);
  4937. }
  4938. return packFloatx80( zSign, 0, 0 );
  4939. }
  4940. if ( bExp == 0 ) {
  4941. if ( bSig == 0 ) {
  4942. if ( ( aExp | aSig ) == 0 ) {
  4943. invalid:
  4944. float_raise(float_flag_invalid, status);
  4945. return floatx80_default_nan(status);
  4946. }
  4947. float_raise(float_flag_divbyzero, status);
  4948. return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  4949. }
  4950. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  4951. }
  4952. if ( aExp == 0 ) {
  4953. if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
  4954. normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
  4955. }
  4956. zExp = aExp - bExp + 0x3FFE;
  4957. rem1 = 0;
  4958. if ( bSig <= aSig ) {
  4959. shift128Right( aSig, 0, 1, &aSig, &rem1 );
  4960. ++zExp;
  4961. }
  4962. zSig0 = estimateDiv128To64( aSig, rem1, bSig );
  4963. mul64To128( bSig, zSig0, &term0, &term1 );
  4964. sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
  4965. while ( (int64_t) rem0 < 0 ) {
  4966. --zSig0;
  4967. add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
  4968. }
  4969. zSig1 = estimateDiv128To64( rem1, 0, bSig );
  4970. if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
  4971. mul64To128( bSig, zSig1, &term1, &term2 );
  4972. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  4973. while ( (int64_t) rem1 < 0 ) {
  4974. --zSig1;
  4975. add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
  4976. }
  4977. zSig1 |= ( ( rem1 | rem2 ) != 0 );
  4978. }
  4979. return roundAndPackFloatx80(status->floatx80_rounding_precision,
  4980. zSign, zExp, zSig0, zSig1, status);
  4981. }
  4982. /*----------------------------------------------------------------------------
  4983. | Returns the remainder of the extended double-precision floating-point value
  4984. | `a' with respect to the corresponding value `b'. The operation is performed
  4985. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  4986. *----------------------------------------------------------------------------*/
  4987. floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
  4988. {
  4989. flag aSign, zSign;
  4990. int32_t aExp, bExp, expDiff;
  4991. uint64_t aSig0, aSig1, bSig;
  4992. uint64_t q, term0, term1, alternateASig0, alternateASig1;
  4993. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  4994. float_raise(float_flag_invalid, status);
  4995. return floatx80_default_nan(status);
  4996. }
  4997. aSig0 = extractFloatx80Frac( a );
  4998. aExp = extractFloatx80Exp( a );
  4999. aSign = extractFloatx80Sign( a );
  5000. bSig = extractFloatx80Frac( b );
  5001. bExp = extractFloatx80Exp( b );
  5002. if ( aExp == 0x7FFF ) {
  5003. if ( (uint64_t) ( aSig0<<1 )
  5004. || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
  5005. return propagateFloatx80NaN(a, b, status);
  5006. }
  5007. goto invalid;
  5008. }
  5009. if ( bExp == 0x7FFF ) {
  5010. if ((uint64_t)(bSig << 1)) {
  5011. return propagateFloatx80NaN(a, b, status);
  5012. }
  5013. return a;
  5014. }
  5015. if ( bExp == 0 ) {
  5016. if ( bSig == 0 ) {
  5017. invalid:
  5018. float_raise(float_flag_invalid, status);
  5019. return floatx80_default_nan(status);
  5020. }
  5021. normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
  5022. }
  5023. if ( aExp == 0 ) {
  5024. if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
  5025. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5026. }
  5027. bSig |= LIT64( 0x8000000000000000 );
  5028. zSign = aSign;
  5029. expDiff = aExp - bExp;
  5030. aSig1 = 0;
  5031. if ( expDiff < 0 ) {
  5032. if ( expDiff < -1 ) return a;
  5033. shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
  5034. expDiff = 0;
  5035. }
  5036. q = ( bSig <= aSig0 );
  5037. if ( q ) aSig0 -= bSig;
  5038. expDiff -= 64;
  5039. while ( 0 < expDiff ) {
  5040. q = estimateDiv128To64( aSig0, aSig1, bSig );
  5041. q = ( 2 < q ) ? q - 2 : 0;
  5042. mul64To128( bSig, q, &term0, &term1 );
  5043. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5044. shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
  5045. expDiff -= 62;
  5046. }
  5047. expDiff += 64;
  5048. if ( 0 < expDiff ) {
  5049. q = estimateDiv128To64( aSig0, aSig1, bSig );
  5050. q = ( 2 < q ) ? q - 2 : 0;
  5051. q >>= 64 - expDiff;
  5052. mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
  5053. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5054. shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
  5055. while ( le128( term0, term1, aSig0, aSig1 ) ) {
  5056. ++q;
  5057. sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
  5058. }
  5059. }
  5060. else {
  5061. term1 = 0;
  5062. term0 = bSig;
  5063. }
  5064. sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
  5065. if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5066. || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
  5067. && ( q & 1 ) )
  5068. ) {
  5069. aSig0 = alternateASig0;
  5070. aSig1 = alternateASig1;
  5071. zSign = ! zSign;
  5072. }
  5073. return
  5074. normalizeRoundAndPackFloatx80(
  5075. 80, zSign, bExp + expDiff, aSig0, aSig1, status);
  5076. }
  5077. /*----------------------------------------------------------------------------
  5078. | Returns the square root of the extended double-precision floating-point
  5079. | value `a'. The operation is performed according to the IEC/IEEE Standard
  5080. | for Binary Floating-Point Arithmetic.
  5081. *----------------------------------------------------------------------------*/
  5082. floatx80 floatx80_sqrt(floatx80 a, float_status *status)
  5083. {
  5084. flag aSign;
  5085. int32_t aExp, zExp;
  5086. uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
  5087. uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  5088. if (floatx80_invalid_encoding(a)) {
  5089. float_raise(float_flag_invalid, status);
  5090. return floatx80_default_nan(status);
  5091. }
  5092. aSig0 = extractFloatx80Frac( a );
  5093. aExp = extractFloatx80Exp( a );
  5094. aSign = extractFloatx80Sign( a );
  5095. if ( aExp == 0x7FFF ) {
  5096. if ((uint64_t)(aSig0 << 1)) {
  5097. return propagateFloatx80NaN(a, a, status);
  5098. }
  5099. if ( ! aSign ) return a;
  5100. goto invalid;
  5101. }
  5102. if ( aSign ) {
  5103. if ( ( aExp | aSig0 ) == 0 ) return a;
  5104. invalid:
  5105. float_raise(float_flag_invalid, status);
  5106. return floatx80_default_nan(status);
  5107. }
  5108. if ( aExp == 0 ) {
  5109. if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
  5110. normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
  5111. }
  5112. zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
  5113. zSig0 = estimateSqrt32( aExp, aSig0>>32 );
  5114. shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
  5115. zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
  5116. doubleZSig0 = zSig0<<1;
  5117. mul64To128( zSig0, zSig0, &term0, &term1 );
  5118. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  5119. while ( (int64_t) rem0 < 0 ) {
  5120. --zSig0;
  5121. doubleZSig0 -= 2;
  5122. add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
  5123. }
  5124. zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
  5125. if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
  5126. if ( zSig1 == 0 ) zSig1 = 1;
  5127. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  5128. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  5129. mul64To128( zSig1, zSig1, &term2, &term3 );
  5130. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  5131. while ( (int64_t) rem1 < 0 ) {
  5132. --zSig1;
  5133. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  5134. term3 |= 1;
  5135. term2 |= doubleZSig0;
  5136. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  5137. }
  5138. zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  5139. }
  5140. shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
  5141. zSig0 |= doubleZSig0;
  5142. return roundAndPackFloatx80(status->floatx80_rounding_precision,
  5143. 0, zExp, zSig0, zSig1, status);
  5144. }
  5145. /*----------------------------------------------------------------------------
  5146. | Returns 1 if the extended double-precision floating-point value `a' is equal
  5147. | to the corresponding value `b', and 0 otherwise. The invalid exception is
  5148. | raised if either operand is a NaN. Otherwise, the comparison is performed
  5149. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5150. *----------------------------------------------------------------------------*/
  5151. int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
  5152. {
  5153. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
  5154. || (extractFloatx80Exp(a) == 0x7FFF
  5155. && (uint64_t) (extractFloatx80Frac(a) << 1))
  5156. || (extractFloatx80Exp(b) == 0x7FFF
  5157. && (uint64_t) (extractFloatx80Frac(b) << 1))
  5158. ) {
  5159. float_raise(float_flag_invalid, status);
  5160. return 0;
  5161. }
  5162. return
  5163. ( a.low == b.low )
  5164. && ( ( a.high == b.high )
  5165. || ( ( a.low == 0 )
  5166. && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
  5167. );
  5168. }
  5169. /*----------------------------------------------------------------------------
  5170. | Returns 1 if the extended double-precision floating-point value `a' is
  5171. | less than or equal to the corresponding value `b', and 0 otherwise. The
  5172. | invalid exception is raised if either operand is a NaN. The comparison is
  5173. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  5174. | Arithmetic.
  5175. *----------------------------------------------------------------------------*/
  5176. int floatx80_le(floatx80 a, floatx80 b, float_status *status)
  5177. {
  5178. flag aSign, bSign;
  5179. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
  5180. || (extractFloatx80Exp(a) == 0x7FFF
  5181. && (uint64_t) (extractFloatx80Frac(a) << 1))
  5182. || (extractFloatx80Exp(b) == 0x7FFF
  5183. && (uint64_t) (extractFloatx80Frac(b) << 1))
  5184. ) {
  5185. float_raise(float_flag_invalid, status);
  5186. return 0;
  5187. }
  5188. aSign = extractFloatx80Sign( a );
  5189. bSign = extractFloatx80Sign( b );
  5190. if ( aSign != bSign ) {
  5191. return
  5192. aSign
  5193. || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5194. == 0 );
  5195. }
  5196. return
  5197. aSign ? le128( b.high, b.low, a.high, a.low )
  5198. : le128( a.high, a.low, b.high, b.low );
  5199. }
  5200. /*----------------------------------------------------------------------------
  5201. | Returns 1 if the extended double-precision floating-point value `a' is
  5202. | less than the corresponding value `b', and 0 otherwise. The invalid
  5203. | exception is raised if either operand is a NaN. The comparison is performed
  5204. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5205. *----------------------------------------------------------------------------*/
  5206. int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
  5207. {
  5208. flag aSign, bSign;
  5209. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
  5210. || (extractFloatx80Exp(a) == 0x7FFF
  5211. && (uint64_t) (extractFloatx80Frac(a) << 1))
  5212. || (extractFloatx80Exp(b) == 0x7FFF
  5213. && (uint64_t) (extractFloatx80Frac(b) << 1))
  5214. ) {
  5215. float_raise(float_flag_invalid, status);
  5216. return 0;
  5217. }
  5218. aSign = extractFloatx80Sign( a );
  5219. bSign = extractFloatx80Sign( b );
  5220. if ( aSign != bSign ) {
  5221. return
  5222. aSign
  5223. && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5224. != 0 );
  5225. }
  5226. return
  5227. aSign ? lt128( b.high, b.low, a.high, a.low )
  5228. : lt128( a.high, a.low, b.high, b.low );
  5229. }
  5230. /*----------------------------------------------------------------------------
  5231. | Returns 1 if the extended double-precision floating-point values `a' and `b'
  5232. | cannot be compared, and 0 otherwise. The invalid exception is raised if
  5233. | either operand is a NaN. The comparison is performed according to the
  5234. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5235. *----------------------------------------------------------------------------*/
  5236. int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
  5237. {
  5238. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
  5239. || (extractFloatx80Exp(a) == 0x7FFF
  5240. && (uint64_t) (extractFloatx80Frac(a) << 1))
  5241. || (extractFloatx80Exp(b) == 0x7FFF
  5242. && (uint64_t) (extractFloatx80Frac(b) << 1))
  5243. ) {
  5244. float_raise(float_flag_invalid, status);
  5245. return 1;
  5246. }
  5247. return 0;
  5248. }
  5249. /*----------------------------------------------------------------------------
  5250. | Returns 1 if the extended double-precision floating-point value `a' is
  5251. | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  5252. | cause an exception. The comparison is performed according to the IEC/IEEE
  5253. | Standard for Binary Floating-Point Arithmetic.
  5254. *----------------------------------------------------------------------------*/
  5255. int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
  5256. {
  5257. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  5258. float_raise(float_flag_invalid, status);
  5259. return 0;
  5260. }
  5261. if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  5262. && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
  5263. || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  5264. && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
  5265. ) {
  5266. if (floatx80_is_signaling_nan(a, status)
  5267. || floatx80_is_signaling_nan(b, status)) {
  5268. float_raise(float_flag_invalid, status);
  5269. }
  5270. return 0;
  5271. }
  5272. return
  5273. ( a.low == b.low )
  5274. && ( ( a.high == b.high )
  5275. || ( ( a.low == 0 )
  5276. && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
  5277. );
  5278. }
  5279. /*----------------------------------------------------------------------------
  5280. | Returns 1 if the extended double-precision floating-point value `a' is less
  5281. | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
  5282. | do not cause an exception. Otherwise, the comparison is performed according
  5283. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5284. *----------------------------------------------------------------------------*/
  5285. int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
  5286. {
  5287. flag aSign, bSign;
  5288. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  5289. float_raise(float_flag_invalid, status);
  5290. return 0;
  5291. }
  5292. if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  5293. && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
  5294. || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  5295. && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
  5296. ) {
  5297. if (floatx80_is_signaling_nan(a, status)
  5298. || floatx80_is_signaling_nan(b, status)) {
  5299. float_raise(float_flag_invalid, status);
  5300. }
  5301. return 0;
  5302. }
  5303. aSign = extractFloatx80Sign( a );
  5304. bSign = extractFloatx80Sign( b );
  5305. if ( aSign != bSign ) {
  5306. return
  5307. aSign
  5308. || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5309. == 0 );
  5310. }
  5311. return
  5312. aSign ? le128( b.high, b.low, a.high, a.low )
  5313. : le128( a.high, a.low, b.high, b.low );
  5314. }
  5315. /*----------------------------------------------------------------------------
  5316. | Returns 1 if the extended double-precision floating-point value `a' is less
  5317. | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
  5318. | an exception. Otherwise, the comparison is performed according to the
  5319. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  5320. *----------------------------------------------------------------------------*/
  5321. int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
  5322. {
  5323. flag aSign, bSign;
  5324. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  5325. float_raise(float_flag_invalid, status);
  5326. return 0;
  5327. }
  5328. if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  5329. && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
  5330. || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  5331. && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
  5332. ) {
  5333. if (floatx80_is_signaling_nan(a, status)
  5334. || floatx80_is_signaling_nan(b, status)) {
  5335. float_raise(float_flag_invalid, status);
  5336. }
  5337. return 0;
  5338. }
  5339. aSign = extractFloatx80Sign( a );
  5340. bSign = extractFloatx80Sign( b );
  5341. if ( aSign != bSign ) {
  5342. return
  5343. aSign
  5344. && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  5345. != 0 );
  5346. }
  5347. return
  5348. aSign ? lt128( b.high, b.low, a.high, a.low )
  5349. : lt128( a.high, a.low, b.high, b.low );
  5350. }
  5351. /*----------------------------------------------------------------------------
  5352. | Returns 1 if the extended double-precision floating-point values `a' and `b'
  5353. | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
  5354. | The comparison is performed according to the IEC/IEEE Standard for Binary
  5355. | Floating-Point Arithmetic.
  5356. *----------------------------------------------------------------------------*/
  5357. int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
  5358. {
  5359. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  5360. float_raise(float_flag_invalid, status);
  5361. return 1;
  5362. }
  5363. if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
  5364. && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
  5365. || ( ( extractFloatx80Exp( b ) == 0x7FFF )
  5366. && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
  5367. ) {
  5368. if (floatx80_is_signaling_nan(a, status)
  5369. || floatx80_is_signaling_nan(b, status)) {
  5370. float_raise(float_flag_invalid, status);
  5371. }
  5372. return 1;
  5373. }
  5374. return 0;
  5375. }
  5376. /*----------------------------------------------------------------------------
  5377. | Returns the result of converting the quadruple-precision floating-point
  5378. | value `a' to the 32-bit two's complement integer format. The conversion
  5379. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5380. | Arithmetic---which means in particular that the conversion is rounded
  5381. | according to the current rounding mode. If `a' is a NaN, the largest
  5382. | positive integer is returned. Otherwise, if the conversion overflows, the
  5383. | largest integer with the same sign as `a' is returned.
  5384. *----------------------------------------------------------------------------*/
  5385. int32_t float128_to_int32(float128 a, float_status *status)
  5386. {
  5387. flag aSign;
  5388. int32_t aExp, shiftCount;
  5389. uint64_t aSig0, aSig1;
  5390. aSig1 = extractFloat128Frac1( a );
  5391. aSig0 = extractFloat128Frac0( a );
  5392. aExp = extractFloat128Exp( a );
  5393. aSign = extractFloat128Sign( a );
  5394. if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
  5395. if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  5396. aSig0 |= ( aSig1 != 0 );
  5397. shiftCount = 0x4028 - aExp;
  5398. if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
  5399. return roundAndPackInt32(aSign, aSig0, status);
  5400. }
  5401. /*----------------------------------------------------------------------------
  5402. | Returns the result of converting the quadruple-precision floating-point
  5403. | value `a' to the 32-bit two's complement integer format. The conversion
  5404. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5405. | Arithmetic, except that the conversion is always rounded toward zero. If
  5406. | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
  5407. | conversion overflows, the largest integer with the same sign as `a' is
  5408. | returned.
  5409. *----------------------------------------------------------------------------*/
  5410. int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
  5411. {
  5412. flag aSign;
  5413. int32_t aExp, shiftCount;
  5414. uint64_t aSig0, aSig1, savedASig;
  5415. int32_t z;
  5416. aSig1 = extractFloat128Frac1( a );
  5417. aSig0 = extractFloat128Frac0( a );
  5418. aExp = extractFloat128Exp( a );
  5419. aSign = extractFloat128Sign( a );
  5420. aSig0 |= ( aSig1 != 0 );
  5421. if ( 0x401E < aExp ) {
  5422. if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
  5423. goto invalid;
  5424. }
  5425. else if ( aExp < 0x3FFF ) {
  5426. if (aExp || aSig0) {
  5427. status->float_exception_flags |= float_flag_inexact;
  5428. }
  5429. return 0;
  5430. }
  5431. aSig0 |= LIT64( 0x0001000000000000 );
  5432. shiftCount = 0x402F - aExp;
  5433. savedASig = aSig0;
  5434. aSig0 >>= shiftCount;
  5435. z = aSig0;
  5436. if ( aSign ) z = - z;
  5437. if ( ( z < 0 ) ^ aSign ) {
  5438. invalid:
  5439. float_raise(float_flag_invalid, status);
  5440. return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
  5441. }
  5442. if ( ( aSig0<<shiftCount ) != savedASig ) {
  5443. status->float_exception_flags |= float_flag_inexact;
  5444. }
  5445. return z;
  5446. }
  5447. /*----------------------------------------------------------------------------
  5448. | Returns the result of converting the quadruple-precision floating-point
  5449. | value `a' to the 64-bit two's complement integer format. The conversion
  5450. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5451. | Arithmetic---which means in particular that the conversion is rounded
  5452. | according to the current rounding mode. If `a' is a NaN, the largest
  5453. | positive integer is returned. Otherwise, if the conversion overflows, the
  5454. | largest integer with the same sign as `a' is returned.
  5455. *----------------------------------------------------------------------------*/
  5456. int64_t float128_to_int64(float128 a, float_status *status)
  5457. {
  5458. flag aSign;
  5459. int32_t aExp, shiftCount;
  5460. uint64_t aSig0, aSig1;
  5461. aSig1 = extractFloat128Frac1( a );
  5462. aSig0 = extractFloat128Frac0( a );
  5463. aExp = extractFloat128Exp( a );
  5464. aSign = extractFloat128Sign( a );
  5465. if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  5466. shiftCount = 0x402F - aExp;
  5467. if ( shiftCount <= 0 ) {
  5468. if ( 0x403E < aExp ) {
  5469. float_raise(float_flag_invalid, status);
  5470. if ( ! aSign
  5471. || ( ( aExp == 0x7FFF )
  5472. && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
  5473. )
  5474. ) {
  5475. return LIT64( 0x7FFFFFFFFFFFFFFF );
  5476. }
  5477. return (int64_t) LIT64( 0x8000000000000000 );
  5478. }
  5479. shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
  5480. }
  5481. else {
  5482. shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
  5483. }
  5484. return roundAndPackInt64(aSign, aSig0, aSig1, status);
  5485. }
  5486. /*----------------------------------------------------------------------------
  5487. | Returns the result of converting the quadruple-precision floating-point
  5488. | value `a' to the 64-bit two's complement integer format. The conversion
  5489. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5490. | Arithmetic, except that the conversion is always rounded toward zero.
  5491. | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
  5492. | the conversion overflows, the largest integer with the same sign as `a' is
  5493. | returned.
  5494. *----------------------------------------------------------------------------*/
  5495. int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
  5496. {
  5497. flag aSign;
  5498. int32_t aExp, shiftCount;
  5499. uint64_t aSig0, aSig1;
  5500. int64_t z;
  5501. aSig1 = extractFloat128Frac1( a );
  5502. aSig0 = extractFloat128Frac0( a );
  5503. aExp = extractFloat128Exp( a );
  5504. aSign = extractFloat128Sign( a );
  5505. if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
  5506. shiftCount = aExp - 0x402F;
  5507. if ( 0 < shiftCount ) {
  5508. if ( 0x403E <= aExp ) {
  5509. aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
  5510. if ( ( a.high == LIT64( 0xC03E000000000000 ) )
  5511. && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
  5512. if (aSig1) {
  5513. status->float_exception_flags |= float_flag_inexact;
  5514. }
  5515. }
  5516. else {
  5517. float_raise(float_flag_invalid, status);
  5518. if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
  5519. return LIT64( 0x7FFFFFFFFFFFFFFF );
  5520. }
  5521. }
  5522. return (int64_t) LIT64( 0x8000000000000000 );
  5523. }
  5524. z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
  5525. if ( (uint64_t) ( aSig1<<shiftCount ) ) {
  5526. status->float_exception_flags |= float_flag_inexact;
  5527. }
  5528. }
  5529. else {
  5530. if ( aExp < 0x3FFF ) {
  5531. if ( aExp | aSig0 | aSig1 ) {
  5532. status->float_exception_flags |= float_flag_inexact;
  5533. }
  5534. return 0;
  5535. }
  5536. z = aSig0>>( - shiftCount );
  5537. if ( aSig1
  5538. || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
  5539. status->float_exception_flags |= float_flag_inexact;
  5540. }
  5541. }
  5542. if ( aSign ) z = - z;
  5543. return z;
  5544. }
  5545. /*----------------------------------------------------------------------------
  5546. | Returns the result of converting the quadruple-precision floating-point value
  5547. | `a' to the 64-bit unsigned integer format. The conversion is
  5548. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  5549. | Arithmetic---which means in particular that the conversion is rounded
  5550. | according to the current rounding mode. If `a' is a NaN, the largest
  5551. | positive integer is returned. If the conversion overflows, the
  5552. | largest unsigned integer is returned. If 'a' is negative, the value is
  5553. | rounded and zero is returned; negative values that do not round to zero
  5554. | will raise the inexact exception.
  5555. *----------------------------------------------------------------------------*/
  5556. uint64_t float128_to_uint64(float128 a, float_status *status)
  5557. {
  5558. flag aSign;
  5559. int aExp;
  5560. int shiftCount;
  5561. uint64_t aSig0, aSig1;
  5562. aSig0 = extractFloat128Frac0(a);
  5563. aSig1 = extractFloat128Frac1(a);
  5564. aExp = extractFloat128Exp(a);
  5565. aSign = extractFloat128Sign(a);
  5566. if (aSign && (aExp > 0x3FFE)) {
  5567. float_raise(float_flag_invalid, status);
  5568. if (float128_is_any_nan(a)) {
  5569. return LIT64(0xFFFFFFFFFFFFFFFF);
  5570. } else {
  5571. return 0;
  5572. }
  5573. }
  5574. if (aExp) {
  5575. aSig0 |= LIT64(0x0001000000000000);
  5576. }
  5577. shiftCount = 0x402F - aExp;
  5578. if (shiftCount <= 0) {
  5579. if (0x403E < aExp) {
  5580. float_raise(float_flag_invalid, status);
  5581. return LIT64(0xFFFFFFFFFFFFFFFF);
  5582. }
  5583. shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
  5584. } else {
  5585. shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
  5586. }
  5587. return roundAndPackUint64(aSign, aSig0, aSig1, status);
  5588. }
  5589. uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
  5590. {
  5591. uint64_t v;
  5592. signed char current_rounding_mode = status->float_rounding_mode;
  5593. set_float_rounding_mode(float_round_to_zero, status);
  5594. v = float128_to_uint64(a, status);
  5595. set_float_rounding_mode(current_rounding_mode, status);
  5596. return v;
  5597. }
  5598. /*----------------------------------------------------------------------------
  5599. | Returns the result of converting the quadruple-precision floating-point
  5600. | value `a' to the 32-bit unsigned integer format. The conversion
  5601. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5602. | Arithmetic except that the conversion is always rounded toward zero.
  5603. | If `a' is a NaN, the largest positive integer is returned. Otherwise,
  5604. | if the conversion overflows, the largest unsigned integer is returned.
  5605. | If 'a' is negative, the value is rounded and zero is returned; negative
  5606. | values that do not round to zero will raise the inexact exception.
  5607. *----------------------------------------------------------------------------*/
  5608. uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
  5609. {
  5610. uint64_t v;
  5611. uint32_t res;
  5612. int old_exc_flags = get_float_exception_flags(status);
  5613. v = float128_to_uint64_round_to_zero(a, status);
  5614. if (v > 0xffffffff) {
  5615. res = 0xffffffff;
  5616. } else {
  5617. return v;
  5618. }
  5619. set_float_exception_flags(old_exc_flags, status);
  5620. float_raise(float_flag_invalid, status);
  5621. return res;
  5622. }
  5623. /*----------------------------------------------------------------------------
  5624. | Returns the result of converting the quadruple-precision floating-point
  5625. | value `a' to the single-precision floating-point format. The conversion
  5626. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5627. | Arithmetic.
  5628. *----------------------------------------------------------------------------*/
  5629. float32 float128_to_float32(float128 a, float_status *status)
  5630. {
  5631. flag aSign;
  5632. int32_t aExp;
  5633. uint64_t aSig0, aSig1;
  5634. uint32_t zSig;
  5635. aSig1 = extractFloat128Frac1( a );
  5636. aSig0 = extractFloat128Frac0( a );
  5637. aExp = extractFloat128Exp( a );
  5638. aSign = extractFloat128Sign( a );
  5639. if ( aExp == 0x7FFF ) {
  5640. if ( aSig0 | aSig1 ) {
  5641. return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
  5642. }
  5643. return packFloat32( aSign, 0xFF, 0 );
  5644. }
  5645. aSig0 |= ( aSig1 != 0 );
  5646. shift64RightJamming( aSig0, 18, &aSig0 );
  5647. zSig = aSig0;
  5648. if ( aExp || zSig ) {
  5649. zSig |= 0x40000000;
  5650. aExp -= 0x3F81;
  5651. }
  5652. return roundAndPackFloat32(aSign, aExp, zSig, status);
  5653. }
  5654. /*----------------------------------------------------------------------------
  5655. | Returns the result of converting the quadruple-precision floating-point
  5656. | value `a' to the double-precision floating-point format. The conversion
  5657. | is performed according to the IEC/IEEE Standard for Binary Floating-Point
  5658. | Arithmetic.
  5659. *----------------------------------------------------------------------------*/
  5660. float64 float128_to_float64(float128 a, float_status *status)
  5661. {
  5662. flag aSign;
  5663. int32_t aExp;
  5664. uint64_t aSig0, aSig1;
  5665. aSig1 = extractFloat128Frac1( a );
  5666. aSig0 = extractFloat128Frac0( a );
  5667. aExp = extractFloat128Exp( a );
  5668. aSign = extractFloat128Sign( a );
  5669. if ( aExp == 0x7FFF ) {
  5670. if ( aSig0 | aSig1 ) {
  5671. return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
  5672. }
  5673. return packFloat64( aSign, 0x7FF, 0 );
  5674. }
  5675. shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  5676. aSig0 |= ( aSig1 != 0 );
  5677. if ( aExp || aSig0 ) {
  5678. aSig0 |= LIT64( 0x4000000000000000 );
  5679. aExp -= 0x3C01;
  5680. }
  5681. return roundAndPackFloat64(aSign, aExp, aSig0, status);
  5682. }
  5683. /*----------------------------------------------------------------------------
  5684. | Returns the result of converting the quadruple-precision floating-point
  5685. | value `a' to the extended double-precision floating-point format. The
  5686. | conversion is performed according to the IEC/IEEE Standard for Binary
  5687. | Floating-Point Arithmetic.
  5688. *----------------------------------------------------------------------------*/
  5689. floatx80 float128_to_floatx80(float128 a, float_status *status)
  5690. {
  5691. flag aSign;
  5692. int32_t aExp;
  5693. uint64_t aSig0, aSig1;
  5694. aSig1 = extractFloat128Frac1( a );
  5695. aSig0 = extractFloat128Frac0( a );
  5696. aExp = extractFloat128Exp( a );
  5697. aSign = extractFloat128Sign( a );
  5698. if ( aExp == 0x7FFF ) {
  5699. if ( aSig0 | aSig1 ) {
  5700. return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
  5701. }
  5702. return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
  5703. }
  5704. if ( aExp == 0 ) {
  5705. if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
  5706. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  5707. }
  5708. else {
  5709. aSig0 |= LIT64( 0x0001000000000000 );
  5710. }
  5711. shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
  5712. return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
  5713. }
  5714. /*----------------------------------------------------------------------------
  5715. | Rounds the quadruple-precision floating-point value `a' to an integer, and
  5716. | returns the result as a quadruple-precision floating-point value. The
  5717. | operation is performed according to the IEC/IEEE Standard for Binary
  5718. | Floating-Point Arithmetic.
  5719. *----------------------------------------------------------------------------*/
  5720. float128 float128_round_to_int(float128 a, float_status *status)
  5721. {
  5722. flag aSign;
  5723. int32_t aExp;
  5724. uint64_t lastBitMask, roundBitsMask;
  5725. float128 z;
  5726. aExp = extractFloat128Exp( a );
  5727. if ( 0x402F <= aExp ) {
  5728. if ( 0x406F <= aExp ) {
  5729. if ( ( aExp == 0x7FFF )
  5730. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
  5731. ) {
  5732. return propagateFloat128NaN(a, a, status);
  5733. }
  5734. return a;
  5735. }
  5736. lastBitMask = 1;
  5737. lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
  5738. roundBitsMask = lastBitMask - 1;
  5739. z = a;
  5740. switch (status->float_rounding_mode) {
  5741. case float_round_nearest_even:
  5742. if ( lastBitMask ) {
  5743. add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
  5744. if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
  5745. }
  5746. else {
  5747. if ( (int64_t) z.low < 0 ) {
  5748. ++z.high;
  5749. if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
  5750. }
  5751. }
  5752. break;
  5753. case float_round_ties_away:
  5754. if (lastBitMask) {
  5755. add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
  5756. } else {
  5757. if ((int64_t) z.low < 0) {
  5758. ++z.high;
  5759. }
  5760. }
  5761. break;
  5762. case float_round_to_zero:
  5763. break;
  5764. case float_round_up:
  5765. if (!extractFloat128Sign(z)) {
  5766. add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
  5767. }
  5768. break;
  5769. case float_round_down:
  5770. if (extractFloat128Sign(z)) {
  5771. add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
  5772. }
  5773. break;
  5774. default:
  5775. abort();
  5776. }
  5777. z.low &= ~ roundBitsMask;
  5778. }
  5779. else {
  5780. if ( aExp < 0x3FFF ) {
  5781. if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
  5782. status->float_exception_flags |= float_flag_inexact;
  5783. aSign = extractFloat128Sign( a );
  5784. switch (status->float_rounding_mode) {
  5785. case float_round_nearest_even:
  5786. if ( ( aExp == 0x3FFE )
  5787. && ( extractFloat128Frac0( a )
  5788. | extractFloat128Frac1( a ) )
  5789. ) {
  5790. return packFloat128( aSign, 0x3FFF, 0, 0 );
  5791. }
  5792. break;
  5793. case float_round_ties_away:
  5794. if (aExp == 0x3FFE) {
  5795. return packFloat128(aSign, 0x3FFF, 0, 0);
  5796. }
  5797. break;
  5798. case float_round_down:
  5799. return
  5800. aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
  5801. : packFloat128( 0, 0, 0, 0 );
  5802. case float_round_up:
  5803. return
  5804. aSign ? packFloat128( 1, 0, 0, 0 )
  5805. : packFloat128( 0, 0x3FFF, 0, 0 );
  5806. }
  5807. return packFloat128( aSign, 0, 0, 0 );
  5808. }
  5809. lastBitMask = 1;
  5810. lastBitMask <<= 0x402F - aExp;
  5811. roundBitsMask = lastBitMask - 1;
  5812. z.low = 0;
  5813. z.high = a.high;
  5814. switch (status->float_rounding_mode) {
  5815. case float_round_nearest_even:
  5816. z.high += lastBitMask>>1;
  5817. if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
  5818. z.high &= ~ lastBitMask;
  5819. }
  5820. break;
  5821. case float_round_ties_away:
  5822. z.high += lastBitMask>>1;
  5823. break;
  5824. case float_round_to_zero:
  5825. break;
  5826. case float_round_up:
  5827. if (!extractFloat128Sign(z)) {
  5828. z.high |= ( a.low != 0 );
  5829. z.high += roundBitsMask;
  5830. }
  5831. break;
  5832. case float_round_down:
  5833. if (extractFloat128Sign(z)) {
  5834. z.high |= (a.low != 0);
  5835. z.high += roundBitsMask;
  5836. }
  5837. break;
  5838. default:
  5839. abort();
  5840. }
  5841. z.high &= ~ roundBitsMask;
  5842. }
  5843. if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
  5844. status->float_exception_flags |= float_flag_inexact;
  5845. }
  5846. return z;
  5847. }
  5848. /*----------------------------------------------------------------------------
  5849. | Returns the result of adding the absolute values of the quadruple-precision
  5850. | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
  5851. | before being returned. `zSign' is ignored if the result is a NaN.
  5852. | The addition is performed according to the IEC/IEEE Standard for Binary
  5853. | Floating-Point Arithmetic.
  5854. *----------------------------------------------------------------------------*/
  5855. static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
  5856. float_status *status)
  5857. {
  5858. int32_t aExp, bExp, zExp;
  5859. uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
  5860. int32_t expDiff;
  5861. aSig1 = extractFloat128Frac1( a );
  5862. aSig0 = extractFloat128Frac0( a );
  5863. aExp = extractFloat128Exp( a );
  5864. bSig1 = extractFloat128Frac1( b );
  5865. bSig0 = extractFloat128Frac0( b );
  5866. bExp = extractFloat128Exp( b );
  5867. expDiff = aExp - bExp;
  5868. if ( 0 < expDiff ) {
  5869. if ( aExp == 0x7FFF ) {
  5870. if (aSig0 | aSig1) {
  5871. return propagateFloat128NaN(a, b, status);
  5872. }
  5873. return a;
  5874. }
  5875. if ( bExp == 0 ) {
  5876. --expDiff;
  5877. }
  5878. else {
  5879. bSig0 |= LIT64( 0x0001000000000000 );
  5880. }
  5881. shift128ExtraRightJamming(
  5882. bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
  5883. zExp = aExp;
  5884. }
  5885. else if ( expDiff < 0 ) {
  5886. if ( bExp == 0x7FFF ) {
  5887. if (bSig0 | bSig1) {
  5888. return propagateFloat128NaN(a, b, status);
  5889. }
  5890. return packFloat128( zSign, 0x7FFF, 0, 0 );
  5891. }
  5892. if ( aExp == 0 ) {
  5893. ++expDiff;
  5894. }
  5895. else {
  5896. aSig0 |= LIT64( 0x0001000000000000 );
  5897. }
  5898. shift128ExtraRightJamming(
  5899. aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
  5900. zExp = bExp;
  5901. }
  5902. else {
  5903. if ( aExp == 0x7FFF ) {
  5904. if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
  5905. return propagateFloat128NaN(a, b, status);
  5906. }
  5907. return a;
  5908. }
  5909. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  5910. if ( aExp == 0 ) {
  5911. if (status->flush_to_zero) {
  5912. if (zSig0 | zSig1) {
  5913. float_raise(float_flag_output_denormal, status);
  5914. }
  5915. return packFloat128(zSign, 0, 0, 0);
  5916. }
  5917. return packFloat128( zSign, 0, zSig0, zSig1 );
  5918. }
  5919. zSig2 = 0;
  5920. zSig0 |= LIT64( 0x0002000000000000 );
  5921. zExp = aExp;
  5922. goto shiftRight1;
  5923. }
  5924. aSig0 |= LIT64( 0x0001000000000000 );
  5925. add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  5926. --zExp;
  5927. if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
  5928. ++zExp;
  5929. shiftRight1:
  5930. shift128ExtraRightJamming(
  5931. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  5932. roundAndPack:
  5933. return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
  5934. }
  5935. /*----------------------------------------------------------------------------
  5936. | Returns the result of subtracting the absolute values of the quadruple-
  5937. | precision floating-point values `a' and `b'. If `zSign' is 1, the
  5938. | difference is negated before being returned. `zSign' is ignored if the
  5939. | result is a NaN. The subtraction is performed according to the IEC/IEEE
  5940. | Standard for Binary Floating-Point Arithmetic.
  5941. *----------------------------------------------------------------------------*/
  5942. static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
  5943. float_status *status)
  5944. {
  5945. int32_t aExp, bExp, zExp;
  5946. uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
  5947. int32_t expDiff;
  5948. aSig1 = extractFloat128Frac1( a );
  5949. aSig0 = extractFloat128Frac0( a );
  5950. aExp = extractFloat128Exp( a );
  5951. bSig1 = extractFloat128Frac1( b );
  5952. bSig0 = extractFloat128Frac0( b );
  5953. bExp = extractFloat128Exp( b );
  5954. expDiff = aExp - bExp;
  5955. shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
  5956. shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
  5957. if ( 0 < expDiff ) goto aExpBigger;
  5958. if ( expDiff < 0 ) goto bExpBigger;
  5959. if ( aExp == 0x7FFF ) {
  5960. if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
  5961. return propagateFloat128NaN(a, b, status);
  5962. }
  5963. float_raise(float_flag_invalid, status);
  5964. return float128_default_nan(status);
  5965. }
  5966. if ( aExp == 0 ) {
  5967. aExp = 1;
  5968. bExp = 1;
  5969. }
  5970. if ( bSig0 < aSig0 ) goto aBigger;
  5971. if ( aSig0 < bSig0 ) goto bBigger;
  5972. if ( bSig1 < aSig1 ) goto aBigger;
  5973. if ( aSig1 < bSig1 ) goto bBigger;
  5974. return packFloat128(status->float_rounding_mode == float_round_down,
  5975. 0, 0, 0);
  5976. bExpBigger:
  5977. if ( bExp == 0x7FFF ) {
  5978. if (bSig0 | bSig1) {
  5979. return propagateFloat128NaN(a, b, status);
  5980. }
  5981. return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
  5982. }
  5983. if ( aExp == 0 ) {
  5984. ++expDiff;
  5985. }
  5986. else {
  5987. aSig0 |= LIT64( 0x4000000000000000 );
  5988. }
  5989. shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  5990. bSig0 |= LIT64( 0x4000000000000000 );
  5991. bBigger:
  5992. sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
  5993. zExp = bExp;
  5994. zSign ^= 1;
  5995. goto normalizeRoundAndPack;
  5996. aExpBigger:
  5997. if ( aExp == 0x7FFF ) {
  5998. if (aSig0 | aSig1) {
  5999. return propagateFloat128NaN(a, b, status);
  6000. }
  6001. return a;
  6002. }
  6003. if ( bExp == 0 ) {
  6004. --expDiff;
  6005. }
  6006. else {
  6007. bSig0 |= LIT64( 0x4000000000000000 );
  6008. }
  6009. shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
  6010. aSig0 |= LIT64( 0x4000000000000000 );
  6011. aBigger:
  6012. sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
  6013. zExp = aExp;
  6014. normalizeRoundAndPack:
  6015. --zExp;
  6016. return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
  6017. status);
  6018. }
  6019. /*----------------------------------------------------------------------------
  6020. | Returns the result of adding the quadruple-precision floating-point values
  6021. | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
  6022. | for Binary Floating-Point Arithmetic.
  6023. *----------------------------------------------------------------------------*/
  6024. float128 float128_add(float128 a, float128 b, float_status *status)
  6025. {
  6026. flag aSign, bSign;
  6027. aSign = extractFloat128Sign( a );
  6028. bSign = extractFloat128Sign( b );
  6029. if ( aSign == bSign ) {
  6030. return addFloat128Sigs(a, b, aSign, status);
  6031. }
  6032. else {
  6033. return subFloat128Sigs(a, b, aSign, status);
  6034. }
  6035. }
  6036. /*----------------------------------------------------------------------------
  6037. | Returns the result of subtracting the quadruple-precision floating-point
  6038. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6039. | Standard for Binary Floating-Point Arithmetic.
  6040. *----------------------------------------------------------------------------*/
  6041. float128 float128_sub(float128 a, float128 b, float_status *status)
  6042. {
  6043. flag aSign, bSign;
  6044. aSign = extractFloat128Sign( a );
  6045. bSign = extractFloat128Sign( b );
  6046. if ( aSign == bSign ) {
  6047. return subFloat128Sigs(a, b, aSign, status);
  6048. }
  6049. else {
  6050. return addFloat128Sigs(a, b, aSign, status);
  6051. }
  6052. }
  6053. /*----------------------------------------------------------------------------
  6054. | Returns the result of multiplying the quadruple-precision floating-point
  6055. | values `a' and `b'. The operation is performed according to the IEC/IEEE
  6056. | Standard for Binary Floating-Point Arithmetic.
  6057. *----------------------------------------------------------------------------*/
  6058. float128 float128_mul(float128 a, float128 b, float_status *status)
  6059. {
  6060. flag aSign, bSign, zSign;
  6061. int32_t aExp, bExp, zExp;
  6062. uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
  6063. aSig1 = extractFloat128Frac1( a );
  6064. aSig0 = extractFloat128Frac0( a );
  6065. aExp = extractFloat128Exp( a );
  6066. aSign = extractFloat128Sign( a );
  6067. bSig1 = extractFloat128Frac1( b );
  6068. bSig0 = extractFloat128Frac0( b );
  6069. bExp = extractFloat128Exp( b );
  6070. bSign = extractFloat128Sign( b );
  6071. zSign = aSign ^ bSign;
  6072. if ( aExp == 0x7FFF ) {
  6073. if ( ( aSig0 | aSig1 )
  6074. || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
  6075. return propagateFloat128NaN(a, b, status);
  6076. }
  6077. if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
  6078. return packFloat128( zSign, 0x7FFF, 0, 0 );
  6079. }
  6080. if ( bExp == 0x7FFF ) {
  6081. if (bSig0 | bSig1) {
  6082. return propagateFloat128NaN(a, b, status);
  6083. }
  6084. if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
  6085. invalid:
  6086. float_raise(float_flag_invalid, status);
  6087. return float128_default_nan(status);
  6088. }
  6089. return packFloat128( zSign, 0x7FFF, 0, 0 );
  6090. }
  6091. if ( aExp == 0 ) {
  6092. if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  6093. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6094. }
  6095. if ( bExp == 0 ) {
  6096. if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  6097. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  6098. }
  6099. zExp = aExp + bExp - 0x4000;
  6100. aSig0 |= LIT64( 0x0001000000000000 );
  6101. shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
  6102. mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
  6103. add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
  6104. zSig2 |= ( zSig3 != 0 );
  6105. if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
  6106. shift128ExtraRightJamming(
  6107. zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
  6108. ++zExp;
  6109. }
  6110. return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
  6111. }
  6112. /*----------------------------------------------------------------------------
  6113. | Returns the result of dividing the quadruple-precision floating-point value
  6114. | `a' by the corresponding value `b'. The operation is performed according to
  6115. | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6116. *----------------------------------------------------------------------------*/
  6117. float128 float128_div(float128 a, float128 b, float_status *status)
  6118. {
  6119. flag aSign, bSign, zSign;
  6120. int32_t aExp, bExp, zExp;
  6121. uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
  6122. uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  6123. aSig1 = extractFloat128Frac1( a );
  6124. aSig0 = extractFloat128Frac0( a );
  6125. aExp = extractFloat128Exp( a );
  6126. aSign = extractFloat128Sign( a );
  6127. bSig1 = extractFloat128Frac1( b );
  6128. bSig0 = extractFloat128Frac0( b );
  6129. bExp = extractFloat128Exp( b );
  6130. bSign = extractFloat128Sign( b );
  6131. zSign = aSign ^ bSign;
  6132. if ( aExp == 0x7FFF ) {
  6133. if (aSig0 | aSig1) {
  6134. return propagateFloat128NaN(a, b, status);
  6135. }
  6136. if ( bExp == 0x7FFF ) {
  6137. if (bSig0 | bSig1) {
  6138. return propagateFloat128NaN(a, b, status);
  6139. }
  6140. goto invalid;
  6141. }
  6142. return packFloat128( zSign, 0x7FFF, 0, 0 );
  6143. }
  6144. if ( bExp == 0x7FFF ) {
  6145. if (bSig0 | bSig1) {
  6146. return propagateFloat128NaN(a, b, status);
  6147. }
  6148. return packFloat128( zSign, 0, 0, 0 );
  6149. }
  6150. if ( bExp == 0 ) {
  6151. if ( ( bSig0 | bSig1 ) == 0 ) {
  6152. if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
  6153. invalid:
  6154. float_raise(float_flag_invalid, status);
  6155. return float128_default_nan(status);
  6156. }
  6157. float_raise(float_flag_divbyzero, status);
  6158. return packFloat128( zSign, 0x7FFF, 0, 0 );
  6159. }
  6160. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  6161. }
  6162. if ( aExp == 0 ) {
  6163. if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
  6164. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6165. }
  6166. zExp = aExp - bExp + 0x3FFD;
  6167. shortShift128Left(
  6168. aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
  6169. shortShift128Left(
  6170. bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  6171. if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
  6172. shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
  6173. ++zExp;
  6174. }
  6175. zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
  6176. mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
  6177. sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
  6178. while ( (int64_t) rem0 < 0 ) {
  6179. --zSig0;
  6180. add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
  6181. }
  6182. zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
  6183. if ( ( zSig1 & 0x3FFF ) <= 4 ) {
  6184. mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
  6185. sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
  6186. while ( (int64_t) rem1 < 0 ) {
  6187. --zSig1;
  6188. add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
  6189. }
  6190. zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  6191. }
  6192. shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
  6193. return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
  6194. }
  6195. /*----------------------------------------------------------------------------
  6196. | Returns the remainder of the quadruple-precision floating-point value `a'
  6197. | with respect to the corresponding value `b'. The operation is performed
  6198. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6199. *----------------------------------------------------------------------------*/
  6200. float128 float128_rem(float128 a, float128 b, float_status *status)
  6201. {
  6202. flag aSign, zSign;
  6203. int32_t aExp, bExp, expDiff;
  6204. uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
  6205. uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
  6206. int64_t sigMean0;
  6207. aSig1 = extractFloat128Frac1( a );
  6208. aSig0 = extractFloat128Frac0( a );
  6209. aExp = extractFloat128Exp( a );
  6210. aSign = extractFloat128Sign( a );
  6211. bSig1 = extractFloat128Frac1( b );
  6212. bSig0 = extractFloat128Frac0( b );
  6213. bExp = extractFloat128Exp( b );
  6214. if ( aExp == 0x7FFF ) {
  6215. if ( ( aSig0 | aSig1 )
  6216. || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
  6217. return propagateFloat128NaN(a, b, status);
  6218. }
  6219. goto invalid;
  6220. }
  6221. if ( bExp == 0x7FFF ) {
  6222. if (bSig0 | bSig1) {
  6223. return propagateFloat128NaN(a, b, status);
  6224. }
  6225. return a;
  6226. }
  6227. if ( bExp == 0 ) {
  6228. if ( ( bSig0 | bSig1 ) == 0 ) {
  6229. invalid:
  6230. float_raise(float_flag_invalid, status);
  6231. return float128_default_nan(status);
  6232. }
  6233. normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
  6234. }
  6235. if ( aExp == 0 ) {
  6236. if ( ( aSig0 | aSig1 ) == 0 ) return a;
  6237. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6238. }
  6239. expDiff = aExp - bExp;
  6240. if ( expDiff < -1 ) return a;
  6241. shortShift128Left(
  6242. aSig0 | LIT64( 0x0001000000000000 ),
  6243. aSig1,
  6244. 15 - ( expDiff < 0 ),
  6245. &aSig0,
  6246. &aSig1
  6247. );
  6248. shortShift128Left(
  6249. bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
  6250. q = le128( bSig0, bSig1, aSig0, aSig1 );
  6251. if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  6252. expDiff -= 64;
  6253. while ( 0 < expDiff ) {
  6254. q = estimateDiv128To64( aSig0, aSig1, bSig0 );
  6255. q = ( 4 < q ) ? q - 4 : 0;
  6256. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  6257. shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
  6258. shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
  6259. sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
  6260. expDiff -= 61;
  6261. }
  6262. if ( -64 < expDiff ) {
  6263. q = estimateDiv128To64( aSig0, aSig1, bSig0 );
  6264. q = ( 4 < q ) ? q - 4 : 0;
  6265. q >>= - expDiff;
  6266. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  6267. expDiff += 52;
  6268. if ( expDiff < 0 ) {
  6269. shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
  6270. }
  6271. else {
  6272. shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
  6273. }
  6274. mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
  6275. sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
  6276. }
  6277. else {
  6278. shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
  6279. shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
  6280. }
  6281. do {
  6282. alternateASig0 = aSig0;
  6283. alternateASig1 = aSig1;
  6284. ++q;
  6285. sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
  6286. } while ( 0 <= (int64_t) aSig0 );
  6287. add128(
  6288. aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
  6289. if ( ( sigMean0 < 0 )
  6290. || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
  6291. aSig0 = alternateASig0;
  6292. aSig1 = alternateASig1;
  6293. }
  6294. zSign = ( (int64_t) aSig0 < 0 );
  6295. if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
  6296. return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
  6297. status);
  6298. }
  6299. /*----------------------------------------------------------------------------
  6300. | Returns the square root of the quadruple-precision floating-point value `a'.
  6301. | The operation is performed according to the IEC/IEEE Standard for Binary
  6302. | Floating-Point Arithmetic.
  6303. *----------------------------------------------------------------------------*/
  6304. float128 float128_sqrt(float128 a, float_status *status)
  6305. {
  6306. flag aSign;
  6307. int32_t aExp, zExp;
  6308. uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
  6309. uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
  6310. aSig1 = extractFloat128Frac1( a );
  6311. aSig0 = extractFloat128Frac0( a );
  6312. aExp = extractFloat128Exp( a );
  6313. aSign = extractFloat128Sign( a );
  6314. if ( aExp == 0x7FFF ) {
  6315. if (aSig0 | aSig1) {
  6316. return propagateFloat128NaN(a, a, status);
  6317. }
  6318. if ( ! aSign ) return a;
  6319. goto invalid;
  6320. }
  6321. if ( aSign ) {
  6322. if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
  6323. invalid:
  6324. float_raise(float_flag_invalid, status);
  6325. return float128_default_nan(status);
  6326. }
  6327. if ( aExp == 0 ) {
  6328. if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
  6329. normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
  6330. }
  6331. zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
  6332. aSig0 |= LIT64( 0x0001000000000000 );
  6333. zSig0 = estimateSqrt32( aExp, aSig0>>17 );
  6334. shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
  6335. zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
  6336. doubleZSig0 = zSig0<<1;
  6337. mul64To128( zSig0, zSig0, &term0, &term1 );
  6338. sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
  6339. while ( (int64_t) rem0 < 0 ) {
  6340. --zSig0;
  6341. doubleZSig0 -= 2;
  6342. add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
  6343. }
  6344. zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
  6345. if ( ( zSig1 & 0x1FFF ) <= 5 ) {
  6346. if ( zSig1 == 0 ) zSig1 = 1;
  6347. mul64To128( doubleZSig0, zSig1, &term1, &term2 );
  6348. sub128( rem1, 0, term1, term2, &rem1, &rem2 );
  6349. mul64To128( zSig1, zSig1, &term2, &term3 );
  6350. sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
  6351. while ( (int64_t) rem1 < 0 ) {
  6352. --zSig1;
  6353. shortShift128Left( 0, zSig1, 1, &term2, &term3 );
  6354. term3 |= 1;
  6355. term2 |= doubleZSig0;
  6356. add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
  6357. }
  6358. zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
  6359. }
  6360. shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
  6361. return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
  6362. }
  6363. /*----------------------------------------------------------------------------
  6364. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  6365. | the corresponding value `b', and 0 otherwise. The invalid exception is
  6366. | raised if either operand is a NaN. Otherwise, the comparison is performed
  6367. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6368. *----------------------------------------------------------------------------*/
  6369. int float128_eq(float128 a, float128 b, float_status *status)
  6370. {
  6371. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6372. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6373. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6374. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6375. ) {
  6376. float_raise(float_flag_invalid, status);
  6377. return 0;
  6378. }
  6379. return
  6380. ( a.low == b.low )
  6381. && ( ( a.high == b.high )
  6382. || ( ( a.low == 0 )
  6383. && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
  6384. );
  6385. }
  6386. /*----------------------------------------------------------------------------
  6387. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  6388. | or equal to the corresponding value `b', and 0 otherwise. The invalid
  6389. | exception is raised if either operand is a NaN. The comparison is performed
  6390. | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6391. *----------------------------------------------------------------------------*/
  6392. int float128_le(float128 a, float128 b, float_status *status)
  6393. {
  6394. flag aSign, bSign;
  6395. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6396. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6397. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6398. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6399. ) {
  6400. float_raise(float_flag_invalid, status);
  6401. return 0;
  6402. }
  6403. aSign = extractFloat128Sign( a );
  6404. bSign = extractFloat128Sign( b );
  6405. if ( aSign != bSign ) {
  6406. return
  6407. aSign
  6408. || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  6409. == 0 );
  6410. }
  6411. return
  6412. aSign ? le128( b.high, b.low, a.high, a.low )
  6413. : le128( a.high, a.low, b.high, b.low );
  6414. }
  6415. /*----------------------------------------------------------------------------
  6416. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  6417. | the corresponding value `b', and 0 otherwise. The invalid exception is
  6418. | raised if either operand is a NaN. The comparison is performed according
  6419. | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6420. *----------------------------------------------------------------------------*/
  6421. int float128_lt(float128 a, float128 b, float_status *status)
  6422. {
  6423. flag aSign, bSign;
  6424. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6425. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6426. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6427. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6428. ) {
  6429. float_raise(float_flag_invalid, status);
  6430. return 0;
  6431. }
  6432. aSign = extractFloat128Sign( a );
  6433. bSign = extractFloat128Sign( b );
  6434. if ( aSign != bSign ) {
  6435. return
  6436. aSign
  6437. && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  6438. != 0 );
  6439. }
  6440. return
  6441. aSign ? lt128( b.high, b.low, a.high, a.low )
  6442. : lt128( a.high, a.low, b.high, b.low );
  6443. }
  6444. /*----------------------------------------------------------------------------
  6445. | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
  6446. | be compared, and 0 otherwise. The invalid exception is raised if either
  6447. | operand is a NaN. The comparison is performed according to the IEC/IEEE
  6448. | Standard for Binary Floating-Point Arithmetic.
  6449. *----------------------------------------------------------------------------*/
  6450. int float128_unordered(float128 a, float128 b, float_status *status)
  6451. {
  6452. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6453. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6454. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6455. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6456. ) {
  6457. float_raise(float_flag_invalid, status);
  6458. return 1;
  6459. }
  6460. return 0;
  6461. }
  6462. /*----------------------------------------------------------------------------
  6463. | Returns 1 if the quadruple-precision floating-point value `a' is equal to
  6464. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  6465. | exception. The comparison is performed according to the IEC/IEEE Standard
  6466. | for Binary Floating-Point Arithmetic.
  6467. *----------------------------------------------------------------------------*/
  6468. int float128_eq_quiet(float128 a, float128 b, float_status *status)
  6469. {
  6470. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6471. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6472. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6473. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6474. ) {
  6475. if (float128_is_signaling_nan(a, status)
  6476. || float128_is_signaling_nan(b, status)) {
  6477. float_raise(float_flag_invalid, status);
  6478. }
  6479. return 0;
  6480. }
  6481. return
  6482. ( a.low == b.low )
  6483. && ( ( a.high == b.high )
  6484. || ( ( a.low == 0 )
  6485. && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
  6486. );
  6487. }
  6488. /*----------------------------------------------------------------------------
  6489. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  6490. | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
  6491. | cause an exception. Otherwise, the comparison is performed according to the
  6492. | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
  6493. *----------------------------------------------------------------------------*/
  6494. int float128_le_quiet(float128 a, float128 b, float_status *status)
  6495. {
  6496. flag aSign, bSign;
  6497. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6498. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6499. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6500. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6501. ) {
  6502. if (float128_is_signaling_nan(a, status)
  6503. || float128_is_signaling_nan(b, status)) {
  6504. float_raise(float_flag_invalid, status);
  6505. }
  6506. return 0;
  6507. }
  6508. aSign = extractFloat128Sign( a );
  6509. bSign = extractFloat128Sign( b );
  6510. if ( aSign != bSign ) {
  6511. return
  6512. aSign
  6513. || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  6514. == 0 );
  6515. }
  6516. return
  6517. aSign ? le128( b.high, b.low, a.high, a.low )
  6518. : le128( a.high, a.low, b.high, b.low );
  6519. }
  6520. /*----------------------------------------------------------------------------
  6521. | Returns 1 if the quadruple-precision floating-point value `a' is less than
  6522. | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
  6523. | exception. Otherwise, the comparison is performed according to the IEC/IEEE
  6524. | Standard for Binary Floating-Point Arithmetic.
  6525. *----------------------------------------------------------------------------*/
  6526. int float128_lt_quiet(float128 a, float128 b, float_status *status)
  6527. {
  6528. flag aSign, bSign;
  6529. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6530. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6531. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6532. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6533. ) {
  6534. if (float128_is_signaling_nan(a, status)
  6535. || float128_is_signaling_nan(b, status)) {
  6536. float_raise(float_flag_invalid, status);
  6537. }
  6538. return 0;
  6539. }
  6540. aSign = extractFloat128Sign( a );
  6541. bSign = extractFloat128Sign( b );
  6542. if ( aSign != bSign ) {
  6543. return
  6544. aSign
  6545. && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
  6546. != 0 );
  6547. }
  6548. return
  6549. aSign ? lt128( b.high, b.low, a.high, a.low )
  6550. : lt128( a.high, a.low, b.high, b.low );
  6551. }
  6552. /*----------------------------------------------------------------------------
  6553. | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
  6554. | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
  6555. | comparison is performed according to the IEC/IEEE Standard for Binary
  6556. | Floating-Point Arithmetic.
  6557. *----------------------------------------------------------------------------*/
  6558. int float128_unordered_quiet(float128 a, float128 b, float_status *status)
  6559. {
  6560. if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
  6561. && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
  6562. || ( ( extractFloat128Exp( b ) == 0x7FFF )
  6563. && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
  6564. ) {
  6565. if (float128_is_signaling_nan(a, status)
  6566. || float128_is_signaling_nan(b, status)) {
  6567. float_raise(float_flag_invalid, status);
  6568. }
  6569. return 1;
  6570. }
  6571. return 0;
  6572. }
  6573. /* misc functions */
  6574. float32 uint32_to_float32(uint32_t a, float_status *status)
  6575. {
  6576. return int64_to_float32(a, status);
  6577. }
  6578. float64 uint32_to_float64(uint32_t a, float_status *status)
  6579. {
  6580. return int64_to_float64(a, status);
  6581. }
  6582. uint32_t float32_to_uint32(float32 a, float_status *status)
  6583. {
  6584. int64_t v;
  6585. uint32_t res;
  6586. int old_exc_flags = get_float_exception_flags(status);
  6587. v = float32_to_int64(a, status);
  6588. if (v < 0) {
  6589. res = 0;
  6590. } else if (v > 0xffffffff) {
  6591. res = 0xffffffff;
  6592. } else {
  6593. return v;
  6594. }
  6595. set_float_exception_flags(old_exc_flags, status);
  6596. float_raise(float_flag_invalid, status);
  6597. return res;
  6598. }
  6599. uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
  6600. {
  6601. int64_t v;
  6602. uint32_t res;
  6603. int old_exc_flags = get_float_exception_flags(status);
  6604. v = float32_to_int64_round_to_zero(a, status);
  6605. if (v < 0) {
  6606. res = 0;
  6607. } else if (v > 0xffffffff) {
  6608. res = 0xffffffff;
  6609. } else {
  6610. return v;
  6611. }
  6612. set_float_exception_flags(old_exc_flags, status);
  6613. float_raise(float_flag_invalid, status);
  6614. return res;
  6615. }
  6616. int16_t float32_to_int16(float32 a, float_status *status)
  6617. {
  6618. int32_t v;
  6619. int16_t res;
  6620. int old_exc_flags = get_float_exception_flags(status);
  6621. v = float32_to_int32(a, status);
  6622. if (v < -0x8000) {
  6623. res = -0x8000;
  6624. } else if (v > 0x7fff) {
  6625. res = 0x7fff;
  6626. } else {
  6627. return v;
  6628. }
  6629. set_float_exception_flags(old_exc_flags, status);
  6630. float_raise(float_flag_invalid, status);
  6631. return res;
  6632. }
  6633. uint16_t float32_to_uint16(float32 a, float_status *status)
  6634. {
  6635. int32_t v;
  6636. uint16_t res;
  6637. int old_exc_flags = get_float_exception_flags(status);
  6638. v = float32_to_int32(a, status);
  6639. if (v < 0) {
  6640. res = 0;
  6641. } else if (v > 0xffff) {
  6642. res = 0xffff;
  6643. } else {
  6644. return v;
  6645. }
  6646. set_float_exception_flags(old_exc_flags, status);
  6647. float_raise(float_flag_invalid, status);
  6648. return res;
  6649. }
  6650. uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
  6651. {
  6652. int64_t v;
  6653. uint16_t res;
  6654. int old_exc_flags = get_float_exception_flags(status);
  6655. v = float32_to_int64_round_to_zero(a, status);
  6656. if (v < 0) {
  6657. res = 0;
  6658. } else if (v > 0xffff) {
  6659. res = 0xffff;
  6660. } else {
  6661. return v;
  6662. }
  6663. set_float_exception_flags(old_exc_flags, status);
  6664. float_raise(float_flag_invalid, status);
  6665. return res;
  6666. }
  6667. uint32_t float64_to_uint32(float64 a, float_status *status)
  6668. {
  6669. uint64_t v;
  6670. uint32_t res;
  6671. int old_exc_flags = get_float_exception_flags(status);
  6672. v = float64_to_uint64(a, status);
  6673. if (v > 0xffffffff) {
  6674. res = 0xffffffff;
  6675. } else {
  6676. return v;
  6677. }
  6678. set_float_exception_flags(old_exc_flags, status);
  6679. float_raise(float_flag_invalid, status);
  6680. return res;
  6681. }
  6682. uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
  6683. {
  6684. uint64_t v;
  6685. uint32_t res;
  6686. int old_exc_flags = get_float_exception_flags(status);
  6687. v = float64_to_uint64_round_to_zero(a, status);
  6688. if (v > 0xffffffff) {
  6689. res = 0xffffffff;
  6690. } else {
  6691. return v;
  6692. }
  6693. set_float_exception_flags(old_exc_flags, status);
  6694. float_raise(float_flag_invalid, status);
  6695. return res;
  6696. }
  6697. int16_t float64_to_int16(float64 a, float_status *status)
  6698. {
  6699. int64_t v;
  6700. int16_t res;
  6701. int old_exc_flags = get_float_exception_flags(status);
  6702. v = float64_to_int32(a, status);
  6703. if (v < -0x8000) {
  6704. res = -0x8000;
  6705. } else if (v > 0x7fff) {
  6706. res = 0x7fff;
  6707. } else {
  6708. return v;
  6709. }
  6710. set_float_exception_flags(old_exc_flags, status);
  6711. float_raise(float_flag_invalid, status);
  6712. return res;
  6713. }
  6714. uint16_t float64_to_uint16(float64 a, float_status *status)
  6715. {
  6716. int64_t v;
  6717. uint16_t res;
  6718. int old_exc_flags = get_float_exception_flags(status);
  6719. v = float64_to_int32(a, status);
  6720. if (v < 0) {
  6721. res = 0;
  6722. } else if (v > 0xffff) {
  6723. res = 0xffff;
  6724. } else {
  6725. return v;
  6726. }
  6727. set_float_exception_flags(old_exc_flags, status);
  6728. float_raise(float_flag_invalid, status);
  6729. return res;
  6730. }
  6731. uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
  6732. {
  6733. int64_t v;
  6734. uint16_t res;
  6735. int old_exc_flags = get_float_exception_flags(status);
  6736. v = float64_to_int64_round_to_zero(a, status);
  6737. if (v < 0) {
  6738. res = 0;
  6739. } else if (v > 0xffff) {
  6740. res = 0xffff;
  6741. } else {
  6742. return v;
  6743. }
  6744. set_float_exception_flags(old_exc_flags, status);
  6745. float_raise(float_flag_invalid, status);
  6746. return res;
  6747. }
  6748. /*----------------------------------------------------------------------------
  6749. | Returns the result of converting the double-precision floating-point value
  6750. | `a' to the 64-bit unsigned integer format. The conversion is
  6751. | performed according to the IEC/IEEE Standard for Binary Floating-Point
  6752. | Arithmetic---which means in particular that the conversion is rounded
  6753. | according to the current rounding mode. If `a' is a NaN, the largest
  6754. | positive integer is returned. If the conversion overflows, the
  6755. | largest unsigned integer is returned. If 'a' is negative, the value is
  6756. | rounded and zero is returned; negative values that do not round to zero
  6757. | will raise the inexact exception.
  6758. *----------------------------------------------------------------------------*/
  6759. uint64_t float64_to_uint64(float64 a, float_status *status)
  6760. {
  6761. flag aSign;
  6762. int aExp;
  6763. int shiftCount;
  6764. uint64_t aSig, aSigExtra;
  6765. a = float64_squash_input_denormal(a, status);
  6766. aSig = extractFloat64Frac(a);
  6767. aExp = extractFloat64Exp(a);
  6768. aSign = extractFloat64Sign(a);
  6769. if (aSign && (aExp > 1022)) {
  6770. float_raise(float_flag_invalid, status);
  6771. if (float64_is_any_nan(a)) {
  6772. return LIT64(0xFFFFFFFFFFFFFFFF);
  6773. } else {
  6774. return 0;
  6775. }
  6776. }
  6777. if (aExp) {
  6778. aSig |= LIT64(0x0010000000000000);
  6779. }
  6780. shiftCount = 0x433 - aExp;
  6781. if (shiftCount <= 0) {
  6782. if (0x43E < aExp) {
  6783. float_raise(float_flag_invalid, status);
  6784. return LIT64(0xFFFFFFFFFFFFFFFF);
  6785. }
  6786. aSigExtra = 0;
  6787. aSig <<= -shiftCount;
  6788. } else {
  6789. shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
  6790. }
  6791. return roundAndPackUint64(aSign, aSig, aSigExtra, status);
  6792. }
  6793. uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
  6794. {
  6795. signed char current_rounding_mode = status->float_rounding_mode;
  6796. set_float_rounding_mode(float_round_to_zero, status);
  6797. uint64_t v = float64_to_uint64(a, status);
  6798. set_float_rounding_mode(current_rounding_mode, status);
  6799. return v;
  6800. }
  6801. #define COMPARE(s, nan_exp) \
  6802. static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
  6803. int is_quiet, float_status *status) \
  6804. { \
  6805. flag aSign, bSign; \
  6806. uint ## s ## _t av, bv; \
  6807. a = float ## s ## _squash_input_denormal(a, status); \
  6808. b = float ## s ## _squash_input_denormal(b, status); \
  6809. \
  6810. if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
  6811. extractFloat ## s ## Frac( a ) ) || \
  6812. ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
  6813. extractFloat ## s ## Frac( b ) )) { \
  6814. if (!is_quiet || \
  6815. float ## s ## _is_signaling_nan(a, status) || \
  6816. float ## s ## _is_signaling_nan(b, status)) { \
  6817. float_raise(float_flag_invalid, status); \
  6818. } \
  6819. return float_relation_unordered; \
  6820. } \
  6821. aSign = extractFloat ## s ## Sign( a ); \
  6822. bSign = extractFloat ## s ## Sign( b ); \
  6823. av = float ## s ## _val(a); \
  6824. bv = float ## s ## _val(b); \
  6825. if ( aSign != bSign ) { \
  6826. if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
  6827. /* zero case */ \
  6828. return float_relation_equal; \
  6829. } else { \
  6830. return 1 - (2 * aSign); \
  6831. } \
  6832. } else { \
  6833. if (av == bv) { \
  6834. return float_relation_equal; \
  6835. } else { \
  6836. return 1 - 2 * (aSign ^ ( av < bv )); \
  6837. } \
  6838. } \
  6839. } \
  6840. \
  6841. int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
  6842. { \
  6843. return float ## s ## _compare_internal(a, b, 0, status); \
  6844. } \
  6845. \
  6846. int float ## s ## _compare_quiet(float ## s a, float ## s b, \
  6847. float_status *status) \
  6848. { \
  6849. return float ## s ## _compare_internal(a, b, 1, status); \
  6850. }
  6851. COMPARE(32, 0xff)
  6852. COMPARE(64, 0x7ff)
  6853. static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
  6854. int is_quiet, float_status *status)
  6855. {
  6856. flag aSign, bSign;
  6857. if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
  6858. float_raise(float_flag_invalid, status);
  6859. return float_relation_unordered;
  6860. }
  6861. if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
  6862. ( extractFloatx80Frac( a )<<1 ) ) ||
  6863. ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
  6864. ( extractFloatx80Frac( b )<<1 ) )) {
  6865. if (!is_quiet ||
  6866. floatx80_is_signaling_nan(a, status) ||
  6867. floatx80_is_signaling_nan(b, status)) {
  6868. float_raise(float_flag_invalid, status);
  6869. }
  6870. return float_relation_unordered;
  6871. }
  6872. aSign = extractFloatx80Sign( a );
  6873. bSign = extractFloatx80Sign( b );
  6874. if ( aSign != bSign ) {
  6875. if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
  6876. ( ( a.low | b.low ) == 0 ) ) {
  6877. /* zero case */
  6878. return float_relation_equal;
  6879. } else {
  6880. return 1 - (2 * aSign);
  6881. }
  6882. } else {
  6883. if (a.low == b.low && a.high == b.high) {
  6884. return float_relation_equal;
  6885. } else {
  6886. return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
  6887. }
  6888. }
  6889. }
  6890. int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
  6891. {
  6892. return floatx80_compare_internal(a, b, 0, status);
  6893. }
  6894. int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
  6895. {
  6896. return floatx80_compare_internal(a, b, 1, status);
  6897. }
  6898. static inline int float128_compare_internal(float128 a, float128 b,
  6899. int is_quiet, float_status *status)
  6900. {
  6901. flag aSign, bSign;
  6902. if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
  6903. ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
  6904. ( ( extractFloat128Exp( b ) == 0x7fff ) &&
  6905. ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
  6906. if (!is_quiet ||
  6907. float128_is_signaling_nan(a, status) ||
  6908. float128_is_signaling_nan(b, status)) {
  6909. float_raise(float_flag_invalid, status);
  6910. }
  6911. return float_relation_unordered;
  6912. }
  6913. aSign = extractFloat128Sign( a );
  6914. bSign = extractFloat128Sign( b );
  6915. if ( aSign != bSign ) {
  6916. if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
  6917. /* zero case */
  6918. return float_relation_equal;
  6919. } else {
  6920. return 1 - (2 * aSign);
  6921. }
  6922. } else {
  6923. if (a.low == b.low && a.high == b.high) {
  6924. return float_relation_equal;
  6925. } else {
  6926. return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
  6927. }
  6928. }
  6929. }
  6930. int float128_compare(float128 a, float128 b, float_status *status)
  6931. {
  6932. return float128_compare_internal(a, b, 0, status);
  6933. }
  6934. int float128_compare_quiet(float128 a, float128 b, float_status *status)
  6935. {
  6936. return float128_compare_internal(a, b, 1, status);
  6937. }
  6938. /* min() and max() functions. These can't be implemented as
  6939. * 'compare and pick one input' because that would mishandle
  6940. * NaNs and +0 vs -0.
  6941. *
  6942. * minnum() and maxnum() functions. These are similar to the min()
  6943. * and max() functions but if one of the arguments is a QNaN and
  6944. * the other is numerical then the numerical argument is returned.
  6945. * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
  6946. * and maxNum() operations. min() and max() are the typical min/max
  6947. * semantics provided by many CPUs which predate that specification.
  6948. *
  6949. * minnummag() and maxnummag() functions correspond to minNumMag()
  6950. * and minNumMag() from the IEEE-754 2008.
  6951. */
  6952. #define MINMAX(s) \
  6953. static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
  6954. int ismin, int isieee, \
  6955. int ismag, \
  6956. float_status *status) \
  6957. { \
  6958. flag aSign, bSign; \
  6959. uint ## s ## _t av, bv, aav, abv; \
  6960. a = float ## s ## _squash_input_denormal(a, status); \
  6961. b = float ## s ## _squash_input_denormal(b, status); \
  6962. if (float ## s ## _is_any_nan(a) || \
  6963. float ## s ## _is_any_nan(b)) { \
  6964. if (isieee) { \
  6965. if (float ## s ## _is_quiet_nan(a, status) && \
  6966. !float ## s ##_is_any_nan(b)) { \
  6967. return b; \
  6968. } else if (float ## s ## _is_quiet_nan(b, status) && \
  6969. !float ## s ## _is_any_nan(a)) { \
  6970. return a; \
  6971. } \
  6972. } \
  6973. return propagateFloat ## s ## NaN(a, b, status); \
  6974. } \
  6975. aSign = extractFloat ## s ## Sign(a); \
  6976. bSign = extractFloat ## s ## Sign(b); \
  6977. av = float ## s ## _val(a); \
  6978. bv = float ## s ## _val(b); \
  6979. if (ismag) { \
  6980. aav = float ## s ## _abs(av); \
  6981. abv = float ## s ## _abs(bv); \
  6982. if (aav != abv) { \
  6983. if (ismin) { \
  6984. return (aav < abv) ? a : b; \
  6985. } else { \
  6986. return (aav < abv) ? b : a; \
  6987. } \
  6988. } \
  6989. } \
  6990. if (aSign != bSign) { \
  6991. if (ismin) { \
  6992. return aSign ? a : b; \
  6993. } else { \
  6994. return aSign ? b : a; \
  6995. } \
  6996. } else { \
  6997. if (ismin) { \
  6998. return (aSign ^ (av < bv)) ? a : b; \
  6999. } else { \
  7000. return (aSign ^ (av < bv)) ? b : a; \
  7001. } \
  7002. } \
  7003. } \
  7004. \
  7005. float ## s float ## s ## _min(float ## s a, float ## s b, \
  7006. float_status *status) \
  7007. { \
  7008. return float ## s ## _minmax(a, b, 1, 0, 0, status); \
  7009. } \
  7010. \
  7011. float ## s float ## s ## _max(float ## s a, float ## s b, \
  7012. float_status *status) \
  7013. { \
  7014. return float ## s ## _minmax(a, b, 0, 0, 0, status); \
  7015. } \
  7016. \
  7017. float ## s float ## s ## _minnum(float ## s a, float ## s b, \
  7018. float_status *status) \
  7019. { \
  7020. return float ## s ## _minmax(a, b, 1, 1, 0, status); \
  7021. } \
  7022. \
  7023. float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
  7024. float_status *status) \
  7025. { \
  7026. return float ## s ## _minmax(a, b, 0, 1, 0, status); \
  7027. } \
  7028. \
  7029. float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
  7030. float_status *status) \
  7031. { \
  7032. return float ## s ## _minmax(a, b, 1, 1, 1, status); \
  7033. } \
  7034. \
  7035. float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
  7036. float_status *status) \
  7037. { \
  7038. return float ## s ## _minmax(a, b, 0, 1, 1, status); \
  7039. }
  7040. MINMAX(32)
  7041. MINMAX(64)
  7042. /* Multiply A by 2 raised to the power N. */
  7043. float32 float32_scalbn(float32 a, int n, float_status *status)
  7044. {
  7045. flag aSign;
  7046. int16_t aExp;
  7047. uint32_t aSig;
  7048. a = float32_squash_input_denormal(a, status);
  7049. aSig = extractFloat32Frac( a );
  7050. aExp = extractFloat32Exp( a );
  7051. aSign = extractFloat32Sign( a );
  7052. if ( aExp == 0xFF ) {
  7053. if ( aSig ) {
  7054. return propagateFloat32NaN(a, a, status);
  7055. }
  7056. return a;
  7057. }
  7058. if (aExp != 0) {
  7059. aSig |= 0x00800000;
  7060. } else if (aSig == 0) {
  7061. return a;
  7062. } else {
  7063. aExp++;
  7064. }
  7065. if (n > 0x200) {
  7066. n = 0x200;
  7067. } else if (n < -0x200) {
  7068. n = -0x200;
  7069. }
  7070. aExp += n - 1;
  7071. aSig <<= 7;
  7072. return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
  7073. }
  7074. float64 float64_scalbn(float64 a, int n, float_status *status)
  7075. {
  7076. flag aSign;
  7077. int16_t aExp;
  7078. uint64_t aSig;
  7079. a = float64_squash_input_denormal(a, status);
  7080. aSig = extractFloat64Frac( a );
  7081. aExp = extractFloat64Exp( a );
  7082. aSign = extractFloat64Sign( a );
  7083. if ( aExp == 0x7FF ) {
  7084. if ( aSig ) {
  7085. return propagateFloat64NaN(a, a, status);
  7086. }
  7087. return a;
  7088. }
  7089. if (aExp != 0) {
  7090. aSig |= LIT64( 0x0010000000000000 );
  7091. } else if (aSig == 0) {
  7092. return a;
  7093. } else {
  7094. aExp++;
  7095. }
  7096. if (n > 0x1000) {
  7097. n = 0x1000;
  7098. } else if (n < -0x1000) {
  7099. n = -0x1000;
  7100. }
  7101. aExp += n - 1;
  7102. aSig <<= 10;
  7103. return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
  7104. }
  7105. floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
  7106. {
  7107. flag aSign;
  7108. int32_t aExp;
  7109. uint64_t aSig;
  7110. if (floatx80_invalid_encoding(a)) {
  7111. float_raise(float_flag_invalid, status);
  7112. return floatx80_default_nan(status);
  7113. }
  7114. aSig = extractFloatx80Frac( a );
  7115. aExp = extractFloatx80Exp( a );
  7116. aSign = extractFloatx80Sign( a );
  7117. if ( aExp == 0x7FFF ) {
  7118. if ( aSig<<1 ) {
  7119. return propagateFloatx80NaN(a, a, status);
  7120. }
  7121. return a;
  7122. }
  7123. if (aExp == 0) {
  7124. if (aSig == 0) {
  7125. return a;
  7126. }
  7127. aExp++;
  7128. }
  7129. if (n > 0x10000) {
  7130. n = 0x10000;
  7131. } else if (n < -0x10000) {
  7132. n = -0x10000;
  7133. }
  7134. aExp += n;
  7135. return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
  7136. aSign, aExp, aSig, 0, status);
  7137. }
  7138. float128 float128_scalbn(float128 a, int n, float_status *status)
  7139. {
  7140. flag aSign;
  7141. int32_t aExp;
  7142. uint64_t aSig0, aSig1;
  7143. aSig1 = extractFloat128Frac1( a );
  7144. aSig0 = extractFloat128Frac0( a );
  7145. aExp = extractFloat128Exp( a );
  7146. aSign = extractFloat128Sign( a );
  7147. if ( aExp == 0x7FFF ) {
  7148. if ( aSig0 | aSig1 ) {
  7149. return propagateFloat128NaN(a, a, status);
  7150. }
  7151. return a;
  7152. }
  7153. if (aExp != 0) {
  7154. aSig0 |= LIT64( 0x0001000000000000 );
  7155. } else if (aSig0 == 0 && aSig1 == 0) {
  7156. return a;
  7157. } else {
  7158. aExp++;
  7159. }
  7160. if (n > 0x10000) {
  7161. n = 0x10000;
  7162. } else if (n < -0x10000) {
  7163. n = -0x10000;
  7164. }
  7165. aExp += n - 1;
  7166. return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
  7167. , status);
  7168. }