arm/nwfpe/softfloat.c

0001 /*
0002 ===============================================================================
0003
0004 This C source file is part of the SoftFloat IEC/IEEE Floating-point
0005 Arithmetic Package, Release 2.
0006
0007 Written by John R. Hauser.  This work was made possible in part by the
0008 International Computer Science Institute, located at Suite 600, 1947 Center
0009 Street, Berkeley, California 94704.  Funding was partially provided by the
0010 National Science Foundation under grant MIP-9311980.  The original version
0011 of this code was written as part of a project to build a fixed-point vector
0012 processor in collaboration with the University of California at Berkeley,
0013 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
0014 is available through the web page
0015 http://www.jhauser.us/arithmetic/SoftFloat-2b/SoftFloat-source.txt
0016
0017 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
0018 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
0019 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
0020 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
0021 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
0022
0023 Derivative works are acceptable, even for commercial purposes, so long as
0024 (1) they include prominent notice that the work is derivative, and (2) they
0025 include prominent notice akin to these three paragraphs for those parts of
0026 this code that are retained.
0027
0028 ===============================================================================
0029 */
0030
0031 #include <asm/div64.h>
0032
0033 #include "fpa11.h"
0034 //#include "milieu.h"
0035 //#include "softfloat.h"
0036
0037 /*
0038 -------------------------------------------------------------------------------
0039 Primitive arithmetic functions, including multi-word arithmetic, and
0040 division and square root approximations.  (Can be specialized to target if
0041 desired.)
0042 -------------------------------------------------------------------------------
0043 */
0044 #include "softfloat-macros"
0045
0046 /*
0047 -------------------------------------------------------------------------------
0048 Functions and definitions to determine:  (1) whether tininess for underflow
0049 is detected before or after rounding by default, (2) what (if anything)
0050 happens when exceptions are raised, (3) how signaling NaNs are distinguished
0051 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
0052 are propagated from function inputs to output.  These details are target-
0053 specific.
0054 -------------------------------------------------------------------------------
0055 */
0056 #include "softfloat-specialize"
0057
0058 /*
0059 -------------------------------------------------------------------------------
0060 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
0061 and 7, and returns the properly rounded 32-bit integer corresponding to the
0062 input.  If `zSign' is nonzero, the input is negated before being converted
0063 to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
0064 input is simply rounded to an integer, with the inexact exception raised if
0065 the input cannot be represented exactly as an integer.  If the fixed-point
0066 input is too large, however, the invalid exception is raised and the largest
0067 positive or negative integer is returned.
0068 -------------------------------------------------------------------------------
0069 */
0070 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
0071 {
0072     int8 roundingMode;
0073     flag roundNearestEven;
0074     int8 roundIncrement, roundBits;
0075     int32 z;
0076
0077     roundingMode = roundData->mode;
0078     roundNearestEven = ( roundingMode == float_round_nearest_even );
0079     roundIncrement = 0x40;
0080     if ( ! roundNearestEven ) {
0081         if ( roundingMode == float_round_to_zero ) {
0082             roundIncrement = 0;
0083         }
0084         else {
0085             roundIncrement = 0x7F;
0086             if ( zSign ) {
0087                 if ( roundingMode == float_round_up ) roundIncrement = 0;
0088             }
0089             else {
0090                 if ( roundingMode == float_round_down ) roundIncrement = 0;
0091             }
0092         }
0093     }
0094     roundBits = absZ & 0x7F;
0095     absZ = ( absZ + roundIncrement )>>7;
0096     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
0097     z = absZ;
0098     if ( zSign ) z = - z;
0099     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
0100         roundData->exception |= float_flag_invalid;
0101         return zSign ? 0x80000000 : 0x7FFFFFFF;
0102     }
0103     if ( roundBits ) roundData->exception |= float_flag_inexact;
0104     return z;
0105
0106 }
0107
0108 /*
0109 -------------------------------------------------------------------------------
0110 Returns the fraction bits of the single-precision floating-point value `a'.
0111 -------------------------------------------------------------------------------
0112 */
0113 INLINE bits32 extractFloat32Frac( float32 a )
0114 {
0115
0116     return a & 0x007FFFFF;
0117
0118 }
0119
0120 /*
0121 -------------------------------------------------------------------------------
0122 Returns the exponent bits of the single-precision floating-point value `a'.
0123 -------------------------------------------------------------------------------
0124 */
0125 INLINE int16 extractFloat32Exp( float32 a )
0126 {
0127
0128     return ( a>>23 ) & 0xFF;
0129
0130 }
0131
0132 /*
0133 -------------------------------------------------------------------------------
0134 Returns the sign bit of the single-precision floating-point value `a'.
0135 -------------------------------------------------------------------------------
0136 */
0137 #if 0   /* in softfloat.h */
0138 INLINE flag extractFloat32Sign( float32 a )
0139 {
0140
0141     return a>>31;
0142
0143 }
0144 #endif
0145
0146 /*
0147 -------------------------------------------------------------------------------
0148 Normalizes the subnormal single-precision floating-point value represented
0149 by the denormalized significand `aSig'.  The normalized exponent and
0150 significand are stored at the locations pointed to by `zExpPtr' and
0151 `zSigPtr', respectively.
0152 -------------------------------------------------------------------------------
0153 */
0154 static void
0155  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
0156 {
0157     int8 shiftCount;
0158
0159     shiftCount = countLeadingZeros32( aSig ) - 8;
0160     *zSigPtr = aSig<<shiftCount;
0161     *zExpPtr = 1 - shiftCount;
0162
0163 }
0164
0165 /*
0166 -------------------------------------------------------------------------------
0167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
0168 single-precision floating-point value, returning the result.  After being
0169 shifted into the proper positions, the three fields are simply added
0170 together to form the result.  This means that any integer portion of `zSig'
0171 will be added into the exponent.  Since a properly normalized significand
0172 will have an integer portion equal to 1, the `zExp' input should be 1 less
0173 than the desired result exponent whenever `zSig' is a complete, normalized
0174 significand.
0175 -------------------------------------------------------------------------------
0176 */
0177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
0178 {
0179 #if 0
0180    float32 f;
0181    __asm__("@ packFloat32               \n\
0182         mov %0, %1, asl #31             \n\
0183         orr %0, %2, asl #23             \n\
0184         orr %0, %3"
0185         : /* no outputs */
0186         : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
0187         : "cc");
0188    return f;
0189 #else
0190     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
0191 #endif
0192 }
0193
0194 /*
0195 -------------------------------------------------------------------------------
0196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
0197 and significand `zSig', and returns the proper single-precision floating-
0198 point value corresponding to the abstract input.  Ordinarily, the abstract
0199 value is simply rounded and packed into the single-precision format, with
0200 the inexact exception raised if the abstract input cannot be represented
0201 exactly.  If the abstract value is too large, however, the overflow and
0202 inexact exceptions are raised and an infinity or maximal finite value is
0203 returned.  If the abstract value is too small, the input value is rounded to
0204 a subnormal number, and the underflow and inexact exceptions are raised if
0205 the abstract input cannot be represented exactly as a subnormal single-
0206 precision floating-point number.
0207     The input significand `zSig' has its binary point between bits 30
0208 and 29, which is 7 bits to the left of the usual location.  This shifted
0209 significand must be normalized or smaller.  If `zSig' is not normalized,
0210 `zExp' must be 0; in that case, the result returned is a subnormal number,
0211 and it must not require rounding.  In the usual case that `zSig' is
0212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
0213 The handling of underflow and overflow follows the IEC/IEEE Standard for
0214 Binary Floating-point Arithmetic.
0215 -------------------------------------------------------------------------------
0216 */
0217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
0218 {
0219     int8 roundingMode;
0220     flag roundNearestEven;
0221     int8 roundIncrement, roundBits;
0222     flag isTiny;
0223
0224     roundingMode = roundData->mode;
0225     roundNearestEven = ( roundingMode == float_round_nearest_even );
0226     roundIncrement = 0x40;
0227     if ( ! roundNearestEven ) {
0228         if ( roundingMode == float_round_to_zero ) {
0229             roundIncrement = 0;
0230         }
0231         else {
0232             roundIncrement = 0x7F;
0233             if ( zSign ) {
0234                 if ( roundingMode == float_round_up ) roundIncrement = 0;
0235             }
0236             else {
0237                 if ( roundingMode == float_round_down ) roundIncrement = 0;
0238             }
0239         }
0240     }
0241     roundBits = zSig & 0x7F;
0242     if ( 0xFD <= (bits16) zExp ) {
0243         if (    ( 0xFD < zExp )
0244              || (    ( zExp == 0xFD )
0245                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
0246            ) {
0247             roundData->exception |= float_flag_overflow | float_flag_inexact;
0248             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
0249         }
0250         if ( zExp < 0 ) {
0251             isTiny =
0252                    ( float_detect_tininess == float_tininess_before_rounding )
0253                 || ( zExp < -1 )
0254                 || ( zSig + roundIncrement < 0x80000000 );
0255             shift32RightJamming( zSig, - zExp, &zSig );
0256             zExp = 0;
0257             roundBits = zSig & 0x7F;
0258             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
0259         }
0260     }
0261     if ( roundBits ) roundData->exception |= float_flag_inexact;
0262     zSig = ( zSig + roundIncrement )>>7;
0263     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
0264     if ( zSig == 0 ) zExp = 0;
0265     return packFloat32( zSign, zExp, zSig );
0266
0267 }
0268
0269 /*
0270 -------------------------------------------------------------------------------
0271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
0272 and significand `zSig', and returns the proper single-precision floating-
0273 point value corresponding to the abstract input.  This routine is just like
0274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
0275 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
0276 point exponent.
0277 -------------------------------------------------------------------------------
0278 */
0279 static float32
0280  normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
0281 {
0282     int8 shiftCount;
0283
0284     shiftCount = countLeadingZeros32( zSig ) - 1;
0285     return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
0286
0287 }
0288
0289 /*
0290 -------------------------------------------------------------------------------
0291 Returns the fraction bits of the double-precision floating-point value `a'.
0292 -------------------------------------------------------------------------------
0293 */
0294 INLINE bits64 extractFloat64Frac( float64 a )
0295 {
0296
0297     return a & LIT64( 0x000FFFFFFFFFFFFF );
0298
0299 }
0300
0301 /*
0302 -------------------------------------------------------------------------------
0303 Returns the exponent bits of the double-precision floating-point value `a'.
0304 -------------------------------------------------------------------------------
0305 */
0306 INLINE int16 extractFloat64Exp( float64 a )
0307 {
0308
0309     return ( a>>52 ) & 0x7FF;
0310
0311 }
0312
0313 /*
0314 -------------------------------------------------------------------------------
0315 Returns the sign bit of the double-precision floating-point value `a'.
0316 -------------------------------------------------------------------------------
0317 */
0318 #if 0   /* in softfloat.h */
0319 INLINE flag extractFloat64Sign( float64 a )
0320 {
0321
0322     return a>>63;
0323
0324 }
0325 #endif
0326
0327 /*
0328 -------------------------------------------------------------------------------
0329 Normalizes the subnormal double-precision floating-point value represented
0330 by the denormalized significand `aSig'.  The normalized exponent and
0331 significand are stored at the locations pointed to by `zExpPtr' and
0332 `zSigPtr', respectively.
0333 -------------------------------------------------------------------------------
0334 */
0335 static void
0336  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
0337 {
0338     int8 shiftCount;
0339
0340     shiftCount = countLeadingZeros64( aSig ) - 11;
0341     *zSigPtr = aSig<<shiftCount;
0342     *zExpPtr = 1 - shiftCount;
0343
0344 }
0345
0346 /*
0347 -------------------------------------------------------------------------------
0348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
0349 double-precision floating-point value, returning the result.  After being
0350 shifted into the proper positions, the three fields are simply added
0351 together to form the result.  This means that any integer portion of `zSig'
0352 will be added into the exponent.  Since a properly normalized significand
0353 will have an integer portion equal to 1, the `zExp' input should be 1 less
0354 than the desired result exponent whenever `zSig' is a complete, normalized
0355 significand.
0356 -------------------------------------------------------------------------------
0357 */
0358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
0359 {
0360
0361     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
0362
0363 }
0364
0365 /*
0366 -------------------------------------------------------------------------------
0367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
0368 and significand `zSig', and returns the proper double-precision floating-
0369 point value corresponding to the abstract input.  Ordinarily, the abstract
0370 value is simply rounded and packed into the double-precision format, with
0371 the inexact exception raised if the abstract input cannot be represented
0372 exactly.  If the abstract value is too large, however, the overflow and
0373 inexact exceptions are raised and an infinity or maximal finite value is
0374 returned.  If the abstract value is too small, the input value is rounded to
0375 a subnormal number, and the underflow and inexact exceptions are raised if
0376 the abstract input cannot be represented exactly as a subnormal double-
0377 precision floating-point number.
0378     The input significand `zSig' has its binary point between bits 62
0379 and 61, which is 10 bits to the left of the usual location.  This shifted
0380 significand must be normalized or smaller.  If `zSig' is not normalized,
0381 `zExp' must be 0; in that case, the result returned is a subnormal number,
0382 and it must not require rounding.  In the usual case that `zSig' is
0383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
0384 The handling of underflow and overflow follows the IEC/IEEE Standard for
0385 Binary Floating-point Arithmetic.
0386 -------------------------------------------------------------------------------
0387 */
0388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
0389 {
0390     int8 roundingMode;
0391     flag roundNearestEven;
0392     int16 roundIncrement, roundBits;
0393     flag isTiny;
0394
0395     roundingMode = roundData->mode;
0396     roundNearestEven = ( roundingMode == float_round_nearest_even );
0397     roundIncrement = 0x200;
0398     if ( ! roundNearestEven ) {
0399         if ( roundingMode == float_round_to_zero ) {
0400             roundIncrement = 0;
0401         }
0402         else {
0403             roundIncrement = 0x3FF;
0404             if ( zSign ) {
0405                 if ( roundingMode == float_round_up ) roundIncrement = 0;
0406             }
0407             else {
0408                 if ( roundingMode == float_round_down ) roundIncrement = 0;
0409             }
0410         }
0411     }
0412     roundBits = zSig & 0x3FF;
0413     if ( 0x7FD <= (bits16) zExp ) {
0414         if (    ( 0x7FD < zExp )
0415              || (    ( zExp == 0x7FD )
0416                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
0417            ) {
0418             //register int lr = __builtin_return_address(0);
0419             //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
0420             roundData->exception |= float_flag_overflow | float_flag_inexact;
0421             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
0422         }
0423         if ( zExp < 0 ) {
0424             isTiny =
0425                    ( float_detect_tininess == float_tininess_before_rounding )
0426                 || ( zExp < -1 )
0427                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
0428             shift64RightJamming( zSig, - zExp, &zSig );
0429             zExp = 0;
0430             roundBits = zSig & 0x3FF;
0431             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
0432         }
0433     }
0434     if ( roundBits ) roundData->exception |= float_flag_inexact;
0435     zSig = ( zSig + roundIncrement )>>10;
0436     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
0437     if ( zSig == 0 ) zExp = 0;
0438     return packFloat64( zSign, zExp, zSig );
0439
0440 }
0441
0442 /*
0443 -------------------------------------------------------------------------------
0444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
0445 and significand `zSig', and returns the proper double-precision floating-
0446 point value corresponding to the abstract input.  This routine is just like
0447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
0448 any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
0449 point exponent.
0450 -------------------------------------------------------------------------------
0451 */
0452 static float64
0453  normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
0454 {
0455     int8 shiftCount;
0456
0457     shiftCount = countLeadingZeros64( zSig ) - 1;
0458     return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
0459
0460 }
0461
0462 #ifdef FLOATX80
0463
0464 /*
0465 -------------------------------------------------------------------------------
0466 Returns the fraction bits of the extended double-precision floating-point
0467 value `a'.
0468 -------------------------------------------------------------------------------
0469 */
0470 INLINE bits64 extractFloatx80Frac( floatx80 a )
0471 {
0472
0473     return a.low;
0474
0475 }
0476
0477 /*
0478 -------------------------------------------------------------------------------
0479 Returns the exponent bits of the extended double-precision floating-point
0480 value `a'.
0481 -------------------------------------------------------------------------------
0482 */
0483 INLINE int32 extractFloatx80Exp( floatx80 a )
0484 {
0485
0486     return a.high & 0x7FFF;
0487
0488 }
0489
0490 /*
0491 -------------------------------------------------------------------------------
0492 Returns the sign bit of the extended double-precision floating-point value
0493 `a'.
0494 -------------------------------------------------------------------------------
0495 */
0496 INLINE flag extractFloatx80Sign( floatx80 a )
0497 {
0498
0499     return a.high>>15;
0500
0501 }
0502
0503 /*
0504 -------------------------------------------------------------------------------
0505 Normalizes the subnormal extended double-precision floating-point value
0506 represented by the denormalized significand `aSig'.  The normalized exponent
0507 and significand are stored at the locations pointed to by `zExpPtr' and
0508 `zSigPtr', respectively.
0509 -------------------------------------------------------------------------------
0510 */
0511 static void
0512  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
0513 {
0514     int8 shiftCount;
0515
0516     shiftCount = countLeadingZeros64( aSig );
0517     *zSigPtr = aSig<<shiftCount;
0518     *zExpPtr = 1 - shiftCount;
0519
0520 }
0521
0522 /*
0523 -------------------------------------------------------------------------------
0524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
0525 extended double-precision floating-point value, returning the result.
0526 -------------------------------------------------------------------------------
0527 */
0528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
0529 {
0530     floatx80 z;
0531
0532     z.low = zSig;
0533     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
0534     z.__padding = 0;
0535     return z;
0536
0537 }
0538
0539 /*
0540 -------------------------------------------------------------------------------
0541 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
0542 and extended significand formed by the concatenation of `zSig0' and `zSig1',
0543 and returns the proper extended double-precision floating-point value
0544 corresponding to the abstract input.  Ordinarily, the abstract value is
0545 rounded and packed into the extended double-precision format, with the
0546 inexact exception raised if the abstract input cannot be represented
0547 exactly.  If the abstract value is too large, however, the overflow and
0548 inexact exceptions are raised and an infinity or maximal finite value is
0549 returned.  If the abstract value is too small, the input value is rounded to
0550 a subnormal number, and the underflow and inexact exceptions are raised if
0551 the abstract input cannot be represented exactly as a subnormal extended
0552 double-precision floating-point number.
0553     If `roundingPrecision' is 32 or 64, the result is rounded to the same
0554 number of bits as single or double precision, respectively.  Otherwise, the
0555 result is rounded to the full precision of the extended double-precision
0556 format.
0557     The input significand must be normalized or smaller.  If the input
0558 significand is not normalized, `zExp' must be 0; in that case, the result
0559 returned is a subnormal number, and it must not require rounding.  The
0560 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
0561 Floating-point Arithmetic.
0562 -------------------------------------------------------------------------------
0563 */
0564 static floatx80
0565  roundAndPackFloatx80(
0566      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
0567  )
0568 {
0569     int8 roundingMode, roundingPrecision;
0570     flag roundNearestEven, increment, isTiny;
0571     int64 roundIncrement, roundMask, roundBits;
0572
0573     roundingMode = roundData->mode;
0574     roundingPrecision = roundData->precision;
0575     roundNearestEven = ( roundingMode == float_round_nearest_even );
0576     if ( roundingPrecision == 80 ) goto precision80;
0577     if ( roundingPrecision == 64 ) {
0578         roundIncrement = LIT64( 0x0000000000000400 );
0579         roundMask = LIT64( 0x00000000000007FF );
0580     }
0581     else if ( roundingPrecision == 32 ) {
0582         roundIncrement = LIT64( 0x0000008000000000 );
0583         roundMask = LIT64( 0x000000FFFFFFFFFF );
0584     }
0585     else {
0586         goto precision80;
0587     }
0588     zSig0 |= ( zSig1 != 0 );
0589     if ( ! roundNearestEven ) {
0590         if ( roundingMode == float_round_to_zero ) {
0591             roundIncrement = 0;
0592         }
0593         else {
0594             roundIncrement = roundMask;
0595             if ( zSign ) {
0596                 if ( roundingMode == float_round_up ) roundIncrement = 0;
0597             }
0598             else {
0599                 if ( roundingMode == float_round_down ) roundIncrement = 0;
0600             }
0601         }
0602     }
0603     roundBits = zSig0 & roundMask;
0604     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
0605         if (    ( 0x7FFE < zExp )
0606              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
0607            ) {
0608             goto overflow;
0609         }
0610         if ( zExp <= 0 ) {
0611             isTiny =
0612                    ( float_detect_tininess == float_tininess_before_rounding )
0613                 || ( zExp < 0 )
0614                 || ( zSig0 <= zSig0 + roundIncrement );
0615             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
0616             zExp = 0;
0617             roundBits = zSig0 & roundMask;
0618             if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
0619             if ( roundBits ) roundData->exception |= float_flag_inexact;
0620             zSig0 += roundIncrement;
0621             if ( (sbits64) zSig0 < 0 ) zExp = 1;
0622             roundIncrement = roundMask + 1;
0623             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
0624                 roundMask |= roundIncrement;
0625             }
0626             zSig0 &= ~ roundMask;
0627             return packFloatx80( zSign, zExp, zSig0 );
0628         }
0629     }
0630     if ( roundBits ) roundData->exception |= float_flag_inexact;
0631     zSig0 += roundIncrement;
0632     if ( zSig0 < roundIncrement ) {
0633         ++zExp;
0634         zSig0 = LIT64( 0x8000000000000000 );
0635     }
0636     roundIncrement = roundMask + 1;
0637     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
0638         roundMask |= roundIncrement;
0639     }
0640     zSig0 &= ~ roundMask;
0641     if ( zSig0 == 0 ) zExp = 0;
0642     return packFloatx80( zSign, zExp, zSig0 );
0643  precision80:
0644     increment = ( (sbits64) zSig1 < 0 );
0645     if ( ! roundNearestEven ) {
0646         if ( roundingMode == float_round_to_zero ) {
0647             increment = 0;
0648         }
0649         else {
0650             if ( zSign ) {
0651                 increment = ( roundingMode == float_round_down ) && zSig1;
0652             }
0653             else {
0654                 increment = ( roundingMode == float_round_up ) && zSig1;
0655             }
0656         }
0657     }
0658     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
0659         if (    ( 0x7FFE < zExp )
0660              || (    ( zExp == 0x7FFE )
0661                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
0662                   && increment
0663                 )
0664            ) {
0665             roundMask = 0;
0666  overflow:
0667             roundData->exception |= float_flag_overflow | float_flag_inexact;
0668             if (    ( roundingMode == float_round_to_zero )
0669                  || ( zSign && ( roundingMode == float_round_up ) )
0670                  || ( ! zSign && ( roundingMode == float_round_down ) )
0671                ) {
0672                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
0673             }
0674             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
0675         }
0676         if ( zExp <= 0 ) {
0677             isTiny =
0678                    ( float_detect_tininess == float_tininess_before_rounding )
0679                 || ( zExp < 0 )
0680                 || ! increment
0681                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
0682             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
0683             zExp = 0;
0684             if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
0685             if ( zSig1 ) roundData->exception |= float_flag_inexact;
0686             if ( roundNearestEven ) {
0687                 increment = ( (sbits64) zSig1 < 0 );
0688             }
0689             else {
0690                 if ( zSign ) {
0691                     increment = ( roundingMode == float_round_down ) && zSig1;
0692                 }
0693                 else {
0694                     increment = ( roundingMode == float_round_up ) && zSig1;
0695                 }
0696             }
0697             if ( increment ) {
0698                 ++zSig0;
0699                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
0700                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
0701             }
0702             return packFloatx80( zSign, zExp, zSig0 );
0703         }
0704     }
0705     if ( zSig1 ) roundData->exception |= float_flag_inexact;
0706     if ( increment ) {
0707         ++zSig0;
0708         if ( zSig0 == 0 ) {
0709             ++zExp;
0710             zSig0 = LIT64( 0x8000000000000000 );
0711         }
0712         else {
0713             zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
0714         }
0715     }
0716     else {
0717         if ( zSig0 == 0 ) zExp = 0;
0718     }
0719
0720     return packFloatx80( zSign, zExp, zSig0 );
0721 }
0722
0723 /*
0724 -------------------------------------------------------------------------------
0725 Takes an abstract floating-point value having sign `zSign', exponent
0726 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
0727 and returns the proper extended double-precision floating-point value
0728 corresponding to the abstract input.  This routine is just like
0729 `roundAndPackFloatx80' except that the input significand does not have to be
0730 normalized.
0731 -------------------------------------------------------------------------------
0732 */
0733 static floatx80
0734  normalizeRoundAndPackFloatx80(
0735      struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
0736  )
0737 {
0738     int8 shiftCount;
0739
0740     if ( zSig0 == 0 ) {
0741         zSig0 = zSig1;
0742         zSig1 = 0;
0743         zExp -= 64;
0744     }
0745     shiftCount = countLeadingZeros64( zSig0 );
0746     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
0747     zExp -= shiftCount;
0748     return
0749         roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
0750
0751 }
0752
0753 #endif
0754
0755 /*
0756 -------------------------------------------------------------------------------
0757 Returns the result of converting the 32-bit two's complement integer `a' to
0758 the single-precision floating-point format.  The conversion is performed
0759 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
0760 -------------------------------------------------------------------------------
0761 */
0762 float32 int32_to_float32(struct roundingData *roundData, int32 a)
0763 {
0764     flag zSign;
0765
0766     if ( a == 0 ) return 0;
0767     if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
0768     zSign = ( a < 0 );
0769     return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
0770
0771 }
0772
0773 /*
0774 -------------------------------------------------------------------------------
0775 Returns the result of converting the 32-bit two's complement integer `a' to
0776 the double-precision floating-point format.  The conversion is performed
0777 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
0778 -------------------------------------------------------------------------------
0779 */
0780 float64 int32_to_float64( int32 a )
0781 {
0782     flag aSign;
0783     uint32 absA;
0784     int8 shiftCount;
0785     bits64 zSig;
0786
0787     if ( a == 0 ) return 0;
0788     aSign = ( a < 0 );
0789     absA = aSign ? - a : a;
0790     shiftCount = countLeadingZeros32( absA ) + 21;
0791     zSig = absA;
0792     return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
0793
0794 }
0795
0796 #ifdef FLOATX80
0797
0798 /*
0799 -------------------------------------------------------------------------------
0800 Returns the result of converting the 32-bit two's complement integer `a'
0801 to the extended double-precision floating-point format.  The conversion
0802 is performed according to the IEC/IEEE Standard for Binary Floating-point
0803 Arithmetic.
0804 -------------------------------------------------------------------------------
0805 */
0806 floatx80 int32_to_floatx80( int32 a )
0807 {
0808     flag zSign;
0809     uint32 absA;
0810     int8 shiftCount;
0811     bits64 zSig;
0812
0813     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
0814     zSign = ( a < 0 );
0815     absA = zSign ? - a : a;
0816     shiftCount = countLeadingZeros32( absA ) + 32;
0817     zSig = absA;
0818     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
0819
0820 }
0821
0822 #endif
0823
0824 /*
0825 -------------------------------------------------------------------------------
0826 Returns the result of converting the single-precision floating-point value
0827 `a' to the 32-bit two's complement integer format.  The conversion is
0828 performed according to the IEC/IEEE Standard for Binary Floating-point
0829 Arithmetic---which means in particular that the conversion is rounded
0830 according to the current rounding mode.  If `a' is a NaN, the largest
0831 positive integer is returned.  Otherwise, if the conversion overflows, the
0832 largest integer with the same sign as `a' is returned.
0833 -------------------------------------------------------------------------------
0834 */
0835 int32 float32_to_int32( struct roundingData *roundData, float32 a )
0836 {
0837     flag aSign;
0838     int16 aExp, shiftCount;
0839     bits32 aSig;
0840     bits64 zSig;
0841
0842     aSig = extractFloat32Frac( a );
0843     aExp = extractFloat32Exp( a );
0844     aSign = extractFloat32Sign( a );
0845     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
0846     if ( aExp ) aSig |= 0x00800000;
0847     shiftCount = 0xAF - aExp;
0848     zSig = aSig;
0849     zSig <<= 32;
0850     if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
0851     return roundAndPackInt32( roundData, aSign, zSig );
0852
0853 }
0854
0855 /*
0856 -------------------------------------------------------------------------------
0857 Returns the result of converting the single-precision floating-point value
0858 `a' to the 32-bit two's complement integer format.  The conversion is
0859 performed according to the IEC/IEEE Standard for Binary Floating-point
0860 Arithmetic, except that the conversion is always rounded toward zero.  If
0861 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
0862 conversion overflows, the largest integer with the same sign as `a' is
0863 returned.
0864 -------------------------------------------------------------------------------
0865 */
0866 int32 float32_to_int32_round_to_zero( float32 a )
0867 {
0868     flag aSign;
0869     int16 aExp, shiftCount;
0870     bits32 aSig;
0871     int32 z;
0872
0873     aSig = extractFloat32Frac( a );
0874     aExp = extractFloat32Exp( a );
0875     aSign = extractFloat32Sign( a );
0876     shiftCount = aExp - 0x9E;
0877     if ( 0 <= shiftCount ) {
0878         if ( a == 0xCF000000 ) return 0x80000000;
0879         float_raise( float_flag_invalid );
0880         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
0881         return 0x80000000;
0882     }
0883     else if ( aExp <= 0x7E ) {
0884         if ( aExp | aSig ) float_raise( float_flag_inexact );
0885         return 0;
0886     }
0887     aSig = ( aSig | 0x00800000 )<<8;
0888     z = aSig>>( - shiftCount );
0889     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
0890         float_raise( float_flag_inexact );
0891     }
0892     return aSign ? - z : z;
0893
0894 }
0895
0896 /*
0897 -------------------------------------------------------------------------------
0898 Returns the result of converting the single-precision floating-point value
0899 `a' to the double-precision floating-point format.  The conversion is
0900 performed according to the IEC/IEEE Standard for Binary Floating-point
0901 Arithmetic.
0902 -------------------------------------------------------------------------------
0903 */
0904 float64 float32_to_float64( float32 a )
0905 {
0906     flag aSign;
0907     int16 aExp;
0908     bits32 aSig;
0909
0910     aSig = extractFloat32Frac( a );
0911     aExp = extractFloat32Exp( a );
0912     aSign = extractFloat32Sign( a );
0913     if ( aExp == 0xFF ) {
0914         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
0915         return packFloat64( aSign, 0x7FF, 0 );
0916     }
0917     if ( aExp == 0 ) {
0918         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
0919         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
0920         --aExp;
0921     }
0922     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
0923
0924 }
0925
0926 #ifdef FLOATX80
0927
0928 /*
0929 -------------------------------------------------------------------------------
0930 Returns the result of converting the single-precision floating-point value
0931 `a' to the extended double-precision floating-point format.  The conversion
0932 is performed according to the IEC/IEEE Standard for Binary Floating-point
0933 Arithmetic.
0934 -------------------------------------------------------------------------------
0935 */
0936 floatx80 float32_to_floatx80( float32 a )
0937 {
0938     flag aSign;
0939     int16 aExp;
0940     bits32 aSig;
0941
0942     aSig = extractFloat32Frac( a );
0943     aExp = extractFloat32Exp( a );
0944     aSign = extractFloat32Sign( a );
0945     if ( aExp == 0xFF ) {
0946         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
0947         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
0948     }
0949     if ( aExp == 0 ) {
0950         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
0951         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
0952     }
0953     aSig |= 0x00800000;
0954     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
0955
0956 }
0957
0958 #endif
0959
0960 /*
0961 -------------------------------------------------------------------------------
0962 Rounds the single-precision floating-point value `a' to an integer, and
0963 returns the result as a single-precision floating-point value.  The
0964 operation is performed according to the IEC/IEEE Standard for Binary
0965 Floating-point Arithmetic.
0966 -------------------------------------------------------------------------------
0967 */
0968 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
0969 {
0970     flag aSign;
0971     int16 aExp;
0972     bits32 lastBitMask, roundBitsMask;
0973     int8 roundingMode;
0974     float32 z;
0975
0976     aExp = extractFloat32Exp( a );
0977     if ( 0x96 <= aExp ) {
0978         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
0979             return propagateFloat32NaN( a, a );
0980         }
0981         return a;
0982     }
0983     roundingMode = roundData->mode;
0984     if ( aExp <= 0x7E ) {
0985         if ( (bits32) ( a<<1 ) == 0 ) return a;
0986         roundData->exception |= float_flag_inexact;
0987         aSign = extractFloat32Sign( a );
0988         switch ( roundingMode ) {
0989          case float_round_nearest_even:
0990             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
0991                 return packFloat32( aSign, 0x7F, 0 );
0992             }
0993             break;
0994          case float_round_down:
0995             return aSign ? 0xBF800000 : 0;
0996          case float_round_up:
0997             return aSign ? 0x80000000 : 0x3F800000;
0998         }
0999         return packFloat32( aSign, 0, 0 );
1000     }
1001     lastBitMask = 1;
1002     lastBitMask <<= 0x96 - aExp;
1003     roundBitsMask = lastBitMask - 1;
1004     z = a;
1005     if ( roundingMode == float_round_nearest_even ) {
1006         z += lastBitMask>>1;
1007         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008     }
1009     else if ( roundingMode != float_round_to_zero ) {
1010         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1011             z += roundBitsMask;
1012         }
1013     }
1014     z &= ~ roundBitsMask;
1015     if ( z != a ) roundData->exception |= float_flag_inexact;
1016     return z;
1017
1018 }
1019
1020 /*
1021 -------------------------------------------------------------------------------
1022 Returns the result of adding the absolute values of the single-precision
1023 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1024 before being returned.  `zSign' is ignored if the result is a NaN.  The
1025 addition is performed according to the IEC/IEEE Standard for Binary
1026 Floating-point Arithmetic.
1027 -------------------------------------------------------------------------------
1028 */
1029 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030 {
1031     int16 aExp, bExp, zExp;
1032     bits32 aSig, bSig, zSig;
1033     int16 expDiff;
1034
1035     aSig = extractFloat32Frac( a );
1036     aExp = extractFloat32Exp( a );
1037     bSig = extractFloat32Frac( b );
1038     bExp = extractFloat32Exp( b );
1039     expDiff = aExp - bExp;
1040     aSig <<= 6;
1041     bSig <<= 6;
1042     if ( 0 < expDiff ) {
1043         if ( aExp == 0xFF ) {
1044             if ( aSig ) return propagateFloat32NaN( a, b );
1045             return a;
1046         }
1047         if ( bExp == 0 ) {
1048             --expDiff;
1049         }
1050         else {
1051             bSig |= 0x20000000;
1052         }
1053         shift32RightJamming( bSig, expDiff, &bSig );
1054         zExp = aExp;
1055     }
1056     else if ( expDiff < 0 ) {
1057         if ( bExp == 0xFF ) {
1058             if ( bSig ) return propagateFloat32NaN( a, b );
1059             return packFloat32( zSign, 0xFF, 0 );
1060         }
1061         if ( aExp == 0 ) {
1062             ++expDiff;
1063         }
1064         else {
1065             aSig |= 0x20000000;
1066         }
1067         shift32RightJamming( aSig, - expDiff, &aSig );
1068         zExp = bExp;
1069     }
1070     else {
1071         if ( aExp == 0xFF ) {
1072             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1073             return a;
1074         }
1075         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1076         zSig = 0x40000000 + aSig + bSig;
1077         zExp = aExp;
1078         goto roundAndPack;
1079     }
1080     aSig |= 0x20000000;
1081     zSig = ( aSig + bSig )<<1;
1082     --zExp;
1083     if ( (sbits32) zSig < 0 ) {
1084         zSig = aSig + bSig;
1085         ++zExp;
1086     }
1087  roundAndPack:
1088     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1089
1090 }
1091
1092 /*
1093 -------------------------------------------------------------------------------
1094 Returns the result of subtracting the absolute values of the single-
1095 precision floating-point values `a' and `b'.  If `zSign' is true, the
1096 difference is negated before being returned.  `zSign' is ignored if the
1097 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1098 Standard for Binary Floating-point Arithmetic.
1099 -------------------------------------------------------------------------------
1100 */
1101 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102 {
1103     int16 aExp, bExp, zExp;
1104     bits32 aSig, bSig, zSig;
1105     int16 expDiff;
1106
1107     aSig = extractFloat32Frac( a );
1108     aExp = extractFloat32Exp( a );
1109     bSig = extractFloat32Frac( b );
1110     bExp = extractFloat32Exp( b );
1111     expDiff = aExp - bExp;
1112     aSig <<= 7;
1113     bSig <<= 7;
1114     if ( 0 < expDiff ) goto aExpBigger;
1115     if ( expDiff < 0 ) goto bExpBigger;
1116     if ( aExp == 0xFF ) {
1117         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1118         roundData->exception |= float_flag_invalid;
1119         return float32_default_nan;
1120     }
1121     if ( aExp == 0 ) {
1122         aExp = 1;
1123         bExp = 1;
1124     }
1125     if ( bSig < aSig ) goto aBigger;
1126     if ( aSig < bSig ) goto bBigger;
1127     return packFloat32( roundData->mode == float_round_down, 0, 0 );
1128  bExpBigger:
1129     if ( bExp == 0xFF ) {
1130         if ( bSig ) return propagateFloat32NaN( a, b );
1131         return packFloat32( zSign ^ 1, 0xFF, 0 );
1132     }
1133     if ( aExp == 0 ) {
1134         ++expDiff;
1135     }
1136     else {
1137         aSig |= 0x40000000;
1138     }
1139     shift32RightJamming( aSig, - expDiff, &aSig );
1140     bSig |= 0x40000000;
1141  bBigger:
1142     zSig = bSig - aSig;
1143     zExp = bExp;
1144     zSign ^= 1;
1145     goto normalizeRoundAndPack;
1146  aExpBigger:
1147     if ( aExp == 0xFF ) {
1148         if ( aSig ) return propagateFloat32NaN( a, b );
1149         return a;
1150     }
1151     if ( bExp == 0 ) {
1152         --expDiff;
1153     }
1154     else {
1155         bSig |= 0x40000000;
1156     }
1157     shift32RightJamming( bSig, expDiff, &bSig );
1158     aSig |= 0x40000000;
1159  aBigger:
1160     zSig = aSig - bSig;
1161     zExp = aExp;
1162  normalizeRoundAndPack:
1163     --zExp;
1164     return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1165
1166 }
1167
1168 /*
1169 -------------------------------------------------------------------------------
1170 Returns the result of adding the single-precision floating-point values `a'
1171 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1172 Binary Floating-point Arithmetic.
1173 -------------------------------------------------------------------------------
1174 */
1175 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1176 {
1177     flag aSign, bSign;
1178
1179     aSign = extractFloat32Sign( a );
1180     bSign = extractFloat32Sign( b );
1181     if ( aSign == bSign ) {
1182         return addFloat32Sigs( roundData, a, b, aSign );
1183     }
1184     else {
1185         return subFloat32Sigs( roundData, a, b, aSign );
1186     }
1187
1188 }
1189
1190 /*
1191 -------------------------------------------------------------------------------
1192 Returns the result of subtracting the single-precision floating-point values
1193 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1194 for Binary Floating-point Arithmetic.
1195 -------------------------------------------------------------------------------
1196 */
1197 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1198 {
1199     flag aSign, bSign;
1200
1201     aSign = extractFloat32Sign( a );
1202     bSign = extractFloat32Sign( b );
1203     if ( aSign == bSign ) {
1204         return subFloat32Sigs( roundData, a, b, aSign );
1205     }
1206     else {
1207         return addFloat32Sigs( roundData, a, b, aSign );
1208     }
1209
1210 }
1211
1212 /*
1213 -------------------------------------------------------------------------------
1214 Returns the result of multiplying the single-precision floating-point values
1215 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1216 for Binary Floating-point Arithmetic.
1217 -------------------------------------------------------------------------------
1218 */
1219 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220 {
1221     flag aSign, bSign, zSign;
1222     int16 aExp, bExp, zExp;
1223     bits32 aSig, bSig;
1224     bits64 zSig64;
1225     bits32 zSig;
1226
1227     aSig = extractFloat32Frac( a );
1228     aExp = extractFloat32Exp( a );
1229     aSign = extractFloat32Sign( a );
1230     bSig = extractFloat32Frac( b );
1231     bExp = extractFloat32Exp( b );
1232     bSign = extractFloat32Sign( b );
1233     zSign = aSign ^ bSign;
1234     if ( aExp == 0xFF ) {
1235         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1236             return propagateFloat32NaN( a, b );
1237         }
1238         if ( ( bExp | bSig ) == 0 ) {
1239             roundData->exception |= float_flag_invalid;
1240             return float32_default_nan;
1241         }
1242         return packFloat32( zSign, 0xFF, 0 );
1243     }
1244     if ( bExp == 0xFF ) {
1245         if ( bSig ) return propagateFloat32NaN( a, b );
1246         if ( ( aExp | aSig ) == 0 ) {
1247             roundData->exception |= float_flag_invalid;
1248             return float32_default_nan;
1249         }
1250         return packFloat32( zSign, 0xFF, 0 );
1251     }
1252     if ( aExp == 0 ) {
1253         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1254         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1255     }
1256     if ( bExp == 0 ) {
1257         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1258         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259     }
1260     zExp = aExp + bExp - 0x7F;
1261     aSig = ( aSig | 0x00800000 )<<7;
1262     bSig = ( bSig | 0x00800000 )<<8;
1263     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1264     zSig = zSig64;
1265     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1266         zSig <<= 1;
1267         --zExp;
1268     }
1269     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1270
1271 }
1272
1273 /*
1274 -------------------------------------------------------------------------------
1275 Returns the result of dividing the single-precision floating-point value `a'
1276 by the corresponding value `b'.  The operation is performed according to the
1277 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1278 -------------------------------------------------------------------------------
1279 */
1280 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281 {
1282     flag aSign, bSign, zSign;
1283     int16 aExp, bExp, zExp;
1284     bits32 aSig, bSig, zSig;
1285
1286     aSig = extractFloat32Frac( a );
1287     aExp = extractFloat32Exp( a );
1288     aSign = extractFloat32Sign( a );
1289     bSig = extractFloat32Frac( b );
1290     bExp = extractFloat32Exp( b );
1291     bSign = extractFloat32Sign( b );
1292     zSign = aSign ^ bSign;
1293     if ( aExp == 0xFF ) {
1294         if ( aSig ) return propagateFloat32NaN( a, b );
1295         if ( bExp == 0xFF ) {
1296             if ( bSig ) return propagateFloat32NaN( a, b );
1297             roundData->exception |= float_flag_invalid;
1298             return float32_default_nan;
1299         }
1300         return packFloat32( zSign, 0xFF, 0 );
1301     }
1302     if ( bExp == 0xFF ) {
1303         if ( bSig ) return propagateFloat32NaN( a, b );
1304         return packFloat32( zSign, 0, 0 );
1305     }
1306     if ( bExp == 0 ) {
1307         if ( bSig == 0 ) {
1308             if ( ( aExp | aSig ) == 0 ) {
1309                 roundData->exception |= float_flag_invalid;
1310                 return float32_default_nan;
1311             }
1312             roundData->exception |= float_flag_divbyzero;
1313             return packFloat32( zSign, 0xFF, 0 );
1314         }
1315         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1316     }
1317     if ( aExp == 0 ) {
1318         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1319         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320     }
1321     zExp = aExp - bExp + 0x7D;
1322     aSig = ( aSig | 0x00800000 )<<7;
1323     bSig = ( bSig | 0x00800000 )<<8;
1324     if ( bSig <= ( aSig + aSig ) ) {
1325         aSig >>= 1;
1326         ++zExp;
1327     }
1328     {
1329         bits64 tmp = ( (bits64) aSig )<<32;
1330         do_div( tmp, bSig );
1331         zSig = tmp;
1332     }
1333     if ( ( zSig & 0x3F ) == 0 ) {
1334         zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335     }
1336     return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1337
1338 }
1339
1340 /*
1341 -------------------------------------------------------------------------------
1342 Returns the remainder of the single-precision floating-point value `a'
1343 with respect to the corresponding value `b'.  The operation is performed
1344 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1345 -------------------------------------------------------------------------------
1346 */
1347 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348 {
1349     flag aSign, bSign, zSign;
1350     int16 aExp, bExp, expDiff;
1351     bits32 aSig, bSig;
1352     bits32 q;
1353     bits64 aSig64, bSig64, q64;
1354     bits32 alternateASig;
1355     sbits32 sigMean;
1356
1357     aSig = extractFloat32Frac( a );
1358     aExp = extractFloat32Exp( a );
1359     aSign = extractFloat32Sign( a );
1360     bSig = extractFloat32Frac( b );
1361     bExp = extractFloat32Exp( b );
1362     bSign = extractFloat32Sign( b );
1363     if ( aExp == 0xFF ) {
1364         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1365             return propagateFloat32NaN( a, b );
1366         }
1367         roundData->exception |= float_flag_invalid;
1368         return float32_default_nan;
1369     }
1370     if ( bExp == 0xFF ) {
1371         if ( bSig ) return propagateFloat32NaN( a, b );
1372         return a;
1373     }
1374     if ( bExp == 0 ) {
1375         if ( bSig == 0 ) {
1376             roundData->exception |= float_flag_invalid;
1377             return float32_default_nan;
1378         }
1379         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380     }
1381     if ( aExp == 0 ) {
1382         if ( aSig == 0 ) return a;
1383         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384     }
1385     expDiff = aExp - bExp;
1386     aSig |= 0x00800000;
1387     bSig |= 0x00800000;
1388     if ( expDiff < 32 ) {
1389         aSig <<= 8;
1390         bSig <<= 8;
1391         if ( expDiff < 0 ) {
1392             if ( expDiff < -1 ) return a;
1393             aSig >>= 1;
1394         }
1395         q = ( bSig <= aSig );
1396         if ( q ) aSig -= bSig;
1397         if ( 0 < expDiff ) {
1398             bits64 tmp = ( (bits64) aSig )<<32;
1399             do_div( tmp, bSig );
1400             q = tmp;
1401             q >>= 32 - expDiff;
1402             bSig >>= 2;
1403             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404         }
1405         else {
1406             aSig >>= 2;
1407             bSig >>= 2;
1408         }
1409     }
1410     else {
1411         if ( bSig <= aSig ) aSig -= bSig;
1412         aSig64 = ( (bits64) aSig )<<40;
1413         bSig64 = ( (bits64) bSig )<<40;
1414         expDiff -= 64;
1415         while ( 0 < expDiff ) {
1416             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418             aSig64 = - ( ( bSig * q64 )<<38 );
1419             expDiff -= 62;
1420         }
1421         expDiff += 64;
1422         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424         q = q64>>( 64 - expDiff );
1425         bSig <<= 6;
1426         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427     }
1428     do {
1429         alternateASig = aSig;
1430         ++q;
1431         aSig -= bSig;
1432     } while ( 0 <= (sbits32) aSig );
1433     sigMean = aSig + alternateASig;
1434     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435         aSig = alternateASig;
1436     }
1437     zSign = ( (sbits32) aSig < 0 );
1438     if ( zSign ) aSig = - aSig;
1439     return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1440
1441 }
1442
1443 /*
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1449 */
1450 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1451 {
1452     flag aSign;
1453     int16 aExp, zExp;
1454     bits32 aSig, zSig;
1455     bits64 rem, term;
1456
1457     aSig = extractFloat32Frac( a );
1458     aExp = extractFloat32Exp( a );
1459     aSign = extractFloat32Sign( a );
1460     if ( aExp == 0xFF ) {
1461         if ( aSig ) return propagateFloat32NaN( a, 0 );
1462         if ( ! aSign ) return a;
1463         roundData->exception |= float_flag_invalid;
1464         return float32_default_nan;
1465     }
1466     if ( aSign ) {
1467         if ( ( aExp | aSig ) == 0 ) return a;
1468         roundData->exception |= float_flag_invalid;
1469         return float32_default_nan;
1470     }
1471     if ( aExp == 0 ) {
1472         if ( aSig == 0 ) return 0;
1473         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474     }
1475     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476     aSig = ( aSig | 0x00800000 )<<8;
1477     zSig = estimateSqrt32( aExp, aSig ) + 2;
1478     if ( ( zSig & 0x7F ) <= 5 ) {
1479         if ( zSig < 2 ) {
1480             zSig = 0xFFFFFFFF;
1481         }
1482         else {
1483             aSig >>= aExp & 1;
1484             term = ( (bits64) zSig ) * zSig;
1485             rem = ( ( (bits64) aSig )<<32 ) - term;
1486             while ( (sbits64) rem < 0 ) {
1487                 --zSig;
1488                 rem += ( ( (bits64) zSig )<<1 ) | 1;
1489             }
1490             zSig |= ( rem != 0 );
1491         }
1492     }
1493     shift32RightJamming( zSig, 1, &zSig );
1494     return roundAndPackFloat32( roundData, 0, zExp, zSig );
1495
1496 }
1497
1498 /*
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise.  The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1504 */
1505 flag float32_eq( float32 a, float32 b )
1506 {
1507
1508     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510        ) {
1511         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512             float_raise( float_flag_invalid );
1513         }
1514         return 0;
1515     }
1516     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517
1518 }
1519
1520 /*
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise.  The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 Arithmetic.
1526 -------------------------------------------------------------------------------
1527 */
1528 flag float32_le( float32 a, float32 b )
1529 {
1530     flag aSign, bSign;
1531
1532     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534        ) {
1535         float_raise( float_flag_invalid );
1536         return 0;
1537     }
1538     aSign = extractFloat32Sign( a );
1539     bSign = extractFloat32Sign( b );
1540     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541     return ( a == b ) || ( aSign ^ ( a < b ) );
1542
1543 }
1544
1545 /*
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise.  The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
1552 flag float32_lt( float32 a, float32 b )
1553 {
1554     flag aSign, bSign;
1555
1556     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558        ) {
1559         float_raise( float_flag_invalid );
1560         return 0;
1561     }
1562     aSign = extractFloat32Sign( a );
1563     bSign = extractFloat32Sign( b );
1564     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565     return ( a != b ) && ( aSign ^ ( a < b ) );
1566
1567 }
1568
1569 /*
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise.  The invalid exception is raised
1573 if either operand is a NaN.  Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1576 */
1577 flag float32_eq_signaling( float32 a, float32 b )
1578 {
1579
1580     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582        ) {
1583         float_raise( float_flag_invalid );
1584         return 0;
1585     }
1586     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587
1588 }
1589
1590 /*
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1594 cause an exception.  Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1597 */
1598 flag float32_le_quiet( float32 a, float32 b )
1599 {
1600     flag aSign, bSign;
1601     //int16 aExp, bExp;
1602
1603     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605        ) {
1606         /* Do nothing, even if NaN as we're quiet */
1607         return 0;
1608     }
1609     aSign = extractFloat32Sign( a );
1610     bSign = extractFloat32Sign( b );
1611     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1612     return ( a == b ) || ( aSign ^ ( a < b ) );
1613
1614 }
1615
1616 /*
1617 -------------------------------------------------------------------------------
1618 Returns 1 if the single-precision floating-point value `a' is less than
1619 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1620 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1621 Standard for Binary Floating-point Arithmetic.
1622 -------------------------------------------------------------------------------
1623 */
1624 flag float32_lt_quiet( float32 a, float32 b )
1625 {
1626     flag aSign, bSign;
1627
1628     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1629          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1630        ) {
1631         /* Do nothing, even if NaN as we're quiet */
1632         return 0;
1633     }
1634     aSign = extractFloat32Sign( a );
1635     bSign = extractFloat32Sign( b );
1636     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637     return ( a != b ) && ( aSign ^ ( a < b ) );
1638
1639 }
1640
1641 /*
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format.  The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode.  If `a' is a NaN, the largest
1648 positive integer is returned.  Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1651 */
1652 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1653 {
1654     flag aSign;
1655     int16 aExp, shiftCount;
1656     bits64 aSig;
1657
1658     aSig = extractFloat64Frac( a );
1659     aExp = extractFloat64Exp( a );
1660     aSign = extractFloat64Sign( a );
1661     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663     shiftCount = 0x42C - aExp;
1664     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665     return roundAndPackInt32( roundData, aSign, aSig );
1666
1667 }
1668
1669 /*
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format.  The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero.  If
1675 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1677 returned.
1678 -------------------------------------------------------------------------------
1679 */
1680 int32 float64_to_int32_round_to_zero( float64 a )
1681 {
1682     flag aSign;
1683     int16 aExp, shiftCount;
1684     bits64 aSig, savedASig;
1685     int32 z;
1686
1687     aSig = extractFloat64Frac( a );
1688     aExp = extractFloat64Exp( a );
1689     aSign = extractFloat64Sign( a );
1690     shiftCount = 0x433 - aExp;
1691     if ( shiftCount < 21 ) {
1692         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693         goto invalid;
1694     }
1695     else if ( 52 < shiftCount ) {
1696         if ( aExp || aSig ) float_raise( float_flag_inexact );
1697         return 0;
1698     }
1699     aSig |= LIT64( 0x0010000000000000 );
1700     savedASig = aSig;
1701     aSig >>= shiftCount;
1702     z = aSig;
1703     if ( aSign ) z = - z;
1704     if ( ( z < 0 ) ^ aSign ) {
1705  invalid:
1706         float_raise( float_flag_invalid );
1707         return aSign ? 0x80000000 : 0x7FFFFFFF;
1708     }
1709     if ( ( aSig<<shiftCount ) != savedASig ) {
1710         float_raise( float_flag_inexact );
1711     }
1712     return z;
1713
1714 }
1715
1716 /*
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format.  The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode.  If `a' is a NaN, the largest
1723 positive integer is returned.  Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1726 */
1727 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1728 {
1729     flag aSign;
1730     int16 aExp, shiftCount;
1731     bits64 aSig;
1732
1733     aSig = extractFloat64Frac( a );
1734     aExp = extractFloat64Exp( a );
1735     aSign = 0; //extractFloat64Sign( a );
1736     //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738     shiftCount = 0x42C - aExp;
1739     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740     return roundAndPackInt32( roundData, aSign, aSig );
1741 }
1742
1743 /*
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format.  The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero.  If
1749 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1752 */
1753 int32 float64_to_uint32_round_to_zero( float64 a )
1754 {
1755     flag aSign;
1756     int16 aExp, shiftCount;
1757     bits64 aSig, savedASig;
1758     int32 z;
1759
1760     aSig = extractFloat64Frac( a );
1761     aExp = extractFloat64Exp( a );
1762     aSign = extractFloat64Sign( a );
1763     shiftCount = 0x433 - aExp;
1764     if ( shiftCount < 21 ) {
1765         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766         goto invalid;
1767     }
1768     else if ( 52 < shiftCount ) {
1769         if ( aExp || aSig ) float_raise( float_flag_inexact );
1770         return 0;
1771     }
1772     aSig |= LIT64( 0x0010000000000000 );
1773     savedASig = aSig;
1774     aSig >>= shiftCount;
1775     z = aSig;
1776     if ( aSign ) z = - z;
1777     if ( ( z < 0 ) ^ aSign ) {
1778  invalid:
1779         float_raise( float_flag_invalid );
1780         return aSign ? 0x80000000 : 0x7FFFFFFF;
1781     }
1782     if ( ( aSig<<shiftCount ) != savedASig ) {
1783         float_raise( float_flag_inexact );
1784     }
1785     return z;
1786 }
1787
1788 /*
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format.  The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 Arithmetic.
1794 -------------------------------------------------------------------------------
1795 */
1796 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1797 {
1798     flag aSign;
1799     int16 aExp;
1800     bits64 aSig;
1801     bits32 zSig;
1802
1803     aSig = extractFloat64Frac( a );
1804     aExp = extractFloat64Exp( a );
1805     aSign = extractFloat64Sign( a );
1806     if ( aExp == 0x7FF ) {
1807         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808         return packFloat32( aSign, 0xFF, 0 );
1809     }
1810     shift64RightJamming( aSig, 22, &aSig );
1811     zSig = aSig;
1812     if ( aExp || zSig ) {
1813         zSig |= 0x40000000;
1814         aExp -= 0x381;
1815     }
1816     return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1817
1818 }
1819
1820 #ifdef FLOATX80
1821
1822 /*
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format.  The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 Arithmetic.
1828 -------------------------------------------------------------------------------
1829 */
1830 floatx80 float64_to_floatx80( float64 a )
1831 {
1832     flag aSign;
1833     int16 aExp;
1834     bits64 aSig;
1835
1836     aSig = extractFloat64Frac( a );
1837     aExp = extractFloat64Exp( a );
1838     aSign = extractFloat64Sign( a );
1839     if ( aExp == 0x7FF ) {
1840         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842     }
1843     if ( aExp == 0 ) {
1844         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846     }
1847     return
1848         packFloatx80(
1849             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1850
1851 }
1852
1853 #endif
1854
1855 /*
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value.  The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1862 */
1863 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1864 {
1865     flag aSign;
1866     int16 aExp;
1867     bits64 lastBitMask, roundBitsMask;
1868     int8 roundingMode;
1869     float64 z;
1870
1871     aExp = extractFloat64Exp( a );
1872     if ( 0x433 <= aExp ) {
1873         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874             return propagateFloat64NaN( a, a );
1875         }
1876         return a;
1877     }
1878     if ( aExp <= 0x3FE ) {
1879         if ( (bits64) ( a<<1 ) == 0 ) return a;
1880         roundData->exception |= float_flag_inexact;
1881         aSign = extractFloat64Sign( a );
1882         switch ( roundData->mode ) {
1883          case float_round_nearest_even:
1884             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885                 return packFloat64( aSign, 0x3FF, 0 );
1886             }
1887             break;
1888          case float_round_down:
1889             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890          case float_round_up:
1891             return
1892             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893         }
1894         return packFloat64( aSign, 0, 0 );
1895     }
1896     lastBitMask = 1;
1897     lastBitMask <<= 0x433 - aExp;
1898     roundBitsMask = lastBitMask - 1;
1899     z = a;
1900     roundingMode = roundData->mode;
1901     if ( roundingMode == float_round_nearest_even ) {
1902         z += lastBitMask>>1;
1903         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904     }
1905     else if ( roundingMode != float_round_to_zero ) {
1906         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907             z += roundBitsMask;
1908         }
1909     }
1910     z &= ~ roundBitsMask;
1911     if ( z != a ) roundData->exception |= float_flag_inexact;
1912     return z;
1913
1914 }
1915
1916 /*
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1920 before being returned.  `zSign' is ignored if the result is a NaN.  The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1924 */
1925 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1926 {
1927     int16 aExp, bExp, zExp;
1928     bits64 aSig, bSig, zSig;
1929     int16 expDiff;
1930
1931     aSig = extractFloat64Frac( a );
1932     aExp = extractFloat64Exp( a );
1933     bSig = extractFloat64Frac( b );
1934     bExp = extractFloat64Exp( b );
1935     expDiff = aExp - bExp;
1936     aSig <<= 9;
1937     bSig <<= 9;
1938     if ( 0 < expDiff ) {
1939         if ( aExp == 0x7FF ) {
1940             if ( aSig ) return propagateFloat64NaN( a, b );
1941             return a;
1942         }
1943         if ( bExp == 0 ) {
1944             --expDiff;
1945         }
1946         else {
1947             bSig |= LIT64( 0x2000000000000000 );
1948         }
1949         shift64RightJamming( bSig, expDiff, &bSig );
1950         zExp = aExp;
1951     }
1952     else if ( expDiff < 0 ) {
1953         if ( bExp == 0x7FF ) {
1954             if ( bSig ) return propagateFloat64NaN( a, b );
1955             return packFloat64( zSign, 0x7FF, 0 );
1956         }
1957         if ( aExp == 0 ) {
1958             ++expDiff;
1959         }
1960         else {
1961             aSig |= LIT64( 0x2000000000000000 );
1962         }
1963         shift64RightJamming( aSig, - expDiff, &aSig );
1964         zExp = bExp;
1965     }
1966     else {
1967         if ( aExp == 0x7FF ) {
1968             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969             return a;
1970         }
1971         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973         zExp = aExp;
1974         goto roundAndPack;
1975     }
1976     aSig |= LIT64( 0x2000000000000000 );
1977     zSig = ( aSig + bSig )<<1;
1978     --zExp;
1979     if ( (sbits64) zSig < 0 ) {
1980         zSig = aSig + bSig;
1981         ++zExp;
1982     }
1983  roundAndPack:
1984     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1985
1986 }
1987
1988 /*
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'.  If `zSign' is true, the
1992 difference is negated before being returned.  `zSign' is ignored if the
1993 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1996 */
1997 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1998 {
1999     int16 aExp, bExp, zExp;
2000     bits64 aSig, bSig, zSig;
2001     int16 expDiff;
2002
2003     aSig = extractFloat64Frac( a );
2004     aExp = extractFloat64Exp( a );
2005     bSig = extractFloat64Frac( b );
2006     bExp = extractFloat64Exp( b );
2007     expDiff = aExp - bExp;
2008     aSig <<= 10;
2009     bSig <<= 10;
2010     if ( 0 < expDiff ) goto aExpBigger;
2011     if ( expDiff < 0 ) goto bExpBigger;
2012     if ( aExp == 0x7FF ) {
2013         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014         roundData->exception |= float_flag_invalid;
2015         return float64_default_nan;
2016     }
2017     if ( aExp == 0 ) {
2018         aExp = 1;
2019         bExp = 1;
2020     }
2021     if ( bSig < aSig ) goto aBigger;
2022     if ( aSig < bSig ) goto bBigger;
2023     return packFloat64( roundData->mode == float_round_down, 0, 0 );
2024  bExpBigger:
2025     if ( bExp == 0x7FF ) {
2026         if ( bSig ) return propagateFloat64NaN( a, b );
2027         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028     }
2029     if ( aExp == 0 ) {
2030         ++expDiff;
2031     }
2032     else {
2033         aSig |= LIT64( 0x4000000000000000 );
2034     }
2035     shift64RightJamming( aSig, - expDiff, &aSig );
2036     bSig |= LIT64( 0x4000000000000000 );
2037  bBigger:
2038     zSig = bSig - aSig;
2039     zExp = bExp;
2040     zSign ^= 1;
2041     goto normalizeRoundAndPack;
2042  aExpBigger:
2043     if ( aExp == 0x7FF ) {
2044         if ( aSig ) return propagateFloat64NaN( a, b );
2045         return a;
2046     }
2047     if ( bExp == 0 ) {
2048         --expDiff;
2049     }
2050     else {
2051         bSig |= LIT64( 0x4000000000000000 );
2052     }
2053     shift64RightJamming( bSig, expDiff, &bSig );
2054     aSig |= LIT64( 0x4000000000000000 );
2055  aBigger:
2056     zSig = aSig - bSig;
2057     zExp = aExp;
2058  normalizeRoundAndPack:
2059     --zExp;
2060     return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2061
2062 }
2063
2064 /*
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2070 */
2071 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2072 {
2073     flag aSign, bSign;
2074
2075     aSign = extractFloat64Sign( a );
2076     bSign = extractFloat64Sign( b );
2077     if ( aSign == bSign ) {
2078         return addFloat64Sigs( roundData, a, b, aSign );
2079     }
2080     else {
2081         return subFloat64Sigs( roundData, a, b, aSign );
2082     }
2083
2084 }
2085
2086 /*
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2092 */
2093 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2094 {
2095     flag aSign, bSign;
2096
2097     aSign = extractFloat64Sign( a );
2098     bSign = extractFloat64Sign( b );
2099     if ( aSign == bSign ) {
2100         return subFloat64Sigs( roundData, a, b, aSign );
2101     }
2102     else {
2103         return addFloat64Sigs( roundData, a, b, aSign );
2104     }
2105
2106 }
2107
2108 /*
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2114 */
2115 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2116 {
2117     flag aSign, bSign, zSign;
2118     int16 aExp, bExp, zExp;
2119     bits64 aSig, bSig, zSig0, zSig1;
2120
2121     aSig = extractFloat64Frac( a );
2122     aExp = extractFloat64Exp( a );
2123     aSign = extractFloat64Sign( a );
2124     bSig = extractFloat64Frac( b );
2125     bExp = extractFloat64Exp( b );
2126     bSign = extractFloat64Sign( b );
2127     zSign = aSign ^ bSign;
2128     if ( aExp == 0x7FF ) {
2129         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130             return propagateFloat64NaN( a, b );
2131         }
2132         if ( ( bExp | bSig ) == 0 ) {
2133             roundData->exception |= float_flag_invalid;
2134             return float64_default_nan;
2135         }
2136         return packFloat64( zSign, 0x7FF, 0 );
2137     }
2138     if ( bExp == 0x7FF ) {
2139         if ( bSig ) return propagateFloat64NaN( a, b );
2140         if ( ( aExp | aSig ) == 0 ) {
2141             roundData->exception |= float_flag_invalid;
2142             return float64_default_nan;
2143         }
2144         return packFloat64( zSign, 0x7FF, 0 );
2145     }
2146     if ( aExp == 0 ) {
2147         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149     }
2150     if ( bExp == 0 ) {
2151         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153     }
2154     zExp = aExp + bExp - 0x3FF;
2155     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158     zSig0 |= ( zSig1 != 0 );
2159     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160         zSig0 <<= 1;
2161         --zExp;
2162     }
2163     return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2164
2165 }
2166
2167 /*
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'.  The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2173 */
2174 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2175 {
2176     flag aSign, bSign, zSign;
2177     int16 aExp, bExp, zExp;
2178     bits64 aSig, bSig, zSig;
2179     bits64 rem0, rem1;
2180     bits64 term0, term1;
2181
2182     aSig = extractFloat64Frac( a );
2183     aExp = extractFloat64Exp( a );
2184     aSign = extractFloat64Sign( a );
2185     bSig = extractFloat64Frac( b );
2186     bExp = extractFloat64Exp( b );
2187     bSign = extractFloat64Sign( b );
2188     zSign = aSign ^ bSign;
2189     if ( aExp == 0x7FF ) {
2190         if ( aSig ) return propagateFloat64NaN( a, b );
2191         if ( bExp == 0x7FF ) {
2192             if ( bSig ) return propagateFloat64NaN( a, b );
2193             roundData->exception |= float_flag_invalid;
2194             return float64_default_nan;
2195         }
2196         return packFloat64( zSign, 0x7FF, 0 );
2197     }
2198     if ( bExp == 0x7FF ) {
2199         if ( bSig ) return propagateFloat64NaN( a, b );
2200         return packFloat64( zSign, 0, 0 );
2201     }
2202     if ( bExp == 0 ) {
2203         if ( bSig == 0 ) {
2204             if ( ( aExp | aSig ) == 0 ) {
2205                 roundData->exception |= float_flag_invalid;
2206                 return float64_default_nan;
2207             }
2208             roundData->exception |= float_flag_divbyzero;
2209             return packFloat64( zSign, 0x7FF, 0 );
2210         }
2211         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212     }
2213     if ( aExp == 0 ) {
2214         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216     }
2217     zExp = aExp - bExp + 0x3FD;
2218     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220     if ( bSig <= ( aSig + aSig ) ) {
2221         aSig >>= 1;
2222         ++zExp;
2223     }
2224     zSig = estimateDiv128To64( aSig, 0, bSig );
2225     if ( ( zSig & 0x1FF ) <= 2 ) {
2226         mul64To128( bSig, zSig, &term0, &term1 );
2227         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228         while ( (sbits64) rem0 < 0 ) {
2229             --zSig;
2230             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231         }
2232         zSig |= ( rem1 != 0 );
2233     }
2234     return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2235
2236 }
2237
2238 /*
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'.  The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2244 */
2245 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2246 {
2247     flag aSign, bSign, zSign;
2248     int16 aExp, bExp, expDiff;
2249     bits64 aSig, bSig;
2250     bits64 q, alternateASig;
2251     sbits64 sigMean;
2252
2253     aSig = extractFloat64Frac( a );
2254     aExp = extractFloat64Exp( a );
2255     aSign = extractFloat64Sign( a );
2256     bSig = extractFloat64Frac( b );
2257     bExp = extractFloat64Exp( b );
2258     bSign = extractFloat64Sign( b );
2259     if ( aExp == 0x7FF ) {
2260         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261             return propagateFloat64NaN( a, b );
2262         }
2263         roundData->exception |= float_flag_invalid;
2264         return float64_default_nan;
2265     }
2266     if ( bExp == 0x7FF ) {
2267         if ( bSig ) return propagateFloat64NaN( a, b );
2268         return a;
2269     }
2270     if ( bExp == 0 ) {
2271         if ( bSig == 0 ) {
2272             roundData->exception |= float_flag_invalid;
2273             return float64_default_nan;
2274         }
2275         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276     }
2277     if ( aExp == 0 ) {
2278         if ( aSig == 0 ) return a;
2279         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280     }
2281     expDiff = aExp - bExp;
2282     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284     if ( expDiff < 0 ) {
2285         if ( expDiff < -1 ) return a;
2286         aSig >>= 1;
2287     }
2288     q = ( bSig <= aSig );
2289     if ( q ) aSig -= bSig;
2290     expDiff -= 64;
2291     while ( 0 < expDiff ) {
2292         q = estimateDiv128To64( aSig, 0, bSig );
2293         q = ( 2 < q ) ? q - 2 : 0;
2294         aSig = - ( ( bSig>>2 ) * q );
2295         expDiff -= 62;
2296     }
2297     expDiff += 64;
2298     if ( 0 < expDiff ) {
2299         q = estimateDiv128To64( aSig, 0, bSig );
2300         q = ( 2 < q ) ? q - 2 : 0;
2301         q >>= 64 - expDiff;
2302         bSig >>= 2;
2303         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304     }
2305     else {
2306         aSig >>= 2;
2307         bSig >>= 2;
2308     }
2309     do {
2310         alternateASig = aSig;
2311         ++q;
2312         aSig -= bSig;
2313     } while ( 0 <= (sbits64) aSig );
2314     sigMean = aSig + alternateASig;
2315     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316         aSig = alternateASig;
2317     }
2318     zSign = ( (sbits64) aSig < 0 );
2319     if ( zSign ) aSig = - aSig;
2320     return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2321
2322 }
2323
2324 /*
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2330 */
2331 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2332 {
2333     flag aSign;
2334     int16 aExp, zExp;
2335     bits64 aSig, zSig;
2336     bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337     //float64 z;
2338
2339     aSig = extractFloat64Frac( a );
2340     aExp = extractFloat64Exp( a );
2341     aSign = extractFloat64Sign( a );
2342     if ( aExp == 0x7FF ) {
2343         if ( aSig ) return propagateFloat64NaN( a, a );
2344         if ( ! aSign ) return a;
2345         roundData->exception |= float_flag_invalid;
2346         return float64_default_nan;
2347     }
2348     if ( aSign ) {
2349         if ( ( aExp | aSig ) == 0 ) return a;
2350         roundData->exception |= float_flag_invalid;
2351         return float64_default_nan;
2352     }
2353     if ( aExp == 0 ) {
2354         if ( aSig == 0 ) return 0;
2355         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356     }
2357     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358     aSig |= LIT64( 0x0010000000000000 );
2359     zSig = estimateSqrt32( aExp, aSig>>21 );
2360     zSig <<= 31;
2361     aSig <<= 9 - ( aExp & 1 );
2362     zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363     if ( ( zSig & 0x3FF ) <= 5 ) {
2364         if ( zSig < 2 ) {
2365             zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366         }
2367         else {
2368             aSig <<= 2;
2369             mul64To128( zSig, zSig, &term0, &term1 );
2370             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371             while ( (sbits64) rem0 < 0 ) {
2372                 --zSig;
2373                 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374                 term1 |= 1;
2375                 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376             }
2377             zSig |= ( ( rem0 | rem1 ) != 0 );
2378         }
2379     }
2380     shift64RightJamming( zSig, 1, &zSig );
2381     return roundAndPackFloat64( roundData, 0, zExp, zSig );
2382
2383 }
2384
2385 /*
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise.  The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2391 */
2392 flag float64_eq( float64 a, float64 b )
2393 {
2394
2395     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397        ) {
2398         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399             float_raise( float_flag_invalid );
2400         }
2401         return 0;
2402     }
2403     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2404
2405 }
2406
2407 /*
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise.  The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 Arithmetic.
2413 -------------------------------------------------------------------------------
2414 */
2415 flag float64_le( float64 a, float64 b )
2416 {
2417     flag aSign, bSign;
2418
2419     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421        ) {
2422         float_raise( float_flag_invalid );
2423         return 0;
2424     }
2425     aSign = extractFloat64Sign( a );
2426     bSign = extractFloat64Sign( b );
2427     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428     return ( a == b ) || ( aSign ^ ( a < b ) );
2429
2430 }
2431
2432 /*
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise.  The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2438 */
2439 flag float64_lt( float64 a, float64 b )
2440 {
2441     flag aSign, bSign;
2442
2443     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445        ) {
2446         float_raise( float_flag_invalid );
2447         return 0;
2448     }
2449     aSign = extractFloat64Sign( a );
2450     bSign = extractFloat64Sign( b );
2451     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452     return ( a != b ) && ( aSign ^ ( a < b ) );
2453
2454 }
2455
2456 /*
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise.  The invalid exception is raised
2460 if either operand is a NaN.  Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2463 */
2464 flag float64_eq_signaling( float64 a, float64 b )
2465 {
2466
2467     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469        ) {
2470         float_raise( float_flag_invalid );
2471         return 0;
2472     }
2473     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2474
2475 }
2476
2477 /*
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2481 cause an exception.  Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2484 */
2485 flag float64_le_quiet( float64 a, float64 b )
2486 {
2487     flag aSign, bSign;
2488     //int16 aExp, bExp;
2489
2490     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492        ) {
2493         /* Do nothing, even if NaN as we're quiet */
2494         return 0;
2495     }
2496     aSign = extractFloat64Sign( a );
2497     bSign = extractFloat64Sign( b );
2498     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2499     return ( a == b ) || ( aSign ^ ( a < b ) );
2500
2501 }
2502
2503 /*
2504 -------------------------------------------------------------------------------
2505 Returns 1 if the double-precision floating-point value `a' is less than
2506 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2507 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2508 Standard for Binary Floating-point Arithmetic.
2509 -------------------------------------------------------------------------------
2510 */
2511 flag float64_lt_quiet( float64 a, float64 b )
2512 {
2513     flag aSign, bSign;
2514
2515     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2516          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2517        ) {
2518         /* Do nothing, even if NaN as we're quiet */
2519         return 0;
2520     }
2521     aSign = extractFloat64Sign( a );
2522     bSign = extractFloat64Sign( b );
2523     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2524     return ( a != b ) && ( aSign ^ ( a < b ) );
2525
2526 }
2527
2528 #ifdef FLOATX80
2529
2530 /*
2531 -------------------------------------------------------------------------------
2532 Returns the result of converting the extended double-precision floating-
2533 point value `a' to the 32-bit two's complement integer format.  The
2534 conversion is performed according to the IEC/IEEE Standard for Binary
2535 Floating-point Arithmetic---which means in particular that the conversion
2536 is rounded according to the current rounding mode.  If `a' is a NaN, the
2537 largest positive integer is returned.  Otherwise, if the conversion
2538 overflows, the largest integer with the same sign as `a' is returned.
2539 -------------------------------------------------------------------------------
2540 */
2541 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2542 {
2543     flag aSign;
2544     int32 aExp, shiftCount;
2545     bits64 aSig;
2546
2547     aSig = extractFloatx80Frac( a );
2548     aExp = extractFloatx80Exp( a );
2549     aSign = extractFloatx80Sign( a );
2550     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2551     shiftCount = 0x4037 - aExp;
2552     if ( shiftCount <= 0 ) shiftCount = 1;
2553     shift64RightJamming( aSig, shiftCount, &aSig );
2554     return roundAndPackInt32( roundData, aSign, aSig );
2555
2556 }
2557
2558 /*
2559 -------------------------------------------------------------------------------
2560 Returns the result of converting the extended double-precision floating-
2561 point value `a' to the 32-bit two's complement integer format.  The
2562 conversion is performed according to the IEC/IEEE Standard for Binary
2563 Floating-point Arithmetic, except that the conversion is always rounded
2564 toward zero.  If `a' is a NaN, the largest positive integer is returned.
2565 Otherwise, if the conversion overflows, the largest integer with the same
2566 sign as `a' is returned.
2567 -------------------------------------------------------------------------------
2568 */
2569 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2570 {
2571     flag aSign;
2572     int32 aExp, shiftCount;
2573     bits64 aSig, savedASig;
2574     int32 z;
2575
2576     aSig = extractFloatx80Frac( a );
2577     aExp = extractFloatx80Exp( a );
2578     aSign = extractFloatx80Sign( a );
2579     shiftCount = 0x403E - aExp;
2580     if ( shiftCount < 32 ) {
2581         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2582         goto invalid;
2583     }
2584     else if ( 63 < shiftCount ) {
2585         if ( aExp || aSig ) float_raise( float_flag_inexact );
2586         return 0;
2587     }
2588     savedASig = aSig;
2589     aSig >>= shiftCount;
2590     z = aSig;
2591     if ( aSign ) z = - z;
2592     if ( ( z < 0 ) ^ aSign ) {
2593  invalid:
2594         float_raise( float_flag_invalid );
2595         return aSign ? 0x80000000 : 0x7FFFFFFF;
2596     }
2597     if ( ( aSig<<shiftCount ) != savedASig ) {
2598         float_raise( float_flag_inexact );
2599     }
2600     return z;
2601
2602 }
2603
2604 /*
2605 -------------------------------------------------------------------------------
2606 Returns the result of converting the extended double-precision floating-
2607 point value `a' to the single-precision floating-point format.  The
2608 conversion is performed according to the IEC/IEEE Standard for Binary
2609 Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2611 */
2612 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2613 {
2614     flag aSign;
2615     int32 aExp;
2616     bits64 aSig;
2617
2618     aSig = extractFloatx80Frac( a );
2619     aExp = extractFloatx80Exp( a );
2620     aSign = extractFloatx80Sign( a );
2621     if ( aExp == 0x7FFF ) {
2622         if ( (bits64) ( aSig<<1 ) ) {
2623             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2624         }
2625         return packFloat32( aSign, 0xFF, 0 );
2626     }
2627     shift64RightJamming( aSig, 33, &aSig );
2628     if ( aExp || aSig ) aExp -= 0x3F81;
2629     return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2630
2631 }
2632
2633 /*
2634 -------------------------------------------------------------------------------
2635 Returns the result of converting the extended double-precision floating-
2636 point value `a' to the double-precision floating-point format.  The
2637 conversion is performed according to the IEC/IEEE Standard for Binary
2638 Floating-point Arithmetic.
2639 -------------------------------------------------------------------------------
2640 */
2641 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2642 {
2643     flag aSign;
2644     int32 aExp;
2645     bits64 aSig, zSig;
2646
2647     aSig = extractFloatx80Frac( a );
2648     aExp = extractFloatx80Exp( a );
2649     aSign = extractFloatx80Sign( a );
2650     if ( aExp == 0x7FFF ) {
2651         if ( (bits64) ( aSig<<1 ) ) {
2652             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2653         }
2654         return packFloat64( aSign, 0x7FF, 0 );
2655     }
2656     shift64RightJamming( aSig, 1, &zSig );
2657     if ( aExp || aSig ) aExp -= 0x3C01;
2658     return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2659
2660 }
2661
2662 /*
2663 -------------------------------------------------------------------------------
2664 Rounds the extended double-precision floating-point value `a' to an integer,
2665 and returns the result as an extended quadruple-precision floating-point
2666 value.  The operation is performed according to the IEC/IEEE Standard for
2667 Binary Floating-point Arithmetic.
2668 -------------------------------------------------------------------------------
2669 */
2670 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2671 {
2672     flag aSign;
2673     int32 aExp;
2674     bits64 lastBitMask, roundBitsMask;
2675     int8 roundingMode;
2676     floatx80 z;
2677
2678     aExp = extractFloatx80Exp( a );
2679     if ( 0x403E <= aExp ) {
2680         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2681             return propagateFloatx80NaN( a, a );
2682         }
2683         return a;
2684     }
2685     if ( aExp <= 0x3FFE ) {
2686         if (    ( aExp == 0 )
2687              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2688             return a;
2689         }
2690         roundData->exception |= float_flag_inexact;
2691         aSign = extractFloatx80Sign( a );
2692         switch ( roundData->mode ) {
2693          case float_round_nearest_even:
2694             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2695                ) {
2696                 return
2697                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2698             }
2699             break;
2700          case float_round_down:
2701             return
2702                   aSign ?
2703                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2704                 : packFloatx80( 0, 0, 0 );
2705          case float_round_up:
2706             return
2707                   aSign ? packFloatx80( 1, 0, 0 )
2708                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709         }
2710         return packFloatx80( aSign, 0, 0 );
2711     }
2712     lastBitMask = 1;
2713     lastBitMask <<= 0x403E - aExp;
2714     roundBitsMask = lastBitMask - 1;
2715     z = a;
2716     roundingMode = roundData->mode;
2717     if ( roundingMode == float_round_nearest_even ) {
2718         z.low += lastBitMask>>1;
2719         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2720     }
2721     else if ( roundingMode != float_round_to_zero ) {
2722         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2723             z.low += roundBitsMask;
2724         }
2725     }
2726     z.low &= ~ roundBitsMask;
2727     if ( z.low == 0 ) {
2728         ++z.high;
2729         z.low = LIT64( 0x8000000000000000 );
2730     }
2731     if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2732     return z;
2733
2734 }
2735
2736 /*
2737 -------------------------------------------------------------------------------
2738 Returns the result of adding the absolute values of the extended double-
2739 precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
2740 negated before being returned.  `zSign' is ignored if the result is a NaN.
2741 The addition is performed according to the IEC/IEEE Standard for Binary
2742 Floating-point Arithmetic.
2743 -------------------------------------------------------------------------------
2744 */
2745 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2746 {
2747     int32 aExp, bExp, zExp;
2748     bits64 aSig, bSig, zSig0, zSig1;
2749     int32 expDiff;
2750
2751     aSig = extractFloatx80Frac( a );
2752     aExp = extractFloatx80Exp( a );
2753     bSig = extractFloatx80Frac( b );
2754     bExp = extractFloatx80Exp( b );
2755     expDiff = aExp - bExp;
2756     if ( 0 < expDiff ) {
2757         if ( aExp == 0x7FFF ) {
2758             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2759             return a;
2760         }
2761         if ( bExp == 0 ) --expDiff;
2762         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2763         zExp = aExp;
2764     }
2765     else if ( expDiff < 0 ) {
2766         if ( bExp == 0x7FFF ) {
2767             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2768             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769         }
2770         if ( aExp == 0 ) ++expDiff;
2771         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2772         zExp = bExp;
2773     }
2774     else {
2775         if ( aExp == 0x7FFF ) {
2776             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2777                 return propagateFloatx80NaN( a, b );
2778             }
2779             return a;
2780         }
2781         zSig1 = 0;
2782         zSig0 = aSig + bSig;
2783         if ( aExp == 0 ) {
2784             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2785             goto roundAndPack;
2786         }
2787         zExp = aExp;
2788         goto shiftRight1;
2789     }
2790
2791     zSig0 = aSig + bSig;
2792
2793     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2794  shiftRight1:
2795     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2796     zSig0 |= LIT64( 0x8000000000000000 );
2797     ++zExp;
2798  roundAndPack:
2799     return
2800         roundAndPackFloatx80(
2801             roundData, zSign, zExp, zSig0, zSig1 );
2802
2803 }
2804
2805 /*
2806 -------------------------------------------------------------------------------
2807 Returns the result of subtracting the absolute values of the extended
2808 double-precision floating-point values `a' and `b'.  If `zSign' is true,
2809 the difference is negated before being returned.  `zSign' is ignored if the
2810 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2811 Standard for Binary Floating-point Arithmetic.
2812 -------------------------------------------------------------------------------
2813 */
2814 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2815 {
2816     int32 aExp, bExp, zExp;
2817     bits64 aSig, bSig, zSig0, zSig1;
2818     int32 expDiff;
2819     floatx80 z;
2820
2821     aSig = extractFloatx80Frac( a );
2822     aExp = extractFloatx80Exp( a );
2823     bSig = extractFloatx80Frac( b );
2824     bExp = extractFloatx80Exp( b );
2825     expDiff = aExp - bExp;
2826     if ( 0 < expDiff ) goto aExpBigger;
2827     if ( expDiff < 0 ) goto bExpBigger;
2828     if ( aExp == 0x7FFF ) {
2829         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2830             return propagateFloatx80NaN( a, b );
2831         }
2832         roundData->exception |= float_flag_invalid;
2833         z.low = floatx80_default_nan_low;
2834         z.high = floatx80_default_nan_high;
2835         z.__padding = 0;
2836         return z;
2837     }
2838     if ( aExp == 0 ) {
2839         aExp = 1;
2840         bExp = 1;
2841     }
2842     zSig1 = 0;
2843     if ( bSig < aSig ) goto aBigger;
2844     if ( aSig < bSig ) goto bBigger;
2845     return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2846  bExpBigger:
2847     if ( bExp == 0x7FFF ) {
2848         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2849         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2850     }
2851     if ( aExp == 0 ) ++expDiff;
2852     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2853  bBigger:
2854     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2855     zExp = bExp;
2856     zSign ^= 1;
2857     goto normalizeRoundAndPack;
2858  aExpBigger:
2859     if ( aExp == 0x7FFF ) {
2860         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2861         return a;
2862     }
2863     if ( bExp == 0 ) --expDiff;
2864     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2865  aBigger:
2866     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2867     zExp = aExp;
2868  normalizeRoundAndPack:
2869     return
2870         normalizeRoundAndPackFloatx80(
2871             roundData, zSign, zExp, zSig0, zSig1 );
2872
2873 }
2874
2875 /*
2876 -------------------------------------------------------------------------------
2877 Returns the result of adding the extended double-precision floating-point
2878 values `a' and `b'.  The operation is performed according to the IEC/IEEE
2879 Standard for Binary Floating-point Arithmetic.
2880 -------------------------------------------------------------------------------
2881 */
2882 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2883 {
2884     flag aSign, bSign;
2885
2886     aSign = extractFloatx80Sign( a );
2887     bSign = extractFloatx80Sign( b );
2888     if ( aSign == bSign ) {
2889         return addFloatx80Sigs( roundData, a, b, aSign );
2890     }
2891     else {
2892         return subFloatx80Sigs( roundData, a, b, aSign );
2893     }
2894
2895 }
2896
2897 /*
2898 -------------------------------------------------------------------------------
2899 Returns the result of subtracting the extended double-precision floating-
2900 point values `a' and `b'.  The operation is performed according to the
2901 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2902 -------------------------------------------------------------------------------
2903 */
2904 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2905 {
2906     flag aSign, bSign;
2907
2908     aSign = extractFloatx80Sign( a );
2909     bSign = extractFloatx80Sign( b );
2910     if ( aSign == bSign ) {
2911         return subFloatx80Sigs( roundData, a, b, aSign );
2912     }
2913     else {
2914         return addFloatx80Sigs( roundData, a, b, aSign );
2915     }
2916
2917 }
2918
2919 /*
2920 -------------------------------------------------------------------------------
2921 Returns the result of multiplying the extended double-precision floating-
2922 point values `a' and `b'.  The operation is performed according to the
2923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2924 -------------------------------------------------------------------------------
2925 */
2926 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2927 {
2928     flag aSign, bSign, zSign;
2929     int32 aExp, bExp, zExp;
2930     bits64 aSig, bSig, zSig0, zSig1;
2931     floatx80 z;
2932
2933     aSig = extractFloatx80Frac( a );
2934     aExp = extractFloatx80Exp( a );
2935     aSign = extractFloatx80Sign( a );
2936     bSig = extractFloatx80Frac( b );
2937     bExp = extractFloatx80Exp( b );
2938     bSign = extractFloatx80Sign( b );
2939     zSign = aSign ^ bSign;
2940     if ( aExp == 0x7FFF ) {
2941         if (    (bits64) ( aSig<<1 )
2942              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2943             return propagateFloatx80NaN( a, b );
2944         }
2945         if ( ( bExp | bSig ) == 0 ) goto invalid;
2946         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2947     }
2948     if ( bExp == 0x7FFF ) {
2949         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2950         if ( ( aExp | aSig ) == 0 ) {
2951  invalid:
2952             roundData->exception |= float_flag_invalid;
2953             z.low = floatx80_default_nan_low;
2954             z.high = floatx80_default_nan_high;
2955             z.__padding = 0;
2956             return z;
2957         }
2958         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2959     }
2960     if ( aExp == 0 ) {
2961         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2962         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2963     }
2964     if ( bExp == 0 ) {
2965         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2966         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2967     }
2968     zExp = aExp + bExp - 0x3FFE;
2969     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2970     if ( 0 < (sbits64) zSig0 ) {
2971         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2972         --zExp;
2973     }
2974     return
2975         roundAndPackFloatx80(
2976             roundData, zSign, zExp, zSig0, zSig1 );
2977
2978 }
2979
2980 /*
2981 -------------------------------------------------------------------------------
2982 Returns the result of dividing the extended double-precision floating-point
2983 value `a' by the corresponding value `b'.  The operation is performed
2984 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2985 -------------------------------------------------------------------------------
2986 */
2987 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2988 {
2989     flag aSign, bSign, zSign;
2990     int32 aExp, bExp, zExp;
2991     bits64 aSig, bSig, zSig0, zSig1;
2992     bits64 rem0, rem1, rem2, term0, term1, term2;
2993     floatx80 z;
2994
2995     aSig = extractFloatx80Frac( a );
2996     aExp = extractFloatx80Exp( a );
2997     aSign = extractFloatx80Sign( a );
2998     bSig = extractFloatx80Frac( b );
2999     bExp = extractFloatx80Exp( b );
3000     bSign = extractFloatx80Sign( b );
3001     zSign = aSign ^ bSign;
3002     if ( aExp == 0x7FFF ) {
3003         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3004         if ( bExp == 0x7FFF ) {
3005             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006             goto invalid;
3007         }
3008         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3009     }
3010     if ( bExp == 0x7FFF ) {
3011         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012         return packFloatx80( zSign, 0, 0 );
3013     }
3014     if ( bExp == 0 ) {
3015         if ( bSig == 0 ) {
3016             if ( ( aExp | aSig ) == 0 ) {
3017  invalid:
3018                 roundData->exception |= float_flag_invalid;
3019                 z.low = floatx80_default_nan_low;
3020                 z.high = floatx80_default_nan_high;
3021                 z.__padding = 0;
3022                 return z;
3023             }
3024             roundData->exception |= float_flag_divbyzero;
3025             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3026         }
3027         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3028     }
3029     if ( aExp == 0 ) {
3030         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3031         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3032     }
3033     zExp = aExp - bExp + 0x3FFE;
3034     rem1 = 0;
3035     if ( bSig <= aSig ) {
3036         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3037         ++zExp;
3038     }
3039     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3040     mul64To128( bSig, zSig0, &term0, &term1 );
3041     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3042     while ( (sbits64) rem0 < 0 ) {
3043         --zSig0;
3044         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3045     }
3046     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3047     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3048         mul64To128( bSig, zSig1, &term1, &term2 );
3049         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3050         while ( (sbits64) rem1 < 0 ) {
3051             --zSig1;
3052             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3053         }
3054         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3055     }
3056     return
3057         roundAndPackFloatx80(
3058             roundData, zSign, zExp, zSig0, zSig1 );
3059
3060 }
3061
3062 /*
3063 -------------------------------------------------------------------------------
3064 Returns the remainder of the extended double-precision floating-point value
3065 `a' with respect to the corresponding value `b'.  The operation is performed
3066 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3068 */
3069 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3070 {
3071     flag aSign, bSign, zSign;
3072     int32 aExp, bExp, expDiff;
3073     bits64 aSig0, aSig1, bSig;
3074     bits64 q, term0, term1, alternateASig0, alternateASig1;
3075     floatx80 z;
3076
3077     aSig0 = extractFloatx80Frac( a );
3078     aExp = extractFloatx80Exp( a );
3079     aSign = extractFloatx80Sign( a );
3080     bSig = extractFloatx80Frac( b );
3081     bExp = extractFloatx80Exp( b );
3082     bSign = extractFloatx80Sign( b );
3083     if ( aExp == 0x7FFF ) {
3084         if (    (bits64) ( aSig0<<1 )
3085              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3086             return propagateFloatx80NaN( a, b );
3087         }
3088         goto invalid;
3089     }
3090     if ( bExp == 0x7FFF ) {
3091         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3092         return a;
3093     }
3094     if ( bExp == 0 ) {
3095         if ( bSig == 0 ) {
3096  invalid:
3097             roundData->exception |= float_flag_invalid;
3098             z.low = floatx80_default_nan_low;
3099             z.high = floatx80_default_nan_high;
3100             z.__padding = 0;
3101             return z;
3102         }
3103         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3104     }
3105     if ( aExp == 0 ) {
3106         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3108     }
3109     bSig |= LIT64( 0x8000000000000000 );
3110     zSign = aSign;
3111     expDiff = aExp - bExp;
3112     aSig1 = 0;
3113     if ( expDiff < 0 ) {
3114         if ( expDiff < -1 ) return a;
3115         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116         expDiff = 0;
3117     }
3118     q = ( bSig <= aSig0 );
3119     if ( q ) aSig0 -= bSig;
3120     expDiff -= 64;
3121     while ( 0 < expDiff ) {
3122         q = estimateDiv128To64( aSig0, aSig1, bSig );
3123         q = ( 2 < q ) ? q - 2 : 0;
3124         mul64To128( bSig, q, &term0, &term1 );
3125         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127         expDiff -= 62;
3128     }
3129     expDiff += 64;
3130     if ( 0 < expDiff ) {
3131         q = estimateDiv128To64( aSig0, aSig1, bSig );
3132         q = ( 2 < q ) ? q - 2 : 0;
3133         q >>= 64 - expDiff;
3134         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138             ++q;
3139             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140         }
3141     }
3142     else {
3143         term1 = 0;
3144         term0 = bSig;
3145     }
3146     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149               && ( q & 1 ) )
3150        ) {
3151         aSig0 = alternateASig0;
3152         aSig1 = alternateASig1;
3153         zSign = ! zSign;
3154     }
3155
3156     return
3157         normalizeRoundAndPackFloatx80(
3158             roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3159
3160 }
3161
3162 /*
3163 -------------------------------------------------------------------------------
3164 Returns the square root of the extended double-precision floating-point
3165 value `a'.  The operation is performed according to the IEC/IEEE Standard
3166 for Binary Floating-point Arithmetic.
3167 -------------------------------------------------------------------------------
3168 */
3169 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3170 {
3171     flag aSign;
3172     int32 aExp, zExp;
3173     bits64 aSig0, aSig1, zSig0, zSig1;
3174     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3175     bits64 shiftedRem0, shiftedRem1;
3176     floatx80 z;
3177
3178     aSig0 = extractFloatx80Frac( a );
3179     aExp = extractFloatx80Exp( a );
3180     aSign = extractFloatx80Sign( a );
3181     if ( aExp == 0x7FFF ) {
3182         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3183         if ( ! aSign ) return a;
3184         goto invalid;
3185     }
3186     if ( aSign ) {
3187         if ( ( aExp | aSig0 ) == 0 ) return a;
3188  invalid:
3189         roundData->exception |= float_flag_invalid;
3190         z.low = floatx80_default_nan_low;
3191         z.high = floatx80_default_nan_high;
3192         z.__padding = 0;
3193         return z;
3194     }
3195     if ( aExp == 0 ) {
3196         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3197         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3198     }
3199     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3200     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3201     zSig0 <<= 31;
3202     aSig1 = 0;
3203     shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3204     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3205     if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3206     shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3207     mul64To128( zSig0, zSig0, &term0, &term1 );
3208     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3209     while ( (sbits64) rem0 < 0 ) {
3210         --zSig0;
3211         shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3212         term1 |= 1;
3213         add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3214     }
3215     shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3216     zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3217     if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3218         if ( zSig1 == 0 ) zSig1 = 1;
3219         mul64To128( zSig0, zSig1, &term1, &term2 );
3220         shortShift128Left( term1, term2, 1, &term1, &term2 );
3221         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3222         mul64To128( zSig1, zSig1, &term2, &term3 );
3223         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3224         while ( (sbits64) rem1 < 0 ) {
3225             --zSig1;
3226             shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3227             term3 |= 1;
3228             add192(
3229                 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3230         }
3231         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3232     }
3233     return
3234         roundAndPackFloatx80(
3235             roundData, 0, zExp, zSig0, zSig1 );
3236
3237 }
3238
3239 /*
3240 -------------------------------------------------------------------------------
3241 Returns 1 if the extended double-precision floating-point value `a' is
3242 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3243 performed according to the IEC/IEEE Standard for Binary Floating-point
3244 Arithmetic.
3245 -------------------------------------------------------------------------------
3246 */
3247 flag floatx80_eq( floatx80 a, floatx80 b )
3248 {
3249
3250     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3251               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3252          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3253               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3254        ) {
3255         if (    floatx80_is_signaling_nan( a )
3256              || floatx80_is_signaling_nan( b ) ) {
3257             float_raise( float_flag_invalid );
3258         }
3259         return 0;
3260     }
3261     return
3262            ( a.low == b.low )
3263         && (    ( a.high == b.high )
3264              || (    ( a.low == 0 )
3265                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3266            );
3267
3268 }
3269
3270 /*
3271 -------------------------------------------------------------------------------
3272 Returns 1 if the extended double-precision floating-point value `a' is
3273 less than or equal to the corresponding value `b', and 0 otherwise.  The
3274 comparison is performed according to the IEC/IEEE Standard for Binary
3275 Floating-point Arithmetic.
3276 -------------------------------------------------------------------------------
3277 */
3278 flag floatx80_le( floatx80 a, floatx80 b )
3279 {
3280     flag aSign, bSign;
3281
3282     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3283               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3284          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3285               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3286        ) {
3287         float_raise( float_flag_invalid );
3288         return 0;
3289     }
3290     aSign = extractFloatx80Sign( a );
3291     bSign = extractFloatx80Sign( b );
3292     if ( aSign != bSign ) {
3293         return
3294                aSign
3295             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3296                  == 0 );
3297     }
3298     return
3299           aSign ? le128( b.high, b.low, a.high, a.low )
3300         : le128( a.high, a.low, b.high, b.low );
3301
3302 }
3303
3304 /*
3305 -------------------------------------------------------------------------------
3306 Returns 1 if the extended double-precision floating-point value `a' is
3307 less than the corresponding value `b', and 0 otherwise.  The comparison
3308 is performed according to the IEC/IEEE Standard for Binary Floating-point
3309 Arithmetic.
3310 -------------------------------------------------------------------------------
3311 */
3312 flag floatx80_lt( floatx80 a, floatx80 b )
3313 {
3314     flag aSign, bSign;
3315
3316     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3317               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3318          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3319               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3320        ) {
3321         float_raise( float_flag_invalid );
3322         return 0;
3323     }
3324     aSign = extractFloatx80Sign( a );
3325     bSign = extractFloatx80Sign( b );
3326     if ( aSign != bSign ) {
3327         return
3328                aSign
3329             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3330                  != 0 );
3331     }
3332     return
3333           aSign ? lt128( b.high, b.low, a.high, a.low )
3334         : lt128( a.high, a.low, b.high, b.low );
3335
3336 }
3337
3338 /*
3339 -------------------------------------------------------------------------------
3340 Returns 1 if the extended double-precision floating-point value `a' is equal
3341 to the corresponding value `b', and 0 otherwise.  The invalid exception is
3342 raised if either operand is a NaN.  Otherwise, the comparison is performed
3343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3344 -------------------------------------------------------------------------------
3345 */
3346 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347 {
3348
3349     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3350               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3351          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3352               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3353        ) {
3354         float_raise( float_flag_invalid );
3355         return 0;
3356     }
3357     return
3358            ( a.low == b.low )
3359         && (    ( a.high == b.high )
3360              || (    ( a.low == 0 )
3361                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3362            );
3363
3364 }
3365
3366 /*
3367 -------------------------------------------------------------------------------
3368 Returns 1 if the extended double-precision floating-point value `a' is less
3369 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3370 do not cause an exception.  Otherwise, the comparison is performed according
3371 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3372 -------------------------------------------------------------------------------
3373 */
3374 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3375 {
3376     flag aSign, bSign;
3377
3378     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3379               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3380          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3381               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3382        ) {
3383         /* Do nothing, even if NaN as we're quiet */
3384         return 0;
3385     }
3386     aSign = extractFloatx80Sign( a );
3387     bSign = extractFloatx80Sign( b );
3388     if ( aSign != bSign ) {
3389         return
3390                aSign
3391             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3392                  == 0 );
3393     }
3394     return
3395           aSign ? le128( b.high, b.low, a.high, a.low )
3396         : le128( a.high, a.low, b.high, b.low );
3397
3398 }
3399
3400 /*
3401 -------------------------------------------------------------------------------
3402 Returns 1 if the extended double-precision floating-point value `a' is less
3403 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3404 an exception.  Otherwise, the comparison is performed according to the
3405 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3406 -------------------------------------------------------------------------------
3407 */
3408 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3409 {
3410     flag aSign, bSign;
3411
3412     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3413               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3414          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3415               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3416        ) {
3417         /* Do nothing, even if NaN as we're quiet */
3418         return 0;
3419     }
3420     aSign = extractFloatx80Sign( a );
3421     bSign = extractFloatx80Sign( b );
3422     if ( aSign != bSign ) {
3423         return
3424                aSign
3425             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3426                  != 0 );
3427     }
3428     return
3429           aSign ? lt128( b.high, b.low, a.high, a.low )
3430         : lt128( a.high, a.low, b.high, b.low );
3431
3432 }
3433
3434 #endif
3435