mips/math-emu/dp_sqrt.c

0001 // SPDX-License-Identifier: GPL-2.0-only
0002 /* IEEE754 floating point arithmetic
0003  * double precision square root
0004  */
0005 /*
0006  * MIPS floating point support
0007  * Copyright (C) 1994-2000 Algorithmics Ltd.
0008  */
0009
0010 #include "ieee754dp.h"
0011
0012 static const unsigned int table[] = {
0013     0, 1204, 3062, 5746, 9193, 13348, 18162, 23592,
0014     29598, 36145, 43202, 50740, 58733, 67158, 75992,
0015     85215, 83599, 71378, 60428, 50647, 41945, 34246,
0016     27478, 21581, 16499, 12183, 8588, 5674, 3403,
0017     1742, 661, 130
0018 };
0019
0020 union ieee754dp ieee754dp_sqrt(union ieee754dp x)
0021 {
0022     struct _ieee754_csr oldcsr;
0023     union ieee754dp y, z, t;
0024     unsigned int scalx, yh;
0025     COMPXDP;
0026
0027     EXPLODEXDP;
0028     ieee754_clearcx();
0029     FLUSHXDP;
0030
0031     /* x == INF or NAN? */
0032     switch (xc) {
0033     case IEEE754_CLASS_SNAN:
0034         return ieee754dp_nanxcpt(x);
0035
0036     case IEEE754_CLASS_QNAN:
0037         /* sqrt(Nan) = Nan */
0038         return x;
0039
0040     case IEEE754_CLASS_ZERO:
0041         /* sqrt(0) = 0 */
0042         return x;
0043
0044     case IEEE754_CLASS_INF:
0045         if (xs) {
0046             /* sqrt(-Inf) = Nan */
0047             ieee754_setcx(IEEE754_INVALID_OPERATION);
0048             return ieee754dp_indef();
0049         }
0050         /* sqrt(+Inf) = Inf */
0051         return x;
0052
0053     case IEEE754_CLASS_DNORM:
0054         DPDNORMX;
0055         fallthrough;
0056     case IEEE754_CLASS_NORM:
0057         if (xs) {
0058             /* sqrt(-x) = Nan */
0059             ieee754_setcx(IEEE754_INVALID_OPERATION);
0060             return ieee754dp_indef();
0061         }
0062         break;
0063     }
0064
0065     /* save old csr; switch off INX enable & flag; set RN rounding */
0066     oldcsr = ieee754_csr;
0067     ieee754_csr.mx &= ~IEEE754_INEXACT;
0068     ieee754_csr.sx &= ~IEEE754_INEXACT;
0069     ieee754_csr.rm = FPU_CSR_RN;
0070
0071     /* adjust exponent to prevent overflow */
0072     scalx = 0;
0073     if (xe > 512) {     /* x > 2**-512? */
0074         xe -= 512;  /* x = x / 2**512 */
0075         scalx += 256;
0076     } else if (xe < -512) { /* x < 2**-512? */
0077         xe += 512;  /* x = x * 2**512 */
0078         scalx -= 256;
0079     }
0080
0081     x = builddp(0, xe + DP_EBIAS, xm & ~DP_HIDDEN_BIT);
0082     y = x;
0083
0084     /* magic initial approximation to almost 8 sig. bits */
0085     yh = y.bits >> 32;
0086     yh = (yh >> 1) + 0x1ff80000;
0087     yh = yh - table[(yh >> 15) & 31];
0088     y.bits = ((u64) yh << 32) | (y.bits & 0xffffffff);
0089
0090     /* Heron's rule once with correction to improve to ~18 sig. bits */
0091     /* t=x/y; y=y+t; py[n0]=py[n0]-0x00100006; py[n1]=0; */
0092     t = ieee754dp_div(x, y);
0093     y = ieee754dp_add(y, t);
0094     y.bits -= 0x0010000600000000LL;
0095     y.bits &= 0xffffffff00000000LL;
0096
0097     /* triple to almost 56 sig. bits: y ~= sqrt(x) to within 1 ulp */
0098     /* t=y*y; z=t;  pt[n0]+=0x00100000; t+=z; z=(x-z)*y; */
0099     t = ieee754dp_mul(y, y);
0100     z = t;
0101     t.bexp += 0x001;
0102     t = ieee754dp_add(t, z);
0103     z = ieee754dp_mul(ieee754dp_sub(x, z), y);
0104
0105     /* t=z/(t+x) ;  pt[n0]+=0x00100000; y+=t; */
0106     t = ieee754dp_div(z, ieee754dp_add(t, x));
0107     t.bexp += 0x001;
0108     y = ieee754dp_add(y, t);
0109
0110     /* twiddle last bit to force y correctly rounded */
0111
0112     /* set RZ, clear INEX flag */
0113     ieee754_csr.rm = FPU_CSR_RZ;
0114     ieee754_csr.sx &= ~IEEE754_INEXACT;
0115
0116     /* t=x/y; ...chopped quotient, possibly inexact */
0117     t = ieee754dp_div(x, y);
0118
0119     if (ieee754_csr.sx & IEEE754_INEXACT || t.bits != y.bits) {
0120
0121         if (!(ieee754_csr.sx & IEEE754_INEXACT))
0122             /* t = t-ulp */
0123             t.bits -= 1;
0124
0125         /* add inexact to result status */
0126         oldcsr.cx |= IEEE754_INEXACT;
0127         oldcsr.sx |= IEEE754_INEXACT;
0128
0129         switch (oldcsr.rm) {
0130         case FPU_CSR_RU:
0131             y.bits += 1;
0132             fallthrough;
0133         case FPU_CSR_RN:
0134             t.bits += 1;
0135             break;
0136         }
0137
0138         /* y=y+t; ...chopped sum */
0139         y = ieee754dp_add(y, t);
0140
0141         /* adjust scalx for correctly rounded sqrt(x) */
0142         scalx -= 1;
0143     }
0144
0145     /* py[n0]=py[n0]+scalx; ...scale back y */
0146     y.bexp += scalx;
0147
0148     /* restore rounding mode, possibly set inexact */
0149     ieee754_csr = oldcsr;
0150
0151     return y;
0152 }