include/linux/reciprocal_div.h

0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _LINUX_RECIPROCAL_DIV_H
0003 #define _LINUX_RECIPROCAL_DIV_H
0004
0005 #include <linux/types.h>
0006
0007 /*
0008  * This algorithm is based on the paper "Division by Invariant
0009  * Integers Using Multiplication" by Torbjörn Granlund and Peter
0010  * L. Montgomery.
0011  *
0012  * The assembler implementation from Agner Fog, which this code is
0013  * based on, can be found here:
0014  * http://www.agner.org/optimize/asmlib.zip
0015  *
0016  * This optimization for A/B is helpful if the divisor B is mostly
0017  * runtime invariant. The reciprocal of B is calculated in the
0018  * slow-path with reciprocal_value(). The fast-path can then just use
0019  * a much faster multiplication operation with a variable dividend A
0020  * to calculate the division A/B.
0021  */
0022
0023 struct reciprocal_value {
0024     u32 m;
0025     u8 sh1, sh2;
0026 };
0027
0028 /* "reciprocal_value" and "reciprocal_divide" together implement the basic
0029  * version of the algorithm described in Figure 4.1 of the paper.
0030  */
0031 struct reciprocal_value reciprocal_value(u32 d);
0032
0033 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
0034 {
0035     u32 t = (u32)(((u64)a * R.m) >> 32);
0036     return (t + ((a - t) >> R.sh1)) >> R.sh2;
0037 }
0038
0039 struct reciprocal_value_adv {
0040     u32 m;
0041     u8 sh, exp;
0042     bool is_wide_m;
0043 };
0044
0045 /* "reciprocal_value_adv" implements the advanced version of the algorithm
0046  * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
0047  * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
0048  * exception case could be easily handled before calling "reciprocal_value_adv".
0049  *
0050  * The advanced version requires more complex calculation to get the reciprocal
0051  * multiplier and other control variables, but then could reduce the required
0052  * emulation operations.
0053  *
0054  * It makes no sense to use this advanced version for host divide emulation,
0055  * those extra complexities for calculating multiplier etc could completely
0056  * waive our saving on emulation operations.
0057  *
0058  * However, it makes sense to use it for JIT divide code generation for which
0059  * we are willing to trade performance of JITed code with that of host. As shown
0060  * by the following pseudo code, the required emulation operations could go down
0061  * from 6 (the basic version) to 3 or 4.
0062  *
0063  * To use the result of "reciprocal_value_adv", suppose we want to calculate
0064  * n/d, the pseudo C code will be:
0065  *
0066  *   struct reciprocal_value_adv rvalue;
0067  *   u8 pre_shift, exp;
0068  *
0069  *   // handle exception case.
0070  *   if (d >= (1U << 31)) {
0071  *     result = n >= d;
0072  *     return;
0073  *   }
0074  *
0075  *   rvalue = reciprocal_value_adv(d, 32)
0076  *   exp = rvalue.exp;
0077  *   if (rvalue.is_wide_m && !(d & 1)) {
0078  *     // floor(log2(d & (2^32 -d)))
0079  *     pre_shift = fls(d & -d) - 1;
0080  *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
0081  *   } else {
0082  *     pre_shift = 0;
0083  *   }
0084  *
0085  *   // code generation starts.
0086  *   if (imm == 1U << exp) {
0087  *     result = n >> exp;
0088  *   } else if (rvalue.is_wide_m) {
0089  *     // pre_shift must be zero when reached here.
0090  *     t = (n * rvalue.m) >> 32;
0091  *     result = n - t;
0092  *     result >>= 1;
0093  *     result += t;
0094  *     result >>= rvalue.sh - 1;
0095  *   } else {
0096  *     if (pre_shift)
0097  *       result = n >> pre_shift;
0098  *     result = ((u64)result * rvalue.m) >> 32;
0099  *     result >>= rvalue.sh;
0100  *   }
0101  */
0102 struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
0103
0104 #endif /* _LINUX_RECIPROCAL_DIV_H */