![]() |
|
|||
0001 /* SPDX-License-Identifier: GPL-2.0 */ 0002 #ifndef _LINUX_RECIPROCAL_DIV_H 0003 #define _LINUX_RECIPROCAL_DIV_H 0004 0005 #include <linux/types.h> 0006 0007 /* 0008 * This algorithm is based on the paper "Division by Invariant 0009 * Integers Using Multiplication" by Torbjörn Granlund and Peter 0010 * L. Montgomery. 0011 * 0012 * The assembler implementation from Agner Fog, which this code is 0013 * based on, can be found here: 0014 * http://www.agner.org/optimize/asmlib.zip 0015 * 0016 * This optimization for A/B is helpful if the divisor B is mostly 0017 * runtime invariant. The reciprocal of B is calculated in the 0018 * slow-path with reciprocal_value(). The fast-path can then just use 0019 * a much faster multiplication operation with a variable dividend A 0020 * to calculate the division A/B. 0021 */ 0022 0023 struct reciprocal_value { 0024 u32 m; 0025 u8 sh1, sh2; 0026 }; 0027 0028 /* "reciprocal_value" and "reciprocal_divide" together implement the basic 0029 * version of the algorithm described in Figure 4.1 of the paper. 0030 */ 0031 struct reciprocal_value reciprocal_value(u32 d); 0032 0033 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) 0034 { 0035 u32 t = (u32)(((u64)a * R.m) >> 32); 0036 return (t + ((a - t) >> R.sh1)) >> R.sh2; 0037 } 0038 0039 struct reciprocal_value_adv { 0040 u32 m; 0041 u8 sh, exp; 0042 bool is_wide_m; 0043 }; 0044 0045 /* "reciprocal_value_adv" implements the advanced version of the algorithm 0046 * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose 0047 * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The 0048 * exception case could be easily handled before calling "reciprocal_value_adv". 0049 * 0050 * The advanced version requires more complex calculation to get the reciprocal 0051 * multiplier and other control variables, but then could reduce the required 0052 * emulation operations. 0053 * 0054 * It makes no sense to use this advanced version for host divide emulation, 0055 * those extra complexities for calculating multiplier etc could completely 0056 * waive our saving on emulation operations. 0057 * 0058 * However, it makes sense to use it for JIT divide code generation for which 0059 * we are willing to trade performance of JITed code with that of host. As shown 0060 * by the following pseudo code, the required emulation operations could go down 0061 * from 6 (the basic version) to 3 or 4. 0062 * 0063 * To use the result of "reciprocal_value_adv", suppose we want to calculate 0064 * n/d, the pseudo C code will be: 0065 * 0066 * struct reciprocal_value_adv rvalue; 0067 * u8 pre_shift, exp; 0068 * 0069 * // handle exception case. 0070 * if (d >= (1U << 31)) { 0071 * result = n >= d; 0072 * return; 0073 * } 0074 * 0075 * rvalue = reciprocal_value_adv(d, 32) 0076 * exp = rvalue.exp; 0077 * if (rvalue.is_wide_m && !(d & 1)) { 0078 * // floor(log2(d & (2^32 -d))) 0079 * pre_shift = fls(d & -d) - 1; 0080 * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); 0081 * } else { 0082 * pre_shift = 0; 0083 * } 0084 * 0085 * // code generation starts. 0086 * if (imm == 1U << exp) { 0087 * result = n >> exp; 0088 * } else if (rvalue.is_wide_m) { 0089 * // pre_shift must be zero when reached here. 0090 * t = (n * rvalue.m) >> 32; 0091 * result = n - t; 0092 * result >>= 1; 0093 * result += t; 0094 * result >>= rvalue.sh - 1; 0095 * } else { 0096 * if (pre_shift) 0097 * result = n >> pre_shift; 0098 * result = ((u64)result * rvalue.m) >> 32; 0099 * result >>= rvalue.sh; 0100 * } 0101 */ 0102 struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); 0103 0104 #endif /* _LINUX_RECIPROCAL_DIV_H */
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.1.0 LXR engine. The LXR team |
![]() ![]() |