lib: reciprocal_div: implement the improved algorithm on the paper mentioned

author Jiong Wang <jiong.wang@netronome.com>

Fri, 6 Jul 2018 22:13:18 +0000 (15:13 -0700)

committer Daniel Borkmann <daniel@iogearbox.net>

Fri, 6 Jul 2018 23:45:31 +0000 (01:45 +0200)
author Jiong Wang <jiong.wang@netronome.com>
Fri, 6 Jul 2018 22:13:18 +0000 (15:13 -0700)
committer Daniel Borkmann <daniel@iogearbox.net>
Fri, 6 Jul 2018 23:45:31 +0000 (01:45 +0200)
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h

index e031e9f2f9d85f1330a9f38b1a82c7f322f00f3e..585ce89c0f336d0d5ab6895be6ccf9fe1b0a1c02 100644 (file)
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -25,6 +25,9 @@ struct reciprocal_value {
         u8 sh1, sh2;
  };
  
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
+ * version of the algorithm described in Figure 4.1 of the paper.
+ */
  struct reciprocal_value reciprocal_value(u32 d);
  
  static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
@@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
         return (t + ((a - t) >> R.sh1)) >> R.sh2;
  }
  
+struct reciprocal_value_adv {
+       u32 m;
+       u8 sh, exp;
+       bool is_wide_m;
+};
+
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
+ * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
+ * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
+ * exception case could be easily handled before calling "reciprocal_value_adv".
+ *
+ * The advanced version requires more complex calculation to get the reciprocal
+ * multiplier and other control variables, but then could reduce the required
+ * emulation operations.
+ *
+ * It makes no sense to use this advanced version for host divide emulation,
+ * those extra complexities for calculating multiplier etc could completely
+ * waive our saving on emulation operations.
+ *
+ * However, it makes sense to use it for JIT divide code generation for which
+ * we are willing to trade performance of JITed code with that of host. As shown
+ * by the following pseudo code, the required emulation operations could go down
+ * from 6 (the basic version) to 3 or 4.
+ *
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
+ * n/d, the pseudo C code will be:
+ *
+ *   struct reciprocal_value_adv rvalue;
+ *   u8 pre_shift, exp;
+ *
+ *   // handle exception case.
+ *   if (d >= (1U << 31)) {
+ *     result = n >= d;
+ *     return;
+ *   }
+ *
+ *   rvalue = reciprocal_value_adv(d, 32)
+ *   exp = rvalue.exp;
+ *   if (rvalue.is_wide_m && !(d & 1)) {
+ *     // floor(log2(d & (2^32 -d)))
+ *     pre_shift = fls(d & -d) - 1;
+ *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
+ *   } else {
+ *     pre_shift = 0;
+ *   }
+ *
+ *   // code generation starts.
+ *   if (imm == 1U << exp) {
+ *     result = n >> exp;
+ *   } else if (rvalue.is_wide_m) {
+ *     // pre_shift must be zero when reached here.
+ *     t = (n * rvalue.m) >> 32;
+ *     result = n - t;
+ *     result >>= 1;
+ *     result += t;
+ *     result >>= rvalue.sh - 1;
+ *   } else {
+ *     if (pre_shift)
+ *       result = n >> pre_shift;
+ *     result = ((u64)result * rvalue.m) >> 32;
+ *     result >>= rvalue.sh;
+ *   }
+ */
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
+
  #endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c

index fcb4ce682c6fae37d71aa6138007b51cd0d8a2b3..bf043258fa0082b4cc1100a982d2ed38e21b1c0e 100644 (file)
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -1,4 +1,5 @@
  // SPDX-License-Identifier: GPL-2.0
+#include <linux/bug.h>
  #include <linux/kernel.h>
  #include <asm/div64.h>
  #include <linux/reciprocal_div.h>
@@ -26,3 +27,43 @@ struct reciprocal_value reciprocal_value(u32 d)
         return R;
  }
  EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+       struct reciprocal_value_adv R;
+       u32 l, post_shift;
+       u64 mhigh, mlow;
+
+       /* ceil(log2(d)) */
+       l = fls(d - 1);
+       /* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
+        * be handled before calling "reciprocal_value_adv", please see the
+        * comment at include/linux/reciprocal_div.h.
+        */
+       WARN(l == 32,
+            "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
+            d, __func__);
+       post_shift = l;
+       mlow = 1ULL << (32 + l);
+       do_div(mlow, d);
+       mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+       do_div(mhigh, d);
+
+       for (; post_shift > 0; post_shift--) {
+               u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+               if (lo >= hi)
+                       break;
+
+               mlow = lo;
+               mhigh = hi;
+       }
+
+       R.m = (u32)mhigh;
+       R.sh = post_shift;
+       R.exp = l;
+       R.is_wide_m = mhigh > U32_MAX;
+
+       return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);
author	Jiong Wang <jiong.wang@netronome.com>
	Fri, 6 Jul 2018 22:13:18 +0000 (15:13 -0700)
committer	Daniel Borkmann <daniel@iogearbox.net>
	Fri, 6 Jul 2018 23:45:31 +0000 (01:45 +0200)
include/linux/reciprocal_div.h		patch \| blob \| history
lib/reciprocal_div.c		patch \| blob \| history