* [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]
@ 2020-11-28 7:54 Jakub Jelinek
2020-11-28 14:55 ` Thomas Koenig
2020-11-30 9:24 ` Richard Biener
0 siblings, 2 replies; 4+ messages in thread
From: Jakub Jelinek @ 2020-11-28 7:54 UTC (permalink / raw)
To: Richard Biener, Jeff Law; +Cc: gcc-patches, Thomas Koenig
Hi!
As discussed in the PR, e.g. on x86_64 (both -m32 and -m64) there is no
double-word modulo and so we expand it to a __{,u}mod[dt]i3 call.
For certain constant divisors we can do better. E.g. consider
32-bit word-size, 0x100000000ULL % 3 == 1, so we can use partly the Hacker's
delight modulo by summing digits approach and optimize
unsigned long long foo (unsigned long long x) { return x % 3; }
as
unsigned long long foo (unsigned long long x) {
unsigned int sum, carry;
carry = __builtin_add_overflow ((unsigned int) x, (unsigned int) (x >> 32), &sum);
sum += carry;
return sum % 3;
}
Similarly, 0x10000000ULL % 5 == 1 (note, 1 << 28), so
unsigned long long bar (unsigned long long x) { return x % 5; }
as
unsigned long long bar (unsigned long long x) {
unsigned int sum = x & ((1 << 28) - 1);
sum += (x >> 28) & ((1 << 28) - 1);
sum += (x >> 56);
return sum % 5;
}
etc.
And we can do also signed modulo,
long long baz (long long x) { return x % 5; }
as
long long baz (long long x) {
unsigned int sum = x & ((1 << 28) - 1);
sum += ((unsigned long long) x >> 28) & ((1 << 28) - 1);
sum += ((unsigned long long) x >> 56);
/* Sum adjustment for negative x. */
sum += (x >> 63) & 3;
unsigned int rem = sum % 5;
/* And finally adjust it to the right interval for negative values. */
return (int) (rem + ((x >> 63) & -4));
}
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2020-11-28 Jakub Jelinek <jakub@redhat.com>
PR rtl-optimization/97459
* internal-fn.h (expand_addsub_overflow): Declare.
* internal-fn.c (expand_addsub_overflow): No longer static.
* optabs.c (expand_doubleword_mod): New function.
(expand_binop): Optimize double-word mod with constant divisor.
* gcc.dg/pr97459-1.c: New test.
* gcc.dg/pr97459-2.c: New test.
--- gcc/internal-fn.h.jj 2020-11-27 11:19:37.950190425 +0100
+++ gcc/internal-fn.h 2020-11-27 13:18:13.116798464 +0100
@@ -224,6 +224,8 @@ extern bool internal_gather_scatter_fn_s
extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
poly_uint64, unsigned int);
+extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
+ bool, bool, bool, bool, tree *);
extern void expand_internal_call (gcall *);
extern void expand_internal_call (internal_fn, gcall *);
extern void expand_PHI (internal_fn, gcall *);
--- gcc/internal-fn.c.jj 2020-11-27 11:19:37.950190425 +0100
+++ gcc/internal-fn.c 2020-11-27 13:18:13.117798453 +0100
@@ -798,7 +798,7 @@ expand_ubsan_result_store (rtx target, r
/* Add sub/add overflow checking to the statement STMT.
CODE says whether the operation is +, or -. */
-static void
+void
expand_addsub_overflow (location_t loc, tree_code code, tree lhs,
tree arg0, tree arg1, bool unsr_p, bool uns0_p,
bool uns1_p, bool is_ubsan, tree *datap)
--- gcc/optabs.c.jj 2020-11-27 11:19:38.000189859 +0100
+++ gcc/optabs.c 2020-11-27 16:06:30.971435747 +0100
@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3.
#include "expr.h"
#include "optabs-tree.h"
#include "libfuncs.h"
+#include "internal-fn.h"
+#include "langhooks.h"
static void prepare_float_lib_cmp (rtx, rtx, enum rtx_code, rtx *,
machine_mode *);
@@ -926,6 +928,196 @@ expand_doubleword_mult (machine_mode mod
emit_move_insn (product_high, adjust);
return product;
}
+
+/* Subroutine of expand_binop. Optimize unsigned double-word OP0 % OP1 for
+ constant OP1. If for some bit in [BITS_PER_WORD / 2, BITS_PER_WORD] range
+ (prefer higher bits) ((1w << bit) % OP1) == 1, then the modulo can be
+ computed in word-mode as ((OP0 & (bit - 1)) + ((OP0 >> bit) & (bit - 1))
+ + (OP0 >> (2 * bit))) % OP1. Whether we need to sum 2, 3 or 4 values
+ depends on the bit value, if 2, then carry from the addition needs to be
+ added too, i.e. like:
+ sum += __builtin_add_overflow (low, high, &sum)
+
+ Optimize signed double-word OP0 % OP1 similarly, just apply some correction
+ factor to the sum before doing unsigned remainder, in the form of
+ sum += (((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & const);
+ then perform unsigned
+ remainder = sum % OP1;
+ and finally
+ remainder += ((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1); */
+
+static rtx
+expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
+{
+ if (INTVAL (op1) <= 1)
+ return NULL_RTX;
+
+ rtx_insn *last = get_last_insn ();
+ for (int bit = BITS_PER_WORD; bit >= BITS_PER_WORD / 2; bit--)
+ {
+ wide_int w = wi::shifted_mask (bit, 1, false, 2 * BITS_PER_WORD);
+ if (wi::ne_p (wi::umod_trunc (w, INTVAL (op1)), 1))
+ continue;
+ rtx sum = NULL_RTX, mask = NULL_RTX;
+ if (bit == BITS_PER_WORD)
+ {
+ /* For signed modulo we need to add correction to the sum
+ and that might again overflow. */
+ if (!unsignedp)
+ continue;
+ if (optab_handler (uaddv4_optab, word_mode) == CODE_FOR_nothing)
+ continue;
+ tree wtype = lang_hooks.types.type_for_mode (word_mode, 1);
+ if (wtype == NULL_TREE)
+ continue;
+ tree ctype = build_complex_type (wtype);
+ if (TYPE_MODE (ctype) != GET_MODE_COMPLEX_MODE (word_mode))
+ continue;
+ machine_mode cmode = TYPE_MODE (ctype);
+ rtx op00 = operand_subword_force (op0, 0, mode);
+ rtx op01 = operand_subword_force (op0, 1, mode);
+ rtx cres = gen_rtx_CONCAT (cmode, gen_reg_rtx (word_mode),
+ gen_reg_rtx (word_mode));
+ tree lhs = make_tree (ctype, cres);
+ tree arg0 = make_tree (wtype, op00);
+ tree arg1 = make_tree (wtype, op01);
+ expand_addsub_overflow (UNKNOWN_LOCATION, PLUS_EXPR, lhs, arg0,
+ arg1, true, true, true, false, NULL);
+ sum = expand_simple_binop (word_mode, PLUS, XEXP (cres, 0),
+ XEXP (cres, 1), NULL_RTX, 1,
+ OPTAB_DIRECT);
+ if (sum == NULL_RTX)
+ return NULL_RTX;
+ }
+ else
+ {
+ /* Code below uses GEN_INT, so we need the masks to be representable
+ in HOST_WIDE_INTs. */
+ if (bit >= HOST_BITS_PER_WIDE_INT)
+ continue;
+ /* If op0 is e.g. -1 or -2 unsigned, then the 2 additions might
+ overflow. Consider 64-bit -1ULL for word size 32, if we add
+ 0x7fffffffU + 0x7fffffffU + 3U, it wraps around to 1. */
+ if (bit == BITS_PER_WORD - 1)
+ continue;
+
+ int count = (2 * BITS_PER_WORD + bit - 1) / bit;
+ rtx sum_corr = NULL_RTX;
+
+ if (!unsignedp)
+ {
+ /* For signed modulo, compute it as unsigned modulo of
+ sum with a correction added to it if OP0 is negative,
+ such that the result can be computed as unsigned
+ remainder + ((OP1 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1). */
+ w = wi::min_value (2 * BITS_PER_WORD, SIGNED);
+ wide_int wmod1 = wi::umod_trunc (w, INTVAL (op1));
+ wide_int wmod2 = wi::smod_trunc (w, INTVAL (op1));
+ /* wmod2 == -wmod1. */
+ wmod2 = wmod2 + (INTVAL (op1) - 1);
+ if (wi::ne_p (wmod1, wmod2))
+ {
+ wide_int wcorr = wmod2 - wmod1;
+ if (wi::neg_p (w))
+ wcorr = wcorr + INTVAL (op1);
+ /* Now verify if the count sums can't overflow, and punt
+ if they could. */
+ w = wi::mask (bit, false, 2 * BITS_PER_WORD);
+ w = w * (count - 1);
+ w = w + wi::mask (2 * BITS_PER_WORD - (count - 1) * bit,
+ false, 2 * BITS_PER_WORD);
+ w = w + wcorr;
+ w = wi::lrshift (w, BITS_PER_WORD);
+ if (wi::ne_p (w, 0))
+ continue;
+
+ mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
+ mode);
+ mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
+ GEN_INT (BITS_PER_WORD - 1),
+ NULL_RTX, 0, OPTAB_DIRECT);
+ if (mask == NULL_RTX)
+ return NULL_RTX;
+ sum_corr = immed_wide_int_const (wcorr, word_mode);
+ sum_corr = expand_simple_binop (word_mode, AND, mask,
+ sum_corr, NULL_RTX, 1,
+ OPTAB_DIRECT);
+ if (sum_corr == NULL_RTX)
+ return NULL_RTX;
+ }
+ }
+
+ for (int i = 0; i < count; i++)
+ {
+ rtx v = op0;
+ if (i)
+ v = expand_simple_binop (mode, LSHIFTRT, v, GEN_INT (i * bit),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ if (v == NULL_RTX)
+ return NULL_RTX;
+ v = lowpart_subreg (word_mode, v, mode);
+ if (v == NULL_RTX)
+ return NULL_RTX;
+ if (i != count - 1)
+ v = expand_simple_binop (word_mode, AND, v,
+ GEN_INT ((HOST_WIDE_INT_1U << bit)
+ - 1), NULL_RTX, 1,
+ OPTAB_DIRECT);
+ if (v == NULL_RTX)
+ return NULL_RTX;
+ if (sum == NULL_RTX)
+ sum = v;
+ else
+ sum = expand_simple_binop (word_mode, PLUS, sum, v, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ if (sum == NULL_RTX)
+ return NULL_RTX;
+ }
+ if (sum_corr)
+ {
+ sum = expand_simple_binop (word_mode, PLUS, sum, sum_corr,
+ NULL_RTX, 1, OPTAB_DIRECT);
+ if (sum == NULL_RTX)
+ return NULL_RTX;
+ }
+ }
+ rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum, op1,
+ NULL_RTX, 1);
+ if (remainder == NULL_RTX)
+ return NULL_RTX;
+
+ if (!unsignedp)
+ {
+ if (mask == NULL_RTX)
+ {
+ mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
+ mode);
+ mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
+ GEN_INT (BITS_PER_WORD - 1),
+ NULL_RTX, 0, OPTAB_DIRECT);
+ if (mask == NULL_RTX)
+ return NULL_RTX;
+ }
+ mask = expand_simple_binop (word_mode, AND, mask,
+ GEN_INT (1 - INTVAL (op1)),
+ NULL_RTX, 1, OPTAB_DIRECT);
+ if (mask == NULL_RTX)
+ return NULL_RTX;
+ remainder = expand_simple_binop (word_mode, PLUS, remainder,
+ mask, NULL_RTX, 1, OPTAB_DIRECT);
+ if (remainder == NULL_RTX)
+ return NULL_RTX;
+ }
+
+ remainder = convert_modes (mode, word_mode, remainder, unsignedp);
+ /* Punt if we need any library calls. */
+ for (; last; last = NEXT_INSN (last))
+ if (CALL_P (last))
+ return NULL_RTX;
+ return remainder;
+ }
+ return NULL_RTX;
+}
\f
/* Wrapper around expand_binop which takes an rtx code to specify
the operation to perform, not an optab pointer. All other
@@ -1806,6 +1998,37 @@ expand_binop (machine_mode mode, optab b
}
}
+ /* Attempt to synthetize double word modulo by constant divisor. */
+ if ((binoptab == umod_optab || binoptab == smod_optab)
+ && optimize
+ && CONST_INT_P (op1)
+ && is_int_mode (mode, &int_mode)
+ && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
+ && optab_handler (lshr_optab, int_mode) != CODE_FOR_nothing
+ && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
+ && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
+ && optimize_insn_for_speed_p ())
+ {
+ rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
+ binoptab == umod_optab);
+ if (remainder != NULL_RTX)
+ {
+ if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
+ {
+ rtx_insn *move = emit_move_insn (target ? target : remainder,
+ remainder);
+ set_dst_reg_note (move,
+ REG_EQUAL,
+ gen_rtx_fmt_ee (UMOD, int_mode,
+ copy_rtx (op0), op1),
+ target ? target : remainder);
+ }
+ return remainder;
+ }
+ else
+ delete_insns_since (last);
+ }
+
/* It can't be open-coded in this mode.
Use a library call if one is available and caller says that's ok. */
--- gcc/testsuite/gcc.dg/pr97459-1.c.jj 2020-11-27 14:16:50.735828637 +0100
+++ gcc/testsuite/gcc.dg/pr97459-1.c 2020-11-27 14:16:12.212259188 +0100
@@ -0,0 +1,54 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t T;
+#else
+typedef unsigned long long T;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x % n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+ C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+ C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+ C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+ { 0, 0 }
+};
+
+int
+main ()
+{
+ int i, j, k;
+ for (k = 0; tests[k].x; k++)
+ for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+ for (j = -5; j <= 5; j++)
+ {
+ T x = ((T) 1 << i) + j;
+ if (foo (x, tests[k].x) != tests[k].foo (x))
+ __builtin_abort ();
+ }
+ return 0;
+}
--- gcc/testsuite/gcc.dg/pr97459-2.c.jj 2020-11-27 15:53:36.831080388 +0100
+++ gcc/testsuite/gcc.dg/pr97459-2.c 2020-11-27 15:50:20.763269826 +0100
@@ -0,0 +1,57 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __int128_t T;
+typedef __uint128_t U;
+#else
+typedef long long T;
+typedef unsigned long long U;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x % n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+ C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+ C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+ C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+ { 0, 0 }
+};
+
+int
+main ()
+{
+ int i, j, k;
+ for (k = 0; tests[k].x; k++)
+ for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+ for (j = -5; j <= 5; j++)
+ {
+ U x = ((U) 1 << i) + j;
+ if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
+ || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
+ __builtin_abort ();
+ }
+ return 0;
+}
Jakub
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]
2020-11-28 7:54 [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459] Jakub Jelinek
@ 2020-11-28 14:55 ` Thomas Koenig
2020-11-30 9:21 ` Richard Biener
2020-11-30 9:24 ` Richard Biener
1 sibling, 1 reply; 4+ messages in thread
From: Thomas Koenig @ 2020-11-28 14:55 UTC (permalink / raw)
To: Jakub Jelinek, Richard Biener, Jeff Law; +Cc: gcc-patches
[-- Attachment #1: Type: text/plain, Size: 1466 bytes --]
Hello Jakub,
thanks a lot for taking this on!
As far as I can tell, the patch works very well, and really speeds up
things.
As (somewhat confusingly) discussed in the PR, there are a couple of
things that could still be done incrementally with this method.
Fist, it can be combined with, or even used for, the calulation
of the quotient. Let a be a number for which your patch works,
for example 5.
If you want to calculate n / 5, you can do
rem = n % 5; /* Fast, with your patch. */
quot = (n - rem) * magic;
in a fast way, where magic is the multiplicative inverse of 5
modulo 2^128 (for a 128-bit number) or 2^64 (for a 64-bit number),
which can be calculated using gmp_invert. The multiplicative inverse
for division works because n - rem is divisible by 5.
Second, you can also use this for the quotient and/or remainder
by 2*a (for example 10), with a slight adjustment:
rem5 = n % 5;
quot5 = (n - rem5) * magic;
rem10 = (quot5 % 2) * 5 + rem5;
quot10 = quot5 / 2;
This would cover the important use case of getting the quotient and
remainder for division by 10.
However, a benchmark (source attached) indicates that this method
is much faster even when only one of quotient and remainder
of division by 10 is needed. Numbers I got indicate that this
method is faster by about a factor of five than calling the
library version.
I hope this clears up the somewhat confusing string of comments
in the PR.
Best regards
Thomas
[-- Attachment #2: bench.c --]
[-- Type: text/x-csrc, Size: 1145 bytes --]
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>
typedef __uint128_t mytype;
double this_time ()
{
struct timeval tv;
gettimeofday (&tv, NULL);
return tv.tv_sec + tv.tv_usec * 1e-6;
}
#define ONE ((__uint128_t) 1)
#define TWO_64 (ONE << 64)
unsigned rem10_v1 (mytype n)
{
return n % 10;
}
unsigned rem10_v2 (mytype n)
{
mytype quot5, tmp;
const mytype magic = (0xCCCCCCCCCCCCCCCC * TWO_64 + 0xCCCCCCCCCCCCCCCD * ONE);
unsigned rem5 = n % 5;
quot5 = (n - rem5) * magic;
return rem5 + (quot5 % 2) * 5;
}
#define N 10000000
int main()
{
mytype *a;
unsigned long s;
int fd;
double t1, t2;
a = malloc (sizeof (*a) * N);
fd = open ("/dev/urandom", O_RDONLY);
read (fd, a, sizeof (*a) * N);
close (fd);
s = 0;
t1 = this_time();
for (int i=0; i<N; i++)
s += rem10_v1 (a[i]);
t2 = this_time();
printf ("s = %lu rem10_v1: %f s\n", s, (t2-t1));
s = 0;
t1 = this_time();
for (int i=0; i<N; i++)
s += rem10_v2 (a[i]);
t2 = this_time();
printf ("s = %lu rem10_v1: %f s\n", s, (t2-t1));
}
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]
2020-11-28 14:55 ` Thomas Koenig
@ 2020-11-30 9:21 ` Richard Biener
0 siblings, 0 replies; 4+ messages in thread
From: Richard Biener @ 2020-11-30 9:21 UTC (permalink / raw)
To: Thomas Koenig; +Cc: Jakub Jelinek, Jeff Law, gcc-patches
On Sat, 28 Nov 2020, Thomas Koenig wrote:
> Hello Jakub,
>
> thanks a lot for taking this on!
>
> As far as I can tell, the patch works very well, and really speeds up
> things.
>
> As (somewhat confusingly) discussed in the PR, there are a couple of
> things that could still be done incrementally with this method.
>
> Fist, it can be combined with, or even used for, the calulation
> of the quotient. Let a be a number for which your patch works,
> for example 5.
>
> If you want to calculate n / 5, you can do
>
> rem = n % 5; /* Fast, with your patch. */
> quot = (n - rem) * magic;
>
> in a fast way, where magic is the multiplicative inverse of 5
> modulo 2^128 (for a 128-bit number) or 2^64 (for a 64-bit number),
> which can be calculated using gmp_invert. The multiplicative inverse
> for division works because n - rem is divisible by 5.
>
> Second, you can also use this for the quotient and/or remainder
> by 2*a (for example 10), with a slight adjustment:
>
> rem5 = n % 5;
> quot5 = (n - rem5) * magic;
> rem10 = (quot5 % 2) * 5 + rem5;
> quot10 = quot5 / 2;
>
> This would cover the important use case of getting the quotient and
> remainder for division by 10.
>
> However, a benchmark (source attached) indicates that this method
> is much faster even when only one of quotient and remainder
> of division by 10 is needed. Numbers I got indicate that this
> method is faster by about a factor of five than calling the
> library version.
Hmm, the benchmark measures throughput of integer division/modulo
which is _much_ worse than just latency since division/modulo is
usually not pipelined so there can be only one division/modulo op
in-flight.
So not sure how relevant the benchmark is - a benchmark measuring
only the latency difference would be more useful, but that's of
course harder to get at. Maybe add a data dependence on each of
the loop iteration computations.
Richard.
> I hope this clears up the somewhat confusing string of comments
> in the PR.
>
> Best regards
>
> Thomas
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]
2020-11-28 7:54 [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459] Jakub Jelinek
2020-11-28 14:55 ` Thomas Koenig
@ 2020-11-30 9:24 ` Richard Biener
1 sibling, 0 replies; 4+ messages in thread
From: Richard Biener @ 2020-11-30 9:24 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Jeff Law, gcc-patches, Thomas Koenig
On Sat, 28 Nov 2020, Jakub Jelinek wrote:
> Hi!
>
> As discussed in the PR, e.g. on x86_64 (both -m32 and -m64) there is no
> double-word modulo and so we expand it to a __{,u}mod[dt]i3 call.
> For certain constant divisors we can do better. E.g. consider
> 32-bit word-size, 0x100000000ULL % 3 == 1, so we can use partly the Hacker's
> delight modulo by summing digits approach and optimize
> unsigned long long foo (unsigned long long x) { return x % 3; }
> as
> unsigned long long foo (unsigned long long x) {
> unsigned int sum, carry;
> carry = __builtin_add_overflow ((unsigned int) x, (unsigned int) (x >> 32), &sum);
> sum += carry;
> return sum % 3;
> }
> Similarly, 0x10000000ULL % 5 == 1 (note, 1 << 28), so
> unsigned long long bar (unsigned long long x) { return x % 5; }
> as
> unsigned long long bar (unsigned long long x) {
> unsigned int sum = x & ((1 << 28) - 1);
> sum += (x >> 28) & ((1 << 28) - 1);
> sum += (x >> 56);
> return sum % 5;
> }
> etc.
> And we can do also signed modulo,
> long long baz (long long x) { return x % 5; }
> as
> long long baz (long long x) {
> unsigned int sum = x & ((1 << 28) - 1);
> sum += ((unsigned long long) x >> 28) & ((1 << 28) - 1);
> sum += ((unsigned long long) x >> 56);
> /* Sum adjustment for negative x. */
> sum += (x >> 63) & 3;
> unsigned int rem = sum % 5;
> /* And finally adjust it to the right interval for negative values. */
> return (int) (rem + ((x >> 63) & -4));
> }
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK.
Thanks,
Richard.
> 2020-11-28 Jakub Jelinek <jakub@redhat.com>
>
> PR rtl-optimization/97459
> * internal-fn.h (expand_addsub_overflow): Declare.
> * internal-fn.c (expand_addsub_overflow): No longer static.
> * optabs.c (expand_doubleword_mod): New function.
> (expand_binop): Optimize double-word mod with constant divisor.
>
> * gcc.dg/pr97459-1.c: New test.
> * gcc.dg/pr97459-2.c: New test.
>
> --- gcc/internal-fn.h.jj 2020-11-27 11:19:37.950190425 +0100
> +++ gcc/internal-fn.h 2020-11-27 13:18:13.116798464 +0100
> @@ -224,6 +224,8 @@ extern bool internal_gather_scatter_fn_s
> extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
> poly_uint64, unsigned int);
>
> +extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
> + bool, bool, bool, bool, tree *);
> extern void expand_internal_call (gcall *);
> extern void expand_internal_call (internal_fn, gcall *);
> extern void expand_PHI (internal_fn, gcall *);
> --- gcc/internal-fn.c.jj 2020-11-27 11:19:37.950190425 +0100
> +++ gcc/internal-fn.c 2020-11-27 13:18:13.117798453 +0100
> @@ -798,7 +798,7 @@ expand_ubsan_result_store (rtx target, r
> /* Add sub/add overflow checking to the statement STMT.
> CODE says whether the operation is +, or -. */
>
> -static void
> +void
> expand_addsub_overflow (location_t loc, tree_code code, tree lhs,
> tree arg0, tree arg1, bool unsr_p, bool uns0_p,
> bool uns1_p, bool is_ubsan, tree *datap)
> --- gcc/optabs.c.jj 2020-11-27 11:19:38.000189859 +0100
> +++ gcc/optabs.c 2020-11-27 16:06:30.971435747 +0100
> @@ -44,6 +44,8 @@ along with GCC; see the file COPYING3.
> #include "expr.h"
> #include "optabs-tree.h"
> #include "libfuncs.h"
> +#include "internal-fn.h"
> +#include "langhooks.h"
>
> static void prepare_float_lib_cmp (rtx, rtx, enum rtx_code, rtx *,
> machine_mode *);
> @@ -926,6 +928,196 @@ expand_doubleword_mult (machine_mode mod
> emit_move_insn (product_high, adjust);
> return product;
> }
> +
> +/* Subroutine of expand_binop. Optimize unsigned double-word OP0 % OP1 for
> + constant OP1. If for some bit in [BITS_PER_WORD / 2, BITS_PER_WORD] range
> + (prefer higher bits) ((1w << bit) % OP1) == 1, then the modulo can be
> + computed in word-mode as ((OP0 & (bit - 1)) + ((OP0 >> bit) & (bit - 1))
> + + (OP0 >> (2 * bit))) % OP1. Whether we need to sum 2, 3 or 4 values
> + depends on the bit value, if 2, then carry from the addition needs to be
> + added too, i.e. like:
> + sum += __builtin_add_overflow (low, high, &sum)
> +
> + Optimize signed double-word OP0 % OP1 similarly, just apply some correction
> + factor to the sum before doing unsigned remainder, in the form of
> + sum += (((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & const);
> + then perform unsigned
> + remainder = sum % OP1;
> + and finally
> + remainder += ((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1); */
> +
> +static rtx
> +expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
> +{
> + if (INTVAL (op1) <= 1)
> + return NULL_RTX;
> +
> + rtx_insn *last = get_last_insn ();
> + for (int bit = BITS_PER_WORD; bit >= BITS_PER_WORD / 2; bit--)
> + {
> + wide_int w = wi::shifted_mask (bit, 1, false, 2 * BITS_PER_WORD);
> + if (wi::ne_p (wi::umod_trunc (w, INTVAL (op1)), 1))
> + continue;
> + rtx sum = NULL_RTX, mask = NULL_RTX;
> + if (bit == BITS_PER_WORD)
> + {
> + /* For signed modulo we need to add correction to the sum
> + and that might again overflow. */
> + if (!unsignedp)
> + continue;
> + if (optab_handler (uaddv4_optab, word_mode) == CODE_FOR_nothing)
> + continue;
> + tree wtype = lang_hooks.types.type_for_mode (word_mode, 1);
> + if (wtype == NULL_TREE)
> + continue;
> + tree ctype = build_complex_type (wtype);
> + if (TYPE_MODE (ctype) != GET_MODE_COMPLEX_MODE (word_mode))
> + continue;
> + machine_mode cmode = TYPE_MODE (ctype);
> + rtx op00 = operand_subword_force (op0, 0, mode);
> + rtx op01 = operand_subword_force (op0, 1, mode);
> + rtx cres = gen_rtx_CONCAT (cmode, gen_reg_rtx (word_mode),
> + gen_reg_rtx (word_mode));
> + tree lhs = make_tree (ctype, cres);
> + tree arg0 = make_tree (wtype, op00);
> + tree arg1 = make_tree (wtype, op01);
> + expand_addsub_overflow (UNKNOWN_LOCATION, PLUS_EXPR, lhs, arg0,
> + arg1, true, true, true, false, NULL);
> + sum = expand_simple_binop (word_mode, PLUS, XEXP (cres, 0),
> + XEXP (cres, 1), NULL_RTX, 1,
> + OPTAB_DIRECT);
> + if (sum == NULL_RTX)
> + return NULL_RTX;
> + }
> + else
> + {
> + /* Code below uses GEN_INT, so we need the masks to be representable
> + in HOST_WIDE_INTs. */
> + if (bit >= HOST_BITS_PER_WIDE_INT)
> + continue;
> + /* If op0 is e.g. -1 or -2 unsigned, then the 2 additions might
> + overflow. Consider 64-bit -1ULL for word size 32, if we add
> + 0x7fffffffU + 0x7fffffffU + 3U, it wraps around to 1. */
> + if (bit == BITS_PER_WORD - 1)
> + continue;
> +
> + int count = (2 * BITS_PER_WORD + bit - 1) / bit;
> + rtx sum_corr = NULL_RTX;
> +
> + if (!unsignedp)
> + {
> + /* For signed modulo, compute it as unsigned modulo of
> + sum with a correction added to it if OP0 is negative,
> + such that the result can be computed as unsigned
> + remainder + ((OP1 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1). */
> + w = wi::min_value (2 * BITS_PER_WORD, SIGNED);
> + wide_int wmod1 = wi::umod_trunc (w, INTVAL (op1));
> + wide_int wmod2 = wi::smod_trunc (w, INTVAL (op1));
> + /* wmod2 == -wmod1. */
> + wmod2 = wmod2 + (INTVAL (op1) - 1);
> + if (wi::ne_p (wmod1, wmod2))
> + {
> + wide_int wcorr = wmod2 - wmod1;
> + if (wi::neg_p (w))
> + wcorr = wcorr + INTVAL (op1);
> + /* Now verify if the count sums can't overflow, and punt
> + if they could. */
> + w = wi::mask (bit, false, 2 * BITS_PER_WORD);
> + w = w * (count - 1);
> + w = w + wi::mask (2 * BITS_PER_WORD - (count - 1) * bit,
> + false, 2 * BITS_PER_WORD);
> + w = w + wcorr;
> + w = wi::lrshift (w, BITS_PER_WORD);
> + if (wi::ne_p (w, 0))
> + continue;
> +
> + mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
> + mode);
> + mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
> + GEN_INT (BITS_PER_WORD - 1),
> + NULL_RTX, 0, OPTAB_DIRECT);
> + if (mask == NULL_RTX)
> + return NULL_RTX;
> + sum_corr = immed_wide_int_const (wcorr, word_mode);
> + sum_corr = expand_simple_binop (word_mode, AND, mask,
> + sum_corr, NULL_RTX, 1,
> + OPTAB_DIRECT);
> + if (sum_corr == NULL_RTX)
> + return NULL_RTX;
> + }
> + }
> +
> + for (int i = 0; i < count; i++)
> + {
> + rtx v = op0;
> + if (i)
> + v = expand_simple_binop (mode, LSHIFTRT, v, GEN_INT (i * bit),
> + NULL_RTX, 1, OPTAB_DIRECT);
> + if (v == NULL_RTX)
> + return NULL_RTX;
> + v = lowpart_subreg (word_mode, v, mode);
> + if (v == NULL_RTX)
> + return NULL_RTX;
> + if (i != count - 1)
> + v = expand_simple_binop (word_mode, AND, v,
> + GEN_INT ((HOST_WIDE_INT_1U << bit)
> + - 1), NULL_RTX, 1,
> + OPTAB_DIRECT);
> + if (v == NULL_RTX)
> + return NULL_RTX;
> + if (sum == NULL_RTX)
> + sum = v;
> + else
> + sum = expand_simple_binop (word_mode, PLUS, sum, v, NULL_RTX,
> + 1, OPTAB_DIRECT);
> + if (sum == NULL_RTX)
> + return NULL_RTX;
> + }
> + if (sum_corr)
> + {
> + sum = expand_simple_binop (word_mode, PLUS, sum, sum_corr,
> + NULL_RTX, 1, OPTAB_DIRECT);
> + if (sum == NULL_RTX)
> + return NULL_RTX;
> + }
> + }
> + rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum, op1,
> + NULL_RTX, 1);
> + if (remainder == NULL_RTX)
> + return NULL_RTX;
> +
> + if (!unsignedp)
> + {
> + if (mask == NULL_RTX)
> + {
> + mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
> + mode);
> + mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
> + GEN_INT (BITS_PER_WORD - 1),
> + NULL_RTX, 0, OPTAB_DIRECT);
> + if (mask == NULL_RTX)
> + return NULL_RTX;
> + }
> + mask = expand_simple_binop (word_mode, AND, mask,
> + GEN_INT (1 - INTVAL (op1)),
> + NULL_RTX, 1, OPTAB_DIRECT);
> + if (mask == NULL_RTX)
> + return NULL_RTX;
> + remainder = expand_simple_binop (word_mode, PLUS, remainder,
> + mask, NULL_RTX, 1, OPTAB_DIRECT);
> + if (remainder == NULL_RTX)
> + return NULL_RTX;
> + }
> +
> + remainder = convert_modes (mode, word_mode, remainder, unsignedp);
> + /* Punt if we need any library calls. */
> + for (; last; last = NEXT_INSN (last))
> + if (CALL_P (last))
> + return NULL_RTX;
> + return remainder;
> + }
> + return NULL_RTX;
> +}
> \f
> /* Wrapper around expand_binop which takes an rtx code to specify
> the operation to perform, not an optab pointer. All other
> @@ -1806,6 +1998,37 @@ expand_binop (machine_mode mode, optab b
> }
> }
>
> + /* Attempt to synthetize double word modulo by constant divisor. */
> + if ((binoptab == umod_optab || binoptab == smod_optab)
> + && optimize
> + && CONST_INT_P (op1)
> + && is_int_mode (mode, &int_mode)
> + && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> + && optab_handler (lshr_optab, int_mode) != CODE_FOR_nothing
> + && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
> + && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
> + && optimize_insn_for_speed_p ())
> + {
> + rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
> + binoptab == umod_optab);
> + if (remainder != NULL_RTX)
> + {
> + if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
> + {
> + rtx_insn *move = emit_move_insn (target ? target : remainder,
> + remainder);
> + set_dst_reg_note (move,
> + REG_EQUAL,
> + gen_rtx_fmt_ee (UMOD, int_mode,
> + copy_rtx (op0), op1),
> + target ? target : remainder);
> + }
> + return remainder;
> + }
> + else
> + delete_insns_since (last);
> + }
> +
> /* It can't be open-coded in this mode.
> Use a library call if one is available and caller says that's ok. */
>
> --- gcc/testsuite/gcc.dg/pr97459-1.c.jj 2020-11-27 14:16:50.735828637 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-1.c 2020-11-27 14:16:12.212259188 +0100
> @@ -0,0 +1,54 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __uint128_t T;
> +#else
> +typedef unsigned long long T;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x % n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> + { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> + int i, j, k;
> + for (k = 0; tests[k].x; k++)
> + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> + for (j = -5; j <= 5; j++)
> + {
> + T x = ((T) 1 << i) + j;
> + if (foo (x, tests[k].x) != tests[k].foo (x))
> + __builtin_abort ();
> + }
> + return 0;
> +}
> --- gcc/testsuite/gcc.dg/pr97459-2.c.jj 2020-11-27 15:53:36.831080388 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-2.c 2020-11-27 15:50:20.763269826 +0100
> @@ -0,0 +1,57 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __int128_t T;
> +typedef __uint128_t U;
> +#else
> +typedef long long T;
> +typedef unsigned long long U;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x % n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> + C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> + C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> + C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> + { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> + int i, j, k;
> + for (k = 0; tests[k].x; k++)
> + for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> + for (j = -5; j <= 5; j++)
> + {
> + U x = ((U) 1 << i) + j;
> + if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
> + || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
> + __builtin_abort ();
> + }
> + return 0;
> +}
>
> Jakub
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2020-11-30 9:24 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-28 7:54 [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459] Jakub Jelinek
2020-11-28 14:55 ` Thomas Koenig
2020-11-30 9:21 ` Richard Biener
2020-11-30 9:24 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).