public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459]
@ 2020-12-01 21:00 Jakub Jelinek
  2020-12-01 23:40 ` Jeff Law
  2020-12-02  7:52 ` Richard Biener
  0 siblings, 2 replies; 3+ messages in thread
From: Jakub Jelinek @ 2020-12-01 21:00 UTC (permalink / raw)
  To: Richard Biener, Jeff Law; +Cc: gcc-patches, Thomas Koenig

Hi!

The following patch implements what Thomas wrote about, in particular
that we can handle also double-word divison by the constants for which
the earlier patch optimized modulo (if it would be otherwise a library
call) and that we can also easily handle such constants shifted to the left.
Unfortunately, seems CSE isn't able to optimize away the two almost
identical sequences (one to compute remainder, one to compute quotient),
probably because of the ADD_OVERFLOW introduced jumps, so the patch also
adjusts expand_DIVMOD.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-12-01  Jakub Jelinek  <jakub@redhat.com>

	PR rtl-optimization/97459
	* optabs.h (expand_doubleword_divmod): Declare.
	* optabs.c (expand_doubleword_divmod): New function.
	(expand_binop): Use it.
	* internal-fn.c (expand_DIVMOD): Likewise.

	* gcc.target/i386/pr97282.c (foo): Use 123456 divisor instead of
	10.
	* gcc.dg/pr97459-1.c (TESTS): Add tests for 10, 12 and
	6144.
	* gcc.dg/pr97459-2.c (TESTS): Likewise.
	* gcc.dg/pr97459-3.c: New test.
	* gcc.dg/pr97459-4.c: New test.
	* gcc.dg/pr97459-5.c: New test.
	* gcc.dg/pr97459-6.c: New test.

--- gcc/optabs.h.jj	2020-12-01 13:19:12.831127718 +0100
+++ gcc/optabs.h	2020-12-01 17:40:08.783137431 +0100
@@ -183,6 +183,8 @@ extern bool force_expand_binop (machine_
 				enum optab_methods);
 extern rtx expand_vector_broadcast (machine_mode, rtx);
 
+extern rtx expand_doubleword_divmod (machine_mode, rtx, rtx, rtx *, bool);
+
 /* Generate code for a simple binary or unary operation.  "Simple" in
    this case means "can be unambiguously described by a (mode, code)
    pair and mapped to a single optab."  */
--- gcc/optabs.c.jj	2020-12-01 16:25:01.567733333 +0100
+++ gcc/optabs.c	2020-12-01 18:11:12.878230572 +0100
@@ -1118,6 +1118,99 @@ expand_doubleword_mod (machine_mode mode
     }
   return NULL_RTX;
 }
+
+/* Similarly to the above function, but compute both quotient and remainder.
+   Quotient can be computed from the remainder as:
+   rem = op0 % op1;  // Handled using expand_doubleword_mod
+   quot = (op0 - rem) * inv; // inv is multiplicative inverse of op1 modulo
+			     // 2 * BITS_PER_WORD
+
+   We can also handle cases where op1 is a multiple of power of two constant
+   and constant handled by expand_doubleword_mod.
+   op11 = 1 << __builtin_ctz (op1);
+   op12 = op1 / op11;
+   rem1 = op0 % op12;  // Handled using expand_doubleword_mod
+   quot1 = (op0 - rem1) * inv; // inv is multiplicative inverse of op12 modulo
+			       // 2 * BITS_PER_WORD
+   rem = (quot1 % op11) * op12 + rem1;
+   quot = quot1 / op11;  */
+
+rtx
+expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
+			  bool unsignedp)
+{
+  *rem = NULL_RTX;
+
+  /* Negative dividend should have been optimized into positive,
+     similarly modulo by 1 and modulo by power of two is optimized
+     differently too.  */
+  if (INTVAL (op1) <= 1 || pow2p_hwi (INTVAL (op1)))
+    return NULL_RTX;
+
+  rtx op11 = const1_rtx;
+  rtx op12 = op1;
+  if ((INTVAL (op1) & 1) == 0)
+    {
+      int bit = ctz_hwi (INTVAL (op1));
+      op11 = GEN_INT (HOST_WIDE_INT_1 << bit);
+      op12 = GEN_INT (INTVAL (op1) >> bit);
+    }
+
+  rtx rem1 = expand_doubleword_mod (mode, op0, op12, unsignedp);
+  if (rem1 == NULL_RTX)
+    return NULL_RTX;
+
+  int prec = 2 * BITS_PER_WORD;
+  wide_int a = wide_int::from (INTVAL (op12), prec + 1, UNSIGNED);
+  wide_int b = wi::shifted_mask (prec, 1, false, prec + 1);
+  wide_int m = wide_int::from (wi::mod_inv (a, b), prec, UNSIGNED);
+  rtx inv = immed_wide_int_const (m, mode);
+
+  rtx_insn *last = get_last_insn ();
+  rtx quot1 = expand_simple_binop (mode, MINUS, op0, rem1,
+				   NULL_RTX, unsignedp, OPTAB_DIRECT);
+  if (quot1 == NULL_RTX)
+    return NULL_RTX;
+
+  quot1 = expand_simple_binop (mode, MULT, quot1, inv,
+			       NULL_RTX, unsignedp, OPTAB_DIRECT);
+  if (quot1 == NULL_RTX)
+    return NULL_RTX;
+
+  if (op11 != const1_rtx)
+    {
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
+				NULL_RTX, unsignedp);
+      if (rem2 == NULL_RTX)
+	return NULL_RTX;
+
+      rem2 = expand_simple_binop (mode, MULT, rem2, op12, NULL_RTX,
+				  unsignedp, OPTAB_DIRECT);
+      if (rem2 == NULL_RTX)
+	return NULL_RTX;
+
+      rem2 = expand_simple_binop (mode, PLUS, rem2, rem1, NULL_RTX,
+				  unsignedp, OPTAB_DIRECT);
+      if (rem2 == NULL_RTX)
+	return NULL_RTX;
+
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
+				 NULL_RTX, unsignedp);
+      if (quot2 == NULL_RTX)
+	return NULL_RTX;
+
+      rem1 = rem2;
+      quot1 = quot2;
+    }
+
+  /* Punt if we need any library calls.  */
+  for (; last; last = NEXT_INSN (last))
+    if (CALL_P (last))
+      return NULL_RTX;
+
+  *rem = rem1;
+  return quot1;
+}
 \f
 /* Wrapper around expand_binop which takes an rtx code to specify
    the operation to perform, not an optab pointer.  All other
@@ -1999,7 +2092,10 @@ expand_binop (machine_mode mode, optab b
     }
 
   /* Attempt to synthetize double word modulo by constant divisor.  */
-  if ((binoptab == umod_optab || binoptab == smod_optab)
+  if ((binoptab == umod_optab
+       || binoptab == smod_optab
+       || binoptab == udiv_optab
+       || binoptab == sdiv_optab)
       && optimize
       && CONST_INT_P (op1)
       && is_int_mode (mode, &int_mode)
@@ -2008,21 +2104,33 @@ expand_binop (machine_mode mode, optab b
       && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
       && optimize_insn_for_speed_p ())
     {
-      rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
-					     binoptab == umod_optab);
-      if (remainder != NULL_RTX)
+      rtx res = NULL_RTX;
+      if ((binoptab == umod_optab || binoptab == smod_optab)
+	  && (INTVAL (op1) & 1) == 0)
+	res = expand_doubleword_mod (int_mode, op0, op1,
+				     binoptab == umod_optab);
+      else
+	{
+	  rtx quot = expand_doubleword_divmod (int_mode, op0, op1, &res,
+					       binoptab == umod_optab
+					       || binoptab == udiv_optab);
+	  if (quot == NULL_RTX)
+	    res = NULL_RTX;
+	  else if (binoptab == udiv_optab || binoptab == sdiv_optab)
+	    res = quot;
+	}
+      if (res != NULL_RTX)
 	{
 	  if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
 	    {
-	      rtx_insn *move = emit_move_insn (target ? target : remainder,
-					       remainder);
-	      set_dst_reg_note (move,
-				REG_EQUAL,
-				gen_rtx_fmt_ee (UMOD, int_mode,
-						copy_rtx (op0), op1),
-				target ? target : remainder);
+	      rtx_insn *move = emit_move_insn (target ? target : res,
+					       res);
+	      set_dst_reg_note (move, REG_EQUAL,
+				gen_rtx_fmt_ee (optab_to_code (binoptab),
+						int_mode, copy_rtx (op0), op1),
+				target ? target : res);
 	    }
-	  return remainder;
+	  return res;
 	}
       else
 	delete_insns_since (last);
--- gcc/internal-fn.c.jj	2020-11-30 10:55:33.134963320 +0100
+++ gcc/internal-fn.c	2020-12-01 18:18:20.964436077 +0100
@@ -3230,27 +3230,68 @@ expand_DIVMOD (internal_fn, gcall *call_
 	 the division and modulo and if it emits any library calls or any
 	 {,U}{DIV,MOD} rtxes throw it away and use a divmod optab or
 	 divmod libcall.  */
-      struct separate_ops ops;
-      ops.code = TRUNC_DIV_EXPR;
-      ops.type = type;
-      ops.op0 = make_tree (ops.type, op0);
-      ops.op1 = arg1;
-      ops.op2 = NULL_TREE;
-      ops.location = gimple_location (call_stmt);
-      start_sequence ();
-      quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
-      if (contains_call_div_mod (get_insns ()))
-	quotient = NULL_RTX;
-      else
+      scalar_int_mode int_mode;
+      if (remainder == NULL_RTX
+	  && optimize
+	  && CONST_INT_P (op1)
+	  && !pow2p_hwi (INTVAL (op1))
+	  && is_int_mode (TYPE_MODE (type), &int_mode)
+	  && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
+	  && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
+	  && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
+	  && optimize_insn_for_speed_p ())
 	{
-	  ops.code = TRUNC_MOD_EXPR;
-	  remainder = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
+	  rtx_insn *last = get_last_insn ();
+	  remainder = NULL_RTX;
+	  quotient = expand_doubleword_divmod (int_mode, op0, op1, &remainder,
+					       TYPE_UNSIGNED (type));
+	  if (quotient != NULL_RTX)
+	    {
+	      if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
+		{
+		  rtx_insn *move = emit_move_insn (quotient, quotient);
+		  set_dst_reg_note (move, REG_EQUAL,
+				    gen_rtx_fmt_ee (TYPE_UNSIGNED (type)
+						    ? UDIV : DIV, int_mode,
+						    copy_rtx (op0), op1),
+				    quotient);
+		  move = emit_move_insn (remainder, remainder);
+		  set_dst_reg_note (move, REG_EQUAL,
+				    gen_rtx_fmt_ee (TYPE_UNSIGNED (type)
+						    ? UMOD : MOD, int_mode,
+						    copy_rtx (op0), op1),
+				    quotient);
+		}
+	    }
+	  else
+	    delete_insns_since (last);
+	}
+
+      if (remainder == NULL_RTX)
+	{
+	  struct separate_ops ops;
+	  ops.code = TRUNC_DIV_EXPR;
+	  ops.type = type;
+	  ops.op0 = make_tree (ops.type, op0);
+	  ops.op1 = arg1;
+	  ops.op2 = NULL_TREE;
+	  ops.location = gimple_location (call_stmt);
+	  start_sequence ();
+	  quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
 	  if (contains_call_div_mod (get_insns ()))
-	    remainder = NULL_RTX;
+	    quotient = NULL_RTX;
+	  else
+	    {
+	      ops.code = TRUNC_MOD_EXPR;
+	      remainder = expand_expr_real_2 (&ops, NULL_RTX, mode,
+					      EXPAND_NORMAL);
+	      if (contains_call_div_mod (get_insns ()))
+		remainder = NULL_RTX;
+	    }
+	  if (remainder)
+	    insns = get_insns ();
+	  end_sequence ();
 	}
-      if (remainder)
-	insns = get_insns ();
-      end_sequence ();
     }
 
   if (remainder)
--- gcc/testsuite/gcc.target/i386/pr97282.c.jj	2020-10-06 10:32:14.769771587 +0200
+++ gcc/testsuite/gcc.target/i386/pr97282.c	2020-12-01 21:52:57.901708048 +0100
@@ -18,8 +18,8 @@ foo (T x)
   unsigned long ret = 0;
   while (x > 0)
     {
-      ret = ret + x % 10;
-      x = x / 10;
+      ret = ret + x % 123456;
+      x = x / 123456;
     }
   return ret;
 }
--- gcc/testsuite/gcc.dg/pr97459-1.c.jj	2020-11-30 10:55:33.135963309 +0100
+++ gcc/testsuite/gcc.dg/pr97459-1.c	2020-12-01 18:23:25.243031163 +0100
@@ -24,7 +24,7 @@ T __attribute__((noipa)) foo (T x, T n)
 #define C3(n) C2(n##0) C2(n##4) C2(n##9)
 #define C4(n) C3(n##0) C3(n##3) C3(n##7)
 #endif
-#define TESTS C4(1)
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
 
 TESTS
 
--- gcc/testsuite/gcc.dg/pr97459-2.c.jj	2020-11-30 10:55:33.136963298 +0100
+++ gcc/testsuite/gcc.dg/pr97459-2.c	2020-12-01 18:23:51.423738268 +0100
@@ -26,7 +26,7 @@ T __attribute__((noipa)) foo (T x, T n)
 #define C3(n) C2(n##0) C2(n##4) C2(n##9)
 #define C4(n) C3(n##0) C3(n##3) C3(n##7)
 #endif
-#define TESTS C4(1)
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
 
 TESTS
 
--- gcc/testsuite/gcc.dg/pr97459-3.c.jj	2020-12-01 18:25:38.452540896 +0100
+++ gcc/testsuite/gcc.dg/pr97459-3.c	2020-12-01 18:26:00.662292428 +0100
@@ -0,0 +1,54 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t T;
+#else
+typedef unsigned long long T;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x / n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  T x = ((T) 1 << i) + j;
+	  if (foo (x, tests[k].x) != tests[k].foo (x))
+	    __builtin_abort ();
+	}
+  return 0;
+}
--- gcc/testsuite/gcc.dg/pr97459-4.c.jj	2020-12-01 18:26:10.272184915 +0100
+++ gcc/testsuite/gcc.dg/pr97459-4.c	2020-12-01 18:26:27.345993905 +0100
@@ -0,0 +1,57 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __int128_t T;
+typedef __uint128_t U;
+#else
+typedef long long T;
+typedef unsigned long long U;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x / n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  U x = ((U) 1 << i) + j;
+	  if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
+	      || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
+	    __builtin_abort ();
+	}
+  return 0;
+}
--- gcc/testsuite/gcc.dg/pr97459-5.c.jj	2020-12-01 18:27:03.386590701 +0100
+++ gcc/testsuite/gcc.dg/pr97459-5.c	2020-12-01 18:28:30.324618095 +0100
@@ -0,0 +1,56 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t T;
+#else
+typedef unsigned long long T;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - 10000); return x / (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
+
+TESTS
+
+struct S { T x; T (*foo) (T, T *); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  T x = ((T) 1 << i) + j;
+	  T r1, r2;
+	  if (foo (x, tests[k].x, &r1) != tests[k].foo (x, &r2)
+	      || r1 != r2)
+	    __builtin_abort ();
+	}
+  return 0;
+}
--- gcc/testsuite/gcc.dg/pr97459-6.c.jj	2020-12-01 18:28:55.452336978 +0100
+++ gcc/testsuite/gcc.dg/pr97459-6.c	2020-12-01 18:31:50.667376785 +0100
@@ -0,0 +1,62 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __int128_t T;
+typedef __uint128_t U;
+#else
+typedef long long T;
+typedef unsigned long long U;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - 10000); return x / (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
+
+TESTS
+
+struct S { T x; T (*foo) (T, T *); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  U x = ((U) 1 << i) + j;
+	  T r1 = 0, r2 = 0;
+	  if (foo ((T) x, tests[k].x, &r1) != tests[k].foo ((T) x, &r2)
+	      || r1 != r2)
+	    __builtin_abort ();
+	  r1 = 0; r2 = 0;
+	  if (foo ((T) -x, tests[k].x, &r1) != tests[k].foo ((T) -x, &r2)
+	      || r1 != r2)
+	    __builtin_abort ();
+	}
+  return 0;
+}

	Jakub


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459]
  2020-12-01 21:00 [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459] Jakub Jelinek
@ 2020-12-01 23:40 ` Jeff Law
  2020-12-02  7:52 ` Richard Biener
  1 sibling, 0 replies; 3+ messages in thread
From: Jeff Law @ 2020-12-01 23:40 UTC (permalink / raw)
  To: Jakub Jelinek, Richard Biener; +Cc: gcc-patches, Thomas Koenig



On 12/1/20 2:00 PM, Jakub Jelinek wrote:
> Hi!
>
> The following patch implements what Thomas wrote about, in particular
> that we can handle also double-word divison by the constants for which
> the earlier patch optimized modulo (if it would be otherwise a library
> call) and that we can also easily handle such constants shifted to the left.
> Unfortunately, seems CSE isn't able to optimize away the two almost
> identical sequences (one to compute remainder, one to compute quotient),
> probably because of the ADD_OVERFLOW introduced jumps, so the patch also
> adjusts expand_DIVMOD.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-12-01  Jakub Jelinek  <jakub@redhat.com>
>
> 	PR rtl-optimization/97459
> 	* optabs.h (expand_doubleword_divmod): Declare.
> 	* optabs.c (expand_doubleword_divmod): New function.
> 	(expand_binop): Use it.
> 	* internal-fn.c (expand_DIVMOD): Likewise.
>
> 	* gcc.target/i386/pr97282.c (foo): Use 123456 divisor instead of
> 	10.
> 	* gcc.dg/pr97459-1.c (TESTS): Add tests for 10, 12 and
> 	6144.
> 	* gcc.dg/pr97459-2.c (TESTS): Likewise.
> 	* gcc.dg/pr97459-3.c: New test.
> 	* gcc.dg/pr97459-4.c: New test.
> 	* gcc.dg/pr97459-5.c: New test.
> 	* gcc.dg/pr97459-6.c: New test.
OK
jeff


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459]
  2020-12-01 21:00 [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459] Jakub Jelinek
  2020-12-01 23:40 ` Jeff Law
@ 2020-12-02  7:52 ` Richard Biener
  1 sibling, 0 replies; 3+ messages in thread
From: Richard Biener @ 2020-12-02  7:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Jeff Law, gcc-patches, Thomas Koenig

On Tue, 1 Dec 2020, Jakub Jelinek wrote:

> Hi!
> 
> The following patch implements what Thomas wrote about, in particular
> that we can handle also double-word divison by the constants for which
> the earlier patch optimized modulo (if it would be otherwise a library
> call) and that we can also easily handle such constants shifted to the left.
> Unfortunately, seems CSE isn't able to optimize away the two almost
> identical sequences (one to compute remainder, one to compute quotient),
> probably because of the ADD_OVERFLOW introduced jumps, so the patch also
> adjusts expand_DIVMOD.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Richard.

> 2020-12-01  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR rtl-optimization/97459
> 	* optabs.h (expand_doubleword_divmod): Declare.
> 	* optabs.c (expand_doubleword_divmod): New function.
> 	(expand_binop): Use it.
> 	* internal-fn.c (expand_DIVMOD): Likewise.
> 
> 	* gcc.target/i386/pr97282.c (foo): Use 123456 divisor instead of
> 	10.
> 	* gcc.dg/pr97459-1.c (TESTS): Add tests for 10, 12 and
> 	6144.
> 	* gcc.dg/pr97459-2.c (TESTS): Likewise.
> 	* gcc.dg/pr97459-3.c: New test.
> 	* gcc.dg/pr97459-4.c: New test.
> 	* gcc.dg/pr97459-5.c: New test.
> 	* gcc.dg/pr97459-6.c: New test.
> 
> --- gcc/optabs.h.jj	2020-12-01 13:19:12.831127718 +0100
> +++ gcc/optabs.h	2020-12-01 17:40:08.783137431 +0100
> @@ -183,6 +183,8 @@ extern bool force_expand_binop (machine_
>  				enum optab_methods);
>  extern rtx expand_vector_broadcast (machine_mode, rtx);
>  
> +extern rtx expand_doubleword_divmod (machine_mode, rtx, rtx, rtx *, bool);
> +
>  /* Generate code for a simple binary or unary operation.  "Simple" in
>     this case means "can be unambiguously described by a (mode, code)
>     pair and mapped to a single optab."  */
> --- gcc/optabs.c.jj	2020-12-01 16:25:01.567733333 +0100
> +++ gcc/optabs.c	2020-12-01 18:11:12.878230572 +0100
> @@ -1118,6 +1118,99 @@ expand_doubleword_mod (machine_mode mode
>      }
>    return NULL_RTX;
>  }
> +
> +/* Similarly to the above function, but compute both quotient and remainder.
> +   Quotient can be computed from the remainder as:
> +   rem = op0 % op1;  // Handled using expand_doubleword_mod
> +   quot = (op0 - rem) * inv; // inv is multiplicative inverse of op1 modulo
> +			     // 2 * BITS_PER_WORD
> +
> +   We can also handle cases where op1 is a multiple of power of two constant
> +   and constant handled by expand_doubleword_mod.
> +   op11 = 1 << __builtin_ctz (op1);
> +   op12 = op1 / op11;
> +   rem1 = op0 % op12;  // Handled using expand_doubleword_mod
> +   quot1 = (op0 - rem1) * inv; // inv is multiplicative inverse of op12 modulo
> +			       // 2 * BITS_PER_WORD
> +   rem = (quot1 % op11) * op12 + rem1;
> +   quot = quot1 / op11;  */
> +
> +rtx
> +expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
> +			  bool unsignedp)
> +{
> +  *rem = NULL_RTX;
> +
> +  /* Negative dividend should have been optimized into positive,
> +     similarly modulo by 1 and modulo by power of two is optimized
> +     differently too.  */
> +  if (INTVAL (op1) <= 1 || pow2p_hwi (INTVAL (op1)))
> +    return NULL_RTX;
> +
> +  rtx op11 = const1_rtx;
> +  rtx op12 = op1;
> +  if ((INTVAL (op1) & 1) == 0)
> +    {
> +      int bit = ctz_hwi (INTVAL (op1));
> +      op11 = GEN_INT (HOST_WIDE_INT_1 << bit);
> +      op12 = GEN_INT (INTVAL (op1) >> bit);
> +    }
> +
> +  rtx rem1 = expand_doubleword_mod (mode, op0, op12, unsignedp);
> +  if (rem1 == NULL_RTX)
> +    return NULL_RTX;
> +
> +  int prec = 2 * BITS_PER_WORD;
> +  wide_int a = wide_int::from (INTVAL (op12), prec + 1, UNSIGNED);
> +  wide_int b = wi::shifted_mask (prec, 1, false, prec + 1);
> +  wide_int m = wide_int::from (wi::mod_inv (a, b), prec, UNSIGNED);
> +  rtx inv = immed_wide_int_const (m, mode);
> +
> +  rtx_insn *last = get_last_insn ();
> +  rtx quot1 = expand_simple_binop (mode, MINUS, op0, rem1,
> +				   NULL_RTX, unsignedp, OPTAB_DIRECT);
> +  if (quot1 == NULL_RTX)
> +    return NULL_RTX;
> +
> +  quot1 = expand_simple_binop (mode, MULT, quot1, inv,
> +			       NULL_RTX, unsignedp, OPTAB_DIRECT);
> +  if (quot1 == NULL_RTX)
> +    return NULL_RTX;
> +
> +  if (op11 != const1_rtx)
> +    {
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> +				NULL_RTX, unsignedp);
> +      if (rem2 == NULL_RTX)
> +	return NULL_RTX;
> +
> +      rem2 = expand_simple_binop (mode, MULT, rem2, op12, NULL_RTX,
> +				  unsignedp, OPTAB_DIRECT);
> +      if (rem2 == NULL_RTX)
> +	return NULL_RTX;
> +
> +      rem2 = expand_simple_binop (mode, PLUS, rem2, rem1, NULL_RTX,
> +				  unsignedp, OPTAB_DIRECT);
> +      if (rem2 == NULL_RTX)
> +	return NULL_RTX;
> +
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> +				 NULL_RTX, unsignedp);
> +      if (quot2 == NULL_RTX)
> +	return NULL_RTX;
> +
> +      rem1 = rem2;
> +      quot1 = quot2;
> +    }
> +
> +  /* Punt if we need any library calls.  */
> +  for (; last; last = NEXT_INSN (last))
> +    if (CALL_P (last))
> +      return NULL_RTX;
> +
> +  *rem = rem1;
> +  return quot1;
> +}
>  \f
>  /* Wrapper around expand_binop which takes an rtx code to specify
>     the operation to perform, not an optab pointer.  All other
> @@ -1999,7 +2092,10 @@ expand_binop (machine_mode mode, optab b
>      }
>  
>    /* Attempt to synthetize double word modulo by constant divisor.  */
> -  if ((binoptab == umod_optab || binoptab == smod_optab)
> +  if ((binoptab == umod_optab
> +       || binoptab == smod_optab
> +       || binoptab == udiv_optab
> +       || binoptab == sdiv_optab)
>        && optimize
>        && CONST_INT_P (op1)
>        && is_int_mode (mode, &int_mode)
> @@ -2008,21 +2104,33 @@ expand_binop (machine_mode mode, optab b
>        && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
>        && optimize_insn_for_speed_p ())
>      {
> -      rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
> -					     binoptab == umod_optab);
> -      if (remainder != NULL_RTX)
> +      rtx res = NULL_RTX;
> +      if ((binoptab == umod_optab || binoptab == smod_optab)
> +	  && (INTVAL (op1) & 1) == 0)
> +	res = expand_doubleword_mod (int_mode, op0, op1,
> +				     binoptab == umod_optab);
> +      else
> +	{
> +	  rtx quot = expand_doubleword_divmod (int_mode, op0, op1, &res,
> +					       binoptab == umod_optab
> +					       || binoptab == udiv_optab);
> +	  if (quot == NULL_RTX)
> +	    res = NULL_RTX;
> +	  else if (binoptab == udiv_optab || binoptab == sdiv_optab)
> +	    res = quot;
> +	}
> +      if (res != NULL_RTX)
>  	{
>  	  if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
>  	    {
> -	      rtx_insn *move = emit_move_insn (target ? target : remainder,
> -					       remainder);
> -	      set_dst_reg_note (move,
> -				REG_EQUAL,
> -				gen_rtx_fmt_ee (UMOD, int_mode,
> -						copy_rtx (op0), op1),
> -				target ? target : remainder);
> +	      rtx_insn *move = emit_move_insn (target ? target : res,
> +					       res);
> +	      set_dst_reg_note (move, REG_EQUAL,
> +				gen_rtx_fmt_ee (optab_to_code (binoptab),
> +						int_mode, copy_rtx (op0), op1),
> +				target ? target : res);
>  	    }
> -	  return remainder;
> +	  return res;
>  	}
>        else
>  	delete_insns_since (last);
> --- gcc/internal-fn.c.jj	2020-11-30 10:55:33.134963320 +0100
> +++ gcc/internal-fn.c	2020-12-01 18:18:20.964436077 +0100
> @@ -3230,27 +3230,68 @@ expand_DIVMOD (internal_fn, gcall *call_
>  	 the division and modulo and if it emits any library calls or any
>  	 {,U}{DIV,MOD} rtxes throw it away and use a divmod optab or
>  	 divmod libcall.  */
> -      struct separate_ops ops;
> -      ops.code = TRUNC_DIV_EXPR;
> -      ops.type = type;
> -      ops.op0 = make_tree (ops.type, op0);
> -      ops.op1 = arg1;
> -      ops.op2 = NULL_TREE;
> -      ops.location = gimple_location (call_stmt);
> -      start_sequence ();
> -      quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
> -      if (contains_call_div_mod (get_insns ()))
> -	quotient = NULL_RTX;
> -      else
> +      scalar_int_mode int_mode;
> +      if (remainder == NULL_RTX
> +	  && optimize
> +	  && CONST_INT_P (op1)
> +	  && !pow2p_hwi (INTVAL (op1))
> +	  && is_int_mode (TYPE_MODE (type), &int_mode)
> +	  && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> +	  && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
> +	  && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
> +	  && optimize_insn_for_speed_p ())
>  	{
> -	  ops.code = TRUNC_MOD_EXPR;
> -	  remainder = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
> +	  rtx_insn *last = get_last_insn ();
> +	  remainder = NULL_RTX;
> +	  quotient = expand_doubleword_divmod (int_mode, op0, op1, &remainder,
> +					       TYPE_UNSIGNED (type));
> +	  if (quotient != NULL_RTX)
> +	    {
> +	      if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
> +		{
> +		  rtx_insn *move = emit_move_insn (quotient, quotient);
> +		  set_dst_reg_note (move, REG_EQUAL,
> +				    gen_rtx_fmt_ee (TYPE_UNSIGNED (type)
> +						    ? UDIV : DIV, int_mode,
> +						    copy_rtx (op0), op1),
> +				    quotient);
> +		  move = emit_move_insn (remainder, remainder);
> +		  set_dst_reg_note (move, REG_EQUAL,
> +				    gen_rtx_fmt_ee (TYPE_UNSIGNED (type)
> +						    ? UMOD : MOD, int_mode,
> +						    copy_rtx (op0), op1),
> +				    quotient);
> +		}
> +	    }
> +	  else
> +	    delete_insns_since (last);
> +	}
> +
> +      if (remainder == NULL_RTX)
> +	{
> +	  struct separate_ops ops;
> +	  ops.code = TRUNC_DIV_EXPR;
> +	  ops.type = type;
> +	  ops.op0 = make_tree (ops.type, op0);
> +	  ops.op1 = arg1;
> +	  ops.op2 = NULL_TREE;
> +	  ops.location = gimple_location (call_stmt);
> +	  start_sequence ();
> +	  quotient = expand_expr_real_2 (&ops, NULL_RTX, mode, EXPAND_NORMAL);
>  	  if (contains_call_div_mod (get_insns ()))
> -	    remainder = NULL_RTX;
> +	    quotient = NULL_RTX;
> +	  else
> +	    {
> +	      ops.code = TRUNC_MOD_EXPR;
> +	      remainder = expand_expr_real_2 (&ops, NULL_RTX, mode,
> +					      EXPAND_NORMAL);
> +	      if (contains_call_div_mod (get_insns ()))
> +		remainder = NULL_RTX;
> +	    }
> +	  if (remainder)
> +	    insns = get_insns ();
> +	  end_sequence ();
>  	}
> -      if (remainder)
> -	insns = get_insns ();
> -      end_sequence ();
>      }
>  
>    if (remainder)
> --- gcc/testsuite/gcc.target/i386/pr97282.c.jj	2020-10-06 10:32:14.769771587 +0200
> +++ gcc/testsuite/gcc.target/i386/pr97282.c	2020-12-01 21:52:57.901708048 +0100
> @@ -18,8 +18,8 @@ foo (T x)
>    unsigned long ret = 0;
>    while (x > 0)
>      {
> -      ret = ret + x % 10;
> -      x = x / 10;
> +      ret = ret + x % 123456;
> +      x = x / 123456;
>      }
>    return ret;
>  }
> --- gcc/testsuite/gcc.dg/pr97459-1.c.jj	2020-11-30 10:55:33.135963309 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-1.c	2020-12-01 18:23:25.243031163 +0100
> @@ -24,7 +24,7 @@ T __attribute__((noipa)) foo (T x, T n)
>  #define C3(n) C2(n##0) C2(n##4) C2(n##9)
>  #define C4(n) C3(n##0) C3(n##3) C3(n##7)
>  #endif
> -#define TESTS C4(1)
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
>  
>  TESTS
>  
> --- gcc/testsuite/gcc.dg/pr97459-2.c.jj	2020-11-30 10:55:33.136963298 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-2.c	2020-12-01 18:23:51.423738268 +0100
> @@ -26,7 +26,7 @@ T __attribute__((noipa)) foo (T x, T n)
>  #define C3(n) C2(n##0) C2(n##4) C2(n##9)
>  #define C4(n) C3(n##0) C3(n##3) C3(n##7)
>  #endif
> -#define TESTS C4(1)
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
>  
>  TESTS
>  
> --- gcc/testsuite/gcc.dg/pr97459-3.c.jj	2020-12-01 18:25:38.452540896 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-3.c	2020-12-01 18:26:00.662292428 +0100
> @@ -0,0 +1,54 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __uint128_t T;
> +#else
> +typedef unsigned long long T;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x / n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +	{
> +	  T x = ((T) 1 << i) + j;
> +	  if (foo (x, tests[k].x) != tests[k].foo (x))
> +	    __builtin_abort ();
> +	}
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/pr97459-4.c.jj	2020-12-01 18:26:10.272184915 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-4.c	2020-12-01 18:26:27.345993905 +0100
> @@ -0,0 +1,57 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __int128_t T;
> +typedef __uint128_t U;
> +#else
> +typedef long long T;
> +typedef unsigned long long U;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x / n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x / (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +	{
> +	  U x = ((U) 1 << i) + j;
> +	  if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
> +	      || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
> +	    __builtin_abort ();
> +	}
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/pr97459-5.c.jj	2020-12-01 18:27:03.386590701 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-5.c	2020-12-01 18:28:30.324618095 +0100
> @@ -0,0 +1,56 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __uint128_t T;
> +#else
> +typedef unsigned long long T;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - 10000); return x / (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T, T *); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +	{
> +	  T x = ((T) 1 << i) + j;
> +	  T r1, r2;
> +	  if (foo (x, tests[k].x, &r1) != tests[k].foo (x, &r2)
> +	      || r1 != r2)
> +	    __builtin_abort ();
> +	}
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/pr97459-6.c.jj	2020-12-01 18:28:55.452336978 +0100
> +++ gcc/testsuite/gcc.dg/pr97459-6.c	2020-12-01 18:31:50.667376785 +0100
> @@ -0,0 +1,62 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __int128_t T;
> +typedef __uint128_t U;
> +#else
> +typedef long long T;
> +typedef unsigned long long U;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n, T *r) { *r = x % n; return x / n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x, T *r) { *r = x % (n - 10000); return x / (n - 10000); }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1) C1(10010) C1(10012) C1(16144)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T, T *); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +	{
> +	  U x = ((U) 1 << i) + j;
> +	  T r1 = 0, r2 = 0;
> +	  if (foo ((T) x, tests[k].x, &r1) != tests[k].foo ((T) x, &r2)
> +	      || r1 != r2)
> +	    __builtin_abort ();
> +	  r1 = 0; r2 = 0;
> +	  if (foo ((T) -x, tests[k].x, &r1) != tests[k].foo ((T) -x, &r2)
> +	      || r1 != r2)
> +	    __builtin_abort ();
> +	}
> +  return 0;
> +}
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-12-02  7:52 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-01 21:00 [PATCH] expansion: Further improve double-word modulo, division and divmod [PR97459] Jakub Jelinek
2020-12-01 23:40 ` Jeff Law
2020-12-02  7:52 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).