public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-6619] middle-end: Implement preferred_div_as_shifts_over_mult [PR108583]
@ 2023-03-12 18:44 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2023-03-12 18:44 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:81fd62d1378b7ddc1fa0967cbddcdcdcdd2d8d8c

commit r13-6619-g81fd62d1378b7ddc1fa0967cbddcdcdcdd2d8d8c
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Sun Mar 12 18:42:04 2023 +0000

    middle-end: Implement preferred_div_as_shifts_over_mult [PR108583]
    
    This now implements a hook
    preferred_div_as_shifts_over_mult that indicates whether a target prefers that
    the vectorizer decomposes division as shifts rather than multiplication when
    possible.
    
    In order to be able to use this we need to check whether the current precision
    has enough bits to do the operation without any of the additions overflowing.
    
    We use range information to determine this and only do the operation if we're
    sure am overflow won't occur. This now uses ranger to do this range check.
    
    This seems to work better than vect_get_range_info which uses range_query, but I
    have not switched the interface of vect_get_range_info over in this PR fix.
    
    As Andy said before initializing a ranger instance is cheap but not free, and if
    the intention is to call it often during a pass it should be instantiated at
    pass startup and passed along to the places that need it.  This is a big
    refactoring and doesn't seem right to do in this PR.  But we should in GCC 14.
    
    Currently we only instantiate it after a long series of much cheaper checks.
    
    gcc/ChangeLog:
    
            PR target/108583
            * target.def (preferred_div_as_shifts_over_mult): New.
            * doc/tm.texi.in: Document it.
            * doc/tm.texi: Regenerate.
            * targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
            * targhooks.h (default_preferred_div_as_shifts_over_mult): New.
            * tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.
    
    gcc/testsuite/ChangeLog:
    
            PR target/108583
            * gcc.dg/vect/vect-div-bitmask-4.c: New test.
            * gcc.dg/vect/vect-div-bitmask-5.c: New test.

Diff:
---
 gcc/doc/tm.texi                                |  6 ++
 gcc/doc/tm.texi.in                             |  1 +
 gcc/target.def                                 | 12 ++++
 gcc/targhooks.cc                               |  9 +++
 gcc/targhooks.h                                |  2 +
 gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c | 25 +++++++++
 gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c | 58 +++++++++++++++++++
 gcc/tree-vect-patterns.cc                      | 77 ++++++++++++++++++++++++++
 8 files changed, 190 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 144862e00c4..c4a92a5ebee 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6146,6 +6146,12 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+Sometimes it is possible to implement a vector division using a sequence
+of two addition-shift pairs, giving four instructions in total.
+Return true if taking this approach for @var{vectype} is likely
+to be better than using a sequence involving highpart multiplication.
+Default is false if @code{can_mult_highpart_p}, otherwise true.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 146f461ddad..4075e71624c 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4175,6 +4175,7 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index 049dab64f68..f401fe148ee 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,18 @@ correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "Sometimes it is possible to implement a vector division using a sequence\n\
+of two addition-shift pairs, giving four instructions in total.\n\
+Return true if taking this approach for @var{vectype} is likely\n\
+to be better than using a sequence involving highpart multiplication.\n\
+Default is false if @code{can_mult_highpart_p}, otherwise true.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 8b7452ce336..51bf3fb7a82 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1488,6 +1488,15 @@ default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree type)
+{
+  return !can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 555160da48a..cf3d3107a0d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+  (const_tree);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 00000000000..c81f8946922
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 00000000000..b4eb1a4dacb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 298fd295a73..887f02bf336 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3934,6 +3934,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
       return pattern_stmt;
     }
 
+  if ((cst = uniform_integer_cst_p (oprnd1))
+      && TYPE_UNSIGNED (itype)
+      && rhs_code == TRUNC_DIV_EXPR
+      && vectype
+      && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+    {
+      /* We can use the relationship:
+
+	   x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)
+
+	 to optimize cases where N+1 is a power of 2, and where // (N+1)
+	 is therefore a shift right.  When operating in modes that are
+	 multiples of a byte in size, there are two cases:
+
+	 (1) N(N+3) is not representable, in which case the question
+	     becomes whether the replacement expression overflows.
+	     It is enough to test that x+N+2 does not overflow,
+	     i.e. that x < MAX-(N+1).
+
+	 (2) N(N+3) is representable, in which case it is the (only)
+	     bound that we need to check.
+
+	 ??? For now we just handle the case where // (N+1) is a shift
+	 right by half the precision, since some architectures can
+	 optimize the associated addition and shift combinations
+	 into single instructions.  */
+
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == prec / 2)
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::shwi (1, prec);
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
+
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
     return NULL;

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-03-12 18:44 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-12 18:44 [gcc r13-6619] middle-end: Implement preferred_div_as_shifts_over_mult [PR108583] Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).