[PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
@ 2024-06-11  9:02 Richard Biener
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2024-06-11  9:02 UTC (permalink / raw)
  To: gcc-patches; +Cc: richard.sandiford

The following makes peeling of a single scalar iteration handle more
gaps, including non-power-of-two cases.  This can be done by rounding
up the remaining access to the next power-of-two which ensures that
the next scalar iteration will pick at least the number of excess
elements we access.

I've added a correctness testcase and one x86 specific scanning for
the optimization.

Bootstrapped and tested on x86_64-unknown-linux-gnu, I plan to
push this tomorrow after eying the CI.  Checking SPEC CPU2017
on x86 shows we have no case left (from 33 before) where peeling
for gaps is insufficient.  This of course relies on sufficient
vec_init optab coverage.

Richard.

	PR tree-optimization/115385
	* tree-vect-stmts.cc (get_group_load_store_type): Peeling
	of a single scalar iteration is sufficient if we can narrow
	the access to the next power of two of the bits in the last
	access.
	(vectorizable_load): Ensure that the last access is narrowed.

	* gcc.dg/vect/pr115385.c: New testcase.
	* gcc.target/i386/vect-pr115385.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/pr115385.c          | 88 +++++++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 +++++++++++
 gcc/tree-vect-stmts.cc                        | 34 ++++++-
 3 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr115385.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115385.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c b/gcc/testsuite/gcc.dg/vect/pr115385.c
new file mode 100644
index 00000000000..a18cd665d7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115385.c
@@ -0,0 +1,88 @@
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+  void *y;
+  TYPE *end_y;
+
+  y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (y == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+  foo (x, end_y - COUNT * 3, COUNT);
+  bar (x, end_y - COUNT * 5, COUNT);
+
+  return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+   targets support vector composition this way.  */
diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
new file mode 100644
index 00000000000..a6be9ce4e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 54106a2be37..8aa41833433 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2142,7 +2142,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 				 "Peeling for outer loop is not supported\n");
 	      return false;
 	    }
-	  unsigned HOST_WIDE_INT cnunits, cvf;
+	  unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
 	  if (overrun_p
 	      && (!nunits.is_constant (&cnunits)
 		  || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
@@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
 		     access excess elements.
 		     ???  Enhancements include peeling multiple iterations
 		     or using masked loads with a static mask.  */
-		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
+		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
+		      /* But peeling a single scalar iteration is enough if
+			 we can use the next power-of-two sized partial
+			 access.  */
+		      && ((cremain = (group_size * cvf - gap) % cnunits), true
+			  && ((cpart_size = (1 << ceil_log2 (cremain)))
+			      != cnunits)
+			  && vector_vector_composition_type
+			       (vectype, cnunits / cpart_size,
+				&half_vtype) == NULL_TREE))))
 	    {
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
 			      gcc_assert (new_vtype
 					  || LOOP_VINFO_PEELING_FOR_GAPS
 					       (loop_vinfo));
+			    /* But still reduce the access size to the next
+			       required power-of-two so peeling a single
+			       scalar iteration is sufficient.  */
+			    unsigned HOST_WIDE_INT cremain;
+			    if (remain.is_constant (&cremain))
+			      {
+				unsigned HOST_WIDE_INT cpart_size
+				  = 1 << ceil_log2 (cremain);
+				if (known_gt (nunits, cpart_size)
+				    && constant_multiple_p (nunits, cpart_size,
+							    &num))
+				  {
+				    tree ptype;
+				    new_vtype
+				      = vector_vector_composition_type (vectype,
+									num,
+									&ptype);
+				    if (new_vtype)
+				      ltype = ptype;
+				  }
+			      }
 			  }
 		      }
 		    tree offset
-- 
2.35.3

^ permalink raw reply	[flat|nested] 6+ messages in thread

[parent not found: <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com>]

* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
       [not found] <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com>
@ 2024-06-11 21:44 ` Richard Sandiford
  2024-06-12  7:03   ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2024-06-11 21:44 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

Don't think it makes any difference, but:

Richard Biener <rguenther@suse.de> writes:
> @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
>  		     access excess elements.
>  		     ???  Enhancements include peeling multiple iterations
>  		     or using masked loads with a static mask.  */
> -		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
> +		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
> +		      /* But peeling a single scalar iteration is enough if
> +			 we can use the next power-of-two sized partial
> +			 access.  */
> +		      && ((cremain = (group_size * cvf - gap) % cnunits), true

...this might be less surprising as:

		      && ((cremain = (group_size * cvf - gap) % cnunits, true)

in terms of how the &&s line up.

Thanks,
Richard

> +			  && ((cpart_size = (1 << ceil_log2 (cremain)))
> +			      != cnunits)
> +			  && vector_vector_composition_type
> +			       (vectype, cnunits / cpart_size,
> +				&half_vtype) == NULL_TREE))))
>  	    {
>  	      if (dump_enabled_p ())
>  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
>  			      gcc_assert (new_vtype
>  					  || LOOP_VINFO_PEELING_FOR_GAPS
>  					       (loop_vinfo));
> +			    /* But still reduce the access size to the next
> +			       required power-of-two so peeling a single
> +			       scalar iteration is sufficient.  */
> +			    unsigned HOST_WIDE_INT cremain;
> +			    if (remain.is_constant (&cremain))
> +			      {
> +				unsigned HOST_WIDE_INT cpart_size
> +				  = 1 << ceil_log2 (cremain);
> +				if (known_gt (nunits, cpart_size)
> +				    && constant_multiple_p (nunits, cpart_size,
> +							    &num))
> +				  {
> +				    tree ptype;
> +				    new_vtype
> +				      = vector_vector_composition_type (vectype,
> +									num,
> +									&ptype);
> +				    if (new_vtype)
> +				      ltype = ptype;
> +				  }
> +			      }
>  			  }
>  		      }
>  		    tree offset

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
  2024-06-11 21:44 ` Richard Sandiford
@ 2024-06-12  7:03   ` Richard Biener
  2024-06-12  9:06     ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2024-06-12  7:03 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

On Tue, 11 Jun 2024, Richard Sandiford wrote:

> Don't think it makes any difference, but:
> 
> Richard Biener <rguenther@suse.de> writes:
> > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> >  		     access excess elements.
> >  		     ???  Enhancements include peeling multiple iterations
> >  		     or using masked loads with a static mask.  */
> > -		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
> > +		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
> > +		      /* But peeling a single scalar iteration is enough if
> > +			 we can use the next power-of-two sized partial
> > +			 access.  */
> > +		      && ((cremain = (group_size * cvf - gap) % cnunits), true
> 
> ...this might be less surprising as:
> 
> 		      && ((cremain = (group_size * cvf - gap) % cnunits, true)
> 
> in terms of how the &&s line up.

Yeah - I'll fix before pushing.

Thanks,
Richard.

> Thanks,
> Richard
> 
> > +			  && ((cpart_size = (1 << ceil_log2 (cremain)))
> > +			      != cnunits)
> > +			  && vector_vector_composition_type
> > +			       (vectype, cnunits / cpart_size,
> > +				&half_vtype) == NULL_TREE))))
> >  	    {
> >  	      if (dump_enabled_p ())
> >  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
> >  			      gcc_assert (new_vtype
> >  					  || LOOP_VINFO_PEELING_FOR_GAPS
> >  					       (loop_vinfo));
> > +			    /* But still reduce the access size to the next
> > +			       required power-of-two so peeling a single
> > +			       scalar iteration is sufficient.  */
> > +			    unsigned HOST_WIDE_INT cremain;
> > +			    if (remain.is_constant (&cremain))
> > +			      {
> > +				unsigned HOST_WIDE_INT cpart_size
> > +				  = 1 << ceil_log2 (cremain);
> > +				if (known_gt (nunits, cpart_size)
> > +				    && constant_multiple_p (nunits, cpart_size,
> > +							    &num))
> > +				  {
> > +				    tree ptype;
> > +				    new_vtype
> > +				      = vector_vector_composition_type (vectype,
> > +									num,
> > +									&ptype);
> > +				    if (new_vtype)
> > +				      ltype = ptype;
> > +				  }
> > +			      }
> >  			  }
> >  		      }
> >  		    tree offset
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
  2024-06-12  7:03   ` Richard Biener
@ 2024-06-12  9:06     ` Richard Biener
  2024-06-12  9:38       ` Richard Sandiford
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2024-06-12  9:06 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

On Wed, 12 Jun 2024, Richard Biener wrote:

> On Tue, 11 Jun 2024, Richard Sandiford wrote:
> 
> > Don't think it makes any difference, but:
> > 
> > Richard Biener <rguenther@suse.de> writes:
> > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> > >  		     access excess elements.
> > >  		     ???  Enhancements include peeling multiple iterations
> > >  		     or using masked loads with a static mask.  */
> > > -		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
> > > +		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
> > > +		      /* But peeling a single scalar iteration is enough if
> > > +			 we can use the next power-of-two sized partial
> > > +			 access.  */
> > > +		      && ((cremain = (group_size * cvf - gap) % cnunits), true
> > 
> > ...this might be less surprising as:
> > 
> > 		      && ((cremain = (group_size * cvf - gap) % cnunits, true)
> > 
> > in terms of how the &&s line up.
> 
> Yeah - I'll fix before pushing.

The aarch64 CI shows that a few testcases no longer use SVE
(gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling
for gaps is deemed isufficient.  Formerly we had

          if (loop_vinfo
              && *memory_access_type == VMAT_CONTIGUOUS
              && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
              && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR 
(loop_vinfo),
                              nunits))
            {
              unsigned HOST_WIDE_INT cnunits, cvf;
              if (!can_overrun_p
                  || !nunits.is_constant (&cnunits)
                  || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant 
(&cvf)
                  /* Peeling for gaps assumes that a single scalar 
iteration
                     is enough to make sure the last vector iteration 
doesn't
                     access excess elements.
                     ???  Enhancements include peeling multiple iterations
                     or using masked loads with a static mask.  */
                  || (group_size * cvf) % cnunits + group_size - gap < 
cnunits)
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, 
vect_location,
                                     "peeling for gaps insufficient for "
                                     "access\n");

and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits)
is true so we didn't check for whether peeling one iteration is
sufficient.  But after the refactoring the outer checks merely
indicates there's overrun (which is there already because gap != 0).

That is, we never verified, for the "regular" gap case, whether peeling
for a single iteration is sufficient.  But now of course we run into
the inner check which will always trigger if earlier checks didn't
work out to set overrun_p to false.

For slp_perm_8.c we have a group_size of two, nunits is {16, 16}
and VF is {8, 8} and gap is one.  Given we know the
multiple_p we know that (group_size * cvf) % cnunits is zero,
so what remains is group_size - gap < nunits but 1 is probably
always less than {16, 16}.

The new logic I added in the later patch that peeling a single
iteration is OK when we use a smaller, rounded-up to power-of-two
sized access is

                  || ((group_size * cvf) % cnunits + group_size - gap < 
cnunits
                      /* But peeling a single scalar iteration is enough 
if
                         we can use the next power-of-two sized partial
                         access.  */
                      && (cremain = (group_size * cvf - gap) % cnunits, 
true)
                      && (cpart_size = (1 << ceil_log2 (cremain))) != 
cnunits
                      && vector_vector_composition_type 
                           (vectype, cnunits / cpart_size, 
                            &half_vtype) == NULL_TREE)))

again knowing the multiple we know cremain is nunits - gap and with
gap == 1 rounding this size up will yield nunits and thus the existing
peeling is OK.  Something is inconsistent here and the pre-existing

  (group_size * cvf) % cnunits + group_size - gap < cnunits

check looks suspicious for a general check.

  (group_size * cvf - gap)

should be the number of elements we can access without touching
excess elements.  Peeling a single iteration will make sure
group_size * cvf + group_size - gap is accessed
(that's group_size * (cvf + 1) - gap).  The excess elements
touched in the vector loop are

  cnunits - (group_size * cvf - gap) % cnunits

I think that number needs to be less or equal to group_size, so
the correct check should be

  (group_size * cvf - gap) % cnunits + group_size < cnunits

for the SVE case that's (nunits - 1) + 2 < nunits which should
simplify to false.  Now the question is how to formulate this
with poly-ints in a way that it works out, for the case in
question doing the overrun check as

          if (overrun_p
              && can_div_trunc_p (group_size
                                  * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 
gap,
                                  nunits, &tem, &remain)
              && maybe_lt (remain + group_size, nunits))

seems to do the trick.  I'm going to test this, will post an updated
series for the CI.

Richard.

> Thanks,
> Richard.
> 
> > Thanks,
> > Richard
> > 
> > > +			  && ((cpart_size = (1 << ceil_log2 (cremain)))
> > > +			      != cnunits)
> > > +			  && vector_vector_composition_type
> > > +			       (vectype, cnunits / cpart_size,
> > > +				&half_vtype) == NULL_TREE))))
> > >  	    {
> > >  	      if (dump_enabled_p ())
> > >  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
> > >  			      gcc_assert (new_vtype
> > >  					  || LOOP_VINFO_PEELING_FOR_GAPS
> > >  					       (loop_vinfo));
> > > +			    /* But still reduce the access size to the next
> > > +			       required power-of-two so peeling a single
> > > +			       scalar iteration is sufficient.  */
> > > +			    unsigned HOST_WIDE_INT cremain;
> > > +			    if (remain.is_constant (&cremain))
> > > +			      {
> > > +				unsigned HOST_WIDE_INT cpart_size
> > > +				  = 1 << ceil_log2 (cremain);
> > > +				if (known_gt (nunits, cpart_size)
> > > +				    && constant_multiple_p (nunits, cpart_size,
> > > +							    &num))
> > > +				  {
> > > +				    tree ptype;
> > > +				    new_vtype
> > > +				      = vector_vector_composition_type (vectype,
> > > +									num,
> > > +									&ptype);
> > > +				    if (new_vtype)
> > > +				      ltype = ptype;
> > > +				  }
> > > +			      }
> > >  			  }
> > >  		      }
> > >  		    tree offset
> > 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
  2024-06-12  9:06     ` Richard Biener
@ 2024-06-12  9:38       ` Richard Sandiford
  2024-06-12 10:43         ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Sandiford @ 2024-06-12  9:38 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

Richard Biener <rguenther@suse.de> writes:
> On Wed, 12 Jun 2024, Richard Biener wrote:
>
>> On Tue, 11 Jun 2024, Richard Sandiford wrote:
>> 
>> > Don't think it makes any difference, but:
>> > 
>> > Richard Biener <rguenther@suse.de> writes:
>> > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
>> > >  		     access excess elements.
>> > >  		     ???  Enhancements include peeling multiple iterations
>> > >  		     or using masked loads with a static mask.  */
>> > > -		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
>> > > +		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
>> > > +		      /* But peeling a single scalar iteration is enough if
>> > > +			 we can use the next power-of-two sized partial
>> > > +			 access.  */
>> > > +		      && ((cremain = (group_size * cvf - gap) % cnunits), true
>> > 
>> > ...this might be less surprising as:
>> > 
>> > 		      && ((cremain = (group_size * cvf - gap) % cnunits, true)
>> > 
>> > in terms of how the &&s line up.
>> 
>> Yeah - I'll fix before pushing.
>
> The aarch64 CI shows that a few testcases no longer use SVE
> (gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling
> for gaps is deemed isufficient.  Formerly we had
>
>           if (loop_vinfo
>               && *memory_access_type == VMAT_CONTIGUOUS
>               && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
>               && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR 
> (loop_vinfo),
>                               nunits))
>             {
>               unsigned HOST_WIDE_INT cnunits, cvf;
>               if (!can_overrun_p
>                   || !nunits.is_constant (&cnunits)
>                   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant 
> (&cvf)
>                   /* Peeling for gaps assumes that a single scalar 
> iteration
>                      is enough to make sure the last vector iteration 
> doesn't
>                      access excess elements.
>                      ???  Enhancements include peeling multiple iterations
>                      or using masked loads with a static mask.  */
>                   || (group_size * cvf) % cnunits + group_size - gap < 
> cnunits)
>                 {
>                   if (dump_enabled_p ())
>                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, 
> vect_location,
>                                      "peeling for gaps insufficient for "
>                                      "access\n");
>
> and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits)
> is true so we didn't check for whether peeling one iteration is
> sufficient.  But after the refactoring the outer checks merely
> indicates there's overrun (which is there already because gap != 0).
>
> That is, we never verified, for the "regular" gap case, whether peeling
> for a single iteration is sufficient.  But now of course we run into
> the inner check which will always trigger if earlier checks didn't
> work out to set overrun_p to false.
>
> For slp_perm_8.c we have a group_size of two, nunits is {16, 16}
> and VF is {8, 8} and gap is one.  Given we know the
> multiple_p we know that (group_size * cvf) % cnunits is zero,
> so what remains is group_size - gap < nunits but 1 is probably
> always less than {16, 16}.

I thought the idea was that the size of the gap was immaterial
for VMAT_CONTIGUOUS, on the assumption that it would never be
bigger than a page.  That is, any gap loaded by the final
unpeeled iteration would belong to the same page as the non-gap
data from either the same vector iteration or the subsequent
peeled scalar iteration.

Will have to think more about this if that doesn't affect the
rest of the message, but FWIW...

> The new logic I added in the later patch that peeling a single
> iteration is OK when we use a smaller, rounded-up to power-of-two
> sized access is
>
>                   || ((group_size * cvf) % cnunits + group_size - gap < 
> cnunits
>                       /* But peeling a single scalar iteration is enough 
> if
>                          we can use the next power-of-two sized partial
>                          access.  */
>                       && (cremain = (group_size * cvf - gap) % cnunits, 
> true)
>                       && (cpart_size = (1 << ceil_log2 (cremain))) != 
> cnunits
>                       && vector_vector_composition_type 
>                            (vectype, cnunits / cpart_size, 
>                             &half_vtype) == NULL_TREE)))
>
> again knowing the multiple we know cremain is nunits - gap and with
> gap == 1 rounding this size up will yield nunits and thus the existing
> peeling is OK.  Something is inconsistent here and the pre-existing
>
>   (group_size * cvf) % cnunits + group_size - gap < cnunits
>
> check looks suspicious for a general check.
>
>   (group_size * cvf - gap)
>
> should be the number of elements we can access without touching
> excess elements.  Peeling a single iteration will make sure
> group_size * cvf + group_size - gap is accessed
> (that's group_size * (cvf + 1) - gap).  The excess elements
> touched in the vector loop are
>
>   cnunits - (group_size * cvf - gap) % cnunits
>
> I think that number needs to be less or equal to group_size, so
> the correct check should be
>
>   (group_size * cvf - gap) % cnunits + group_size < cnunits
>
> for the SVE case that's (nunits - 1) + 2 < nunits which should
> simplify to false.  Now the question is how to formulate this
> with poly-ints in a way that it works out, for the case in
> question doing the overrun check as
>
>           if (overrun_p
>               && can_div_trunc_p (group_size
>                                   * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 
> gap,
>                                   nunits, &tem, &remain)
>               && maybe_lt (remain + group_size, nunits))

...it looks like this should be known_lt, if we're relying on it
for correctness.

Thanks,
Richard

> seems to do the trick.  I'm going to test this, will post an updated
> series for the CI.
>
> Richard.
>
>> Thanks,
>> Richard.
>> 
>> > Thanks,
>> > Richard
>> > 
>> > > +			  && ((cpart_size = (1 << ceil_log2 (cremain)))
>> > > +			      != cnunits)
>> > > +			  && vector_vector_composition_type
>> > > +			       (vectype, cnunits / cpart_size,
>> > > +				&half_vtype) == NULL_TREE))))
>> > >  	    {
>> > >  	      if (dump_enabled_p ())
>> > >  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
>> > >  			      gcc_assert (new_vtype
>> > >  					  || LOOP_VINFO_PEELING_FOR_GAPS
>> > >  					       (loop_vinfo));
>> > > +			    /* But still reduce the access size to the next
>> > > +			       required power-of-two so peeling a single
>> > > +			       scalar iteration is sufficient.  */
>> > > +			    unsigned HOST_WIDE_INT cremain;
>> > > +			    if (remain.is_constant (&cremain))
>> > > +			      {
>> > > +				unsigned HOST_WIDE_INT cpart_size
>> > > +				  = 1 << ceil_log2 (cremain);
>> > > +				if (known_gt (nunits, cpart_size)
>> > > +				    && constant_multiple_p (nunits, cpart_size,
>> > > +							    &num))
>> > > +				  {
>> > > +				    tree ptype;
>> > > +				    new_vtype
>> > > +				      = vector_vector_composition_type (vectype,
>> > > +									num,
>> > > +									&ptype);
>> > > +				    if (new_vtype)
>> > > +				      ltype = ptype;
>> > > +				  }
>> > > +			      }
>> > >  			  }
>> > >  		      }
>> > >  		    tree offset
>> > 
>> 
>> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration
  2024-06-12  9:38       ` Richard Sandiford
@ 2024-06-12 10:43         ` Richard Biener
  0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2024-06-12 10:43 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

On Wed, 12 Jun 2024, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Wed, 12 Jun 2024, Richard Biener wrote:
> >
> >> On Tue, 11 Jun 2024, Richard Sandiford wrote:
> >> 
> >> > Don't think it makes any difference, but:
> >> > 
> >> > Richard Biener <rguenther@suse.de> writes:
> >> > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> >> > >  		     access excess elements.
> >> > >  		     ???  Enhancements include peeling multiple iterations
> >> > >  		     or using masked loads with a static mask.  */
> >> > > -		  || (group_size * cvf) % cnunits + group_size - gap < cnunits))
> >> > > +		  || ((group_size * cvf) % cnunits + group_size - gap < cnunits
> >> > > +		      /* But peeling a single scalar iteration is enough if
> >> > > +			 we can use the next power-of-two sized partial
> >> > > +			 access.  */
> >> > > +		      && ((cremain = (group_size * cvf - gap) % cnunits), true
> >> > 
> >> > ...this might be less surprising as:
> >> > 
> >> > 		      && ((cremain = (group_size * cvf - gap) % cnunits, true)
> >> > 
> >> > in terms of how the &&s line up.
> >> 
> >> Yeah - I'll fix before pushing.
> >
> > The aarch64 CI shows that a few testcases no longer use SVE
> > (gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling
> > for gaps is deemed isufficient.  Formerly we had
> >
> >           if (loop_vinfo
> >               && *memory_access_type == VMAT_CONTIGUOUS
> >               && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> >               && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR 
> > (loop_vinfo),
> >                               nunits))
> >             {
> >               unsigned HOST_WIDE_INT cnunits, cvf;
> >               if (!can_overrun_p
> >                   || !nunits.is_constant (&cnunits)
> >                   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant 
> > (&cvf)
> >                   /* Peeling for gaps assumes that a single scalar 
> > iteration
> >                      is enough to make sure the last vector iteration 
> > doesn't
> >                      access excess elements.
> >                      ???  Enhancements include peeling multiple iterations
> >                      or using masked loads with a static mask.  */
> >                   || (group_size * cvf) % cnunits + group_size - gap < 
> > cnunits)
> >                 {
> >                   if (dump_enabled_p ())
> >                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, 
> > vect_location,
> >                                      "peeling for gaps insufficient for "
> >                                      "access\n");
> >
> > and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits)
> > is true so we didn't check for whether peeling one iteration is
> > sufficient.  But after the refactoring the outer checks merely
> > indicates there's overrun (which is there already because gap != 0).
> >
> > That is, we never verified, for the "regular" gap case, whether peeling
> > for a single iteration is sufficient.  But now of course we run into
> > the inner check which will always trigger if earlier checks didn't
> > work out to set overrun_p to false.
> >
> > For slp_perm_8.c we have a group_size of two, nunits is {16, 16}
> > and VF is {8, 8} and gap is one.  Given we know the
> > multiple_p we know that (group_size * cvf) % cnunits is zero,
> > so what remains is group_size - gap < nunits but 1 is probably
> > always less than {16, 16}.
> 
> I thought the idea was that the size of the gap was immaterial
> for VMAT_CONTIGUOUS, on the assumption that it would never be
> bigger than a page.  That is, any gap loaded by the final
> unpeeled iteration would belong to the same page as the non-gap
> data from either the same vector iteration or the subsequent
> peeled scalar iteration.

The subsequent scalar iteration might be on the same page as the
last vector iteration but that accessing elements beyond those
touched by the subsequent scalar iteration (which could be on
the next page).  That's what this is supposed to check - that in
fact the next scalar iteration covers all elements accessed in
excess in the last vector iteration.

So the size of the gap matters if it is larger than the group_size
or the size of a vector (the former happens with group_size being
smaller than the vector size which can happen with permutes).

> 
> Will have to think more about this if that doesn't affect the
> rest of the message, but FWIW...
> 
> > The new logic I added in the later patch that peeling a single
> > iteration is OK when we use a smaller, rounded-up to power-of-two
> > sized access is
> >
> >                   || ((group_size * cvf) % cnunits + group_size - gap < 
> > cnunits
> >                       /* But peeling a single scalar iteration is enough 
> > if
> >                          we can use the next power-of-two sized partial
> >                          access.  */
> >                       && (cremain = (group_size * cvf - gap) % cnunits, 
> > true)
> >                       && (cpart_size = (1 << ceil_log2 (cremain))) != 
> > cnunits
> >                       && vector_vector_composition_type 
> >                            (vectype, cnunits / cpart_size, 
> >                             &half_vtype) == NULL_TREE)))
> >
> > again knowing the multiple we know cremain is nunits - gap and with
> > gap == 1 rounding this size up will yield nunits and thus the existing
> > peeling is OK.  Something is inconsistent here and the pre-existing
> >
> >   (group_size * cvf) % cnunits + group_size - gap < cnunits
> >
> > check looks suspicious for a general check.
> >
> >   (group_size * cvf - gap)
> >
> > should be the number of elements we can access without touching
> > excess elements.  Peeling a single iteration will make sure
> > group_size * cvf + group_size - gap is accessed
> > (that's group_size * (cvf + 1) - gap).  The excess elements
> > touched in the vector loop are
> >
> >   cnunits - (group_size * cvf - gap) % cnunits
> >
> > I think that number needs to be less or equal to group_size, so
> > the correct check should be
> >
> >   (group_size * cvf - gap) % cnunits + group_size < cnunits
> >
> > for the SVE case that's (nunits - 1) + 2 < nunits which should
> > simplify to false.  Now the question is how to formulate this
> > with poly-ints in a way that it works out, for the case in
> > question doing the overrun check as
> >
> >           if (overrun_p
> >               && can_div_trunc_p (group_size
> >                                   * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 
> > gap,
> >                                   nunits, &tem, &remain)
> >               && maybe_lt (remain + group_size, nunits))
> 
> ...it looks like this should be known_lt, if we're relying on it
> for correctness.

In fact the above was slightly wrong and should have been

          if (overrun_p
              && (!can_div_trunc_p (group_size
                                    * LOOP_VINFO_VECT_FACTOR (loop_vinfo) 
- gap,
                                    nunits, &tem, &remain)
                  || maybe_lt (remain + group_size, nunits)))

thus it's a negative test and maybe_lt should be correct.

Richard.

> 
> Thanks,
> Richard
> 
> > seems to do the trick.  I'm going to test this, will post an updated
> > series for the CI.
> >
> > Richard.
> >
> >> Thanks,
> >> Richard.
> >> 
> >> > Thanks,
> >> > Richard
> >> > 
> >> > > +			  && ((cpart_size = (1 << ceil_log2 (cremain)))
> >> > > +			      != cnunits)
> >> > > +			  && vector_vector_composition_type
> >> > > +			       (vectype, cnunits / cpart_size,
> >> > > +				&half_vtype) == NULL_TREE))))
> >> > >  	    {
> >> > >  	      if (dump_enabled_p ())
> >> > >  		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >> > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo,
> >> > >  			      gcc_assert (new_vtype
> >> > >  					  || LOOP_VINFO_PEELING_FOR_GAPS
> >> > >  					       (loop_vinfo));
> >> > > +			    /* But still reduce the access size to the next
> >> > > +			       required power-of-two so peeling a single
> >> > > +			       scalar iteration is sufficient.  */
> >> > > +			    unsigned HOST_WIDE_INT cremain;
> >> > > +			    if (remain.is_constant (&cremain))
> >> > > +			      {
> >> > > +				unsigned HOST_WIDE_INT cpart_size
> >> > > +				  = 1 << ceil_log2 (cremain);
> >> > > +				if (known_gt (nunits, cpart_size)
> >> > > +				    && constant_multiple_p (nunits, cpart_size,
> >> > > +							    &num))
> >> > > +				  {
> >> > > +				    tree ptype;
> >> > > +				    new_vtype
> >> > > +				      = vector_vector_composition_type (vectype,
> >> > > +									num,
> >> > > +									&ptype);
> >> > > +				    if (new_vtype)
> >> > > +				      ltype = ptype;
> >> > > +				  }
> >> > > +			      }
> >> > >  			  }
> >> > >  		      }
> >> > >  		    tree offset
> >> > 
> >> 
> >> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-06-12 10:43 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-11  9:02 [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration Richard Biener
     [not found] <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com>
2024-06-11 21:44 ` Richard Sandiford
2024-06-12  7:03   ` Richard Biener
2024-06-12  9:06     ` Richard Biener
2024-06-12  9:38       ` Richard Sandiford
2024-06-12 10:43         ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).