* [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration @ 2024-06-11 9:02 Richard Biener 0 siblings, 0 replies; 6+ messages in thread From: Richard Biener @ 2024-06-11 9:02 UTC (permalink / raw) To: gcc-patches; +Cc: richard.sandiford The following makes peeling of a single scalar iteration handle more gaps, including non-power-of-two cases. This can be done by rounding up the remaining access to the next power-of-two which ensures that the next scalar iteration will pick at least the number of excess elements we access. I've added a correctness testcase and one x86 specific scanning for the optimization. Bootstrapped and tested on x86_64-unknown-linux-gnu, I plan to push this tomorrow after eying the CI. Checking SPEC CPU2017 on x86 shows we have no case left (from 33 before) where peeling for gaps is insufficient. This of course relies on sufficient vec_init optab coverage. Richard. PR tree-optimization/115385 * tree-vect-stmts.cc (get_group_load_store_type): Peeling of a single scalar iteration is sufficient if we can narrow the access to the next power of two of the bits in the last access. (vectorizable_load): Ensure that the last access is narrowed. * gcc.dg/vect/pr115385.c: New testcase. * gcc.target/i386/vect-pr115385.c: Likewise. --- gcc/testsuite/gcc.dg/vect/pr115385.c | 88 +++++++++++++++++++ gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 +++++++++++ gcc/tree-vect-stmts.cc | 34 ++++++- 3 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr115385.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-pr115385.c diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c b/gcc/testsuite/gcc.dg/vect/pr115385.c new file mode 100644 index 00000000000..a18cd665d7d --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr115385.c @@ -0,0 +1,88 @@ +/* { dg-require-effective-target mmap } */ + +#include <sys/mman.h> +#include <stdio.h> + +#define COUNT 511 +#define MMAP_SIZE 0x20000 +#define ADDRESS 0x1122000000 +#define TYPE unsigned char + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +void __attribute__((noipa)) foo(TYPE * __restrict x, + TYPE *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[3*i+0]; + x[16*i+1] = y[3*i+1]; + x[16*i+2] = y[3*i+2]; + x[16*i+3] = y[3*i+0]; + x[16*i+4] = y[3*i+1]; + x[16*i+5] = y[3*i+2]; + x[16*i+6] = y[3*i+0]; + x[16*i+7] = y[3*i+1]; + x[16*i+8] = y[3*i+2]; + x[16*i+9] = y[3*i+0]; + x[16*i+10] = y[3*i+1]; + x[16*i+11] = y[3*i+2]; + x[16*i+12] = y[3*i+0]; + x[16*i+13] = y[3*i+1]; + x[16*i+14] = y[3*i+2]; + x[16*i+15] = y[3*i+0]; + } +} + +void __attribute__((noipa)) bar(TYPE * __restrict x, + TYPE *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[5*i+0]; + x[16*i+1] = y[5*i+1]; + x[16*i+2] = y[5*i+2]; + x[16*i+3] = y[5*i+3]; + x[16*i+4] = y[5*i+4]; + x[16*i+5] = y[5*i+0]; + x[16*i+6] = y[5*i+1]; + x[16*i+7] = y[5*i+2]; + x[16*i+8] = y[5*i+3]; + x[16*i+9] = y[5*i+4]; + x[16*i+10] = y[5*i+0]; + x[16*i+11] = y[5*i+1]; + x[16*i+12] = y[5*i+2]; + x[16*i+13] = y[5*i+3]; + x[16*i+14] = y[5*i+4]; + x[16*i+15] = y[5*i+0]; + } +} + +TYPE x[COUNT * 16]; + +int +main (void) +{ + void *y; + TYPE *end_y; + + y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (y == MAP_FAILED) + { + perror ("mmap"); + return 1; + } + + end_y = (TYPE *) ((char *) y + MMAP_SIZE); + + foo (x, end_y - COUNT * 3, COUNT); + bar (x, end_y - COUNT * 5, COUNT); + + return 0; +} + +/* We always require a scalar epilogue here but we don't know which + targets support vector composition this way. */ diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c b/gcc/testsuite/gcc.target/i386/vect-pr115385.c new file mode 100644 index 00000000000..a6be9ce4e54 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */ + +void __attribute__((noipa)) foo(unsigned char * __restrict x, + unsigned char *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[3*i+0]; + x[16*i+1] = y[3*i+1]; + x[16*i+2] = y[3*i+2]; + x[16*i+3] = y[3*i+0]; + x[16*i+4] = y[3*i+1]; + x[16*i+5] = y[3*i+2]; + x[16*i+6] = y[3*i+0]; + x[16*i+7] = y[3*i+1]; + x[16*i+8] = y[3*i+2]; + x[16*i+9] = y[3*i+0]; + x[16*i+10] = y[3*i+1]; + x[16*i+11] = y[3*i+2]; + x[16*i+12] = y[3*i+0]; + x[16*i+13] = y[3*i+1]; + x[16*i+14] = y[3*i+2]; + x[16*i+15] = y[3*i+0]; + } +} + +void __attribute__((noipa)) bar(unsigned char * __restrict x, + unsigned char *y, int n) +{ + for (int i = 0; i < n; ++i) + { + x[16*i+0] = y[5*i+0]; + x[16*i+1] = y[5*i+1]; + x[16*i+2] = y[5*i+2]; + x[16*i+3] = y[5*i+3]; + x[16*i+4] = y[5*i+4]; + x[16*i+5] = y[5*i+0]; + x[16*i+6] = y[5*i+1]; + x[16*i+7] = y[5*i+2]; + x[16*i+8] = y[5*i+3]; + x[16*i+9] = y[5*i+4]; + x[16*i+10] = y[5*i+0]; + x[16*i+11] = y[5*i+1]; + x[16*i+12] = y[5*i+2]; + x[16*i+13] = y[5*i+3]; + x[16*i+14] = y[5*i+4]; + x[16*i+15] = y[5*i+0]; + } +} + +/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue loop" "vect"} } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 54106a2be37..8aa41833433 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2142,7 +2142,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, "Peeling for outer loop is not supported\n"); return false; } - unsigned HOST_WIDE_INT cnunits, cvf; + unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size; if (overrun_p && (!nunits.is_constant (&cnunits) || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf) @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, access excess elements. ??? Enhancements include peeling multiple iterations or using masked loads with a static mask. */ - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) + || ((group_size * cvf) % cnunits + group_size - gap < cnunits + /* But peeling a single scalar iteration is enough if + we can use the next power-of-two sized partial + access. */ + && ((cremain = (group_size * cvf - gap) % cnunits), true + && ((cpart_size = (1 << ceil_log2 (cremain))) + != cnunits) + && vector_vector_composition_type + (vectype, cnunits / cpart_size, + &half_vtype) == NULL_TREE)))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, gcc_assert (new_vtype || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); + /* But still reduce the access size to the next + required power-of-two so peeling a single + scalar iteration is sufficient. */ + unsigned HOST_WIDE_INT cremain; + if (remain.is_constant (&cremain)) + { + unsigned HOST_WIDE_INT cpart_size + = 1 << ceil_log2 (cremain); + if (known_gt (nunits, cpart_size) + && constant_multiple_p (nunits, cpart_size, + &num)) + { + tree ptype; + new_vtype + = vector_vector_composition_type (vectype, + num, + &ptype); + if (new_vtype) + ltype = ptype; + } + } } } tree offset -- 2.35.3 ^ permalink raw reply [flat|nested] 6+ messages in thread
[parent not found: <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com>]
* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration [not found] <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com> @ 2024-06-11 21:44 ` Richard Sandiford 2024-06-12 7:03 ` Richard Biener 0 siblings, 1 reply; 6+ messages in thread From: Richard Sandiford @ 2024-06-11 21:44 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches Don't think it makes any difference, but: Richard Biener <rguenther@suse.de> writes: > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, > access excess elements. > ??? Enhancements include peeling multiple iterations > or using masked loads with a static mask. */ > - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) > + || ((group_size * cvf) % cnunits + group_size - gap < cnunits > + /* But peeling a single scalar iteration is enough if > + we can use the next power-of-two sized partial > + access. */ > + && ((cremain = (group_size * cvf - gap) % cnunits), true ...this might be less surprising as: && ((cremain = (group_size * cvf - gap) % cnunits, true) in terms of how the &&s line up. Thanks, Richard > + && ((cpart_size = (1 << ceil_log2 (cremain))) > + != cnunits) > + && vector_vector_composition_type > + (vectype, cnunits / cpart_size, > + &half_vtype) == NULL_TREE)))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, > gcc_assert (new_vtype > || LOOP_VINFO_PEELING_FOR_GAPS > (loop_vinfo)); > + /* But still reduce the access size to the next > + required power-of-two so peeling a single > + scalar iteration is sufficient. */ > + unsigned HOST_WIDE_INT cremain; > + if (remain.is_constant (&cremain)) > + { > + unsigned HOST_WIDE_INT cpart_size > + = 1 << ceil_log2 (cremain); > + if (known_gt (nunits, cpart_size) > + && constant_multiple_p (nunits, cpart_size, > + &num)) > + { > + tree ptype; > + new_vtype > + = vector_vector_composition_type (vectype, > + num, > + &ptype); > + if (new_vtype) > + ltype = ptype; > + } > + } > } > } > tree offset ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration 2024-06-11 21:44 ` Richard Sandiford @ 2024-06-12 7:03 ` Richard Biener 2024-06-12 9:06 ` Richard Biener 0 siblings, 1 reply; 6+ messages in thread From: Richard Biener @ 2024-06-12 7:03 UTC (permalink / raw) To: Richard Sandiford; +Cc: gcc-patches On Tue, 11 Jun 2024, Richard Sandiford wrote: > Don't think it makes any difference, but: > > Richard Biener <rguenther@suse.de> writes: > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, > > access excess elements. > > ??? Enhancements include peeling multiple iterations > > or using masked loads with a static mask. */ > > - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) > > + || ((group_size * cvf) % cnunits + group_size - gap < cnunits > > + /* But peeling a single scalar iteration is enough if > > + we can use the next power-of-two sized partial > > + access. */ > > + && ((cremain = (group_size * cvf - gap) % cnunits), true > > ...this might be less surprising as: > > && ((cremain = (group_size * cvf - gap) % cnunits, true) > > in terms of how the &&s line up. Yeah - I'll fix before pushing. Thanks, Richard. > Thanks, > Richard > > > + && ((cpart_size = (1 << ceil_log2 (cremain))) > > + != cnunits) > > + && vector_vector_composition_type > > + (vectype, cnunits / cpart_size, > > + &half_vtype) == NULL_TREE)))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, > > gcc_assert (new_vtype > > || LOOP_VINFO_PEELING_FOR_GAPS > > (loop_vinfo)); > > + /* But still reduce the access size to the next > > + required power-of-two so peeling a single > > + scalar iteration is sufficient. */ > > + unsigned HOST_WIDE_INT cremain; > > + if (remain.is_constant (&cremain)) > > + { > > + unsigned HOST_WIDE_INT cpart_size > > + = 1 << ceil_log2 (cremain); > > + if (known_gt (nunits, cpart_size) > > + && constant_multiple_p (nunits, cpart_size, > > + &num)) > > + { > > + tree ptype; > > + new_vtype > > + = vector_vector_composition_type (vectype, > > + num, > > + &ptype); > > + if (new_vtype) > > + ltype = ptype; > > + } > > + } > > } > > } > > tree offset > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration 2024-06-12 7:03 ` Richard Biener @ 2024-06-12 9:06 ` Richard Biener 2024-06-12 9:38 ` Richard Sandiford 0 siblings, 1 reply; 6+ messages in thread From: Richard Biener @ 2024-06-12 9:06 UTC (permalink / raw) To: Richard Sandiford; +Cc: gcc-patches On Wed, 12 Jun 2024, Richard Biener wrote: > On Tue, 11 Jun 2024, Richard Sandiford wrote: > > > Don't think it makes any difference, but: > > > > Richard Biener <rguenther@suse.de> writes: > > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, > > > access excess elements. > > > ??? Enhancements include peeling multiple iterations > > > or using masked loads with a static mask. */ > > > - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) > > > + || ((group_size * cvf) % cnunits + group_size - gap < cnunits > > > + /* But peeling a single scalar iteration is enough if > > > + we can use the next power-of-two sized partial > > > + access. */ > > > + && ((cremain = (group_size * cvf - gap) % cnunits), true > > > > ...this might be less surprising as: > > > > && ((cremain = (group_size * cvf - gap) % cnunits, true) > > > > in terms of how the &&s line up. > > Yeah - I'll fix before pushing. The aarch64 CI shows that a few testcases no longer use SVE (gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling for gaps is deemed isufficient. Formerly we had if (loop_vinfo && *memory_access_type == VMAT_CONTIGUOUS && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo), nunits)) { unsigned HOST_WIDE_INT cnunits, cvf; if (!can_overrun_p || !nunits.is_constant (&cnunits) || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf) /* Peeling for gaps assumes that a single scalar iteration is enough to make sure the last vector iteration doesn't access excess elements. ??? Enhancements include peeling multiple iterations or using masked loads with a static mask. */ || (group_size * cvf) % cnunits + group_size - gap < cnunits) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "peeling for gaps insufficient for " "access\n"); and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits) is true so we didn't check for whether peeling one iteration is sufficient. But after the refactoring the outer checks merely indicates there's overrun (which is there already because gap != 0). That is, we never verified, for the "regular" gap case, whether peeling for a single iteration is sufficient. But now of course we run into the inner check which will always trigger if earlier checks didn't work out to set overrun_p to false. For slp_perm_8.c we have a group_size of two, nunits is {16, 16} and VF is {8, 8} and gap is one. Given we know the multiple_p we know that (group_size * cvf) % cnunits is zero, so what remains is group_size - gap < nunits but 1 is probably always less than {16, 16}. The new logic I added in the later patch that peeling a single iteration is OK when we use a smaller, rounded-up to power-of-two sized access is || ((group_size * cvf) % cnunits + group_size - gap < cnunits /* But peeling a single scalar iteration is enough if we can use the next power-of-two sized partial access. */ && (cremain = (group_size * cvf - gap) % cnunits, true) && (cpart_size = (1 << ceil_log2 (cremain))) != cnunits && vector_vector_composition_type (vectype, cnunits / cpart_size, &half_vtype) == NULL_TREE))) again knowing the multiple we know cremain is nunits - gap and with gap == 1 rounding this size up will yield nunits and thus the existing peeling is OK. Something is inconsistent here and the pre-existing (group_size * cvf) % cnunits + group_size - gap < cnunits check looks suspicious for a general check. (group_size * cvf - gap) should be the number of elements we can access without touching excess elements. Peeling a single iteration will make sure group_size * cvf + group_size - gap is accessed (that's group_size * (cvf + 1) - gap). The excess elements touched in the vector loop are cnunits - (group_size * cvf - gap) % cnunits I think that number needs to be less or equal to group_size, so the correct check should be (group_size * cvf - gap) % cnunits + group_size < cnunits for the SVE case that's (nunits - 1) + 2 < nunits which should simplify to false. Now the question is how to formulate this with poly-ints in a way that it works out, for the case in question doing the overrun check as if (overrun_p && can_div_trunc_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap, nunits, &tem, &remain) && maybe_lt (remain + group_size, nunits)) seems to do the trick. I'm going to test this, will post an updated series for the CI. Richard. > Thanks, > Richard. > > > Thanks, > > Richard > > > > > + && ((cpart_size = (1 << ceil_log2 (cremain))) > > > + != cnunits) > > > + && vector_vector_composition_type > > > + (vectype, cnunits / cpart_size, > > > + &half_vtype) == NULL_TREE)))) > > > { > > > if (dump_enabled_p ()) > > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, > > > gcc_assert (new_vtype > > > || LOOP_VINFO_PEELING_FOR_GAPS > > > (loop_vinfo)); > > > + /* But still reduce the access size to the next > > > + required power-of-two so peeling a single > > > + scalar iteration is sufficient. */ > > > + unsigned HOST_WIDE_INT cremain; > > > + if (remain.is_constant (&cremain)) > > > + { > > > + unsigned HOST_WIDE_INT cpart_size > > > + = 1 << ceil_log2 (cremain); > > > + if (known_gt (nunits, cpart_size) > > > + && constant_multiple_p (nunits, cpart_size, > > > + &num)) > > > + { > > > + tree ptype; > > > + new_vtype > > > + = vector_vector_composition_type (vectype, > > > + num, > > > + &ptype); > > > + if (new_vtype) > > > + ltype = ptype; > > > + } > > > + } > > > } > > > } > > > tree offset > > > > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration 2024-06-12 9:06 ` Richard Biener @ 2024-06-12 9:38 ` Richard Sandiford 2024-06-12 10:43 ` Richard Biener 0 siblings, 1 reply; 6+ messages in thread From: Richard Sandiford @ 2024-06-12 9:38 UTC (permalink / raw) To: Richard Biener; +Cc: gcc-patches Richard Biener <rguenther@suse.de> writes: > On Wed, 12 Jun 2024, Richard Biener wrote: > >> On Tue, 11 Jun 2024, Richard Sandiford wrote: >> >> > Don't think it makes any difference, but: >> > >> > Richard Biener <rguenther@suse.de> writes: >> > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, >> > > access excess elements. >> > > ??? Enhancements include peeling multiple iterations >> > > or using masked loads with a static mask. */ >> > > - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) >> > > + || ((group_size * cvf) % cnunits + group_size - gap < cnunits >> > > + /* But peeling a single scalar iteration is enough if >> > > + we can use the next power-of-two sized partial >> > > + access. */ >> > > + && ((cremain = (group_size * cvf - gap) % cnunits), true >> > >> > ...this might be less surprising as: >> > >> > && ((cremain = (group_size * cvf - gap) % cnunits, true) >> > >> > in terms of how the &&s line up. >> >> Yeah - I'll fix before pushing. > > The aarch64 CI shows that a few testcases no longer use SVE > (gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling > for gaps is deemed isufficient. Formerly we had > > if (loop_vinfo > && *memory_access_type == VMAT_CONTIGUOUS > && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () > && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR > (loop_vinfo), > nunits)) > { > unsigned HOST_WIDE_INT cnunits, cvf; > if (!can_overrun_p > || !nunits.is_constant (&cnunits) > || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant > (&cvf) > /* Peeling for gaps assumes that a single scalar > iteration > is enough to make sure the last vector iteration > doesn't > access excess elements. > ??? Enhancements include peeling multiple iterations > or using masked loads with a static mask. */ > || (group_size * cvf) % cnunits + group_size - gap < > cnunits) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, > vect_location, > "peeling for gaps insufficient for " > "access\n"); > > and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits) > is true so we didn't check for whether peeling one iteration is > sufficient. But after the refactoring the outer checks merely > indicates there's overrun (which is there already because gap != 0). > > That is, we never verified, for the "regular" gap case, whether peeling > for a single iteration is sufficient. But now of course we run into > the inner check which will always trigger if earlier checks didn't > work out to set overrun_p to false. > > For slp_perm_8.c we have a group_size of two, nunits is {16, 16} > and VF is {8, 8} and gap is one. Given we know the > multiple_p we know that (group_size * cvf) % cnunits is zero, > so what remains is group_size - gap < nunits but 1 is probably > always less than {16, 16}. I thought the idea was that the size of the gap was immaterial for VMAT_CONTIGUOUS, on the assumption that it would never be bigger than a page. That is, any gap loaded by the final unpeeled iteration would belong to the same page as the non-gap data from either the same vector iteration or the subsequent peeled scalar iteration. Will have to think more about this if that doesn't affect the rest of the message, but FWIW... > The new logic I added in the later patch that peeling a single > iteration is OK when we use a smaller, rounded-up to power-of-two > sized access is > > || ((group_size * cvf) % cnunits + group_size - gap < > cnunits > /* But peeling a single scalar iteration is enough > if > we can use the next power-of-two sized partial > access. */ > && (cremain = (group_size * cvf - gap) % cnunits, > true) > && (cpart_size = (1 << ceil_log2 (cremain))) != > cnunits > && vector_vector_composition_type > (vectype, cnunits / cpart_size, > &half_vtype) == NULL_TREE))) > > again knowing the multiple we know cremain is nunits - gap and with > gap == 1 rounding this size up will yield nunits and thus the existing > peeling is OK. Something is inconsistent here and the pre-existing > > (group_size * cvf) % cnunits + group_size - gap < cnunits > > check looks suspicious for a general check. > > (group_size * cvf - gap) > > should be the number of elements we can access without touching > excess elements. Peeling a single iteration will make sure > group_size * cvf + group_size - gap is accessed > (that's group_size * (cvf + 1) - gap). The excess elements > touched in the vector loop are > > cnunits - (group_size * cvf - gap) % cnunits > > I think that number needs to be less or equal to group_size, so > the correct check should be > > (group_size * cvf - gap) % cnunits + group_size < cnunits > > for the SVE case that's (nunits - 1) + 2 < nunits which should > simplify to false. Now the question is how to formulate this > with poly-ints in a way that it works out, for the case in > question doing the overrun check as > > if (overrun_p > && can_div_trunc_p (group_size > * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - > gap, > nunits, &tem, &remain) > && maybe_lt (remain + group_size, nunits)) ...it looks like this should be known_lt, if we're relying on it for correctness. Thanks, Richard > seems to do the trick. I'm going to test this, will post an updated > series for the CI. > > Richard. > >> Thanks, >> Richard. >> >> > Thanks, >> > Richard >> > >> > > + && ((cpart_size = (1 << ceil_log2 (cremain))) >> > > + != cnunits) >> > > + && vector_vector_composition_type >> > > + (vectype, cnunits / cpart_size, >> > > + &half_vtype) == NULL_TREE)))) >> > > { >> > > if (dump_enabled_p ()) >> > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, >> > > gcc_assert (new_vtype >> > > || LOOP_VINFO_PEELING_FOR_GAPS >> > > (loop_vinfo)); >> > > + /* But still reduce the access size to the next >> > > + required power-of-two so peeling a single >> > > + scalar iteration is sufficient. */ >> > > + unsigned HOST_WIDE_INT cremain; >> > > + if (remain.is_constant (&cremain)) >> > > + { >> > > + unsigned HOST_WIDE_INT cpart_size >> > > + = 1 << ceil_log2 (cremain); >> > > + if (known_gt (nunits, cpart_size) >> > > + && constant_multiple_p (nunits, cpart_size, >> > > + &num)) >> > > + { >> > > + tree ptype; >> > > + new_vtype >> > > + = vector_vector_composition_type (vectype, >> > > + num, >> > > + &ptype); >> > > + if (new_vtype) >> > > + ltype = ptype; >> > > + } >> > > + } >> > > } >> > > } >> > > tree offset >> > >> >> ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration 2024-06-12 9:38 ` Richard Sandiford @ 2024-06-12 10:43 ` Richard Biener 0 siblings, 0 replies; 6+ messages in thread From: Richard Biener @ 2024-06-12 10:43 UTC (permalink / raw) To: Richard Sandiford; +Cc: gcc-patches On Wed, 12 Jun 2024, Richard Sandiford wrote: > Richard Biener <rguenther@suse.de> writes: > > On Wed, 12 Jun 2024, Richard Biener wrote: > > > >> On Tue, 11 Jun 2024, Richard Sandiford wrote: > >> > >> > Don't think it makes any difference, but: > >> > > >> > Richard Biener <rguenther@suse.de> writes: > >> > > @@ -2151,7 +2151,16 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, > >> > > access excess elements. > >> > > ??? Enhancements include peeling multiple iterations > >> > > or using masked loads with a static mask. */ > >> > > - || (group_size * cvf) % cnunits + group_size - gap < cnunits)) > >> > > + || ((group_size * cvf) % cnunits + group_size - gap < cnunits > >> > > + /* But peeling a single scalar iteration is enough if > >> > > + we can use the next power-of-two sized partial > >> > > + access. */ > >> > > + && ((cremain = (group_size * cvf - gap) % cnunits), true > >> > > >> > ...this might be less surprising as: > >> > > >> > && ((cremain = (group_size * cvf - gap) % cnunits, true) > >> > > >> > in terms of how the &&s line up. > >> > >> Yeah - I'll fix before pushing. > > > > The aarch64 CI shows that a few testcases no longer use SVE > > (gcc.target/aarch64/sve/slp_perm_{4,7,8}.c) because peeling > > for gaps is deemed isufficient. Formerly we had > > > > if (loop_vinfo > > && *memory_access_type == VMAT_CONTIGUOUS > > && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () > > && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR > > (loop_vinfo), > > nunits)) > > { > > unsigned HOST_WIDE_INT cnunits, cvf; > > if (!can_overrun_p > > || !nunits.is_constant (&cnunits) > > || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant > > (&cvf) > > /* Peeling for gaps assumes that a single scalar > > iteration > > is enough to make sure the last vector iteration > > doesn't > > access excess elements. > > ??? Enhancements include peeling multiple iterations > > or using masked loads with a static mask. */ > > || (group_size * cvf) % cnunits + group_size - gap < > > cnunits) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, > > vect_location, > > "peeling for gaps insufficient for " > > "access\n"); > > > > and in all cases multiple_p (group_size * LOOP_VINFO_VECT_FACTOR, nunits) > > is true so we didn't check for whether peeling one iteration is > > sufficient. But after the refactoring the outer checks merely > > indicates there's overrun (which is there already because gap != 0). > > > > That is, we never verified, for the "regular" gap case, whether peeling > > for a single iteration is sufficient. But now of course we run into > > the inner check which will always trigger if earlier checks didn't > > work out to set overrun_p to false. > > > > For slp_perm_8.c we have a group_size of two, nunits is {16, 16} > > and VF is {8, 8} and gap is one. Given we know the > > multiple_p we know that (group_size * cvf) % cnunits is zero, > > so what remains is group_size - gap < nunits but 1 is probably > > always less than {16, 16}. > > I thought the idea was that the size of the gap was immaterial > for VMAT_CONTIGUOUS, on the assumption that it would never be > bigger than a page. That is, any gap loaded by the final > unpeeled iteration would belong to the same page as the non-gap > data from either the same vector iteration or the subsequent > peeled scalar iteration. The subsequent scalar iteration might be on the same page as the last vector iteration but that accessing elements beyond those touched by the subsequent scalar iteration (which could be on the next page). That's what this is supposed to check - that in fact the next scalar iteration covers all elements accessed in excess in the last vector iteration. So the size of the gap matters if it is larger than the group_size or the size of a vector (the former happens with group_size being smaller than the vector size which can happen with permutes). > > Will have to think more about this if that doesn't affect the > rest of the message, but FWIW... > > > The new logic I added in the later patch that peeling a single > > iteration is OK when we use a smaller, rounded-up to power-of-two > > sized access is > > > > || ((group_size * cvf) % cnunits + group_size - gap < > > cnunits > > /* But peeling a single scalar iteration is enough > > if > > we can use the next power-of-two sized partial > > access. */ > > && (cremain = (group_size * cvf - gap) % cnunits, > > true) > > && (cpart_size = (1 << ceil_log2 (cremain))) != > > cnunits > > && vector_vector_composition_type > > (vectype, cnunits / cpart_size, > > &half_vtype) == NULL_TREE))) > > > > again knowing the multiple we know cremain is nunits - gap and with > > gap == 1 rounding this size up will yield nunits and thus the existing > > peeling is OK. Something is inconsistent here and the pre-existing > > > > (group_size * cvf) % cnunits + group_size - gap < cnunits > > > > check looks suspicious for a general check. > > > > (group_size * cvf - gap) > > > > should be the number of elements we can access without touching > > excess elements. Peeling a single iteration will make sure > > group_size * cvf + group_size - gap is accessed > > (that's group_size * (cvf + 1) - gap). The excess elements > > touched in the vector loop are > > > > cnunits - (group_size * cvf - gap) % cnunits > > > > I think that number needs to be less or equal to group_size, so > > the correct check should be > > > > (group_size * cvf - gap) % cnunits + group_size < cnunits > > > > for the SVE case that's (nunits - 1) + 2 < nunits which should > > simplify to false. Now the question is how to formulate this > > with poly-ints in a way that it works out, for the case in > > question doing the overrun check as > > > > if (overrun_p > > && can_div_trunc_p (group_size > > * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - > > gap, > > nunits, &tem, &remain) > > && maybe_lt (remain + group_size, nunits)) > > ...it looks like this should be known_lt, if we're relying on it > for correctness. In fact the above was slightly wrong and should have been if (overrun_p && (!can_div_trunc_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap, nunits, &tem, &remain) || maybe_lt (remain + group_size, nunits))) thus it's a negative test and maybe_lt should be correct. Richard. > > Thanks, > Richard > > > seems to do the trick. I'm going to test this, will post an updated > > series for the CI. > > > > Richard. > > > >> Thanks, > >> Richard. > >> > >> > Thanks, > >> > Richard > >> > > >> > > + && ((cpart_size = (1 << ceil_log2 (cremain))) > >> > > + != cnunits) > >> > > + && vector_vector_composition_type > >> > > + (vectype, cnunits / cpart_size, > >> > > + &half_vtype) == NULL_TREE)))) > >> > > { > >> > > if (dump_enabled_p ()) > >> > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> > > @@ -11599,6 +11608,27 @@ vectorizable_load (vec_info *vinfo, > >> > > gcc_assert (new_vtype > >> > > || LOOP_VINFO_PEELING_FOR_GAPS > >> > > (loop_vinfo)); > >> > > + /* But still reduce the access size to the next > >> > > + required power-of-two so peeling a single > >> > > + scalar iteration is sufficient. */ > >> > > + unsigned HOST_WIDE_INT cremain; > >> > > + if (remain.is_constant (&cremain)) > >> > > + { > >> > > + unsigned HOST_WIDE_INT cpart_size > >> > > + = 1 << ceil_log2 (cremain); > >> > > + if (known_gt (nunits, cpart_size) > >> > > + && constant_multiple_p (nunits, cpart_size, > >> > > + &num)) > >> > > + { > >> > > + tree ptype; > >> > > + new_vtype > >> > > + = vector_vector_composition_type (vectype, > >> > > + num, > >> > > + &ptype); > >> > > + if (new_vtype) > >> > > + ltype = ptype; > >> > > + } > >> > > + } > >> > > } > >> > > } > >> > > tree offset > >> > > >> > >> > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-06-12 10:43 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-06-11 9:02 [PATCH] tree-optimization/115385 - handle more gaps with peeling of a single iteration Richard Biener [not found] <11ebbcd8-66c1-4843-8e33-504fcd055d95@DB1PEPF00039232.eurprd03.prod.outlook.com> 2024-06-11 21:44 ` Richard Sandiford 2024-06-12 7:03 ` Richard Biener 2024-06-12 9:06 ` Richard Biener 2024-06-12 9:38 ` Richard Sandiford 2024-06-12 10:43 ` Richard Biener
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).