* [PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter
@ 2023-03-24 13:04 Richard Biener
2023-03-24 13:12 ` Jan Hubicka
0 siblings, 1 reply; 2+ messages in thread
From: Richard Biener @ 2023-03-24 13:04 UTC (permalink / raw)
To: gcc-patches; +Cc: hongtao.liu, Jan Hubicka
Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.
For pr88531-2c.c instead of
.L4:
leaq (%r15,%rcx), %rdx
incl %edi
movl 16(%rdx), %r13d
movl 24(%rdx), %r14d
movl (%rdx), %r10d
movl 4(%rdx), %r9d
movl 8(%rdx), %ebx
movl 12(%rdx), %r11d
movl 20(%rdx), %r12d
vmovss (%rax,%r14,4), %xmm2
movl 28(%rdx), %edx
vmovss (%rax,%r13,4), %xmm1
vmovss (%rax,%r10,4), %xmm0
vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2
vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1
vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vmovss (%rax,%rbx,4), %xmm2
vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2
vmovlhps %xmm2, %xmm0, %xmm0
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmulps %ymm3, %ymm0, %ymm0
vmovups %ymm0, (%r8,%rcx)
addq $32, %rcx
cmpl %esi, %edi
jb .L4
we now prefer
.L4:
leaq 0(%rbp,%rdx,8), %rcx
movl (%rcx), %r10d
movl 4(%rcx), %ecx
vmovss (%rsi,%r10,4), %xmm0
vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
vmulps %xmm1, %xmm0, %xmm0
vmovlps %xmm0, (%rbx,%rdx,8)
incq %rdx
cmpl %edi, %edx
jb .L4
which vectorizes with SSE instead of AVX2 which looks like an
improvement.
When testing this on SPEC CPU 2017 with -Ofast -flto -march=znver4
there are quite some cases where we now prefer SSE vectorization
over AVX512 + AVX2 epilogue and some cases where we now reject
vectorization. Runtime the changes are noise with the off-noise
candidates better after the patch.
Bootstrapped and tested on x86_64-unknown-linux-gnu.
OK for stage1?
Thanks,
Richard.
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Tame down element extracts and scalar loads for gather/scatter
similar to elementwise strided accesses.
* gcc.target/i386/pr89618-2.c: New testcase.
* gcc.target/i386/pr88531-2b.c: Adjust.
* gcc.target/i386/pr88531-2c.c: Likewise.
---
gcc/config/i386/i386.cc | 6 ++++--
gcc/testsuite/gcc.target/i386/pr88531-2b.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-2c.c | 2 +-
gcc/testsuite/gcc.target/i386/pr89618-2.c | 23 ++++++++++++++++++++++
4 files changed, 29 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr89618-2.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6a8734c2346..7a0b48c62c5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23555,8 +23555,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
&& stmt_info
&& (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
|| STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+ && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ != INTEGER_CST))
+ || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
{
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
index 011607c3d54..cdefff2ce8e 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
@@ -3,4 +3,4 @@
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
index 0f7ec3832f8..17b24c0dacc 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
@@ -3,4 +3,4 @@
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c
new file mode 100644
index 00000000000..0b7dcfd8806
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+void foo (int n, int *off, double *a)
+{
+ const int m = 32;
+
+ for (int j = 0; j < n/m; ++j)
+ {
+ int const start = j*m;
+ int const end = (j+1)*m;
+
+#pragma GCC ivdep
+ for (int i = start; i < end; ++i)
+ {
+ a[off[i]] = a[i] < 0 ? a[i] : 0;
+ }
+ }
+}
+
+/* Make sure the cost model selects SSE vectors rather than AVX to avoid
+ too many scalar ops for the address computes in the loop. */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
--
2.35.3
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter
2023-03-24 13:04 [PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter Richard Biener
@ 2023-03-24 13:12 ` Jan Hubicka
0 siblings, 0 replies; 2+ messages in thread
From: Jan Hubicka @ 2023-03-24 13:12 UTC (permalink / raw)
To: Richard Biener; +Cc: gcc-patches, hongtao.liu
> Emulated gather/scatter behave similar to strided elementwise
> accesses in that they need to decompose the offset vector
> and construct or decompose the data vector so handle them
> the same way, pessimizing the cases with may elements.
>
> For pr88531-2c.c instead of
>
> .L4:
> leaq (%r15,%rcx), %rdx
> incl %edi
> movl 16(%rdx), %r13d
> movl 24(%rdx), %r14d
> movl (%rdx), %r10d
> movl 4(%rdx), %r9d
> movl 8(%rdx), %ebx
> movl 12(%rdx), %r11d
> movl 20(%rdx), %r12d
> vmovss (%rax,%r14,4), %xmm2
> movl 28(%rdx), %edx
> vmovss (%rax,%r13,4), %xmm1
> vmovss (%rax,%r10,4), %xmm0
> vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2
> vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1
> vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0
> vmovlhps %xmm2, %xmm1, %xmm1
> vmovss (%rax,%rbx,4), %xmm2
> vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2
> vmovlhps %xmm2, %xmm0, %xmm0
> vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
> vmulps %ymm3, %ymm0, %ymm0
> vmovups %ymm0, (%r8,%rcx)
> addq $32, %rcx
> cmpl %esi, %edi
> jb .L4
>
> we now prefer
>
> .L4:
> leaq 0(%rbp,%rdx,8), %rcx
> movl (%rcx), %r10d
> movl 4(%rcx), %ecx
> vmovss (%rsi,%r10,4), %xmm0
> vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
> vmulps %xmm1, %xmm0, %xmm0
> vmovlps %xmm0, (%rbx,%rdx,8)
> incq %rdx
> cmpl %edi, %edx
> jb .L4
>
> which vectorizes with SSE instead of AVX2 which looks like an
> improvement.
>
> When testing this on SPEC CPU 2017 with -Ofast -flto -march=znver4
> there are quite some cases where we now prefer SSE vectorization
> over AVX512 + AVX2 epilogue and some cases where we now reject
> vectorization. Runtime the changes are noise with the off-noise
> candidates better after the patch.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> OK for stage1?
>
> Thanks,
> Richard.
>
> * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
> Tame down element extracts and scalar loads for gather/scatter
> similar to elementwise strided accesses.
>
> * gcc.target/i386/pr89618-2.c: New testcase.
> * gcc.target/i386/pr88531-2b.c: Adjust.
> * gcc.target/i386/pr88531-2c.c: Likewise.
OK.
Honza
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2023-03-24 13:13 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-24 13:04 [PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter Richard Biener
2023-03-24 13:12 ` Jan Hubicka
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).