public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-332] Adjust costing of emulated vectorized gather/scatter
@ 2023-04-28 12:42 Richard Biener
0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2023-04-28 12:42 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:24905a4bd1375ccd99c02510b9f9529015a48315
commit r14-332-g24905a4bd1375ccd99c02510b9f9529015a48315
Author: Richard Biener <rguenther@suse.de>
Date: Wed Jan 18 11:04:49 2023 +0100
Adjust costing of emulated vectorized gather/scatter
Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.
For pr88531-2c.c instead of
.L4:
leaq (%r15,%rcx), %rdx
incl %edi
movl 16(%rdx), %r13d
movl 24(%rdx), %r14d
movl (%rdx), %r10d
movl 4(%rdx), %r9d
movl 8(%rdx), %ebx
movl 12(%rdx), %r11d
movl 20(%rdx), %r12d
vmovss (%rax,%r14,4), %xmm2
movl 28(%rdx), %edx
vmovss (%rax,%r13,4), %xmm1
vmovss (%rax,%r10,4), %xmm0
vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2
vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1
vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vmovss (%rax,%rbx,4), %xmm2
vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2
vmovlhps %xmm2, %xmm0, %xmm0
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmulps %ymm3, %ymm0, %ymm0
vmovups %ymm0, (%r8,%rcx)
addq $32, %rcx
cmpl %esi, %edi
jb .L4
we now prefer
.L4:
leaq 0(%rbp,%rdx,8), %rcx
movl (%rcx), %r10d
movl 4(%rcx), %ecx
vmovss (%rsi,%r10,4), %xmm0
vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
vmulps %xmm1, %xmm0, %xmm0
vmovlps %xmm0, (%rbx,%rdx,8)
incq %rdx
cmpl %edi, %edx
jb .L4
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Tame down element extracts and scalar loads for gather/scatter
similar to elementwise strided accesses.
* gcc.target/i386/pr89618-2.c: New testcase.
* gcc.target/i386/pr88531-2b.c: Adjust.
* gcc.target/i386/pr88531-2c.c: Likewise.
Diff:
---
gcc/config/i386/i386.cc | 6 ++++--
gcc/testsuite/gcc.target/i386/pr88531-2b.c | 2 +-
gcc/testsuite/gcc.target/i386/pr88531-2c.c | 2 +-
gcc/testsuite/gcc.target/i386/pr89618-2.c | 23 +++++++++++++++++++++++
4 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index cf1cfb722a9..b1d08ecdb3d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23576,8 +23576,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
&& stmt_info
&& (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
|| STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
- && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
- && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+ && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ != INTEGER_CST))
+ || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
{
stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
index 011607c3d54..cdefff2ce8e 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
@@ -3,4 +3,4 @@
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
index 0f7ec3832f8..17b24c0dacc 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
@@ -3,4 +3,4 @@
#include "pr88531-2a.c"
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c
new file mode 100644
index 00000000000..0b7dcfd8806
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+void foo (int n, int *off, double *a)
+{
+ const int m = 32;
+
+ for (int j = 0; j < n/m; ++j)
+ {
+ int const start = j*m;
+ int const end = (j+1)*m;
+
+#pragma GCC ivdep
+ for (int i = start; i < end; ++i)
+ {
+ a[off[i]] = a[i] < 0 ? a[i] : 0;
+ }
+ }
+}
+
+/* Make sure the cost model selects SSE vectors rather than AVX to avoid
+ too many scalar ops for the address computes in the loop. */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-04-28 12:42 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-28 12:42 [gcc r14-332] Adjust costing of emulated vectorized gather/scatter Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).