[PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Richard Biener <rguenther@suse.de>
To: gcc-patches@gcc.gnu.org
Cc: hongtao.liu@intel.com, Jan Hubicka <hubicka@ucw.cz>
Subject: [PATCH 2/2] [i386] Adjust costing of emulated vectorized gather/scatter
Date: Fri, 24 Mar 2023 14:04:03 +0100 (CET)	[thread overview]
Message-ID: <20230324130404.2C4ED138ED@imap2.suse-dmz.suse.de> (raw)

Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.

For pr88531-2c.c instead of

.L4:
        leaq    (%r15,%rcx), %rdx
        incl    %edi
        movl    16(%rdx), %r13d
        movl    24(%rdx), %r14d
        movl    (%rdx), %r10d
        movl    4(%rdx), %r9d
        movl    8(%rdx), %ebx
        movl    12(%rdx), %r11d
        movl    20(%rdx), %r12d
        vmovss  (%rax,%r14,4), %xmm2
        movl    28(%rdx), %edx
        vmovss  (%rax,%r13,4), %xmm1
        vmovss  (%rax,%r10,4), %xmm0
        vinsertps       $0x10, (%rax,%rdx,4), %xmm2, %xmm2
        vinsertps       $0x10, (%rax,%r12,4), %xmm1, %xmm1
        vinsertps       $0x10, (%rax,%r9,4), %xmm0, %xmm0
        vmovlhps        %xmm2, %xmm1, %xmm1
        vmovss  (%rax,%rbx,4), %xmm2
        vinsertps       $0x10, (%rax,%r11,4), %xmm2, %xmm2
        vmovlhps        %xmm2, %xmm0, %xmm0
        vinsertf128     $0x1, %xmm1, %ymm0, %ymm0
        vmulps  %ymm3, %ymm0, %ymm0
        vmovups %ymm0, (%r8,%rcx)
        addq    $32, %rcx
        cmpl    %esi, %edi
        jb      .L4

we now prefer

.L4:
        leaq    0(%rbp,%rdx,8), %rcx
        movl    (%rcx), %r10d
        movl    4(%rcx), %ecx
        vmovss  (%rsi,%r10,4), %xmm0
        vinsertps       $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
        vmulps  %xmm1, %xmm0, %xmm0
        vmovlps %xmm0, (%rbx,%rdx,8)
        incq    %rdx
        cmpl    %edi, %edx
        jb      .L4

which vectorizes with SSE instead of AVX2 which looks like an
improvement.

When testing this on SPEC CPU 2017 with -Ofast -flto -march=znver4
there are quite some cases where we now prefer SSE vectorization
over AVX512 + AVX2 epilogue and some cases where we now reject
vectorization.  Runtime the changes are noise with the off-noise
candidates better after the patch.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

OK for stage1?

Thanks,
Richard.

	* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
	Tame down element extracts and scalar loads for gather/scatter
	similar to elementwise strided accesses.

	* gcc.target/i386/pr89618-2.c: New testcase.
	* gcc.target/i386/pr88531-2b.c: Adjust.
	* gcc.target/i386/pr88531-2c.c: Likewise.
---
 gcc/config/i386/i386.cc                    |  6 ++++--
 gcc/testsuite/gcc.target/i386/pr88531-2b.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr88531-2c.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr89618-2.c  | 23 ++++++++++++++++++++++
 4 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89618-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6a8734c2346..7a0b48c62c5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23555,8 +23555,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       && stmt_info
       && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
 	  || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-      && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-      && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+      && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+	   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+	       != INTEGER_CST))
+	  || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
     {
       stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
index 011607c3d54..cdefff2ce8e 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
@@ -3,4 +3,4 @@
 
 #include "pr88531-2a.c"
 
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
index 0f7ec3832f8..17b24c0dacc 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
@@ -3,4 +3,4 @@
 
 #include "pr88531-2a.c"
 
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c
new file mode 100644
index 00000000000..0b7dcfd8806
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+void foo (int n, int *off, double *a)
+{
+  const int m = 32;
+
+  for (int j = 0; j < n/m; ++j)
+    {
+      int const start = j*m;
+      int const end = (j+1)*m;
+
+#pragma GCC ivdep
+      for (int i = start; i < end; ++i)
+	{
+	  a[off[i]] = a[i] < 0 ? a[i] : 0;
+	}
+    }
+}
+
+/* Make sure the cost model selects SSE vectors rather than AVX to avoid
+   too many scalar ops for the address computes in the loop.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */
-- 
2.35.3

next             reply	other threads:[~2023-03-24 13:13 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-24 13:04 Richard Biener [this message]
2023-03-24 13:12 ` Jan Hubicka

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230324130404.2C4ED138ED@imap2.suse-dmz.suse.de \
    --to=rguenther@suse.de \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=hongtao.liu@intel.com \
    --cc=hubicka@ucw.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).