From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenth@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1666)
	id 720CD383468B; Fri, 28 Apr 2023 12:42:29 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 720CD383468B
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1682685749;
	bh=ihwwSYR9yO8y6DtSkWfMRXya4n8no7sSR0i20eyOZuM=;
	h=From:To:Subject:Date:From;
	b=jzH5hF3ATq/bJQqTPYv0VW3jo7nPV24XBFQL1dFuj6S/RlQHMrZpVOA2t5m15tenr
	 WmoykL/MvJGXbaZzcVCw2KK1YQ1fVjDpV6A4zmuO0W7So6tXSBpRmnMYeV+re1sbSg
	 AIWnq8RFUI3dxTr3KDffO7U1LkBohsOJ+T17z9m0=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Richard Biener <rguenth@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-332] Adjust costing of emulated vectorized gather/scatter
X-Act-Checkin: gcc
X-Git-Author: Richard Biener <rguenther@suse.de>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 8b84d87969ef2443516a79a80c22d2b6dba04630
X-Git-Newrev: 24905a4bd1375ccd99c02510b9f9529015a48315
Message-Id: <20230428124229.720CD383468B@sourceware.org>
Date: Fri, 28 Apr 2023 12:42:29 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:24905a4bd1375ccd99c02510b9f9529015a48315

commit r14-332-g24905a4bd1375ccd99c02510b9f9529015a48315
Author: Richard Biener <rguenther@suse.de>
Date:   Wed Jan 18 11:04:49 2023 +0100

    Adjust costing of emulated vectorized gather/scatter
    
    Emulated gather/scatter behave similar to strided elementwise
    accesses in that they need to decompose the offset vector
    and construct or decompose the data vector so handle them
    the same way, pessimizing the cases with may elements.
    
    For pr88531-2c.c instead of
    
    .L4:
            leaq    (%r15,%rcx), %rdx
            incl    %edi
            movl    16(%rdx), %r13d
            movl    24(%rdx), %r14d
            movl    (%rdx), %r10d
            movl    4(%rdx), %r9d
            movl    8(%rdx), %ebx
            movl    12(%rdx), %r11d
            movl    20(%rdx), %r12d
            vmovss  (%rax,%r14,4), %xmm2
            movl    28(%rdx), %edx
            vmovss  (%rax,%r13,4), %xmm1
            vmovss  (%rax,%r10,4), %xmm0
            vinsertps       $0x10, (%rax,%rdx,4), %xmm2, %xmm2
            vinsertps       $0x10, (%rax,%r12,4), %xmm1, %xmm1
            vinsertps       $0x10, (%rax,%r9,4), %xmm0, %xmm0
            vmovlhps        %xmm2, %xmm1, %xmm1
            vmovss  (%rax,%rbx,4), %xmm2
            vinsertps       $0x10, (%rax,%r11,4), %xmm2, %xmm2
            vmovlhps        %xmm2, %xmm0, %xmm0
            vinsertf128     $0x1, %xmm1, %ymm0, %ymm0
            vmulps  %ymm3, %ymm0, %ymm0
            vmovups %ymm0, (%r8,%rcx)
            addq    $32, %rcx
            cmpl    %esi, %edi
            jb      .L4
    
    we now prefer
    
    .L4:
            leaq    0(%rbp,%rdx,8), %rcx
            movl    (%rcx), %r10d
            movl    4(%rcx), %ecx
            vmovss  (%rsi,%r10,4), %xmm0
            vinsertps       $0x10, (%rsi,%rcx,4), %xmm0, %xmm0
            vmulps  %xmm1, %xmm0, %xmm0
            vmovlps %xmm0, (%rbx,%rdx,8)
            incq    %rdx
            cmpl    %edi, %edx
            jb      .L4
    
            * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
            Tame down element extracts and scalar loads for gather/scatter
            similar to elementwise strided accesses.
    
            * gcc.target/i386/pr89618-2.c: New testcase.
            * gcc.target/i386/pr88531-2b.c: Adjust.
            * gcc.target/i386/pr88531-2c.c: Likewise.

Diff:
---
 gcc/config/i386/i386.cc                    |  6 ++++--
 gcc/testsuite/gcc.target/i386/pr88531-2b.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr88531-2c.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr89618-2.c  | 23 +++++++++++++++++++++++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index cf1cfb722a9..b1d08ecdb3d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23576,8 +23576,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
       && stmt_info
       && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type
 	  || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type)
-      && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
-      && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST)
+      && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE
+	   && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+	       != INTEGER_CST))
+	  || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
     {
       stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
       stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
index 011607c3d54..cdefff2ce8e 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c
@@ -3,4 +3,4 @@
 
 #include "pr88531-2a.c"
 
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
index 0f7ec3832f8..17b24c0dacc 100644
--- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c
+++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c
@@ -3,4 +3,4 @@
 
 #include "pr88531-2a.c"
 
-/* { dg-final { scan-assembler-times "vmulps" 2 } } */
+/* { dg-final { scan-assembler-times "vmulps" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c
new file mode 100644
index 00000000000..0b7dcfd8806
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */
+
+void foo (int n, int *off, double *a)
+{
+  const int m = 32;
+
+  for (int j = 0; j < n/m; ++j)
+    {
+      int const start = j*m;
+      int const end = (j+1)*m;
+
+#pragma GCC ivdep
+      for (int i = start; i < end; ++i)
+	{
+	  a[off[i]] = a[i] < 0 ? a[i] : 0;
+	}
+    }
+}
+
+/* Make sure the cost model selects SSE vectors rather than AVX to avoid
+   too many scalar ops for the address computes in the loop.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */