From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1666) id 720CD383468B; Fri, 28 Apr 2023 12:42:29 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 720CD383468B DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1682685749; bh=ihwwSYR9yO8y6DtSkWfMRXya4n8no7sSR0i20eyOZuM=; h=From:To:Subject:Date:From; b=jzH5hF3ATq/bJQqTPYv0VW3jo7nPV24XBFQL1dFuj6S/RlQHMrZpVOA2t5m15tenr WmoykL/MvJGXbaZzcVCw2KK1YQ1fVjDpV6A4zmuO0W7So6tXSBpRmnMYeV+re1sbSg AIWnq8RFUI3dxTr3KDffO7U1LkBohsOJ+T17z9m0= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Richard Biener To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-332] Adjust costing of emulated vectorized gather/scatter X-Act-Checkin: gcc X-Git-Author: Richard Biener X-Git-Refname: refs/heads/master X-Git-Oldrev: 8b84d87969ef2443516a79a80c22d2b6dba04630 X-Git-Newrev: 24905a4bd1375ccd99c02510b9f9529015a48315 Message-Id: <20230428124229.720CD383468B@sourceware.org> Date: Fri, 28 Apr 2023 12:42:29 +0000 (GMT) List-Id: https://gcc.gnu.org/g:24905a4bd1375ccd99c02510b9f9529015a48315 commit r14-332-g24905a4bd1375ccd99c02510b9f9529015a48315 Author: Richard Biener Date: Wed Jan 18 11:04:49 2023 +0100 Adjust costing of emulated vectorized gather/scatter Emulated gather/scatter behave similar to strided elementwise accesses in that they need to decompose the offset vector and construct or decompose the data vector so handle them the same way, pessimizing the cases with may elements. For pr88531-2c.c instead of .L4: leaq (%r15,%rcx), %rdx incl %edi movl 16(%rdx), %r13d movl 24(%rdx), %r14d movl (%rdx), %r10d movl 4(%rdx), %r9d movl 8(%rdx), %ebx movl 12(%rdx), %r11d movl 20(%rdx), %r12d vmovss (%rax,%r14,4), %xmm2 movl 28(%rdx), %edx vmovss (%rax,%r13,4), %xmm1 vmovss (%rax,%r10,4), %xmm0 vinsertps $0x10, (%rax,%rdx,4), %xmm2, %xmm2 vinsertps $0x10, (%rax,%r12,4), %xmm1, %xmm1 vinsertps $0x10, (%rax,%r9,4), %xmm0, %xmm0 vmovlhps %xmm2, %xmm1, %xmm1 vmovss (%rax,%rbx,4), %xmm2 vinsertps $0x10, (%rax,%r11,4), %xmm2, %xmm2 vmovlhps %xmm2, %xmm0, %xmm0 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 vmulps %ymm3, %ymm0, %ymm0 vmovups %ymm0, (%r8,%rcx) addq $32, %rcx cmpl %esi, %edi jb .L4 we now prefer .L4: leaq 0(%rbp,%rdx,8), %rcx movl (%rcx), %r10d movl 4(%rcx), %ecx vmovss (%rsi,%r10,4), %xmm0 vinsertps $0x10, (%rsi,%rcx,4), %xmm0, %xmm0 vmulps %xmm1, %xmm0, %xmm0 vmovlps %xmm0, (%rbx,%rdx,8) incq %rdx cmpl %edi, %edx jb .L4 * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Tame down element extracts and scalar loads for gather/scatter similar to elementwise strided accesses. * gcc.target/i386/pr89618-2.c: New testcase. * gcc.target/i386/pr88531-2b.c: Adjust. * gcc.target/i386/pr88531-2c.c: Likewise. Diff: --- gcc/config/i386/i386.cc | 6 ++++-- gcc/testsuite/gcc.target/i386/pr88531-2b.c | 2 +- gcc/testsuite/gcc.target/i386/pr88531-2c.c | 2 +- gcc/testsuite/gcc.target/i386/pr89618-2.c | 23 +++++++++++++++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index cf1cfb722a9..b1d08ecdb3d 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -23576,8 +23576,10 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, && stmt_info && (STMT_VINFO_TYPE (stmt_info) == load_vec_info_type || STMT_VINFO_TYPE (stmt_info) == store_vec_info_type) - && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE - && TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) != INTEGER_CST) + && ((STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_ELEMENTWISE + && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + != INTEGER_CST)) + || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)) { stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1); diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2b.c b/gcc/testsuite/gcc.target/i386/pr88531-2b.c index 011607c3d54..cdefff2ce8e 100644 --- a/gcc/testsuite/gcc.target/i386/pr88531-2b.c +++ b/gcc/testsuite/gcc.target/i386/pr88531-2b.c @@ -3,4 +3,4 @@ #include "pr88531-2a.c" -/* { dg-final { scan-assembler-times "vmulps" 2 } } */ +/* { dg-final { scan-assembler-times "vmulps" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr88531-2c.c b/gcc/testsuite/gcc.target/i386/pr88531-2c.c index 0f7ec3832f8..17b24c0dacc 100644 --- a/gcc/testsuite/gcc.target/i386/pr88531-2c.c +++ b/gcc/testsuite/gcc.target/i386/pr88531-2c.c @@ -3,4 +3,4 @@ #include "pr88531-2a.c" -/* { dg-final { scan-assembler-times "vmulps" 2 } } */ +/* { dg-final { scan-assembler-times "vmulps" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr89618-2.c b/gcc/testsuite/gcc.target/i386/pr89618-2.c new file mode 100644 index 00000000000..0b7dcfd8806 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr89618-2.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2 -fdump-tree-vect-details" } */ + +void foo (int n, int *off, double *a) +{ + const int m = 32; + + for (int j = 0; j < n/m; ++j) + { + int const start = j*m; + int const end = (j+1)*m; + +#pragma GCC ivdep + for (int i = start; i < end; ++i) + { + a[off[i]] = a[i] < 0 ? a[i] : 0; + } + } +} + +/* Make sure the cost model selects SSE vectors rather than AVX to avoid + too many scalar ops for the address computes in the loop. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" } } */