From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1251) id 20CB2385841B; Mon, 29 Nov 2021 10:48:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 20CB2385841B MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Roger Sayle To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-5577] x86_64: Improved V1TImode rotations by non-constant amounts. X-Act-Checkin: gcc X-Git-Author: Roger Sayle X-Git-Refname: refs/heads/master X-Git-Oldrev: a3b31fe3692894e80de16b4059a89a309e409687 X-Git-Newrev: a5d269f0c1cda545a86da960e8989bea862dd75e Message-Id: <20211129104849.20CB2385841B@sourceware.org> Date: Mon, 29 Nov 2021 10:48:49 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 29 Nov 2021 10:48:49 -0000 https://gcc.gnu.org/g:a5d269f0c1cda545a86da960e8989bea862dd75e commit r12-5577-ga5d269f0c1cda545a86da960e8989bea862dd75e Author: Roger Sayle Date: Mon Nov 29 10:45:11 2021 +0000 x86_64: Improved V1TImode rotations by non-constant amounts. This patch builds on the recent improvements to TImode rotations (and Jakub's fixes to shldq/shrdq patterns). Now that expanding a TImode rotation can never fail, it is safe to allow general_operand constraints on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns. I've also made an additional tweak to ix86_expand_v1ti_to_ti to use vec_extract via V2DImode, which avoid using memory and takes advantage vpextrq on recent hardware. For the following test case: typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); } GCC with -O2 -mavx2 would previously generate: rotr: vmovdqa %xmm0, -24(%rsp) movq -16(%rsp), %rdx movl %edi, %ecx xorl %esi, %esi movq -24(%rsp), %rax shrdq %rdx, %rax shrq %cl, %rdx testb $64, %dil cmovne %rdx, %rax cmovne %rsi, %rdx negl %ecx xorl %edi, %edi andl $127, %ecx vmovq %rax, %xmm2 movq -24(%rsp), %rax vpinsrq $1, %rdx, %xmm2, %xmm1 movq -16(%rsp), %rdx shldq %rax, %rdx salq %cl, %rax testb $64, %cl cmovne %rax, %rdx cmovne %rdi, %rax vmovq %rax, %xmm3 vpinsrq $1, %rdx, %xmm3, %xmm0 vpor %xmm1, %xmm0, %xmm0 ret with this patch, we now generate: rotr: movl %edi, %ecx vpextrq $1, %xmm0, %rax vmovq %xmm0, %rdx shrdq %rax, %rdx vmovq %xmm0, %rsi shrdq %rsi, %rax andl $64, %ecx movq %rdx, %rsi cmovne %rax, %rsi cmove %rax, %rdx vmovq %rsi, %xmm0 vpinsrq $1, %rdx, %xmm0, %xmm0 ret 2021-11-29 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the conversion via V2DImode using vec_extractv2didi on TARGET_SSE2. * config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint on QImode shift amounts from const_int_operand to general_operand. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-v1ti-rotate.c: New test case. Diff: --- gcc/config/i386/i386-expand.c | 12 +++++++++++- gcc/config/i386/sse.md | 4 ++-- gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c | 11 +++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 0d5d1a0e205..354a9a713b7 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -6162,7 +6162,17 @@ static rtx ix86_expand_v1ti_to_ti (rtx x) { rtx result = gen_reg_rtx (TImode); - emit_move_insn (result, gen_lowpart (TImode, x)); + if (TARGET_SSE2) + { + rtx temp = gen_reg_rtx (V2DImode); + emit_move_insn (temp, gen_lowpart (V2DImode, x)); + rtx lo = gen_lowpart (DImode, result); + emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx)); + rtx hi = gen_highpart (DImode, result); + emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx)); + } + else + emit_move_insn (result, gen_lowpart (TImode, x)); return result; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b371b140eb1..b6d03b89a04 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -15296,7 +15296,7 @@ [(set (match_operand:V1TI 0 "register_operand") (rotate:V1TI (match_operand:V1TI 1 "register_operand") - (match_operand:QI 2 "const_int_operand")))] + (match_operand:QI 2 "general_operand")))] "TARGET_SSE2 && TARGET_64BIT" { ix86_expand_v1ti_rotate (ROTATE, operands); @@ -15307,7 +15307,7 @@ [(set (match_operand:V1TI 0 "register_operand") (rotatert:V1TI (match_operand:V1TI 1 "register_operand") - (match_operand:QI 2 "const_int_operand")))] + (match_operand:QI 2 "general_operand")))] "TARGET_SSE2 && TARGET_64BIT" { ix86_expand_v1ti_rotate (ROTATERT, operands); diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c new file mode 100644 index 00000000000..b4b2814a3d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); + +uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); } +uv1ti rotl(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); } + +/* { dg-final { scan-assembler-not "shrq" } } */ +/* { dg-final { scan-assembler-not "salq" } } */