public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-4873] PR target/107548: Handle vec_select in STV on x86.
@ 2022-12-23  9:58 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-12-23  9:58 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0b2c1369d035e92847cca81fd9f7b4e9ab9da710

commit r13-4873-g0b2c1369d035e92847cca81fd9f7b4e9ab9da710
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Fri Dec 23 09:56:30 2022 +0000

    PR target/107548: Handle vec_select in STV on x86.
    
    This patch enhances x86's STV pass to handle VEC_SELECT during general
    scalar chain conversion, performing SImode scalar extraction from V4SI
    and DImode scalar extraction from V2DI in vector registers.
    
    The motivating test case from bugzilla is:
    
    typedef unsigned int v4si __attribute__((vector_size(16)));
    
    unsigned int f (v4si a, v4si b)
    {
      a[0] += b[0];
      return a[0] + a[1];
    }
    
    currently with -O2 -march=znver2 this generates:
    
            vpextrd $1, %xmm0, %edx
            vmovd   %xmm0, %eax
            addl    %edx, %eax
            vmovd   %xmm1, %edx
            addl    %edx, %eax
            ret
    
    which performs three transfers from the vector unit to the scalar unit,
    and performs the two additions there.  With this patch, we now generate:
    
            vmovdqa %xmm0, %xmm2
            vpshufd $85, %xmm0, %xmm0
            vpaddd  %xmm0, %xmm2, %xmm0
            vpaddd  %xmm1, %xmm0, %xmm0
            vmovd   %xmm0, %eax
            ret
    
    which performs the two additions in the vector unit, and then transfers
    the result to the scalar unit.  Technically the (cheap) movdqa isn't
    needed with better register allocation (or this could be cleaned up
    during peephole2), but even so this transform is still a win.
    
    2022-12-23  Roger Sayle  <roger@nextmovesoftware.com>
    
    gcc/ChangeLog
            PR target/107548
            * config/i386/i386-features.cc (scalar_chain::add_insn): The
            operands of a VEC_SELECT don't need to added to the scalar chain.
            (general_scalar_chain::compute_convert_gain) <case VEC_SELECT>:
            Provide gains for performing STV on a VEC_SELECT.
            (general_scalar_chain::convert_insn): Convert VEC_SELECT to pshufd,
            psrldq or no-op.
            (general_scalar_to_vector_candidate_p): Handle VEC_SELECT of a
            single element from a vector register to a scalar register.
    
    gcc/testsuite/ChangeLog
            PR target/107548
            * gcc.target/i386/pr107548-1.c: New test V4SI case.
            * gcc.target/i386/pr107548-2.c: New test V2DI case.

Diff:
---
 gcc/config/i386/i386-features.cc           | 50 ++++++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107548-1.c | 25 +++++++++++++++
 gcc/testsuite/gcc.target/i386/pr107548-2.c | 13 ++++++++
 3 files changed, 88 insertions(+)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4bf8bb35804..0f59be0b6fa 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -429,6 +429,11 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
       analyze_register_chain (candidates, ref);
+
+  /* The operand(s) of VEC_SELECT don't need to be converted/convertible.  */
+  if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
+    return;
+
   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
     if (!DF_REF_REG_MEM_P (ref))
       analyze_register_chain (candidates, ref);
@@ -629,6 +634,23 @@ general_scalar_chain::compute_convert_gain ()
 	      }
 	    break;
 
+	  case VEC_SELECT:
+	    if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+	      {
+		// movd (4 bytes) replaced with movdqa (4 bytes).
+		if (!optimize_insn_for_size_p ())
+		  igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+	      }
+	    else
+	      {
+		// pshufd; movd replaced with pshufd.
+		if (optimize_insn_for_size_p ())
+		  igain += COSTS_N_BYTES (4);
+		else
+		  igain += ix86_cost->sse_to_integer;
+	      }
+	    break;
+
 	  default:
 	    gcc_unreachable ();
 	  }
@@ -1167,6 +1189,24 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
       convert_op (&src, insn);
       break;
 
+    case VEC_SELECT:
+      if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+	src = XEXP (src, 0);
+      else if (smode == DImode)
+	{
+	  rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
+	  dst = gen_lowpart (V1TImode, dst);
+	  src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
+	}
+      else
+	{
+	  rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
+	  rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
+	  rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
+	  src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
+	}
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -1930,6 +1970,16 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
     case CONST_INT:
       return REG_P (dst);
 
+    case VEC_SELECT:
+      /* Excluding MEM_P (dst) avoids intefering with vpextr[dq].  */
+      return REG_P (dst)
+	     && REG_P (XEXP (src, 0))
+	     && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
+							    : V4SImode)
+	     && GET_CODE (XEXP (src, 1)) == PARALLEL
+	     && XVECLEN (XEXP (src, 1), 0) == 1
+	     && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
+
     default:
       return false;
     }
diff --git a/gcc/testsuite/gcc.target/i386/pr107548-1.c b/gcc/testsuite/gcc.target/i386/pr107548-1.c
new file mode 100644
index 00000000000..da78f758fe2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107548-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mstv -mno-stackrealign" } */
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+unsigned int foo1 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[1];
+}
+
+unsigned int foo2 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[2];
+}
+
+unsigned int foo3 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[3];
+}
+
+/* { dg-final { scan-assembler-times "\tmovd\t" 3 } } */
+/* { dg-final { scan-assembler-times "paddd" 6 } } */
+/* { dg-final { scan-assembler-not "addl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr107548-2.c b/gcc/testsuite/gcc.target/i386/pr107548-2.c
new file mode 100644
index 00000000000..b57594e8821
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107548-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mstv -mno-stackrealign" } */
+typedef unsigned long long v2di __attribute__((vector_size(16)));
+
+unsigned long long foo(v2di a, v2di b)
+{
+  a[0] += b[0];
+  return a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-not "\taddq\t" } } */
+/* { dg-final { scan-assembler-times "paddq" 2 } } */
+/* { dg-final { scan-assembler "psrldq" } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-12-23  9:58 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-23  9:58 [gcc r13-4873] PR target/107548: Handle vec_select in STV on x86 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).