public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-4889] Use movss/movsd to implement V4SI/V2DI VEC_PERM on x86.
@ 2022-12-25 12:04 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-12-25 12:04 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:febb58d28bfa4b544ec7ffec2d61f46d25205ff0

commit r13-4889-gfebb58d28bfa4b544ec7ffec2d61f46d25205ff0
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Sun Dec 25 11:57:12 2022 +0000

    Use movss/movsd to implement V4SI/V2DI VEC_PERM on x86.
    
    This patch tweaks the x86 backend to use the movss and movsd instructions
    to perform some vector permutations on integer vectors (V4SI and V2DI) in
    the same way they are used for floating point vectors (V4SF and V2DF).
    
    As a motivating example, consider:
    
    typedef unsigned int v4si __attribute__((vector_size(16)));
    typedef float v4sf __attribute__((vector_size(16)));
    v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
    v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }
    
    which is currently compiled with -O2 to:
    
    foo:    movdqa  %xmm0, %xmm2
            shufps  $80, %xmm0, %xmm1
            movdqa  %xmm1, %xmm0
            shufps  $232, %xmm2, %xmm0
            ret
    
    bar:    movss   %xmm1, %xmm0
            ret
    
    with this patch both functions compile to the same form.
    Likewise for the V2DI case:
    
    typedef unsigned long v2di __attribute__((vector_size(16)));
    typedef double v2df __attribute__((vector_size(16)));
    
    v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
    v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }
    
    which currently generates:
    
    foo:    shufpd  $2, %xmm0, %xmm1
            movdqa  %xmm1, %xmm0
            ret
    
    bar:    movsd   %xmm1, %xmm0
            ret
    
    2022-12-25  Roger Sayle  <roger@nextmovesoftware.com>
                Uroš Bizjak  <ubizjak@gmail.com>
    
    gcc/ChangeLog
            * config/i386/i386-builtin.def (__builtin_ia32_movss): Update
            CODE_FOR_sse_movss to CODE_FOR_sse_movss_v4sf.
            (__builtin_ia32_movsd): Likewise, update CODE_FOR_sse2_movsd to
            CODE_FOR_sse2_movsd_v2df.
            * config/i386/i386-expand.cc (split_convert_uns_si_sse): Update
            gen_sse_movss call to gen_sse_movss_v4sf, and gen_sse2_movsd call
            to gen_sse2_movsd_v2df.
            (expand_vec_perm_movs): Also allow V4SImode with TARGET_SSE and
            V2DImode with TARGET_SSE2.
            * config/i386/sse.md
            (avx512fp16_fcmaddcsh_v8hf_mask3<round_expand_name>): Update
            gen_sse_movss call to gen_sse_movss_v4sf.
            (avx512fp16_fmaddcsh_v8hf_mask3<round_expand_name>): Likewise.
            (sse_movss_<mode>): Renamed from sse_movss using VI4F_128 mode
            iterator to handle both V4SF and V4SI.
            (sse2_movsd_<mode>): Likewise, renamed from sse2_movsd using
            VI8F_128 mode iterator to handle both V2DF and V2DI.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse-movss-4.c: New test case.
            * gcc.target/i386/sse2-movsd-3.c: New test case.

Diff:
---
 gcc/config/i386/i386-builtin.def             |  4 ++--
 gcc/config/i386/i386-expand.cc               |  8 +++++---
 gcc/config/i386/sse.md                       | 24 ++++++++++++------------
 gcc/testsuite/gcc.target/i386/sse-movss-4.c  | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-movsd-3.c | 15 +++++++++++++++
 5 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index d85b1753039..0d1fc342e61 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -679,7 +679,7 @@ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_
 
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_copysignv4sf3,  "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movss_v4sf,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
@@ -781,7 +781,7 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86
 
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 
-BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_sse2_movsd_v2df,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0b887e74515..b6cc76d8a9c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1774,9 +1774,9 @@ ix86_split_convert_uns_si_sse (rtx operands[])
       input = gen_rtx_REG (vecmode, REGNO (input));
       emit_move_insn (value, CONST0_RTX (vecmode));
       if (vecmode == V4SFmode)
-	emit_insn (gen_sse_movss (value, value, input));
+	emit_insn (gen_sse_movss_v4sf (value, value, input));
       else
-	emit_insn (gen_sse2_movsd (value, value, input));
+	emit_insn (gen_sse2_movsd_v2df (value, value, input));
     }
 
   emit_move_insn (large, two31);
@@ -18905,8 +18905,10 @@ expand_vec_perm_movs (struct expand_vec_perm_d *d)
     return false;
 
   if (!(TARGET_SSE && vmode == V4SFmode)
+      && !(TARGET_SSE && vmode == V4SImode)
       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
-      && !(TARGET_SSE2 && vmode == V2DFmode))
+      && !(TARGET_SSE2 && vmode == V2DFmode)
+      && !(TARGET_SSE2 && vmode == V2DImode))
     return false;
 
   /* Only the first element is changed.  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index de632b2d8fe..d50627a7d6c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6825,7 +6825,7 @@
   if (!MEM_P (operands[3]))
     operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
   emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
@@ -6855,7 +6855,7 @@
   if (!MEM_P (operands[3]))
     operands[3] = force_reg (V8HFmode, operands[3]);
   op1 = lowpart_subreg (V4SFmode, operands[3], V8HFmode);
-  emit_insn (gen_sse_movss (dest, op1, op0));
+  emit_insn (gen_sse_movss_v4sf (dest, op1, op0));
   emit_move_insn (operands[0], lowpart_subreg (V8HFmode, dest, V4SFmode));
   DONE;
 })
@@ -10498,11 +10498,11 @@
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF,V2SF")])
 
-(define_insn "sse_movss"
-  [(set (match_operand:V4SF 0 "register_operand"   "=x,v")
-	(vec_merge:V4SF
-	  (match_operand:V4SF 2 "register_operand" " x,v")
-	  (match_operand:V4SF 1 "register_operand" " 0,v")
+(define_insn "sse_movss_<mode>"
+  [(set (match_operand:VI4F_128 0 "register_operand"   "=x,v")
+	(vec_merge:VI4F_128
+	  (match_operand:VI4F_128 2 "register_operand" " x,v")
+	  (match_operand:VI4F_128 1 "register_operand" " 0,v")
 	  (const_int 1)))]
   "TARGET_SSE"
   "@
@@ -13481,11 +13481,11 @@
   [(set (match_dup 0) (match_dup 1))]
   "operands[0] = adjust_address (operands[0], DFmode, 0);")
 
-(define_insn "sse2_movsd"
-  [(set (match_operand:V2DF 0 "nonimmediate_operand"   "=x,v,x,v,m,x,x,v,o")
-	(vec_merge:V2DF
-	  (match_operand:V2DF 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
-	  (match_operand:V2DF 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
+(define_insn "sse2_movsd_<mode>"
+  [(set (match_operand:VI8F_128 0 "nonimmediate_operand"   "=x,v,x,v,m,x,x,v,o")
+	(vec_merge:VI8F_128
+	  (match_operand:VI8F_128 2 "nonimmediate_operand" " x,v,m,m,v,0,0,v,0")
+	  (match_operand:VI8F_128 1 "nonimmediate_operand" " 0,v,0,v,0,x,o,o,v")
 	  (const_int 1)))]
   "TARGET_SSE2"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/sse-movss-4.c b/gcc/testsuite/gcc.target/i386/sse-movss-4.c
new file mode 100644
index 00000000000..ec3019c8e54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse-movss-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef float v4sf __attribute__((vector_size(16)));
+
+v4si foo(v4si x,v4si y) { return (v4si){y[0],x[1],x[2],x[3]}; }
+v4sf bar(v4sf x,v4sf y) { return (v4sf){y[0],x[1],x[2],x[3]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movss\t" 2 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "shufps" } } */
+/* { dg-final { scan-assembler-not "vpblendw" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c
new file mode 100644
index 00000000000..fadbe2bf2f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-movsd-3.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned long long v2di __attribute__((vector_size(16)));
+typedef double v2df __attribute__((vector_size(16)));
+
+v2di foo(v2di x,v2di y) { return (v2di){y[0],x[1]}; }
+v2df bar(v2df x,v2df y) { return (v2df){y[0],x[1]}; }
+
+/* { dg-final { scan-assembler-times "\tv?movsd\t" 2 } } */
+/* { dg-final { scan-assembler-not "v?shufpd" } } */
+/* { dg-final { scan-assembler-not "movdqa" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "v?punpckldq" } } */
+/* { dg-final { scan-assembler-not "v?movq" } } */

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-12-25 12:04 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-25 12:04 [gcc r13-4889] Use movss/movsd to implement V4SI/V2DI VEC_PERM on x86 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).