public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r13-1508] UNSPEC_PALIGNR optimizations and clean-ups on x86.
@ 2022-07-05 17:02 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2022-07-05 17:02 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:02e2e15ec4b610c0f5c73e1db424b1bbc65dd39a

commit r13-1508-g02e2e15ec4b610c0f5c73e1db424b1bbc65dd39a
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Tue Jul 5 18:00:00 2022 +0100

    UNSPEC_PALIGNR optimizations and clean-ups on x86.
    
    This patch is a follow-up to Hongtao's fix for PR target/105854.  That
    fix is perfectly correct, but the thing that caught my eye was why is
    the compiler generating a shift by zero at all.  Digging deeper it
    turns out that we can easily optimize __builtin_ia32_palignr for
    alignments of 0 and 64 respectively, which may be simplified to moves
    of the highpart and lowpart respectively.
    
    After adding optimizations to simplify the 64-bit DImode palignr, I
    started to add the corresponding optimizations for vpalignr (i.e.
    128-bit).  The first oddity is that sse.md uses TImode and a special
    SSESCALARMODE iterator, rather than V1TImode, and indeed the comment
    above SSESCALARMODE hints that this should be "dropped in favor of
    VIMAX_AVX2_AVX512BW".  Hence this patch includes the migration of
    <ssse3_avx2>_palignr<mode> to use VIMAX_AVX2_AVX512BW, basically
    using V1TImode instead of TImode for 128-bit palignr.
    
    This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
    and make -k check, both with and without --target_board=unix{-,32},
    with no new failures.  Ok for mainline?
    
    2022-07-05  Roger Sayle  <roger@nextmovesoftware.com>
                Hongtao Liu  <hongtao.liu@intel.com>
    
    gcc/ChangeLog
            * config/i386/i386-builtin.def (__builtin_ia32_palignr128): Change
            CODE_FOR_ssse3_palignrti to CODE_FOR_ssse3_palignrv1ti.
            * config/i386/i386-expand.cc (expand_vec_perm_palignr): Use V1TImode
            and gen_ssse3_palignv1ti instead of TImode.
            * config/i386/sse.md (SSESCALARMODE): Delete.
            (define_mode_attr ssse3_avx2): Handle V1TImode instead of TImode.
            (<ssse3_avx2>_palignr<mode>): Use VIMAX_AVX2_AVX512BW as a mode
            iterator instead of SSESCALARMODE.
            (ssse3_palignrdi): Optimize cases where operands[3] is 0 or 64,
            using a single move instruction (if required).
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/ssse3-palignr-2.c: New test case.

Diff:
---
 gcc/config/i386/i386-builtin.def                |  2 +-
 gcc/config/i386/i386-expand.cc                  |  8 ++++--
 gcc/config/i386/sse.md                          | 37 +++++++++++++++++--------
 gcc/testsuite/gcc.target/i386/ssse3-palignr-2.c | 21 ++++++++++++++
 4 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index e6daad47692..fd160935e67 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -900,7 +900,7 @@ BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psig
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI)
 
 /* SSSE3.  */
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_palignrv1ti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT)
 
 /* SSE4.1 */
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bc54304da8..6a3fcde5738 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19548,9 +19548,11 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
   if (GET_MODE_SIZE (d->vmode) == 16)
     {
-      target = gen_reg_rtx (TImode);
-      emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
-				      gen_lowpart (TImode, dcopy.op0), shift));
+      target = gen_reg_rtx (V1TImode);
+      emit_insn (gen_ssse3_palignrv1ti (target,
+					gen_lowpart (V1TImode, dcopy.op1),
+					gen_lowpart (V1TImode, dcopy.op0),
+					shift));
     }
   else
     {
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3396ff748da..81d32a88785 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -575,10 +575,6 @@
 (define_mode_iterator VIMAX_AVX2
   [(V2TI "TARGET_AVX2") V1TI])
 
-;; ??? This should probably be dropped in favor of VIMAX_AVX2_AVX512BW.
-(define_mode_iterator SSESCALARMODE
-  [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI])
-
 (define_mode_iterator VI12_AVX2
   [(V32QI "TARGET_AVX2") V16QI
    (V16HI "TARGET_AVX2") V8HI])
@@ -712,7 +708,7 @@
     (V4HI "ssse3") (V8HI "ssse3") (V16HI "avx2") (V32HI "avx512bw")
     (V4SI "ssse3") (V8SI "avx2")
     (V2DI "ssse3") (V4DI "avx2")
-    (TI "ssse3") (V2TI "avx2") (V4TI "avx512bw")])
+    (V1TI "ssse3") (V2TI "avx2") (V4TI "avx512bw")])
 
 (define_mode_attr sse4_1_avx2
    [(V16QI "sse4_1") (V32QI "avx2") (V64QI "avx512bw")
@@ -21108,10 +21104,10 @@
    (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<ssse3_avx2>_palignr<mode>"
-  [(set (match_operand:SSESCALARMODE 0 "register_operand" "=x,<v_Yw>")
-	(unspec:SSESCALARMODE
-	  [(match_operand:SSESCALARMODE 1 "register_operand" "0,<v_Yw>")
-	   (match_operand:SSESCALARMODE 2 "vector_operand" "xBm,<v_Yw>m")
+  [(set (match_operand:VIMAX_AVX2_AVX512BW 0 "register_operand" "=x,<v_Yw>")
+	(unspec:VIMAX_AVX2_AVX512BW
+	  [(match_operand:VIMAX_AVX2_AVX512BW 1 "register_operand" "0,<v_Yw>")
+	   (match_operand:VIMAX_AVX2_AVX512BW 2 "vector_operand" "xBm,<v_Yw>m")
 	   (match_operand:SI 3 "const_0_to_255_mul_8_operand")]
 	  UNSPEC_PALIGNR))]
   "TARGET_SSSE3"
@@ -21157,11 +21153,30 @@
       gcc_unreachable ();
     }
 }
-  "TARGET_SSSE3 && reload_completed
-   && SSE_REGNO_P (REGNO (operands[0]))"
+  "(TARGET_SSSE3 && reload_completed
+    && SSE_REGNO_P (REGNO (operands[0])))
+   || operands[3] == const0_rtx
+   || INTVAL (operands[3]) == 64"
   [(set (match_dup 0)
 	(lshiftrt:V1TI (match_dup 0) (match_dup 3)))]
 {
+  if (operands[3] == const0_rtx)
+    {
+      if (!rtx_equal_p (operands[0], operands[2]))
+	emit_move_insn (operands[0], operands[2]);
+      else
+	emit_note (NOTE_INSN_DELETED);
+      DONE;
+    }
+  else if (INTVAL (operands[3]) == 64)
+    {
+      if (!rtx_equal_p (operands[0], operands[1]))
+	emit_move_insn (operands[0], operands[1]);
+      else
+	emit_note (NOTE_INSN_DELETED);
+      DONE;
+    }
+
   /* Emulate MMX palignrdi with SSE psrldq.  */
   rtx op0 = lowpart_subreg (V2DImode, operands[0],
 			    GET_MODE (operands[0]));
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-palignr-2.c b/gcc/testsuite/gcc.target/i386/ssse3-palignr-2.c
new file mode 100644
index 00000000000..791222ddd78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ssse3-palignr-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mssse3" } */
+
+typedef long long __attribute__ ((__vector_size__ (8))) T;
+
+T x;
+T y;
+T z;
+
+void foo()
+{
+  z = __builtin_ia32_palignr (x, y, 0);
+}
+
+void bar()
+{
+  z = __builtin_ia32_palignr (x, y, 64);
+}
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-not "pshufd" } } */
+/* { dg-final { scan-assembler-not "psrldq" } } */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-07-05 17:02 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-05 17:02 [gcc r13-1508] UNSPEC_PALIGNR optimizations and clean-ups on x86 Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).