public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-4731] rs6000: Use direct move for char/short vector CTOR [PR96933]
@ 2020-11-05  8:09 Kewen Lin
  0 siblings, 0 replies; only message in thread
From: Kewen Lin @ 2020-11-05  8:09 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:025f434a87336e38bf5140fba2005081876aa911

commit r11-4731-g025f434a87336e38bf5140fba2005081876aa911
Author: Kewen Lin <linkw@linux.ibm.com>
Date:   Thu Nov 5 00:04:10 2020 -0600

    rs6000: Use direct move for char/short vector CTOR [PR96933]
    
    This patch is to make vector CTOR with char/short leverage direct
    move instructions when they are available.  With one constructed
    test case, it can speed up 145% for char and 190% for short on P9.
    
    Tested SPEC2017 x264_r at -Ofast on P9, it gets 1.61% speedup
    (but based on unexpected SLP see PR96789).
    
    Bootstrapped/regtested on powerpc64{,le}-linux-gnu P8 and
    powerpc64le-linux-gnu P9.
    
    gcc/ChangeLog:
    
            PR target/96933
            * config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
            instructions for vector construction with char/short types.
            * config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
            (p8_mtvsrd_v16qidi2): Likewise.
    
    gcc/testsuite/ChangeLog:
    
            PR target/96933
            * gcc.target/powerpc/pr96933-1.c: New test.
            * gcc.target/powerpc/pr96933-2.c: New test.
            * gcc.target/powerpc/pr96933-3.c: New test.
            * gcc.target/powerpc/pr96933-4.c: New test.
            * gcc.target/powerpc/pr96933.h: New test.
            * gcc.target/powerpc/pr96933-run.h: New test.

Diff:
---
 gcc/config/rs6000/rs6000.c                     | 179 ++++++++++++++++++++++++-
 gcc/config/rs6000/rs6000.md                    |  16 +++
 gcc/testsuite/gcc.target/powerpc/pr96933-1.c   |  15 +++
 gcc/testsuite/gcc.target/powerpc/pr96933-2.c   |  15 +++
 gcc/testsuite/gcc.target/powerpc/pr96933-3.c   |  10 ++
 gcc/testsuite/gcc.target/powerpc/pr96933-4.c   |  10 ++
 gcc/testsuite/gcc.target/powerpc/pr96933-run.h |  56 ++++++++
 gcc/testsuite/gcc.target/powerpc/pr96933.h     |  50 +++++++
 8 files changed, 349 insertions(+), 2 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 1e506b83762..e613353d0c9 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6522,11 +6522,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
+  unsigned int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
   bool all_same = true, all_const_zero = true;
   rtx x, mem;
-  int i;
+  unsigned int i;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -6792,6 +6792,181 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+    {
+      rtx op[16];
+      /* Force the values into word_mode registers.  */
+      for (i = 0; i < n_elts; i++)
+	{
+	  rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+	  if (TARGET_64BIT)
+	    {
+	      op[i] = gen_reg_rtx (DImode);
+	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+	    }
+	  else
+	    {
+	      op[i] = gen_reg_rtx (SImode);
+	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+	    }
+	}
+
+      /* Take unsigned char big endianness on 64bit as example for below
+	 construction, the input values are: A, B, C, D, ..., O, P.  */
+
+      if (TARGET_DIRECT_MOVE_128)
+	{
+	  /* Move to VSX register with vec_concat, each has 2 values.
+	     eg: vr1[0] = { xxxxxxxA, xxxxxxxB };
+		 vr1[1] = { xxxxxxxC, xxxxxxxD };
+		 ...
+		 vr1[7] = { xxxxxxxO, xxxxxxxP };  */
+	  rtx vr1[8];
+	  for (i = 0; i < n_elts / 2; i++)
+	    {
+	      vr1[i] = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vsx_concat_v2di (vr1[i], op[i * 2],
+					      op[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 2 values into vectors with 4 values.
+	     eg: vr2[0] = { xxxAxxxB, xxxCxxxD };
+		 vr2[1] = { xxxExxxF, xxxGxxxH };
+		 vr2[1] = { xxxIxxxJ, xxxKxxxL };
+		 vr2[3] = { xxxMxxxN, xxxOxxxP };  */
+	  rtx vr2[4];
+	  for (i = 0; i < n_elts / 4; i++)
+	    {
+	      vr2[i] = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_altivec_vpkudum (vr2[i], vr1[i * 2],
+					      vr1[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 4 values into vectors with 8 values.
+	     eg: vr3[0] = { xAxBxCxD, xExFxGxH };
+		 vr3[1] = { xIxJxKxL, xMxNxOxP };  */
+	  rtx vr3[2];
+	  for (i = 0; i < n_elts / 8; i++)
+	    {
+	      vr3[i] = gen_reg_rtx (V8HImode);
+	      emit_insn (gen_altivec_vpkuwum (vr3[i], vr2[i * 2],
+					      vr2[i * 2 + 1]));
+	    }
+
+	  /* If it's V8HImode, it's done and return it. */
+	  if (mode == V8HImode)
+	    {
+	      emit_insn (gen_rtx_SET (target, vr3[0]));
+	      return;
+	    }
+
+	  /* Pack vectors with 8 values into 16 values.  */
+	  rtx res = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_altivec_vpkuhum (res, vr3[0], vr3[1]));
+	  emit_insn (gen_rtx_SET (target, res));
+	}
+      else
+	{
+	  rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+	  rtx perm_idx;
+
+	  /* Set up some common gen routines and values.  */
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrghb;
+		  merge_v8hi = gen_altivec_vmrglh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrghh;
+
+	      merge_v4si = gen_altivec_vmrglw;
+	      perm_idx = GEN_INT (3);
+	    }
+	  else
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrglb;
+		  merge_v8hi = gen_altivec_vmrghh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrglh;
+
+	      merge_v4si = gen_altivec_vmrghw;
+	      perm_idx = GEN_INT (0);
+	    }
+
+	  /* Move to VSX register with direct move.
+	     eg: vr_qi[0] = { xxxxxxxA, xxxxxxxx };
+		 vr_qi[1] = { xxxxxxxB, xxxxxxxx };
+		 ...
+		 vr_qi[15] = { xxxxxxxP, xxxxxxxx };  */
+	  rtx vr_qi[16];
+	  for (i = 0; i < n_elts; i++)
+	    {
+	      vr_qi[i] = gen_reg_rtx (V16QImode);
+	      if (TARGET_64BIT)
+		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+	      else
+		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+	    }
+
+	  /* Merge/move to vector short.
+	     eg: vr_hi[0] = { xxxxxxxx, xxxxxxAB };
+		 vr_hi[1] = { xxxxxxxx, xxxxxxCD };
+		 ...
+		 vr_hi[7] = { xxxxxxxx, xxxxxxOP };  */
+	  rtx vr_hi[8];
+	  for (i = 0; i < 8; i++)
+	    {
+	      rtx tmp = vr_qi[i];
+	      if (mode == V16QImode)
+		{
+		  tmp = gen_reg_rtx (V16QImode);
+		  emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+		}
+	      vr_hi[i] = gen_reg_rtx (V8HImode);
+	      emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+	    }
+
+	  /* Merge vector short to vector int.
+	     eg: vr_si[0] = { xxxxxxxx, xxxxABCD };
+		 vr_si[1] = { xxxxxxxx, xxxxEFGH };
+		 ...
+		 vr_si[3] = { xxxxxxxx, xxxxMNOP };  */
+	  rtx vr_si[4];
+	  for (i = 0; i < 4; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V8HImode);
+	      emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+	      vr_si[i] = gen_reg_rtx (V4SImode);
+	      emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+	    }
+
+	  /* Merge vector int to vector long.
+	     eg: vr_di[0] = { xxxxxxxx, ABCDEFGH };
+		 vr_di[1] = { xxxxxxxx, IJKLMNOP };  */
+	  rtx vr_di[2];
+	  for (i = 0; i < 2; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V4SImode);
+	      emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+	      vr_di[i] = gen_reg_rtx (V2DImode);
+	      emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+	    }
+
+	  rtx res = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+	  emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+	}
+
+      return;
+    }
+
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index dc060143104..acbf1303ab2 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mtvsr")])
 
+(define_insn "p8_mtvsrwz_v16qisi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRWZ))]
+  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mtvsr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mtvsr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..3b63865b3b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+   direct move instructions mtvsrdd on Power9 or later, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mvpkudum\M} 12 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..cef8fbd4f35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+   move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..3e5709ab0d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short run successfully on Power8.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-4.c b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
new file mode 100644
index 00000000000..5a1c3d0c857
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+/* Test vector constructions with char/short run successfully on Power9.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-run.h b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
new file mode 100644
index 00000000000..7fa8dac639c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
@@ -0,0 +1,56 @@
+/* Test function for pr96933-{3,4}.c run result verification.  */
+
+int
+main ()
+{
+  unsigned char uc[16];
+  signed char sc[16];
+
+  for (int i = 0; i < 16; i++)
+    {
+      uc[i] = (unsigned char) (i * 2 + 1);
+      sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned char ucv
+    = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8],
+		  uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+  vector signed char scv
+    = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8],
+		  sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+  for (int i = 0; i < 16; i++)
+    {
+      unsigned char uexp = (unsigned char) (i * 2 + 1);
+      signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+      if (ucv[i] != uexp)
+	abort ();
+      if (scv[i] != sexp)
+	abort ();
+    }
+
+  unsigned short us[8];
+  signed short ss[8];
+  for (int i = 0; i < 8; i++)
+    {
+      us[i] = (unsigned short) (i * 2 + 1);
+      ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned short usv
+    = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+  vector signed short ssv
+    = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+  for (int i = 0; i < 8; i++)
+    {
+      unsigned short uexp = (unsigned short) (i * 2 + 1);
+      signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+      if (usv[i] != uexp)
+	abort ();
+      if (ssv[i] != sexp)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+   functions as below:
+
+     - test_uchar  // vector unsigned char
+     - test_schar  // vector signed char
+     - test_ushort // vector unsigned short
+     - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+	    unsigned char c4, unsigned char c5, unsigned char c6,
+	    unsigned char c7, unsigned char c8, unsigned char c9,
+	    unsigned char c10, unsigned char c11, unsigned char c12,
+	    unsigned char c13, unsigned char c14, unsigned char c15,
+	    unsigned char c16)
+{
+  vector unsigned char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+	    signed char c5, signed char c6, signed char c7, signed char c8,
+	    signed char c9, signed char c10, signed char c11, signed char c12,
+	    signed char c13, signed char c14, signed char c15, signed char c16)
+{
+  vector signed char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+	     unsigned short c4, unsigned short c5, unsigned short c6,
+	     unsigned short c7, unsigned short c8)
+{
+  vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+	     signed short c4, signed short c5, signed short c6,
+	     signed short c7, signed short c8)
+{
+  vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2020-11-05  8:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-05  8:09 [gcc r11-4731] rs6000: Use direct move for char/short vector CTOR [PR96933] Kewen Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).