public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] rs6000: Use direct move for char/short vector CTOR [PR96933]
@ 2020-09-08  7:55 Kewen.Lin
  2020-09-10  3:19 ` [PATCH v2] " Kewen.Lin
  0 siblings, 1 reply; 8+ messages in thread
From: Kewen.Lin @ 2020-09-08  7:55 UTC (permalink / raw)
  To: GCC Patches; +Cc: Segher Boessenkool, Bill Schmidt, David Edelsohn

[-- Attachment #1: Type: text/plain, Size: 928 bytes --]

Hi,

This patch is to make vector CTOR with char/short leverage direct
move instructions when they are available.  With one constructed
test case, it can speed up 145% for char and 190% for short on P9.

Tested SPEC2017 x264_r at -Ofast on P9, it gets 1.61% speedup
(but based on unexpected SLP see PR96789).

Bootstrapped/regtested on powerpc64{,le}-linux-gnu P8 and
powerpc64le-linux-gnu P9.

Is it ok for trunk?

BR,
Kewen
------------

gcc/ChangeLog:

	PR target/96933
	* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
	instructions for vector construction with char/short types.
	* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
	(p8_mtvsrd_v16qidi2): Likewise. 

gcc/testsuite/ChangeLog:

	PR target/96933
	* gcc.target/powerpc/pr96933-1.c: New test.
	* gcc.target/powerpc/pr96933-2.c: New test.
	* gcc.target/powerpc/pr96933-3.c: New test.
	* gcc.target/powerpc/pr96933.h: New test.

[-- Attachment #2: vec_CTOR.diff --]
[-- Type: text/plain, Size: 13514 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ca5b71ecdd3..39d7e2e9451 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6411,11 +6411,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
+  unsigned int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
   bool all_same = true, all_const_zero = true;
   rtx x, mem;
-  int i;
+  unsigned int i;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -6681,6 +6681,207 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+    {
+      rtx op[16];
+      /* Force the values into word_mode registers.  */
+      for (i = 0; i < n_elts; i++)
+	{
+	  rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+	  if (TARGET_POWERPC64)
+	    {
+	      op[i] = gen_reg_rtx (DImode);
+	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+	    }
+	  else
+	    {
+	      op[i] = gen_reg_rtx (SImode);
+	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+	    }
+	}
+
+      rtx vr_qi[16];
+      rtx vr_hi[8];
+      rtx vr_si[4];
+      rtx vr_di[2];
+
+      rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+      rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+      rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+      rtx perm_idx;
+
+      /* Set up some common gen routines and values according to endianness.  */
+      if (BYTES_BIG_ENDIAN)
+	{
+	  if (mode == V16QImode)
+	    {
+	      merge_v16qi = gen_altivec_vmrghb;
+	      merge_v8hi = gen_altivec_vmrglh;
+	    }
+	  else
+	    merge_v8hi = gen_altivec_vmrghh;
+
+	  merge_v4si = gen_altivec_vmrglw;
+	  perm_idx = GEN_INT (3);
+	}
+      else
+	{
+	  if (mode == V16QImode)
+	    {
+	      merge_v16qi = gen_altivec_vmrglb;
+	      merge_v8hi = gen_altivec_vmrghh;
+	    }
+	  else
+	    merge_v8hi = gen_altivec_vmrglh;
+
+	  merge_v4si = gen_altivec_vmrghw;
+	  perm_idx = GEN_INT (0);
+	}
+
+      if (TARGET_DIRECT_MOVE_128)
+	{
+	  /* Take unsigned char big endianness as example for below comments,
+	     the input values are: c1, c2, c3, c4, ..., c15, c16.  */
+
+	  rtx vr_mrg1[8];
+	  /* Move to VSX register with vec_concat, each has 2 values.
+	     eg: vr_mrg1[0] = { ... c1, ... c2 };
+		 vr_mrg1[1] = { ... c3, ... c4 };
+		 ...  */
+	  for (i = 0; i < n_elts / 2; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vsx_concat_v2di (tmp, op[i * 2], op[i * 2 + 1]));
+	      vr_mrg1[i] = gen_reg_rtx (V16QImode);
+	      emit_move_insn (vr_mrg1[i], gen_lowpart (V16QImode, tmp));
+	    }
+
+	  /* Obtain the control vector for further merging.  */
+	  rtx vr_ctrl = gen_reg_rtx (V16QImode);
+	  if (mode == V16QImode)
+	    {
+	      rtx val;
+	      if (BYTES_BIG_ENDIAN)
+		val = gen_int_mode (0x070f171f, SImode);
+	      else
+		val = gen_int_mode (0x18100800, SImode);
+	      rtvec v = gen_rtvec (4, val, val, val, val);
+	      rtx tmp = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_vec_initv4sisi (tmp,
+					     gen_rtx_PARALLEL (V4SImode, v)));
+	      emit_move_insn (vr_ctrl, gen_lowpart (V16QImode, tmp));
+	    }
+	  else
+	    {
+	      rtx val;
+	      if (BYTES_BIG_ENDIAN)
+		val = gen_int_mode (0x06070e0f16171e1f, DImode);
+	      else
+		val = gen_int_mode (0x1918111009080100, DImode);
+	      rtvec v = gen_rtvec (2, val, val);
+	      rtx tmp = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vec_initv2didi (tmp,
+					     gen_rtx_PARALLEL (V2DImode, v)));
+	      emit_move_insn (vr_ctrl, gen_lowpart (V16QImode, tmp));
+	    }
+
+	  rtx vr_mrg2[4];
+	  /* Merge vectors with 2 values into vectors with 4 values.
+	     eg: vr_mrg1[0] = { ... c1, ... c2 };
+		 vr_mrg1[1] = { ... c3, ... c4 };
+		 ...
+	       =>
+		 vr_mrg2[0] = { ... c1, c2, c3, c4 };
+		 ... */
+	  for (i = 0; i < n_elts / 4; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V16QImode);
+	      emit_insn (gen_altivec_vperm_v16qi (tmp, vr_mrg1[i * 2],
+						  vr_mrg1[i * 2 + 1],
+						  gen_lowpart (V16QImode,
+							       vr_ctrl)));
+	      if (mode == V16QImode)
+		{
+		  vr_mrg2[i] = gen_reg_rtx (V4SImode);
+		  emit_move_insn (vr_mrg2[i], gen_lowpart (V4SImode, tmp));
+		}
+	      else
+		{
+		  vr_di[i] = gen_reg_rtx (V2DImode);
+		  emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+		}
+	    }
+
+	  /* Merge vectors with 4 values into vectors with 8 values.
+	     eg: vr_mrg2[0] = { ... c1, c2, c3, c4 };
+		 vr_mrg2[1] = { ... c5, c6, c7, c8 };
+		 ...
+	       =>
+		 vr_di[0] = { ... c1, c2, c3, c4, c5, c6, c7, c8 };
+		 ... */
+	  if (mode == V16QImode)
+	    {
+	      for (i = 0; i < 2; i++)
+		{
+		  rtx tmp = gen_reg_rtx (V4SImode);
+		  emit_insn (merge_v4si (tmp, vr_mrg2[2 * i],
+					 vr_mrg2[2 * i + 1]));
+		  vr_di[i] = gen_reg_rtx (V2DImode);
+		  emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+		}
+	    }
+	}
+      else
+	{
+	  /* Move to VSX register.  */
+	  for (i = 0; i < n_elts; i++)
+	    {
+	      vr_qi[i] = gen_reg_rtx (V16QImode);
+	      if (TARGET_POWERPC64)
+		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+	      else
+		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+	    }
+
+	  /* Merge/move into vector short.  */
+	  for (i = 0; i < 8; i++)
+	    {
+	      rtx tmp = vr_qi[i];
+	      if (mode == V16QImode)
+		{
+		  tmp = gen_reg_rtx (V16QImode);
+		  emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+		}
+	      vr_hi[i] = gen_reg_rtx (V8HImode);
+	      emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+	    }
+
+	  /* Merge vector short into vector int.  */
+	  for (i = 0; i < 4; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V8HImode);
+	      emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+	      vr_si[i] = gen_reg_rtx (V4SImode);
+	      emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+	    }
+
+	  /* Merge vector int into vector long long.  */
+	  for (i = 0; i < 2; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V4SImode);
+	      emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+	      vr_di[i] = gen_reg_rtx (V2DImode);
+	      emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+	    }
+	}
+
+      rtx res = gen_reg_rtx (V2DImode);
+      emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+      emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+
+      return;
+    }
+
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..4d6bd193a10 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@ (define_insn "p8_mtvsrwz"
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
+(define_insn "p8_mtvsrwz_v16qisi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRWZ))]
+  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..eaa4ca5cd5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+   direct move instructions mtvsrdd on Power9 or later, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..e8d57d67aaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc_p8vector_ok } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+   move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..fd0c6926c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short run successfully.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+
+int
+main ()
+{
+  unsigned char uc[16];
+  signed char sc[16];
+
+  for (int i = 0; i < 16; i++)
+    {
+      uc[i] = (unsigned char) (i * 2 + 1);
+      sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned char ucv
+    = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8],
+		  uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+  vector signed char scv
+    = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8],
+		  sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+  for (int i = 0; i < 16; i++)
+    {
+      unsigned char uexp = (unsigned char) (i * 2 + 1);
+      signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+      if (ucv[i] != uexp)
+	abort ();
+      if (scv[i] != sexp)
+	abort ();
+    }
+
+  unsigned short us[8];
+  signed short ss[8];
+  for (int i = 0; i < 8; i++)
+    {
+      us[i] = (unsigned short) (i * 2 + 1);
+      ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned short usv
+    = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+  vector signed short ssv
+    = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+  for (int i = 0; i < 8; i++)
+    {
+      unsigned short uexp = (unsigned short) (i * 2 + 1);
+      signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+      if (usv[i] != uexp)
+	abort ();
+      if (ssv[i] != sexp)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+   functions as below:
+
+     - test_uchar  // vector unsigned char
+     - test_schar  // vector signed char
+     - test_ushort // vector unsigned short
+     - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+	    unsigned char c4, unsigned char c5, unsigned char c6,
+	    unsigned char c7, unsigned char c8, unsigned char c9,
+	    unsigned char c10, unsigned char c11, unsigned char c12,
+	    unsigned char c13, unsigned char c14, unsigned char c15,
+	    unsigned char c16)
+{
+  vector unsigned char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+	    signed char c5, signed char c6, signed char c7, signed char c8,
+	    signed char c9, signed char c10, signed char c11, signed char c12,
+	    signed char c13, signed char c14, signed char c15, signed char c16)
+{
+  vector signed char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+	     unsigned short c4, unsigned short c5, unsigned short c6,
+	     unsigned short c7, unsigned short c8)
+{
+  vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+	     signed short c4, signed short c5, signed short c6,
+	     signed short c7, signed short c8)
+{
+  vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-09-08  7:55 [PATCH] rs6000: Use direct move for char/short vector CTOR [PR96933] Kewen.Lin
@ 2020-09-10  3:19 ` Kewen.Lin
  2020-10-13  6:59   ` PING^1 " Kewen.Lin
  0 siblings, 1 reply; 8+ messages in thread
From: Kewen.Lin @ 2020-09-10  3:19 UTC (permalink / raw)
  To: GCC Patches; +Cc: Bill Schmidt, David Edelsohn, Segher Boessenkool

[-- Attachment #1: Type: text/plain, Size: 916 bytes --]

Hi,

As Segher's suggestion in the PR, for 128bit_direct_move, this new
version leverages vector pack insns instead of vector perms with
one control vector.  The performance evaluation shows that it's on
par with the previous version for char, while it's better than the
previous for short.

Bootstrapped/regtested again on powerpc64{,le}-linux-gnu P8 and
powerpc64le-linux-gnu P9.

Is it ok for trunk?

BR,
Kewen
------------

gcc/ChangeLog:

	PR target/96933
	* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
	instructions for vector construction with char/short types.
	* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
	(p8_mtvsrd_v16qidi2): Likewise. 

gcc/testsuite/ChangeLog:

	PR target/96933
	* gcc.target/powerpc/pr96933-1.c: New test.
	* gcc.target/powerpc/pr96933-2.c: New test.
	* gcc.target/powerpc/pr96933-3.c: New test.
	* gcc.target/powerpc/pr96933.h: New test.

[-- Attachment #2: vec_CTORv2.diff --]
[-- Type: text/plain, Size: 12752 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ca5b71ecdd3..b641ee49a49 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6411,11 +6411,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
+  unsigned int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
   bool all_same = true, all_const_zero = true;
   rtx x, mem;
-  int i;
+  unsigned int i;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -6681,6 +6681,181 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+    {
+      rtx op[16];
+      /* Force the values into word_mode registers.  */
+      for (i = 0; i < n_elts; i++)
+	{
+	  rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+	  if (TARGET_POWERPC64)
+	    {
+	      op[i] = gen_reg_rtx (DImode);
+	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+	    }
+	  else
+	    {
+	      op[i] = gen_reg_rtx (SImode);
+	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+	    }
+	}
+
+      /* Take unsigned char big endianness on 64bit as example for below
+	 construction, the input values are: A, B, C, D, ..., O, P.  */
+
+      if (TARGET_DIRECT_MOVE_128)
+	{
+	  /* Move to VSX register with vec_concat, each has 2 values.
+	     eg: vr1[0] = { xxxxxxxA, xxxxxxxB };
+		 vr1[1] = { xxxxxxxC, xxxxxxxD };
+		 ...
+		 vr1[7] = { xxxxxxxO, xxxxxxxP };  */
+	  rtx vr1[8];
+	  for (i = 0; i < n_elts / 2; i++)
+	    {
+	      vr1[i] = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vsx_concat_v2di (vr1[i], op[i * 2],
+					      op[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 2 values into vectors with 4 values.
+	     eg: vr2[0] = { xxxAxxxB, xxxCxxxD };
+		 vr2[1] = { xxxExxxF, xxxGxxxH };
+		 vr2[1] = { xxxIxxxJ, xxxKxxxL };
+		 vr2[3] = { xxxMxxxN, xxxOxxxP };  */
+	  rtx vr2[4];
+	  for (i = 0; i < n_elts / 4; i++)
+	    {
+	      vr2[i] = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_altivec_vpkudum (vr2[i], vr1[i * 2],
+					      vr1[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 4 values into vectors with 8 values.
+	     eg: vr3[0] = { xAxBxCxD, xExFxGxH };
+		 vr3[1] = { xIxJxKxL, xMxNxOxP };  */
+	  rtx vr3[2];
+	  for (i = 0; i < n_elts / 8; i++)
+	    {
+	      vr3[i] = gen_reg_rtx (V8HImode);
+	      emit_insn (gen_altivec_vpkuwum (vr3[i], vr2[i * 2],
+					      vr2[i * 2 + 1]));
+	    }
+
+	  /* If it's V8HImode, it's done and return it. */
+	  if (mode == V8HImode)
+	    {
+	      emit_insn (gen_rtx_SET (target, vr3[0]));
+	      return;
+	    }
+
+	  /* Pack vectors with 8 values into 16 values.  */
+	  rtx res = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_altivec_vpkuhum (res, vr3[0], vr3[1]));
+	  emit_insn (gen_rtx_SET (target, res));
+	}
+      else
+	{
+	  rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+	  rtx perm_idx;
+
+	  /* Set up some common gen routines and values.  */
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrghb;
+		  merge_v8hi = gen_altivec_vmrglh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrghh;
+
+	      merge_v4si = gen_altivec_vmrglw;
+	      perm_idx = GEN_INT (3);
+	    }
+	  else
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrglb;
+		  merge_v8hi = gen_altivec_vmrghh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrglh;
+
+	      merge_v4si = gen_altivec_vmrghw;
+	      perm_idx = GEN_INT (0);
+	    }
+
+	  /* Move to VSX register with direct move.
+	     eg: vr_qi[0] = { xxxxxxxA, xxxxxxxx };
+		 vr_qi[1] = { xxxxxxxB, xxxxxxxx };
+		 ...
+		 vr_qi[15] = { xxxxxxxP, xxxxxxxx };  */
+	  rtx vr_qi[16];
+	  for (i = 0; i < n_elts; i++)
+	    {
+	      vr_qi[i] = gen_reg_rtx (V16QImode);
+	      if (TARGET_POWERPC64)
+		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+	      else
+		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+	    }
+
+	  /* Merge/move to vector short.
+	     eg: vr_hi[0] = { xxxxxxxx, xxxxxxAB };
+		 vr_hi[1] = { xxxxxxxx, xxxxxxCD };
+		 ...
+		 vr_hi[7] = { xxxxxxxx, xxxxxxOP };  */
+	  rtx vr_hi[8];
+	  for (i = 0; i < 8; i++)
+	    {
+	      rtx tmp = vr_qi[i];
+	      if (mode == V16QImode)
+		{
+		  tmp = gen_reg_rtx (V16QImode);
+		  emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+		}
+	      vr_hi[i] = gen_reg_rtx (V8HImode);
+	      emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+	    }
+
+	  /* Merge vector short to vector int.
+	     eg: vr_si[0] = { xxxxxxxx, xxxxABCD };
+		 vr_si[1] = { xxxxxxxx, xxxxEFGH };
+		 ...
+		 vr_si[3] = { xxxxxxxx, xxxxMNOP };  */
+	  rtx vr_si[4];
+	  for (i = 0; i < 4; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V8HImode);
+	      emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+	      vr_si[i] = gen_reg_rtx (V4SImode);
+	      emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+	    }
+
+	  /* Merge vector int to vector long.
+	     eg: vr_di[0] = { xxxxxxxx, ABCDEFGH };
+		 vr_di[1] = { xxxxxxxx, IJKLMNOP };  */
+	  rtx vr_di[2];
+	  for (i = 0; i < 2; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V4SImode);
+	      emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+	      vr_di[i] = gen_reg_rtx (V2DImode);
+	      emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+	    }
+
+	  rtx res = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+	  emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+	}
+
+      return;
+    }
+
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..4d6bd193a10 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@ (define_insn "p8_mtvsrwz"
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
+(define_insn "p8_mtvsrwz_v16qisi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRWZ))]
+  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..27ff251e28c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+   direct move instructions mtvsrdd on Power9 or later, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mvpkudum\M} 12 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..e8d57d67aaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc_p8vector_ok } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+   move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..fd0c6926c5f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short run successfully.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+
+int
+main ()
+{
+  unsigned char uc[16];
+  signed char sc[16];
+
+  for (int i = 0; i < 16; i++)
+    {
+      uc[i] = (unsigned char) (i * 2 + 1);
+      sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned char ucv
+    = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8],
+		  uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+  vector signed char scv
+    = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8],
+		  sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+  for (int i = 0; i < 16; i++)
+    {
+      unsigned char uexp = (unsigned char) (i * 2 + 1);
+      signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+      if (ucv[i] != uexp)
+	abort ();
+      if (scv[i] != sexp)
+	abort ();
+    }
+
+  unsigned short us[8];
+  signed short ss[8];
+  for (int i = 0; i < 8; i++)
+    {
+      us[i] = (unsigned short) (i * 2 + 1);
+      ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned short usv
+    = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+  vector signed short ssv
+    = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+  for (int i = 0; i < 8; i++)
+    {
+      unsigned short uexp = (unsigned short) (i * 2 + 1);
+      signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+      if (usv[i] != uexp)
+	abort ();
+      if (ssv[i] != sexp)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+   functions as below:
+
+     - test_uchar  // vector unsigned char
+     - test_schar  // vector signed char
+     - test_ushort // vector unsigned short
+     - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+	    unsigned char c4, unsigned char c5, unsigned char c6,
+	    unsigned char c7, unsigned char c8, unsigned char c9,
+	    unsigned char c10, unsigned char c11, unsigned char c12,
+	    unsigned char c13, unsigned char c14, unsigned char c15,
+	    unsigned char c16)
+{
+  vector unsigned char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+	    signed char c5, signed char c6, signed char c7, signed char c8,
+	    signed char c9, signed char c10, signed char c11, signed char c12,
+	    signed char c13, signed char c14, signed char c15, signed char c16)
+{
+  vector signed char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+	     unsigned short c4, unsigned short c5, unsigned short c6,
+	     unsigned short c7, unsigned short c8)
+{
+  vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+	     signed short c4, signed short c5, signed short c6,
+	     signed short c7, signed short c8)
+{
+  vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* PING^1 [PATCH v2] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-09-10  3:19 ` [PATCH v2] " Kewen.Lin
@ 2020-10-13  6:59   ` Kewen.Lin
  2020-11-02  9:11     ` PING^2 " Kewen.Lin
  0 siblings, 1 reply; 8+ messages in thread
From: Kewen.Lin @ 2020-10-13  6:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: Segher Boessenkool, Bill Schmidt, David Edelsohn

Hi,

I'd like to gentle ping this patch:

https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553555.html

BR,
Kewen

on 2020/9/10 上午11:19, Kewen.Lin via Gcc-patches wrote:
> Hi,
> 
> As Segher's suggestion in the PR, for 128bit_direct_move, this new
> version leverages vector pack insns instead of vector perms with
> one control vector.  The performance evaluation shows that it's on
> par with the previous version for char, while it's better than the
> previous for short.
> 
> Bootstrapped/regtested again on powerpc64{,le}-linux-gnu P8 and
> powerpc64le-linux-gnu P9.
> 
> Is it ok for trunk?
> 
> BR,
> Kewen
> ------------
> 
> gcc/ChangeLog:
> 
> 	PR target/96933
> 	* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
> 	instructions for vector construction with char/short types.
> 	* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
> 	(p8_mtvsrd_v16qidi2): Likewise. 
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR target/96933
> 	* gcc.target/powerpc/pr96933-1.c: New test.
> 	* gcc.target/powerpc/pr96933-2.c: New test.
> 	* gcc.target/powerpc/pr96933-3.c: New test.
> 	* gcc.target/powerpc/pr96933.h: New test.
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* PING^2 [PATCH v2] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-10-13  6:59   ` PING^1 " Kewen.Lin
@ 2020-11-02  9:11     ` Kewen.Lin
  2020-11-02 13:44       ` David Edelsohn
  0 siblings, 1 reply; 8+ messages in thread
From: Kewen.Lin @ 2020-11-02  9:11 UTC (permalink / raw)
  To: gcc-patches; +Cc: Bill Schmidt, David Edelsohn, Segher Boessenkool

Hi,

Gentle ping for the patch:
  
https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553555.html

BR,
Kewen

on 2020/10/13 下午2:59, Kewen.Lin via Gcc-patches wrote:
> Hi,
> 
> I'd like to gentle ping this patch:
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553555.html
> 
> BR,
> Kewen
> 
> on 2020/9/10 上午11:19, Kewen.Lin via Gcc-patches wrote:
>> Hi,
>>
>> As Segher's suggestion in the PR, for 128bit_direct_move, this new
>> version leverages vector pack insns instead of vector perms with
>> one control vector.  The performance evaluation shows that it's on
>> par with the previous version for char, while it's better than the
>> previous for short.
>>
>> Bootstrapped/regtested again on powerpc64{,le}-linux-gnu P8 and
>> powerpc64le-linux-gnu P9.
>>
>> Is it ok for trunk?
>>
>> BR,
>> Kewen
>> ------------
>>
>> gcc/ChangeLog:
>>
>> 	PR target/96933
>> 	* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
>> 	instructions for vector construction with char/short types.
>> 	* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
>> 	(p8_mtvsrd_v16qidi2): Likewise. 
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	PR target/96933
>> 	* gcc.target/powerpc/pr96933-1.c: New test.
>> 	* gcc.target/powerpc/pr96933-2.c: New test.
>> 	* gcc.target/powerpc/pr96933-3.c: New test.
>> 	* gcc.target/powerpc/pr96933.h: New test.
>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: PING^2 [PATCH v2] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-11-02  9:11     ` PING^2 " Kewen.Lin
@ 2020-11-02 13:44       ` David Edelsohn
  2020-11-03  7:25         ` [PATCH v3] " Kewen.Lin
  0 siblings, 1 reply; 8+ messages in thread
From: David Edelsohn @ 2020-11-02 13:44 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: GCC Patches, Bill Schmidt, Segher Boessenkool

On Mon, Nov 2, 2020 at 4:11 AM Kewen.Lin <linkw@linux.ibm.com> wrote:

> >> Hi,
> >>
> >> As Segher's suggestion in the PR, for 128bit_direct_move, this new
> >> version leverages vector pack insns instead of vector perms with
> >> one control vector.  The performance evaluation shows that it's on
> >> par with the previous version for char, while it's better than the
> >> previous for short.
> >>
> >> Bootstrapped/regtested again on powerpc64{,le}-linux-gnu P8 and
> >> powerpc64le-linux-gnu P9.
> >>
> >> Is it ok for trunk?

The patch looks fine to me, but I'll let Segher decide if it addresses
his requested changes.

I'm trying to be stricter about the test cases.

+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */

Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?

+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target powerpc_p8vector_ok } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */

Please place powerpc_p8vector_ok on a separate dg-require-effective-target line.

+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,63 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2" } */

Doesn't this need -mdejagnu-cpu=power8?

Thanks, David

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v3] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-11-02 13:44       ` David Edelsohn
@ 2020-11-03  7:25         ` Kewen.Lin
  2020-11-03 16:45           ` Segher Boessenkool
  0 siblings, 1 reply; 8+ messages in thread
From: Kewen.Lin @ 2020-11-03  7:25 UTC (permalink / raw)
  To: David Edelsohn; +Cc: GCC Patches, Bill Schmidt, Segher Boessenkool

[-- Attachment #1: Type: text/plain, Size: 2345 bytes --]

Hi David,

Thanks for the review!

> The patch looks fine to me, but I'll let Segher decide if it addresses
> his requested changes.
> 
> I'm trying to be stricter about the test cases.
> 
> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2" } */
> 
> Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?
> 

I thought using -mdejagnu-cpu=power9 would force the case run with
power9 cpu all the time, while using has_arch_pwr9 seems to be more
flexible, it can be compiled with power9 or later (like -mcpu=power10),
we can check whether we generate unexpected code on power10 or later.
Does it sound good?

> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target powerpc_p8vector_ok } } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
> 
> Please place powerpc_p8vector_ok on a separate dg-require-effective-target line.
> 

Done.

> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
> @@ -0,0 +1,63 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target p8vector_hw } */
> +/* { dg-options "-O2" } */
> 
> Doesn't this need -mdejagnu-cpu=power8?

Thanks for catching!  Yes, it needs.  I was thinking to use one
case for both Power8 and Power9 runs, it passed the testings on
both machines.  But your question made me realize that it's
incorrect when we are doing testing on Power8 but pass some
external option like -mcpu=power9, it can generate power9 insns
which are illegal on the machine.

The new version leaves this case for Power8 run test and add one
more pr96933-4.c for Power9 run test.

BR,
Kewen
-----
gcc/ChangeLog:

	PR target/96933
	* config/rs6000/rs6000.c (rs6000_expand_vector_init): Use direct move
	instructions for vector construction with char/short types.
	* config/rs6000/rs6000.md (p8_mtvsrwz_v16qisi2): New define_insn.
	(p8_mtvsrd_v16qidi2): Likewise. 

gcc/testsuite/ChangeLog:

	PR target/96933
	* gcc.target/powerpc/pr96933-1.c: New test.
	* gcc.target/powerpc/pr96933-2.c: New test.
	* gcc.target/powerpc/pr96933-3.c: New test.
	* gcc.target/powerpc/pr96933-4.c: New test.
	* gcc.target/powerpc/pr96933.h: New test.
	* gcc.target/powerpc/pr96933-run.h: New test.

[-- Attachment #2: vec_CTORv3.diff --]
[-- Type: text/plain, Size: 13638 bytes --]

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index ca5b71ecdd3..b641ee49a49 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6411,11 +6411,11 @@ rs6000_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
-  int n_elts = GET_MODE_NUNITS (mode);
+  unsigned int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
   bool all_same = true, all_const_zero = true;
   rtx x, mem;
-  int i;
+  unsigned int i;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -6681,6 +6681,181 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       return;
     }
 
+  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
+    {
+      rtx op[16];
+      /* Force the values into word_mode registers.  */
+      for (i = 0; i < n_elts; i++)
+	{
+	  rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
+	  if (TARGET_POWERPC64)
+	    {
+	      op[i] = gen_reg_rtx (DImode);
+	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
+	    }
+	  else
+	    {
+	      op[i] = gen_reg_rtx (SImode);
+	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
+	    }
+	}
+
+      /* Take unsigned char big endianness on 64bit as example for below
+	 construction, the input values are: A, B, C, D, ..., O, P.  */
+
+      if (TARGET_DIRECT_MOVE_128)
+	{
+	  /* Move to VSX register with vec_concat, each has 2 values.
+	     eg: vr1[0] = { xxxxxxxA, xxxxxxxB };
+		 vr1[1] = { xxxxxxxC, xxxxxxxD };
+		 ...
+		 vr1[7] = { xxxxxxxO, xxxxxxxP };  */
+	  rtx vr1[8];
+	  for (i = 0; i < n_elts / 2; i++)
+	    {
+	      vr1[i] = gen_reg_rtx (V2DImode);
+	      emit_insn (gen_vsx_concat_v2di (vr1[i], op[i * 2],
+					      op[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 2 values into vectors with 4 values.
+	     eg: vr2[0] = { xxxAxxxB, xxxCxxxD };
+		 vr2[1] = { xxxExxxF, xxxGxxxH };
+		 vr2[1] = { xxxIxxxJ, xxxKxxxL };
+		 vr2[3] = { xxxMxxxN, xxxOxxxP };  */
+	  rtx vr2[4];
+	  for (i = 0; i < n_elts / 4; i++)
+	    {
+	      vr2[i] = gen_reg_rtx (V4SImode);
+	      emit_insn (gen_altivec_vpkudum (vr2[i], vr1[i * 2],
+					      vr1[i * 2 + 1]));
+	    }
+
+	  /* Pack vectors with 4 values into vectors with 8 values.
+	     eg: vr3[0] = { xAxBxCxD, xExFxGxH };
+		 vr3[1] = { xIxJxKxL, xMxNxOxP };  */
+	  rtx vr3[2];
+	  for (i = 0; i < n_elts / 8; i++)
+	    {
+	      vr3[i] = gen_reg_rtx (V8HImode);
+	      emit_insn (gen_altivec_vpkuwum (vr3[i], vr2[i * 2],
+					      vr2[i * 2 + 1]));
+	    }
+
+	  /* If it's V8HImode, it's done and return it. */
+	  if (mode == V8HImode)
+	    {
+	      emit_insn (gen_rtx_SET (target, vr3[0]));
+	      return;
+	    }
+
+	  /* Pack vectors with 8 values into 16 values.  */
+	  rtx res = gen_reg_rtx (V16QImode);
+	  emit_insn (gen_altivec_vpkuhum (res, vr3[0], vr3[1]));
+	  emit_insn (gen_rtx_SET (target, res));
+	}
+      else
+	{
+	  rtx (*merge_v16qi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v8hi) (rtx, rtx, rtx) = NULL;
+	  rtx (*merge_v4si) (rtx, rtx, rtx) = NULL;
+	  rtx perm_idx;
+
+	  /* Set up some common gen routines and values.  */
+	  if (BYTES_BIG_ENDIAN)
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrghb;
+		  merge_v8hi = gen_altivec_vmrglh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrghh;
+
+	      merge_v4si = gen_altivec_vmrglw;
+	      perm_idx = GEN_INT (3);
+	    }
+	  else
+	    {
+	      if (mode == V16QImode)
+		{
+		  merge_v16qi = gen_altivec_vmrglb;
+		  merge_v8hi = gen_altivec_vmrghh;
+		}
+	      else
+		merge_v8hi = gen_altivec_vmrglh;
+
+	      merge_v4si = gen_altivec_vmrghw;
+	      perm_idx = GEN_INT (0);
+	    }
+
+	  /* Move to VSX register with direct move.
+	     eg: vr_qi[0] = { xxxxxxxA, xxxxxxxx };
+		 vr_qi[1] = { xxxxxxxB, xxxxxxxx };
+		 ...
+		 vr_qi[15] = { xxxxxxxP, xxxxxxxx };  */
+	  rtx vr_qi[16];
+	  for (i = 0; i < n_elts; i++)
+	    {
+	      vr_qi[i] = gen_reg_rtx (V16QImode);
+	      if (TARGET_POWERPC64)
+		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
+	      else
+		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
+	    }
+
+	  /* Merge/move to vector short.
+	     eg: vr_hi[0] = { xxxxxxxx, xxxxxxAB };
+		 vr_hi[1] = { xxxxxxxx, xxxxxxCD };
+		 ...
+		 vr_hi[7] = { xxxxxxxx, xxxxxxOP };  */
+	  rtx vr_hi[8];
+	  for (i = 0; i < 8; i++)
+	    {
+	      rtx tmp = vr_qi[i];
+	      if (mode == V16QImode)
+		{
+		  tmp = gen_reg_rtx (V16QImode);
+		  emit_insn (merge_v16qi (tmp, vr_qi[2 * i], vr_qi[2 * i + 1]));
+		}
+	      vr_hi[i] = gen_reg_rtx (V8HImode);
+	      emit_move_insn (vr_hi[i], gen_lowpart (V8HImode, tmp));
+	    }
+
+	  /* Merge vector short to vector int.
+	     eg: vr_si[0] = { xxxxxxxx, xxxxABCD };
+		 vr_si[1] = { xxxxxxxx, xxxxEFGH };
+		 ...
+		 vr_si[3] = { xxxxxxxx, xxxxMNOP };  */
+	  rtx vr_si[4];
+	  for (i = 0; i < 4; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V8HImode);
+	      emit_insn (merge_v8hi (tmp, vr_hi[2 * i], vr_hi[2 * i + 1]));
+	      vr_si[i] = gen_reg_rtx (V4SImode);
+	      emit_move_insn (vr_si[i], gen_lowpart (V4SImode, tmp));
+	    }
+
+	  /* Merge vector int to vector long.
+	     eg: vr_di[0] = { xxxxxxxx, ABCDEFGH };
+		 vr_di[1] = { xxxxxxxx, IJKLMNOP };  */
+	  rtx vr_di[2];
+	  for (i = 0; i < 2; i++)
+	    {
+	      rtx tmp = gen_reg_rtx (V4SImode);
+	      emit_insn (merge_v4si (tmp, vr_si[2 * i], vr_si[2 * i + 1]));
+	      vr_di[i] = gen_reg_rtx (V2DImode);
+	      emit_move_insn (vr_di[i], gen_lowpart (V2DImode, tmp));
+	    }
+
+	  rtx res = gen_reg_rtx (V2DImode);
+	  emit_insn (gen_vsx_xxpermdi_v2di (res, vr_di[0], vr_di[1], perm_idx));
+	  emit_insn (gen_rtx_SET (target, gen_lowpart (mode, res)));
+	}
+
+      return;
+    }
+
   /* Construct the vector in memory one field at a time
      and load the whole vector.  */
   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..4d6bd193a10 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,22 @@
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
+(define_insn "p8_mtvsrwz_v16qisi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRWZ))]
+  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qidi2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
+		  UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-1.c b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
new file mode 100644
index 00000000000..27ff251e28c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2" } */
+
+/* Test vector constructions with char/short type values whether use 128bit
+   direct move instructions mtvsrdd on Power9 or later, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrdd\M} 24 } } */
+/* { dg-final { scan-assembler-times {\mvpkudum\M} 12 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-2.c b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
new file mode 100644
index 00000000000..cef8fbd4f35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short type values whether use direct
+   move instructions like mtvsrd/mtvsrwz on Power8, rather than transfering
+   with memory store/load with stb/sth and vector load.  */
+
+#include "pr96933.h"
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 48 {target lp64 } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrwz\M} 48 {target {! lp64 } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 4 } } */
+/* { dg-final { scan-assembler-not {\mstb\M} } } */
+/* { dg-final { scan-assembler-not {\msth\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-3.c b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
new file mode 100644
index 00000000000..3e5709ab0d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p8vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power8" } */
+
+/* Test vector constructions with char/short run successfully on Power8.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-4.c b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
new file mode 100644
index 00000000000..5a1c3d0c857
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-4.c
@@ -0,0 +1,10 @@
+/* { dg-do run } */
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+/* Test vector constructions with char/short run successfully on Power9.  */
+
+#include <stdlib.h>
+#include "pr96933.h"
+#include "pr96933-run.h"
+
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933-run.h b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
new file mode 100644
index 00000000000..7fa8dac639c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933-run.h
@@ -0,0 +1,56 @@
+/* Test function for pr96933-{3,4}.c run result verification.  */
+
+int
+main ()
+{
+  unsigned char uc[16];
+  signed char sc[16];
+
+  for (int i = 0; i < 16; i++)
+    {
+      uc[i] = (unsigned char) (i * 2 + 1);
+      sc[i] = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned char ucv
+    = test_uchar (uc[0], uc[1], uc[2], uc[3], uc[4], uc[5], uc[6], uc[7], uc[8],
+		  uc[9], uc[10], uc[11], uc[12], uc[13], uc[14], uc[15]);
+  vector signed char scv
+    = test_schar (sc[0], sc[1], sc[2], sc[3], sc[4], sc[5], sc[6], sc[7], sc[8],
+		  sc[9], sc[10], sc[11], sc[12], sc[13], sc[14], sc[15]);
+
+  for (int i = 0; i < 16; i++)
+    {
+      unsigned char uexp = (unsigned char) (i * 2 + 1);
+      signed char sexp = (signed char) ((i % 2 == 0) ? (i + 1) : -i);
+      if (ucv[i] != uexp)
+	abort ();
+      if (scv[i] != sexp)
+	abort ();
+    }
+
+  unsigned short us[8];
+  signed short ss[8];
+  for (int i = 0; i < 8; i++)
+    {
+      us[i] = (unsigned short) (i * 2 + 1);
+      ss[i] = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+    }
+
+  vector unsigned short usv
+    = test_ushort (us[0], us[1], us[2], us[3], us[4], us[5], us[6], us[7]);
+  vector signed short ssv
+    = test_sshort (ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7]);
+
+  for (int i = 0; i < 8; i++)
+    {
+      unsigned short uexp = (unsigned short) (i * 2 + 1);
+      signed short sexp = (signed short) ((i % 2 == 0) ? (i + 1) : -i);
+      if (usv[i] != uexp)
+	abort ();
+      if (ssv[i] != sexp)
+	abort ();
+    }
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr96933.h b/gcc/testsuite/gcc.target/powerpc/pr96933.h
new file mode 100644
index 00000000000..4bc2b941e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr96933.h
@@ -0,0 +1,50 @@
+/* Source file for pr96933-*.c testing, this mainly contains 4
+   functions as below:
+
+     - test_uchar  // vector unsigned char
+     - test_schar  // vector signed char
+     - test_ushort // vector unsigned short
+     - test_sshort // vector signed short
+*/
+
+__attribute__ ((noipa)) vector unsigned char
+test_uchar (unsigned char c1, unsigned char c2, unsigned char c3,
+	    unsigned char c4, unsigned char c5, unsigned char c6,
+	    unsigned char c7, unsigned char c8, unsigned char c9,
+	    unsigned char c10, unsigned char c11, unsigned char c12,
+	    unsigned char c13, unsigned char c14, unsigned char c15,
+	    unsigned char c16)
+{
+  vector unsigned char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed char
+test_schar (signed char c1, signed char c2, signed char c3, signed char c4,
+	    signed char c5, signed char c6, signed char c7, signed char c8,
+	    signed char c9, signed char c10, signed char c11, signed char c12,
+	    signed char c13, signed char c14, signed char c15, signed char c16)
+{
+  vector signed char v
+    = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16};
+  return v;
+}
+
+__attribute__ ((noipa)) vector unsigned short
+test_ushort (unsigned short c1, unsigned short c2, unsigned short c3,
+	     unsigned short c4, unsigned short c5, unsigned short c6,
+	     unsigned short c7, unsigned short c8)
+{
+  vector unsigned short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}
+
+__attribute__ ((noipa)) vector signed short
+test_sshort (signed short c1, signed short c2, signed short c3,
+	     signed short c4, signed short c5, signed short c6,
+	     signed short c7, signed short c8)
+{
+  vector signed short v = {c1, c2, c3, c4, c5, c6, c7, c8};
+  return v;
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-11-03  7:25         ` [PATCH v3] " Kewen.Lin
@ 2020-11-03 16:45           ` Segher Boessenkool
  2020-11-05  8:27             ` Kewen.Lin
  0 siblings, 1 reply; 8+ messages in thread
From: Segher Boessenkool @ 2020-11-03 16:45 UTC (permalink / raw)
  To: Kewen.Lin; +Cc: David Edelsohn, GCC Patches, Bill Schmidt

Hi!

On Tue, Nov 03, 2020 at 03:25:19PM +0800, Kewen.Lin wrote:
> > I'm trying to be stricter about the test cases.
> > 
> > +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
> > @@ -0,0 +1,14 @@
> > +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
> > +/* { dg-require-effective-target powerpc_p9vector_ok } */
> > +/* { dg-options "-O2" } */
> > 
> > Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?
> 
> I thought using -mdejagnu-cpu=power9 would force the case run with
> power9 cpu all the time, while using has_arch_pwr9 seems to be more
> flexible, it can be compiled with power9 or later (like -mcpu=power10),
> we can check whether we generate unexpected code on power10 or later.
> Does it sound good?

It will not run at all if your compiler (or testsuite invocation) does
not use at least power9.  Since the default for powerpc64-linux is
power4, and that for powerpc64le-linux is power8, this will happen for
many people (not to mention that it is extra important to test the
default setup, of course).

It probably would be useful if there was some convenient way to say
"use at least -mcpu=power9 for this, but some later cpu is fine too" --
but there is no such thing yet.

Using something like that might cause more maintenance issues later, see
"pstb" below for example, but that is not really an argument against
fixing this.

> > +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-3.c
> > @@ -0,0 +1,63 @@
> > +/* { dg-do run } */
> > +/* { dg-require-effective-target p8vector_hw } */
> > +/* { dg-options "-O2" } */
> > 
> > Doesn't this need -mdejagnu-cpu=power8?
> 
> Thanks for catching!  Yes, it needs.  I was thinking to use one
> case for both Power8 and Power9 runs, it passed the testings on
> both machines.  But your question made me realize that it's
> incorrect when we are doing testing on Power8 but pass some
> external option like -mcpu=power9, it can generate power9 insns
> which are illegal on the machine.

If the compiler defaults to (say) -mcpu=power7, it will generate code
for that the way the testcase is set up now (and it will not run on
machines before power8, but that is separate).

> +  if (TARGET_DIRECT_MOVE && (mode == V16QImode || mode == V8HImode))
> +    {
> +      rtx op[16];
> +      /* Force the values into word_mode registers.  */
> +      for (i = 0; i < n_elts; i++)
> +	{
> +	  rtx tmp = force_reg (GET_MODE_INNER (mode), XVECEXP (vals, 0, i));
> +	  if (TARGET_POWERPC64)
> +	    {
> +	      op[i] = gen_reg_rtx (DImode);
> +	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
> +	    }
> +	  else
> +	    {
> +	      op[i] = gen_reg_rtx (SImode);
> +	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
> +	    }
> +	}

TARGET_POWERPC64 should be TARGET_64BIT afaics?  (See below.)

You can use Pmode then, too.  The zero_extend thing can be handled by
changing
  (define_insn "zero_extendqi<mode>2"
to
  (define_insn "@zero_extendqi<mode>2"
(and no other changes needed), and then calling
  emit_insn (gen_zero_extendqi2 (Pmode, op[i], tmp));
(or so is the theory.  This might need some other changes, and also all
other gen_zero_extendqi* callers need to change, so that is a separate
patch if you want to try.  This isn't so bad right now.)

> +	  for (i = 0; i < n_elts; i++)
> +	    {
> +	      vr_qi[i] = gen_reg_rtx (V16QImode);
> +	      if (TARGET_POWERPC64)
> +		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
> +	      else
> +		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
> +	    }

TARGET_64BIT here as well.

TARGET_POWERPC64 means the current machine has the 64-bit insns.  It
does not mean the code will run in 64-bit mode (e.g. -m32 -mpowerpc64 is
just fine, and can be useful), but it also does not mean the OS (libc,
kernel, etc.) will actually save the full 64-bit registers -- making it
only useful on Darwin currently.

(You *can* run all of the testsuite flawlessly on Linux with those
options, but that only works because those are small, short-running
programs.  More "real", bigger and more complex programs fail in strange
and exciting ways!)

It's a pity the pre-p9 code cannot reuse most of what we do for p9.

> +(define_insn "p8_mtvsrwz_v16qisi2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +    (unspec:V16QI [(match_operand:SI 1 "register_operand" "r")]
> +		  UNSPEC_P8V_MTVSRWZ))]
> +  "!TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrwz %x0,%1"
> +  [(set_attr "type" "mftgpr")])
> +
> +(define_insn "p8_mtvsrd_v16qidi2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +    (unspec:V16QI [(match_operand:DI 1 "register_operand" "r")]
> +		  UNSPEC_P8V_MTVSRD))]
> +  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrd %x0,%1"
> +  [(set_attr "type" "mftgpr")])

TARGET_POWERPC64 is fine for these, btw.  You just cannot decide to put
a DImode in a register based on only this -- but if that has been
decided already, it is just fine.

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2" } */

As David said:

/* { dg-do compile } */
/* { dg-require-effective-target lp64 } */
/* { dg-require-effective-target has_arch_pwr9 } */
/* { dg-require-effective-target powerpc_p9vector_ok } */
/* { dg-options "-O2 -mdejagnu-cpu=power9" } */

But, you probably don't want that has_arch_pwr9 line at all, this is a
compile test?

> +/* { dg-final { scan-assembler-not {\mstb\M} } } */
> +/* { dg-final { scan-assembler-not {\msth\M} } } */

Btw, if this didn't for -mcpu=power9, you probably would need to allow
prefixed stores here, like  {\mp?stb\M} .

So, okay for trunk with those TARGET_POWERPC64 fixed, and that one
remaining testcase.  Thanks!


Segher

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] rs6000: Use direct move for char/short vector CTOR [PR96933]
  2020-11-03 16:45           ` Segher Boessenkool
@ 2020-11-05  8:27             ` Kewen.Lin
  0 siblings, 0 replies; 8+ messages in thread
From: Kewen.Lin @ 2020-11-05  8:27 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: David Edelsohn, GCC Patches, Bill Schmidt

Hi Segher,

Thanks for the review!

>>> Why does this test has_arch_pwr9 instead of adding -mdejagnu-cpu=power9?
>>
>> I thought using -mdejagnu-cpu=power9 would force the case run with
>> power9 cpu all the time, while using has_arch_pwr9 seems to be more
>> flexible, it can be compiled with power9 or later (like -mcpu=power10),
>> we can check whether we generate unexpected code on power10 or later.
>> Does it sound good?
> 
> It will not run at all if your compiler (or testsuite invocation) does
> not use at least power9.  Since the default for powerpc64-linux is
> power4, and that for powerpc64le-linux is power8, this will happen for
> many people (not to mention that it is extra important to test the
> default setup, of course).
> 

Good point!  has_arch_pwr9 can cause fewer test coverage if the default
arch is less than power9.

> It probably would be useful if there was some convenient way to say
> "use at least -mcpu=power9 for this, but some later cpu is fine too" --
> but there is no such thing yet.
> 
> Using something like that might cause more maintenance issues later, see
> "pstb" below for example, but that is not really an argument against
> fixing this.

Yeah, thanks for the good example!

>> +	  if (TARGET_POWERPC64)
>> +	    {
>> +	      op[i] = gen_reg_rtx (DImode);
>> +	      emit_insn (gen_zero_extendqidi2 (op[i], tmp));
>> +	    }
>> +	  else
>> +	    {
>> +	      op[i] = gen_reg_rtx (SImode);
>> +	      emit_insn (gen_zero_extendqisi2 (op[i], tmp));
>> +	    }
>> +	}
> 
> TARGET_POWERPC64 should be TARGET_64BIT afaics?  (See below.)

Yes, fixed.

> 
> You can use Pmode then, too.  The zero_extend thing can be handled by
> changing
>   (define_insn "zero_extendqi<mode>2"
> to
>   (define_insn "@zero_extendqi<mode>2"
> (and no other changes needed), and then calling
>   emit_insn (gen_zero_extendqi2 (Pmode, op[i], tmp));
> (or so is the theory.  This might need some other changes, and also all
> other gen_zero_extendqi* callers need to change, so that is a separate
> patch if you want to try.  This isn't so bad right now.)

Will deal with this in a separate patch.

> 
>> +	  for (i = 0; i < n_elts; i++)
>> +	    {
>> +	      vr_qi[i] = gen_reg_rtx (V16QImode);
>> +	      if (TARGET_POWERPC64)
>> +		emit_insn (gen_p8_mtvsrd_v16qidi2 (vr_qi[i], op[i]));
>> +	      else
>> +		emit_insn (gen_p8_mtvsrwz_v16qisi2 (vr_qi[i], op[i]));
>> +	    }
> 
> TARGET_64BIT here as well.
> 
> TARGET_POWERPC64 means the current machine has the 64-bit insns.  It
> does not mean the code will run in 64-bit mode (e.g. -m32 -mpowerpc64 is
> just fine, and can be useful), but it also does not mean the OS (libc,
> kernel, etc.) will actually save the full 64-bit registers -- making it
> only useful on Darwin currently.
> 
> (You *can* run all of the testsuite flawlessly on Linux with those
> options, but that only works because those are small, short-running
> programs.  More "real", bigger and more complex programs fail in strange
> and exciting ways!)

Fixed as well.  Thanks for the detailed explanation!

>> +++ b/gcc/testsuite/gcc.target/powerpc/pr96933-1.c
>> @@ -0,0 +1,14 @@
>> +/* { dg-do compile { target { lp64 && has_arch_pwr9 } } } */
>> +/* { dg-require-effective-target powerpc_p9vector_ok } */
>> +/* { dg-options "-O2" } */
> 
> As David said:
> 
> /* { dg-do compile } */
> /* { dg-require-effective-target lp64 } */
> /* { dg-require-effective-target has_arch_pwr9 } */
> /* { dg-require-effective-target powerpc_p9vector_ok } */
> /* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> 

Updated.  But I guessed the reason why we recommend to use single
line effective-target: more clear for people to read?  easier to
find the check result in test debugging verbose dump content?
Anything else I missed?

> But, you probably don't want that has_arch_pwr9 line at all, this is a
> compile test?

Yeah, removed.

> 
> So, okay for trunk with those TARGET_POWERPC64 fixed, and that one
> remaining testcase.  Thanks!

Thanks!  Bootstrapped/regress-tested on powerpc64le-linux-gnu P8/P9
and powerpc64-linux-gnu P8 again and committed in r11-4731.

BR,
Kewen

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-11-05  8:27 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-08  7:55 [PATCH] rs6000: Use direct move for char/short vector CTOR [PR96933] Kewen.Lin
2020-09-10  3:19 ` [PATCH v2] " Kewen.Lin
2020-10-13  6:59   ` PING^1 " Kewen.Lin
2020-11-02  9:11     ` PING^2 " Kewen.Lin
2020-11-02 13:44       ` David Edelsohn
2020-11-03  7:25         ` [PATCH v3] " Kewen.Lin
2020-11-03 16:45           ` Segher Boessenkool
2020-11-05  8:27             ` Kewen.Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).