From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <meissner@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1005)
	id 36033386102E; Wed, 14 Feb 2024 23:12:55 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 36033386102E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1707952375;
	bh=I53O2eAalfohwVWh0HsfUmwLlmHQ3uorrysXEdCwIzE=;
	h=From:To:Subject:Date:From;
	b=rvwFZ1cPvO8KNDYqoJaL/VOSAjTSaLKE/OaV7b/ZD7NnumamFkzS/DNgoFjEUyoxb
	 NA4YCdMjR5gMHir4Njx0MMxf3OzAaZkUd6RQCtLezcNxFRumAEZI0YF8O0PJU/tYHq
	 ChSo019dANjBkNkOkrkbFWDBO2WQK6snNqM4lfws=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Michael Meissner <meissner@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/meissner/heads/work154-dmf)] Support load/store
 vector with right length.
X-Act-Checkin: gcc
X-Git-Author: Michael Meissner <meissner@linux.ibm.com>
X-Git-Refname: refs/users/meissner/heads/work154-dmf
X-Git-Oldrev: 6ada24f5406c585d0672ae58609f54529c2922db
X-Git-Newrev: d9032d1942d08a9549b49b84ca0965dc993d33ea
Message-Id: <20240214231255.36033386102E@sourceware.org>
Date: Wed, 14 Feb 2024 23:12:55 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:d9032d1942d08a9549b49b84ca0965dc993d33ea

commit d9032d1942d08a9549b49b84ca0965dc993d33ea
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Tue Jan 23 00:19:45 2024 -0500

    Support load/store vector with right length.
    
    This patch adds support for new instructions that may be added to the PowerPC
    architecture in the future to enhance the load and store vector with length
    instructions.
    
    The current instructions (lxvl, lxvll, stxvl, and stxvll) are inconvient to use
    since the count for the number of bytes must be in the top 8 bits of the GPR
    register, instead of the bottom 8 bits.  This meant that code generating these
    instructions typically had to do a shift left by 56 bits to get the count into
    the right position.  In a future version of the PowerPC architecture, new
    variants of these instructions might be added that expect the count to be in
    the bottom 8 bits of the GPR register.  These patches add this support to GCC
    if the user uses the -mcpu=future option.
    
    I discovered that the code in rs6000-string.cc to generate ISA 3.1 lxvl/stxvl
    future lxvll/stxvll instructions would generate these instructions on 32-bit.
    However the patterns for these instructions is only done on 64-bit systems.  So
    I added a check for 64-bit support before generating the instructions.
    
    The patches have been tested on both little and big endian systems.  Can I check
    it into the master branch?
    
    2024-01-23   Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            * config/rs6000/rs6000-string.cc (expand_block_move): Do not generate
            lxvl and stxvl on 32-bit.
            * config/rs6000/vsx.md (lxvl): If -mcpu=future, generate the lxvl with
            the shift count automaticaly used in the insn.
            (lxvrl): New insn for -mcpu=future.
            (lxvrll): Likewise.
            (stxvl): If -mcpu=future, generate the stxvl with the shift count
            automaticaly used in the insn.
            (stxvrl): New insn for -mcpu=future.
            (stxvrll): Likewise.
    
    gcc/testsuite/
    
            * gcc.target/powerpc/lxvrl.c: New test.
            * lib/target-supports.exp (check_effective_target_powerpc_future_ok):
            New effective target.

Diff:
---
 gcc/config/rs6000/rs6000-string.cc       |   1 +
 gcc/config/rs6000/vsx.md                 | 122 +++++++++++++++++++++++++------
 gcc/testsuite/gcc.target/powerpc/lxvrl.c |  32 ++++++++
 gcc/testsuite/lib/target-supports.exp    |  16 +++-
 4 files changed, 148 insertions(+), 23 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
index f707bb2727ee..1845860d7d04 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -2783,6 +2783,7 @@ expand_block_move (rtx operands[], bool might_overlap)
 
       if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
 	  && TARGET_BLOCK_OPS_VECTOR_PAIR
+	  && TARGET_POWERPC64
 	  && bytes >= 32
 	  && (align >= 256 || !STRICT_ALIGNMENT))
 	{
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 6111cc90eb74..42e9976e95d3 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5629,20 +5629,32 @@
   DONE;
 })
 
-;; Load VSX Vector with Length
+;; Load VSX Vector with Length.  If we have lxvrl, we don't have to do an
+;; explicit shift left into a pseudo.
 (define_expand "lxvl"
-  [(set (match_dup 3)
-        (ashift:DI (match_operand:DI 2 "register_operand")
-                   (const_int 56)))
-   (set (match_operand:V16QI 0 "vsx_register_operand")
-	(unspec:V16QI
-	 [(match_operand:DI 1 "gpc_reg_operand")
-          (mem:V16QI (match_dup 1))
-	  (match_dup 3)]
-	 UNSPEC_LXVL))]
+  [(use (match_operand:V16QI 0 "vsx_register_operand"))
+   (use (match_operand:DI 1 "gpc_reg_operand"))
+   (use (match_operand:DI 2 "gpc_reg_operand"))]
   "TARGET_P9_VECTOR && TARGET_64BIT"
 {
-  operands[3] = gen_reg_rtx (DImode);
+  rtx shift_len = gen_rtx_ASHIFT (DImode, operands[2], GEN_INT (56));
+  rtx len;
+
+  if (TARGET_FUTURE)
+    len = shift_len;
+  else
+    {
+      len = gen_reg_rtx (DImode);
+      emit_insn (gen_rtx_SET (len, shift_len));
+    }
+
+  rtx dest = operands[0];
+  rtx addr = operands[1];
+  rtx mem = gen_rtx_MEM (V16QImode, addr);
+  rtvec rv = gen_rtvec (3, addr, mem, len);
+  rtx lxvl = gen_rtx_UNSPEC (V16QImode, rv, UNSPEC_LXVL);
+  emit_insn (gen_rtx_SET (dest, lxvl));
+  DONE;
 })
 
 (define_insn "*lxvl"
@@ -5666,6 +5678,34 @@
   "lxvll %x0,%1,%2"
   [(set_attr "type" "vecload")])
 
+;; For lxvrl and lxvrll, use the combiner to eliminate the shift.  The
+;; define_expand for lxvl will already incorporate the shift in generating the
+;; insn.  The lxvll buitl-in function required the user to have already done
+;; the shift.  Defining lxvrll this way, will optimize cases where the user has
+;; done the shift immediately before the built-in.
+(define_insn "*lxvrl"
+  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
+	(unspec:V16QI
+	 [(match_operand:DI 1 "gpc_reg_operand" "b")
+	  (mem:V16QI (match_dup 1))
+	  (ashift:DI (match_operand:DI 2 "register_operand" "r")
+		     (const_int 56))]
+	 UNSPEC_LXVL))]
+  "TARGET_FUTURE && TARGET_64BIT"
+  "lxvrl %x0,%1,%2"
+  [(set_attr "type" "vecload")])
+
+(define_insn "*lxvrll"
+  [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
+	(unspec:V16QI [(match_operand:DI 1 "gpc_reg_operand" "b")
+                       (mem:V16QI (match_dup 1))
+		       (ashift:DI (match_operand:DI 2 "register_operand" "r")
+				  (const_int 56))]
+		      UNSPEC_LXVLL))]
+  "TARGET_FUTURE"
+  "lxvrll %x0,%1,%2"
+  [(set_attr "type" "vecload")])
+
 ;; Expand for builtin xl_len_r
 (define_expand "xl_len_r"
   [(match_operand:V16QI 0 "vsx_register_operand")
@@ -5697,18 +5737,29 @@
 
 ;; Store VSX Vector with Length
 (define_expand "stxvl"
-  [(set (match_dup 3)
-	(ashift:DI (match_operand:DI 2 "register_operand")
-		   (const_int 56)))
-   (set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand"))
-	(unspec:V16QI
-	 [(match_operand:V16QI 0 "vsx_register_operand")
-	  (mem:V16QI (match_dup 1))
-	  (match_dup 3)]
-	 UNSPEC_STXVL))]
+  [(use (match_operand:V16QI 0 "vsx_register_operand"))
+   (use (match_operand:DI 1 "gpc_reg_operand"))
+   (use (match_operand:DI 2 "gpc_reg_operand"))]
   "TARGET_P9_VECTOR && TARGET_64BIT"
 {
-  operands[3] = gen_reg_rtx (DImode);
+  rtx shift_len = gen_rtx_ASHIFT (DImode, operands[2], GEN_INT (56));
+  rtx len;
+
+  if (TARGET_FUTURE)
+    len = shift_len;
+  else
+    {
+      len = gen_reg_rtx (DImode);
+      emit_insn (gen_rtx_SET (len, shift_len));
+    }
+
+  rtx src = operands[0];
+  rtx addr = operands[1];
+  rtx mem = gen_rtx_MEM (V16QImode, addr);
+  rtvec rv = gen_rtvec (3, src, mem, len);
+  rtx stxvl = gen_rtx_UNSPEC (V16QImode, rv, UNSPEC_STXVL);
+  emit_insn (gen_rtx_SET (mem, stxvl));
+  DONE;
 })
 
 ;; Define optab for vector access with length vectorization exploitation.
@@ -5752,6 +5803,35 @@
   "stxvl %x0,%1,%2"
   [(set_attr "type" "vecstore")])
 
+;; For stxvrl and stxvrll, use the combiner to eliminate the shift.  The
+;; define_expand for stxvl will already incorporate the shift in generating the
+;; insn.  The stxvll buitl-in function required the user to have already done
+;; the shift.  Defining stxvrll this way, will optimize cases where the user
+;; has done the shift immediately before the built-in.
+
+(define_insn "*stxvrl"
+  [(set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand" "b"))
+	(unspec:V16QI
+	 [(match_operand:V16QI 0 "vsx_register_operand" "wa")
+	  (mem:V16QI (match_dup 1))
+	  (ashift:DI (match_operand:DI 2 "register_operand" "r")
+		     (const_int 56))]
+	 UNSPEC_STXVL))]
+  "TARGET_FUTURE && TARGET_64BIT"
+  "stxvrl %x0,%1,%2"
+  [(set_attr "type" "vecstore")])
+
+(define_insn "*stxvrll"
+  [(set (mem:V16QI (match_operand:DI 1 "gpc_reg_operand" "b"))
+	(unspec:V16QI [(match_operand:V16QI 0 "vsx_register_operand" "wa")
+		       (mem:V16QI (match_dup 1))
+		       (ashift:DI (match_operand:DI 2 "register_operand" "r")
+				  (const_int 56))]
+	              UNSPEC_STXVLL))]
+  "TARGET_FUTURE"
+  "stxvrll %x0,%1,%2"
+  [(set_attr "type" "vecstore")])
+
 ;; Expand for builtin xst_len_r
 (define_expand "xst_len_r"
   [(match_operand:V16QI 0 "vsx_register_operand" "=wa")
diff --git a/gcc/testsuite/gcc.target/powerpc/lxvrl.c b/gcc/testsuite/gcc.target/powerpc/lxvrl.c
new file mode 100644
index 000000000000..71854c50c911
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/lxvrl.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_future_ok } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=future -O2" } */
+
+/* Test whether the lxvrl and stxvrl instructions are generated for
+   -mcpu=future on memory copy operations.  */
+
+#ifndef VSIZE
+#define VSIZE 2
+#endif
+
+#ifndef LSIZE
+#define LSIZE 5
+#endif
+
+struct foo {
+  vector unsigned char vc[VSIZE];
+  unsigned char leftover[LSIZE];
+};
+
+void memcpy_ptr (struct foo *p, struct foo *q)
+{
+  __builtin_memcpy ((void *) p,		/* lxvrl and stxvrl.  */
+		    (void *) q,
+		    (sizeof (vector unsigned char) * VSIZE) + LSIZE);
+}
+
+/* { dg-final { scan-assembler     {\mlxvrl\M}  } } */
+/* { dg-final { scan-assembler     {\mstxvrl\M} } } */
+/* { dg-final { scan-assembler-not {\mlxvl\M}   } } */
+/* { dg-final { scan-assembler-not {\mstxvl\M}  } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 256ba41baf8c..8c08cad86677 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7149,8 +7149,8 @@ proc check_effective_target_power10_ok { } {
     }
 }
 
-# Return 1 if this is a PowerPC target supporting -mcpu=future or -mdense-math
-# which enables the dense math operations.
+# Return 1 if this is a PowerPC target supporting -mcpu=future which enables
+# the dense math operations.
 proc check_effective_target_powerpc_dense_math_ok { } {
 	return [check_no_compiler_messages_nocache powerpc_dense_math_ok assembly {
 		__vector_quad vq;
@@ -7168,6 +7168,18 @@ proc check_effective_target_powerpc_dense_math_ok { } {
 	} "-mcpu=future"]
 }
 
+# Return 1 if this is a PowerPC target supporting -mcpu=future which enables
+# the saturating subtract instruction.
+proc check_effective_target_powerpc_future_ok { } {
+	return [check_no_compiler_messages powerpc_future_ok object {
+	    #ifndef _ARCH_PWR_FUTURE
+	    #error "not -mcpu=future"
+	    #else
+	    int dummy;
+	    #endif
+	} "-mcpu=future"]
+}
+
 # Return 1 if this is a PowerPC target supporting -mfloat128 via either
 # software emulation on power7/power8 systems or hardware support on power9.