From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <meissner@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1005)
	id 8826E3858D3C; Fri,  2 Jun 2023 19:13:38 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8826E3858D3C
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1685733218;
	bh=X96empqXpQkOmBWrQcHGPQ8vR8N8P88R93Xy+SlzF3w=;
	h=From:To:Subject:Date:From;
	b=wNkKqJOztut0GgzRRYRP6Hwww1ZnR5zG7BS6Uu5F4u5yIYMXuGrpyFEbsX2QcxzSl
	 EHUKWw21h3mJ5AEhUklgHlecfvivb2/CfbF+GUd5kPftaA88Ois2EfU2BrmBGmnNw5
	 uYyb7kZGsrmYbE2c14Qpmznd5xloaSfzQk2iNGXE=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Michael Meissner <meissner@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize
 vector shift by a constant.
X-Act-Checkin: gcc
X-Git-Author: Michael Meissner <meissner@linux.ibm.com>
X-Git-Refname: refs/users/meissner/heads/work121
X-Git-Oldrev: 1542a6653d3961b658bd9baeb16f30d2892a3f10
X-Git-Newrev: 2f37c136aafb0c819d0af5e127193a2155fc9b1e
Message-Id: <20230602191338.8826E3858D3C@sourceware.org>
Date: Fri,  2 Jun 2023 19:13:38 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:2f37c136aafb0c819d0af5e127193a2155fc9b1e

commit 2f37c136aafb0c819d0af5e127193a2155fc9b1e
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Jun 2 15:13:20 2023 -0400

    PR target/89213 - Optimize vector shift by a constant.
    
    Optimize vector shifts by a constant, taking advantage that the shift
    instructions only look at the bits within the element.
    
    The PowerPC doesn't have a VSPLTID instruction.  This meant that if we are doing
    a V2DI shift of 0..15, we had to do VSPLTIW and VEXTSW2D instructions to load
    the constant into the vector register.
    
    Similarly for V4SI and V2DI, if we wanted to shift more than 15 bits, we would
    generate XXSPLTIB and VEXTSB2D or VEXTSB2W instructions to load the constant
    into the vector register.
    
    Given the vector shift instructions only look at the bottom 5 or 6 bits of the
    shift value, we can load the constant via VSPLTISW or XXSPLTIB instructions and
    eliminate the sign extend instructions (VEXTSW2D, VEXTSB2D, and VEXTSB2W).
    
    2023-06-02  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            PR target/89213
            * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec.
            (V4SI_V2DI): New mode iterator.
            (vshift_code): New code iterator.
            (altivec_<code>_const_<mode): New insns.
            (altivec_shift_const_<mode>): New insns.
            * config/rs6000/predicates.md (vector_shift_constant): New
            predicate.
    
    gcc/testsuite/
    
            PR target/89213
            * gcc.target/powerpc/pr89213.c: New test.

Diff:
---
 gcc/config/rs6000/altivec.md               |  53 ++++++++++++++
 gcc/config/rs6000/predicates.md            |  56 +++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 +++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)
diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index ad1224e0b57..e982d968e3b 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -171,6 +171,7 @@
    UNSPEC_VSTRIL
    UNSPEC_SLDB
    UNSPEC_SRDB
+   UNSPEC_VECTOR_SHIFT
 ])
 
 (define_c_enum "unspecv"
@@ -239,6 +240,10 @@
 ;; Vector negate
 (define_mode_iterator VNEG [V4SI V2DI])
 
+;; Vector shift by constant vector optimizations
+(define_mode_iterator V4SI_V2DI		[V4SI V2DI])
+(define_code_iterator vshift_code	[ashift ashiftrt lshiftrt])
+
 ;; Vector move instructions.
 (define_insn "*altivec_mov<mode>"
   [(set (match_operand:VM2 0 "nonimmediate_operand" "=Z,v,v,?Y,?*r,?*r,v,v,?*r")
@@ -2077,6 +2082,54 @@
   "vsro %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+;; Optimize V2DI/V4SI shifts by constants.  We don't have a VSPLTISD or
+;; VSPLTISW instruction, but we can use XXSPLITB to load constants that would
+;; be used by shifts.  The shift instructions only looking at the bits needed
+;; to do the shift.
+
+(define_insn_and_split "*altivec_<code>_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(vshift_code:V4SI_V2DI
+	 (match_operand:V4SI_V2DI 1 "register_operand" "v")
+	 (match_operand:V4SI_V2DI 2 "vector_shift_constant" "")))
+   (clobber (match_scratch:V4SI_V2DI 3 "=&v"))]
+  "TARGET_P8_VECTOR"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+	(unspec:V4SI_V2DI [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+   (set (match_dup 0)
+	(vshift_code:V4SI_V2DI (match_dup 1)
+			       (match_dup 3)))]
+{
+  rtx vec_const = operands[2];
+
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (<MODE>mode);
+
+  if (GET_CODE (vec_const) == CONST_VECTOR)
+    operands[4] = CONST_VECTOR_ELT (vec_const, 0);
+
+  else if (GET_CODE (vec_const) == VEC_DUPLICATE)
+    operands[4] = XEXP (vec_const, 0);
+
+  else
+    gcc_unreachable ();
+})
+
+(define_insn "*altivec_shift_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(unspec:V4SI_V2DI [(match_operand 1 "const_int_operand" "n")]
+			    UNSPEC_VECTOR_SHIFT))]
+  "(TARGET_P8_VECTOR && UINTVAL (operands[1]) <= 15)
+    || (TARGET_P9_VECTOR && UINTVAL (operands[1]) <= 63)"
+{
+  return (UINTVAL (operands[1]) <= 15
+	  ? "vspltisw %0,%1"
+	  : "xxspltib %x0,%1");
+}
+  [(set_attr "type" "vecperm")])
+
 (define_insn "altivec_vsum4ubs"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index a16ee30f0c0..ad1d64fbe48 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -850,6 +850,62 @@
     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
 })
 
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63.  If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW.  V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31.  Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+  (match_code "const_vector,vec_duplicate")
+{
+  unsigned HOST_WIDE_INT min_value;
+
+  if (mode == V2DImode)
+    min_value = 0;
+  else if (mode == V4SImode)
+    {
+      min_value = 16;
+      if (!TARGET_P9_VECTOR)
+	return 0;
+    }
+  else
+    return 0;
+
+  unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      rtx first = CONST_VECTOR_ELT (op, 0);
+      unsigned nunits = GET_MODE_NUNITS (mode);
+
+      if (!IN_RANGE (UINTVAL (first), min_value, max_value))
+	return 0;
+
+      for (unsigned i = 1; i < nunits; i++)
+	if (!rtx_equal_p (first, CONST_VECTOR_ELT (op, i)))
+	  return 0;
+
+      return 1;
+    }
+
+  else if (GET_CODE (op) == VEC_DUPLICATE
+	   && CONST_INT_P (XEXP (op, 0)))
+    return IN_RANGE (UINTVAL (XEXP (op, 0)), min_value, max_value);
+
+  return 0;
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 00000000000..601f9166d6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,107 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants.  */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M}     2 } } */