[gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize vector shift by a constant.

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize vector shift by a constant.
@ 2023-06-03  4:06 Michael Meissner
  0 siblings, 0 replies; 3+ messages in thread
From: Michael Meissner @ 2023-06-03  4:06 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:67d9bb820024821b617fb24f41d4eb4e2afbe7dc

commit 67d9bb820024821b617fb24f41d4eb4e2afbe7dc
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Sat Jun 3 00:06:10 2023 -0400

    PR target/89213 - Optimize vector shift by a constant.
    
    Optimize vector shifts by a constant, taking advantage that the shift
    instructions only look at the bits within the element.
    
    The PowerPC doesn't have a VSPLTID instruction.  This meant that if we are doing
    a V2DI shift of 0..15, we had to do VSPLTIW and VEXTSW2D instructions to load
    the constant into the vector register.
    
    Similarly for V4SI and V2DI, if we wanted to shift more than 15 bits, we would
    generate XXSPLTIB and VEXTSB2D or VEXTSB2W instructions to load the constant
    into the vector register.
    
    Given the vector shift instructions only look at the bottom 5 or 6 bits of the
    shift value, we can load the constant via VSPLTISW or XXSPLTIB instructions and
    eliminate the sign extend instructions (VEXTSW2D, VEXTSB2D, and VEXTSB2W).
    
    2023-06-03  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            PR target/89213
            * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec.
            (V4SI_V2DI): New mode iterator.
            (vshift_code): New code iterator.
            (altivec_<code>_const_<mode): New insns.
            (altivec_shift_const_<mode>): New insns.
            * config/rs6000/predicates.md (vector_shift_constant): New
            predicate.
    
    gcc/testsuite/
    
            PR target/89213
            * gcc.target/powerpc/pr89213.c: New test.
            * gcc.target/powerpc/vec-rlmi-rlnm.c: Update insn count.

Diff:
---
 gcc/config/rs6000/altivec.md                     |  53 +++++++++++
 gcc/config/rs6000/predicates.md                  |  56 ++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr89213.c       | 107 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c |   4 +-
 4 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index ad1224e0b57..e982d968e3b 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -171,6 +171,7 @@
    UNSPEC_VSTRIL
    UNSPEC_SLDB
    UNSPEC_SRDB
+   UNSPEC_VECTOR_SHIFT
 ])
 
 (define_c_enum "unspecv"
@@ -239,6 +240,10 @@
 ;; Vector negate
 (define_mode_iterator VNEG [V4SI V2DI])
 
+;; Vector shift by constant vector optimizations
+(define_mode_iterator V4SI_V2DI		[V4SI V2DI])
+(define_code_iterator vshift_code	[ashift ashiftrt lshiftrt])
+
 ;; Vector move instructions.
 (define_insn "*altivec_mov<mode>"
   [(set (match_operand:VM2 0 "nonimmediate_operand" "=Z,v,v,?Y,?*r,?*r,v,v,?*r")
@@ -2077,6 +2082,54 @@
   "vsro %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+;; Optimize V2DI/V4SI shifts by constants.  We don't have a VSPLTISD or
+;; VSPLTISW instruction, but we can use XXSPLITB to load constants that would
+;; be used by shifts.  The shift instructions only looking at the bits needed
+;; to do the shift.
+
+(define_insn_and_split "*altivec_<code>_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(vshift_code:V4SI_V2DI
+	 (match_operand:V4SI_V2DI 1 "register_operand" "v")
+	 (match_operand:V4SI_V2DI 2 "vector_shift_constant" "")))
+   (clobber (match_scratch:V4SI_V2DI 3 "=&v"))]
+  "TARGET_P8_VECTOR"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+	(unspec:V4SI_V2DI [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+   (set (match_dup 0)
+	(vshift_code:V4SI_V2DI (match_dup 1)
+			       (match_dup 3)))]
+{
+  rtx vec_const = operands[2];
+
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (<MODE>mode);
+
+  if (GET_CODE (vec_const) == CONST_VECTOR)
+    operands[4] = CONST_VECTOR_ELT (vec_const, 0);
+
+  else if (GET_CODE (vec_const) == VEC_DUPLICATE)
+    operands[4] = XEXP (vec_const, 0);
+
+  else
+    gcc_unreachable ();
+})
+
+(define_insn "*altivec_shift_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(unspec:V4SI_V2DI [(match_operand 1 "const_int_operand" "n")]
+			    UNSPEC_VECTOR_SHIFT))]
+  "(TARGET_P8_VECTOR && UINTVAL (operands[1]) <= 15)
+    || (TARGET_P9_VECTOR && UINTVAL (operands[1]) <= 63)"
+{
+  return (UINTVAL (operands[1]) <= 15
+	  ? "vspltisw %0,%1"
+	  : "xxspltib %x0,%1");
+}
+  [(set_attr "type" "vecperm")])
+
 (define_insn "altivec_vsum4ubs"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index a16ee30f0c0..ad1d64fbe48 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -850,6 +850,62 @@
     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
 })
 
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63.  If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW.  V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31.  Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+  (match_code "const_vector,vec_duplicate")
+{
+  unsigned HOST_WIDE_INT min_value;
+
+  if (mode == V2DImode)
+    min_value = 0;
+  else if (mode == V4SImode)
+    {
+      min_value = 16;
+      if (!TARGET_P9_VECTOR)
+	return 0;
+    }
+  else
+    return 0;
+
+  unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      rtx first = CONST_VECTOR_ELT (op, 0);
+      unsigned nunits = GET_MODE_NUNITS (mode);
+
+      if (!IN_RANGE (UINTVAL (first), min_value, max_value))
+	return 0;
+
+      for (unsigned i = 1; i < nunits; i++)
+	if (!rtx_equal_p (first, CONST_VECTOR_ELT (op, i)))
+	  return 0;
+
+      return 1;
+    }
+
+  else if (GET_CODE (op) == VEC_DUPLICATE
+	   && CONST_INT_P (XEXP (op, 0)))
+    return IN_RANGE (UINTVAL (XEXP (op, 0)), min_value, max_value);
+
+  return 0;
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 00000000000..601f9166d6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,107 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants.  */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
index 6834733b1bf..0f42dc58aae 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-rlmi-rlnm.c
@@ -54,12 +54,12 @@ rlnm_test_2 (vector unsigned long long x, vector unsigned long long y,
     - For rlnm_test_1: vspltisw, vslw, xxlor, vrlwnm.
     - For rlnm_test_2: xxspltib, vextsb2d, vsld, xxlor, vrldnm.
    There is a choice of splat instructions in both cases, so we
-   just check for "splt".  */
+   just check for "splt".  Eliminate checking for vector sign
+   extend after the splat.  */
 
 /* { dg-final { scan-assembler-times "vrlwmi" 1 } } */
 /* { dg-final { scan-assembler-times "vrldmi" 1 } } */
 /* { dg-final { scan-assembler-times "splt" 2 } } */
-/* { dg-final { scan-assembler-times "vextsb2d" 1 } } */
 /* { dg-final { scan-assembler-times "vslw" 1 } } */
 /* { dg-final { scan-assembler-times "vsld" 1 } } */
 /* { dg-final { scan-assembler-times "xxlor" 4 } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize vector shift by a constant.
@ 2023-06-02 19:13 Michael Meissner
  0 siblings, 0 replies; 3+ messages in thread
From: Michael Meissner @ 2023-06-02 19:13 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2f37c136aafb0c819d0af5e127193a2155fc9b1e

commit 2f37c136aafb0c819d0af5e127193a2155fc9b1e
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Jun 2 15:13:20 2023 -0400

    PR target/89213 - Optimize vector shift by a constant.
    
    Optimize vector shifts by a constant, taking advantage that the shift
    instructions only look at the bits within the element.
    
    The PowerPC doesn't have a VSPLTID instruction.  This meant that if we are doing
    a V2DI shift of 0..15, we had to do VSPLTIW and VEXTSW2D instructions to load
    the constant into the vector register.
    
    Similarly for V4SI and V2DI, if we wanted to shift more than 15 bits, we would
    generate XXSPLTIB and VEXTSB2D or VEXTSB2W instructions to load the constant
    into the vector register.
    
    Given the vector shift instructions only look at the bottom 5 or 6 bits of the
    shift value, we can load the constant via VSPLTISW or XXSPLTIB instructions and
    eliminate the sign extend instructions (VEXTSW2D, VEXTSB2D, and VEXTSB2W).
    
    2023-06-02  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            PR target/89213
            * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec.
            (V4SI_V2DI): New mode iterator.
            (vshift_code): New code iterator.
            (altivec_<code>_const_<mode): New insns.
            (altivec_shift_const_<mode>): New insns.
            * config/rs6000/predicates.md (vector_shift_constant): New
            predicate.
    
    gcc/testsuite/
    
            PR target/89213
            * gcc.target/powerpc/pr89213.c: New test.

Diff:
---
 gcc/config/rs6000/altivec.md               |  53 ++++++++++++++
 gcc/config/rs6000/predicates.md            |  56 +++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 +++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index ad1224e0b57..e982d968e3b 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -171,6 +171,7 @@
    UNSPEC_VSTRIL
    UNSPEC_SLDB
    UNSPEC_SRDB
+   UNSPEC_VECTOR_SHIFT
 ])
 
 (define_c_enum "unspecv"
@@ -239,6 +240,10 @@
 ;; Vector negate
 (define_mode_iterator VNEG [V4SI V2DI])
 
+;; Vector shift by constant vector optimizations
+(define_mode_iterator V4SI_V2DI		[V4SI V2DI])
+(define_code_iterator vshift_code	[ashift ashiftrt lshiftrt])
+
 ;; Vector move instructions.
 (define_insn "*altivec_mov<mode>"
   [(set (match_operand:VM2 0 "nonimmediate_operand" "=Z,v,v,?Y,?*r,?*r,v,v,?*r")
@@ -2077,6 +2082,54 @@
   "vsro %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+;; Optimize V2DI/V4SI shifts by constants.  We don't have a VSPLTISD or
+;; VSPLTISW instruction, but we can use XXSPLITB to load constants that would
+;; be used by shifts.  The shift instructions only looking at the bits needed
+;; to do the shift.
+
+(define_insn_and_split "*altivec_<code>_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(vshift_code:V4SI_V2DI
+	 (match_operand:V4SI_V2DI 1 "register_operand" "v")
+	 (match_operand:V4SI_V2DI 2 "vector_shift_constant" "")))
+   (clobber (match_scratch:V4SI_V2DI 3 "=&v"))]
+  "TARGET_P8_VECTOR"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+	(unspec:V4SI_V2DI [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+   (set (match_dup 0)
+	(vshift_code:V4SI_V2DI (match_dup 1)
+			       (match_dup 3)))]
+{
+  rtx vec_const = operands[2];
+
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (<MODE>mode);
+
+  if (GET_CODE (vec_const) == CONST_VECTOR)
+    operands[4] = CONST_VECTOR_ELT (vec_const, 0);
+
+  else if (GET_CODE (vec_const) == VEC_DUPLICATE)
+    operands[4] = XEXP (vec_const, 0);
+
+  else
+    gcc_unreachable ();
+})
+
+(define_insn "*altivec_shift_const_<mode>"
+  [(set (match_operand:V4SI_V2DI 0 "register_operand" "=v")
+	(unspec:V4SI_V2DI [(match_operand 1 "const_int_operand" "n")]
+			    UNSPEC_VECTOR_SHIFT))]
+  "(TARGET_P8_VECTOR && UINTVAL (operands[1]) <= 15)
+    || (TARGET_P9_VECTOR && UINTVAL (operands[1]) <= 63)"
+{
+  return (UINTVAL (operands[1]) <= 15
+	  ? "vspltisw %0,%1"
+	  : "xxspltib %x0,%1");
+}
+  [(set_attr "type" "vecperm")])
+
 (define_insn "altivec_vsum4ubs"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index a16ee30f0c0..ad1d64fbe48 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -850,6 +850,62 @@
     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
 })
 
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63.  If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW.  V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31.  Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+  (match_code "const_vector,vec_duplicate")
+{
+  unsigned HOST_WIDE_INT min_value;
+
+  if (mode == V2DImode)
+    min_value = 0;
+  else if (mode == V4SImode)
+    {
+      min_value = 16;
+      if (!TARGET_P9_VECTOR)
+	return 0;
+    }
+  else
+    return 0;
+
+  unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      rtx first = CONST_VECTOR_ELT (op, 0);
+      unsigned nunits = GET_MODE_NUNITS (mode);
+
+      if (!IN_RANGE (UINTVAL (first), min_value, max_value))
+	return 0;
+
+      for (unsigned i = 1; i < nunits; i++)
+	if (!rtx_equal_p (first, CONST_VECTOR_ELT (op, i)))
+	  return 0;
+
+      return 1;
+    }
+
+  else if (GET_CODE (op) == VEC_DUPLICATE
+	   && CONST_INT_P (XEXP (op, 0)))
+    return IN_RANGE (UINTVAL (XEXP (op, 0)), min_value, max_value);
+
+  return 0;
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 00000000000..601f9166d6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,107 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants.  */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M}     2 } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize vector shift by a constant.
@ 2023-06-02  4:13 Michael Meissner
  0 siblings, 0 replies; 3+ messages in thread
From: Michael Meissner @ 2023-06-02  4:13 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0f0f0f1dc1da642c776154027b7a3f0882415551

commit 0f0f0f1dc1da642c776154027b7a3f0882415551
Author: Michael Meissner <meissner@linux.ibm.com>
Date:   Fri Jun 2 00:13:25 2023 -0400

    PR target/89213 - Optimize vector shift by a constant.
    
    On power9 and power10 systems, optimize vector shifts by a constant, taking
    advantage that the shift instructions only look at the bits within the element.
    
    2023-06-02  Michael Meissner  <meissner@linux.ibm.com>
    
    gcc/
    
            PR target/89213
            * config/rs6000/altivec.md (UNSPEC_VECTOR_SHIFT): New unspec.
            (VSHIFT_MODE): New mode iterator.
            (vshift_code): New code iterator.
            (vshift_attr): New code attribute.
            (altivec_<mode>_<vshift_attr>_const): New pattern to optimize
            vector long long/int shifts by a constant.
            (altivec_<mode>_shift_const): New helper insn to load up a
            constant used by the shift operation.
            * config/rs6000/predicates.md (vector_shift_constant): New
            predicate.
    
    gcc/testsuite/
    
            PR target/89213
            * gcc.target/powerpc/pr89213.c: New test.

Diff:
---
 gcc/config/rs6000/altivec.md               |  51 ++++++++++++++
 gcc/config/rs6000/predicates.md            |  63 +++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr89213.c | 107 +++++++++++++++++++++++++++++
 3 files changed, 221 insertions(+)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index ad1224e0b57..e9f432d4812 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -171,6 +171,7 @@
    UNSPEC_VSTRIL
    UNSPEC_SLDB
    UNSPEC_SRDB
+   UNSPEC_VECTOR_SHIFT
 ])
 
 (define_c_enum "unspecv"
@@ -2077,6 +2078,56 @@
   "vsro %0,%1,%2"
   [(set_attr "type" "vecperm")])
 
+;; Optimize V2DI shifts by constants.  This relies on the shift instructions
+;; only looking at the bits needed to do the shift.  This means we can use
+;; VSPLTISW or XXSPLTIB to load up the constant, and not worry about the bits
+;; that the vector shift instructions will not use.
+(define_mode_iterator VSHIFT_MODE	[(V4SI "TARGET_P9_VECTOR")
+					 (V2DI "TARGET_P8_VECTOR")])
+
+(define_code_iterator vshift_code	[ashift ashiftrt lshiftrt])
+(define_code_attr vshift_attr		[(ashift   "ashift")
+					 (ashiftrt "ashiftrt")
+					 (lshiftrt "lshiftrt")])
+
+(define_insn_and_split "*altivec_<mode>_<vshift_attr>_const"
+  [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+	(vshift_code:VSHIFT_MODE
+	 (match_operand:VSHIFT_MODE 1 "register_operand" "v")
+	 (match_operand:VSHIFT_MODE 2 "vector_shift_constant" "")))
+   (clobber (match_scratch:VSHIFT_MODE 3 "=&v"))]
+  "((<MODE>mode == V2DImode && TARGET_P8_VECTOR)
+    || (<MODE>mode == V4SImode && TARGET_P9_VECTOR))"
+  "#"
+  "&& 1"
+  [(set (match_dup 3)
+	(unspec:VSHIFT_MODE [(match_dup 4)] UNSPEC_VECTOR_SHIFT))
+   (set (match_dup 0)
+	(vshift_code:VSHIFT_MODE (match_dup 1)
+				 (match_dup 3)))]
+{
+  if (GET_CODE (operands[3]) == SCRATCH)
+    operands[3] = gen_reg_rtx (<MODE>mode);
+
+  operands[4] = ((GET_CODE (operands[2]) == CONST_VECTOR)
+		 ? CONST_VECTOR_ELT (operands[2], 0)
+		 : XEXP (operands[2], 0));
+})
+
+(define_insn "*altivec_<mode>_shift_const"
+  [(set (match_operand:VSHIFT_MODE 0 "register_operand" "=v")
+	(unspec:VSHIFT_MODE [(match_operand 1 "const_int_operand" "n")]
+			    UNSPEC_VECTOR_SHIFT))]
+  "TARGET_P8_VECTOR"
+{
+  if (UINTVAL (operands[1]) <= 15)
+    return "vspltisw %0,%1";
+  else if (TARGET_P9_VECTOR)
+    return "xxspltib %x0,%1";
+  else
+    gcc_unreachable ();
+})
+
 (define_insn "altivec_vsum4ubs"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V16QI 1 "register_operand" "v")
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index a16ee30f0c0..f1e93453c18 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -850,6 +850,69 @@
     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
 })
 
+;; Return 1 if the operand is a V2DI or V4SI const_vector, where each element
+;; is the same constant, and the constant can be used for a shift operation.
+;; This is to prevent sub-optimal code, that needs to load up the constant and
+;; then zero extend it 32 or 64-bit vectors or load up the constant from the
+;; literal pool.
+;;
+;; For V4SImode, we only recognize shifts by 16..31 on ISA 3.0, since shifts by
+;; 1..15 can be handled by the normal VSPLTISW and vector shift instruction.
+;; For V2DImode, we do this all of the time, since there is no convenient
+;; instruction to load up a vector long long splatted constant.
+;;
+;; If we can use XXSPLTIB, then allow constants up to 63.  If not, we restrict
+;; the constant to 0..15 that can be loaded with VSPLTISW.  V4SI shifts are
+;; only optimized for ISA 3.0 when the shift value is >= 16 and <= 31.  Values
+;; between 0 and 15 can use a normal VSPLTISW to load the value, and it doesn't
+;; need this optimization.
+(define_predicate "vector_shift_constant"
+  (match_code "const_vector,vec_duplicate")
+{
+  unsigned HOST_WIDE_INT min_value;
+
+  if (mode == V2DImode)
+    {
+      min_value = 0;
+      if (!TARGET_P8_VECTOR)
+	return 0;
+    }
+  else if (mode == V4SImode)
+    {
+      min_value = 16;
+      if (!TARGET_P9_VECTOR)
+	return 0;
+    }
+  else
+    return 0;
+
+  unsigned HOST_WIDE_INT max_value = TARGET_P9_VECTOR ? 63 : 15;
+
+  if (GET_CODE (op) == CONST_VECTOR)
+    {
+      unsigned HOST_WIDE_INT first = UINTVAL (CONST_VECTOR_ELT (op, 0));
+      unsigned nunits = GET_MODE_NUNITS (mode);
+      unsigned i;
+
+      if (!IN_RANGE (first, min_value, max_value))
+	return 0;
+
+      for (i = 1; i < nunits; i++)
+	if (first != UINTVAL (CONST_VECTOR_ELT (op, i)))
+	  return 0;
+
+      return 1;
+    }
+  else
+    {
+      rtx op0 = XEXP (op, 0);
+      if (!CONST_INT_P (op0))
+	return 0;
+
+      return IN_RANGE (UINTVAL (op0), min_value, max_value);
+    }
+})
+
 ;; Return 1 if operand is 0.0.
 (define_predicate "zero_fp_constant"
   (and (match_code "const_double")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr89213.c b/gcc/testsuite/gcc.target/powerpc/pr89213.c
new file mode 100644
index 00000000000..601f9166d6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr89213.c
@@ -0,0 +1,107 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mcpu=power9 -O2" } */
+
+/* Optimize vector shifts by constants.  */
+
+#include <altivec.h>
+
+typedef vector long long vi64_t;
+typedef vector unsigned long long vui64_t;
+
+typedef vector int vi32_t;
+typedef vector unsigned int vui32_t;
+
+vi64_t
+shiftra_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_4 (vi64_t a)
+{
+  vui64_t x = {4, 4};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi64_t
+shiftra_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrad (a, x);
+}
+
+vi64_t
+shiftrl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsrd (a, x);
+}
+
+vi64_t
+shiftl_test64_29 (vi64_t a)
+{
+  vui64_t x = {29, 29};
+  return (vi64_t) vec_vsld (a, x);
+}
+
+vi32_t
+shiftra_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_4 (vi32_t a)
+{
+  vui32_t x = {4, 4, 4, 4};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+vi32_t
+shiftra_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsraw (a, x);
+}
+
+vi32_t
+shiftrl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vsrw (a, x);
+}
+
+vi32_t
+shiftl_test32_29 (vi32_t a)
+{
+  vui32_t x = {29, 29, 29, 29};
+  return (vi32_t) vec_vslw (a, x);
+}
+
+/* { dg-final { scan-assembler-times {\mxxspltib\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsld\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvslw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvspltisw\M}  6 } } */
+/* { dg-final { scan-assembler-times {\mvsrd\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrw\M}      2 } } */
+/* { dg-final { scan-assembler-times {\mvsrad\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mvsraw\M}     2 } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-06-03  4:06 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-03  4:06 [gcc(refs/users/meissner/heads/work121)] PR target/89213 - Optimize vector shift by a constant Michael Meissner
  -- strict thread matches above, loose matches on Subject: below --
2023-06-02 19:13 Michael Meissner
2023-06-02  4:13 Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).