[PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]
@ 2023-07-21  1:32 HAO CHEN GUI
  2023-07-24  2:43 ` Kewen.Lin
  0 siblings, 1 reply; 4+ messages in thread
From: HAO CHEN GUI @ 2023-07-21  1:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: Segher Boessenkool, David, Kewen.Lin, Peter Bergner

Hi,
  This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
for all subtargets when the mode is V4SI and the index of extracted element
is 1 for BE and 2 for LE. Also this patch adds a insn pattern for mfvsrwz
which can help eliminate redundant zero extend.

  Compared to last version, the main change is to add a new expand for V4SI
and separate "vsx_extract_si" to 2 insn patterns.
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622101.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.

Thanks
Gui Haochen


ChangeLog
rs6000: Generate mfvsrwz for all subtargets and remove redundant zero extend

mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
generated even with p9 vector enabled.  Also the instruction is already
zero extended.  A combine pattern is needed to eliminate redundant zero
extend instructions.

gcc/
	PR target/106769
	* config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only
	for V8HI and V16QI.
	(vsx_extract_v4si): New expand for V4SI.
	(*vsx_extract_<mode>_di_p9): Not generate the insn when it can
	be generated by mfvsrwz.
	(mfvsrwz): New insn pattern for zero extended vsx_extract_v4si.
	(*vsx_extract_si): Removed.
	(vsx_extract_v4si_0): New insn pattern to deal with V4SI extract
	when the index of extracted element is 1 with BE and 2 with LE.
	(vsx_extract_v4si_1): New insn and split pattern which deals with
	the cases not handled by vsx_extract_v4si_0.

gcc/testsuite/
	PR target/106769
	* gcc.target/powerpc/pr106769.h: New.
	* gcc.target/powerpc/pr106769-p8.c: New.
	* gcc.target/powerpc/pr106769-p9.c: New.

patch.diff
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 0a34ceebeb5..ad249441bcf 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1"
 (define_expand  "vsx_extract_<mode>"
   [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand")
 		   (vec_select:<VEC_base>
-		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
+		    (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
 		    (parallel [(match_operand:QI 2 "const_int_operand")])))
-	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
+	      (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
 {
   /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
@@ -3736,6 +3736,23 @@ (define_expand  "vsx_extract_<mode>"
     }
 })

+(define_expand  "vsx_extract_v4si"
+  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
+		   (vec_select:SI
+		    (match_operand:V4SI 1 "gpc_reg_operand")
+		    (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
+	      (clobber (match_scratch:V4SI 3))])]
+  "TARGET_DIRECT_MOVE_64BIT"
+{
+  if (TARGET_P9_VECTOR
+      && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
+    {
+      emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
+					  operands[2]));
+      DONE;
+    }
+})
+
 (define_insn "vsx_extract_<mode>_p9"
   [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>")
 	(vec_select:<VEC_base>
@@ -3798,7 +3815,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
 	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
 	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
    (clobber (match_scratch:SI 3 "=r,X"))]
-  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
+  "TARGET_VEXTRACTUB
+   && (<MODE>mode != V4SImode
+       || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))"
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 4)
@@ -3830,58 +3849,78 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9"
    (set (match_dup 0)
 	(match_dup 3))])

-(define_insn_and_split  "*vsx_extract_si"
+(define_insn "mfvsrwz"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "vsx_register_operand" "wa")
+	    (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
+   (clobber (match_scratch:V4SI 3 "=v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
+  "mfvsrwz %0,%x1"
+  [(set_attr "type" "mfvsr")
+   (set_attr "isa" "p8v")])
+
+(define_insn "vsx_extract_v4si_0"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
+	(vec_select:SI
+	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
+	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
+   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && (!TARGET_P9_VECTOR || INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2))"
+{
+   if (which_alternative == 0)
+     return "mfvsrwz %0,%x1";
+
+   if (which_alternative == 1)
+     return "xxlor %x0,%x1,%x1";
+
+   if (which_alternative == 2)
+     return "stxsiwx %x1,%y0";
+
+   return ASM_COMMENT_START " vec_extract to same register";
+}
+  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
+   (set_attr "length" "4,4,4,0")
+   (set_attr "isa" "p8v,*,p8v,*")])
+
+(define_insn_and_split "vsx_extract_v4si_1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z")
 	(vec_select:SI
 	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v")
 	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")])))
    (clobber (match_scratch:V4SI 3 "=v,v,v"))]
-  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR"
+  "TARGET_DIRECT_MOVE_64BIT
+   && !TARGET_P9_VECTOR
+   && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)"
   "#"
-  "&& reload_completed"
+  "&& 1"
   [(const_int 0)]
 {
   rtx dest = operands[0];
   rtx src = operands[1];
   rtx element = operands[2];
-  rtx vec_tmp = operands[3];
-  int value;
+  rtx vec_tmp;
+
+  if (GET_CODE (operands[3]) == SCRATCH)
+    vec_tmp = gen_reg_rtx (V4SImode);
+  else
+    vec_tmp = operands[3];

   /* Adjust index for LE element ordering, the below minuend 3 is computed by
      GET_MODE_NUNITS (V4SImode) - 1.  */
   if (!BYTES_BIG_ENDIAN)
     element = GEN_INT (3 - INTVAL (element));

-  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
-     instruction.  */
-  value = INTVAL (element);
-  if (value != 1)
-    emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
-  else
-    vec_tmp = src;
+  emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));

-  if (MEM_P (operands[0]))
-    {
-      if (can_create_pseudo_p ())
-	dest = rs6000_force_indexed_or_indirect_mem (dest);
-
-      if (TARGET_P8_VECTOR)
-	emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
-      else
-	emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp))));
-    }
-
-  else if (TARGET_P8_VECTOR)
-    emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
-  else
-    emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
-		    gen_rtx_REG (DImode, REGNO (vec_tmp)));
+  int value = BYTES_BIG_ENDIAN ? 1 : 2;
+  emit_insn (gen_vsx_extract_v4si_0 (dest, vec_tmp, GEN_INT (value)));

   DONE;
-}
-  [(set_attr "type" "mfvsr,vecperm,fpstore")
-   (set_attr "length" "8")
-   (set_attr "isa" "*,p8v,*")])
+})

 (define_insn_and_split  "*vsx_extract_<mode>_p8"
   [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
new file mode 100644
index 00000000000..e7cdbc76298
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+#include "pr106769.h"
+
+/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
+/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
+/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
new file mode 100644
index 00000000000..2205e434a86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+#include "pr106769.h"
+
+/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
+/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
+/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
+/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */
+/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769.h b/gcc/testsuite/gcc.target/powerpc/pr106769.h
new file mode 100644
index 00000000000..1c8c8a024f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769.h
@@ -0,0 +1,17 @@
+#include <altivec.h>
+
+#ifdef __BIG_ENDIAN__
+#define LANE 1
+#else
+#define LANE 2
+#endif
+
+unsigned int foo1 (vector unsigned int v)
+{
+   return vec_extract(v, LANE);
+}
+
+void foo2 (vector unsigned int v, unsigned int* p)
+{
+   *p = vec_extract(v, LANE);
+}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]
  2023-07-21  1:32 [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769] HAO CHEN GUI
@ 2023-07-24  2:43 ` Kewen.Lin
  0 siblings, 0 replies; 4+ messages in thread
From: Kewen.Lin @ 2023-07-24  2:43 UTC (permalink / raw)
  To: HAO CHEN GUI; +Cc: Segher Boessenkool, David, Peter Bergner, gcc-patches

Hi Haochen,

on 2023/7/21 09:32, HAO CHEN GUI wrote:
> Hi,
>   This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
> for all subtargets when the mode is V4SI and the index of extracted element
> is 1 for BE and 2 for LE. Also this patch adds a insn pattern for mfvsrwz
> which can help eliminate redundant zero extend.
> 
>   Compared to last version, the main change is to add a new expand for V4SI
> and separate "vsx_extract_si" to 2 insn patterns.
> https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622101.html
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> rs6000: Generate mfvsrwz for all subtargets and remove redundant zero extend
> 
> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
> generated even with p9 vector enabled.  Also the instruction is already
> zero extended.  A combine pattern is needed to eliminate redundant zero
> extend instructions.
> 
> gcc/
> 	PR target/106769
> 	* config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only
> 	for V8HI and V16QI.
> 	(vsx_extract_v4si): New expand for V4SI.
> 	(*vsx_extract_<mode>_di_p9): Not generate the insn when it can
> 	be generated by mfvsrwz.
> 	(mfvsrwz): New insn pattern for zero extended vsx_extract_v4si.
> 	(*vsx_extract_si): Removed.
> 	(vsx_extract_v4si_0): New insn pattern to deal with V4SI extract
> 	when the index of extracted element is 1 with BE and 2 with LE.
> 	(vsx_extract_v4si_1): New insn and split pattern which deals with
> 	the cases not handled by vsx_extract_v4si_0.
> 
> gcc/testsuite/
> 	PR target/106769
> 	* gcc.target/powerpc/pr106769.h: New.
> 	* gcc.target/powerpc/pr106769-p8.c: New.
> 	* gcc.target/powerpc/pr106769-p9.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 0a34ceebeb5..ad249441bcf 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1"
>  (define_expand  "vsx_extract_<mode>"
>    [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand")
>  		   (vec_select:<VEC_base>
> -		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
> +		    (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
>  		    (parallel [(match_operand:QI 2 "const_int_operand")])))
> -	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
> +	      (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
>    "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
>  {
>    /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
> @@ -3736,6 +3736,23 @@ (define_expand  "vsx_extract_<mode>"
>      }
>  })
> 
> +(define_expand  "vsx_extract_v4si"
> +  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
> +		   (vec_select:SI
> +		    (match_operand:V4SI 1 "gpc_reg_operand")
> +		    (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
> +	      (clobber (match_scratch:V4SI 3))])]
> +  "TARGET_DIRECT_MOVE_64BIT"
> +{

Nit: Maybe add a comment here for why we special-case op2.

> +  if (TARGET_P9_VECTOR
> +      && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
> +    {
> +      emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
> +					  operands[2]));
> +      DONE;
> +    }
> +})
> +

Nit: Move "(define_insn \"vsx_extract_v4si_0\"..." up here to ensure
it takes the first priority in matching.

>  (define_insn "vsx_extract_<mode>_p9"
>    [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>")
>  	(vec_select:<VEC_base>
> @@ -3798,7 +3815,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
>  	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
>  	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
>     (clobber (match_scratch:SI 3 "=r,X"))]
> -  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
> +  "TARGET_VEXTRACTUB
> +   && (<MODE>mode != V4SImode
> +       || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))"

I'd expect that under condition TARGET_VEXTRACTUB, we won't get this kind of
pattern with V4SI and 1/2 op2 now?  Instead of putting one condition to exclude
it, IMHO it's better to assert op2 isn't 1 or 2 in its splitters.

>    "#"
>    "&& reload_completed"
>    [(parallel [(set (match_dup 4)
> @@ -3830,58 +3849,78 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9"
>     (set (match_dup 0)
>  	(match_dup 3))])
> 
> -(define_insn_and_split  "*vsx_extract_si"
> +(define_insn "mfvsrwz"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +	(zero_extend:DI
> +	  (vec_select:SI
> +	    (match_operand:V4SI 1 "vsx_register_operand" "wa")
> +	    (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
> +   (clobber (match_scratch:V4SI 3 "=v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +  "mfvsrwz %0,%x1"
> +  [(set_attr "type" "mfvsr")
> +   (set_attr "isa" "p8v")])
Nit: there is no call with gen_mfvswz, make it "*mfvsrwz"?

> +
> +(define_insn "vsx_extract_v4si_0"

Nit: since it cares about word 1 (ISA, also BE order) of vector, may be better
to name it as "vsx_extract_v4si_w1" with one more comment.

> +  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
> +	(vec_select:SI
> +	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
> +	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
> +   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && (!TARGET_P9_VECTOR || INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2))"

Could we remove the "!TARGET_P9_VECTOR"? 

It looks more clear to make this just focus on op2 == 1 or 2 ...

> +{
> +   if (which_alternative == 0)
> +     return "mfvsrwz %0,%x1";
> +
> +   if (which_alternative == 1)
> +     return "xxlor %x0,%x1,%x1";
> +
> +   if (which_alternative == 2)
> +     return "stxsiwx %x1,%y0";
> +
> +   return ASM_COMMENT_START " vec_extract to same register";
> +}
> +  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
> +   (set_attr "length" "4,4,4,0")
> +   (set_attr "isa" "p8v,*,p8v,*")])
> +
> +(define_insn_and_split "vsx_extract_v4si_1"

... and use this for the others ...

>    [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z")
>  	(vec_select:SI
>  	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v")
>  	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")])))
>     (clobber (match_scratch:V4SI 3 "=v,v,v"))]
> -  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR"
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && !TARGET_P9_VECTOR

I'd expect TARGET_P9_VECTOR won't adopt this kind of pattern?  If so, like above
it's better to put one assert below.

BR,
Kewen

> +   && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)"
>    "#"
> -  "&& reload_completed"
> +  "&& 1"
>    [(const_int 0)]
>  {
>    rtx dest = operands[0];
>    rtx src = operands[1];
>    rtx element = operands[2];
> -  rtx vec_tmp = operands[3];
> -  int value;
> +  rtx vec_tmp;
> +
> +  if (GET_CODE (operands[3]) == SCRATCH)
> +    vec_tmp = gen_reg_rtx (V4SImode);
> +  else
> +    vec_tmp = operands[3];
> 
>    /* Adjust index for LE element ordering, the below minuend 3 is computed by
>       GET_MODE_NUNITS (V4SImode) - 1.  */
>    if (!BYTES_BIG_ENDIAN)
>      element = GEN_INT (3 - INTVAL (element));
> 
> -  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
> -     instruction.  */
> -  value = INTVAL (element);
> -  if (value != 1)
> -    emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
> -  else
> -    vec_tmp = src;
> +  emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
> 
> -  if (MEM_P (operands[0]))
> -    {
> -      if (can_create_pseudo_p ())
> -	dest = rs6000_force_indexed_or_indirect_mem (dest);
> -
> -      if (TARGET_P8_VECTOR)
> -	emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
> -      else
> -	emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp))));
> -    }
> -
> -  else if (TARGET_P8_VECTOR)
> -    emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
> -  else
> -    emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
> -		    gen_rtx_REG (DImode, REGNO (vec_tmp)));
> +  int value = BYTES_BIG_ENDIAN ? 1 : 2;
> +  emit_insn (gen_vsx_extract_v4si_0 (dest, vec_tmp, GEN_INT (value)));
> 
>    DONE;
> -}
> -  [(set_attr "type" "mfvsr,vecperm,fpstore")
> -   (set_attr "length" "8")
> -   (set_attr "isa" "*,p8v,*")])
> +})
> 
>  (define_insn_and_split  "*vsx_extract_<mode>_p8"
>    [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r")
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
> new file mode 100644
> index 00000000000..e7cdbc76298
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +#include "pr106769.h"
> +
> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
> new file mode 100644
> index 00000000000..2205e434a86
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +#include "pr106769.h"
> +
> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */
> +/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769.h b/gcc/testsuite/gcc.target/powerpc/pr106769.h
> new file mode 100644
> index 00000000000..1c8c8a024f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769.h
> @@ -0,0 +1,17 @@
> +#include <altivec.h>
> +
> +#ifdef __BIG_ENDIAN__
> +#define LANE 1
> +#else
> +#define LANE 2
> +#endif
> +
> +unsigned int foo1 (vector unsigned int v)
> +{
> +   return vec_extract(v, LANE);
> +}
> +
> +void foo2 (vector unsigned int v, unsigned int* p)
> +{
> +   *p = vec_extract(v, LANE);
> +}

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]
  2023-07-25  2:10 HAO CHEN GUI
@ 2023-07-31  5:43 ` Kewen.Lin
  0 siblings, 0 replies; 4+ messages in thread
From: Kewen.Lin @ 2023-07-31  5:43 UTC (permalink / raw)
  To: HAO CHEN GUI; +Cc: Segher Boessenkool, David, Peter Bergner, gcc-patches

Hi Haochen,

on 2023/7/25 10:10, HAO CHEN GUI wrote:
> Hi,
>   This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
> for all subtargets when the mode is V4SI and the index of extracted element
> is 1 for BE and 2 for LE. Also this patch adds a insn pattern for mfvsrwz
> which helps eliminate redundant zero extend.
> 
>   Compared to last version, the main change is to move "vsx_extract_v4si_w1"
> and "*mfvsrwz" to the front of "*vsx_extract_<mode>_di_p9". Also some insn
> conditions are changed to assertions.
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625128.html

Since the previous one is v2, this is actually v3. ;-)

> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: Generate mfvsrwz for all platform and remove redundant zero extend
> 
> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
> generated even with p9 vector enabled.  Also the instruction is already
> zero extended.  A combine pattern is needed to eliminate redundant zero
> extend instructions.
> 
> gcc/
> 	PR target/106769
> 	* config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only
> 	for V8HI and V16QI.
> 	(vsx_extract_v4si): New expand for V4SI extraction.
> 	(vsx_extract_v4si_w1): New insn pattern for V4SI extraction
> 	when the index of extracted element is 1 with BE and 2 with LE.

Nit: Maybe better to match the name with " ... for V4SI extraction on
word 1 from BE order."

> 	(*mfvsrwz): New insn pattern.
> 	(*vsx_extract_<mode>_di_p9): Not generate the insn when the index
> 	of extracted element is 1 with BE and 2 with LE.
> 	(*vsx_extract_si): Removed.

Nit: s/Removed/Remove/

> 	(*vsx_extract_v4si_not_w1): New insn and split pattern which deals
> 	with the cases not handled by vsx_extract_v4si_w1.
> 
> gcc/testsuite/
> 	PR target/106769
> 	* gcc.target/powerpc/pr106769.h: New.
> 	* gcc.target/powerpc/pr106769-p8.c: New.
> 	* gcc.target/powerpc/pr106769-p9.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 0a34ceebeb5..0065b76fef8 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1"
>  (define_expand  "vsx_extract_<mode>"
>    [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand")
>  		   (vec_select:<VEC_base>
> -		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
> +		    (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
>  		    (parallel [(match_operand:QI 2 "const_int_operand")])))
> -	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
> +	      (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
>    "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
>  {
>    /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
> @@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_<mode>"
>      }
>  })
> 
> +(define_expand  "vsx_extract_v4si"
> +  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
> +		   (vec_select:SI
> +		    (match_operand:V4SI 1 "gpc_reg_operand")
> +		    (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
> +	      (clobber (match_scratch:V4SI 3))])]
> +  "TARGET_DIRECT_MOVE_64BIT"
> +{
> +  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
> +     fall through to vsx_extract_v4si_w1.  */
> +  if (TARGET_P9_VECTOR
> +      && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
> +    {
> +      emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
> +					  operands[2]));
> +      DONE;
> +    }
> +})
> +
> +/* Extract from word 1 (BE order).  */

Nit: Use semicolon ";" for comments to keep consistent with the others
and what the doc says.

> +(define_insn "vsx_extract_v4si_w1"
> +  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
> +	(vec_select:SI
> +	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
> +	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
> +   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +{
> +   if (which_alternative == 0)
> +     return "mfvsrwz %0,%x1";
> +
> +   if (which_alternative == 1)
> +     return "xxlor %x0,%x1,%x1";
> +
> +   if (which_alternative == 2)
> +     return "stxsiwx %x1,%y0";
> +
> +   return ASM_COMMENT_START " vec_extract to same register";
> +}
> +  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
> +   (set_attr "length" "4,4,4,0")
> +   (set_attr "isa" "p8v,*,p8v,*")])
> +
> +(define_insn "*mfvsrwz"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +	(zero_extend:DI
> +	  (vec_select:SI
> +	    (match_operand:V4SI 1 "vsx_register_operand" "wa")
> +	    (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
> +   (clobber (match_scratch:V4SI 3 "=v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +  "mfvsrwz %0,%x1"
> +  [(set_attr "type" "mfvsr")
> +   (set_attr "isa" "p8v")])
> +
>  (define_insn "vsx_extract_<mode>_p9"
>    [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>")
>  	(vec_select:<VEC_base>
> @@ -3798,7 +3855,7 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
>  	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
>  	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
>     (clobber (match_scratch:SI 3 "=r,X"))]
> -  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
> +  "TARGET_VEXTRACTUB"

Nit: I understand this change is based on TARGET_VEXTRACTUB implying
the condition VECTOR_MEM_VSX_P (<MODE>mode) always holds, but this line
change isn't necessary in this patch and we have a few conditions like
this, so I prefer to just leave this alone.

>    "#"
>    "&& reload_completed"
>    [(parallel [(set (match_dup 4)
> @@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
>  		    (parallel [(match_dup 2)])))
>  	      (clobber (match_dup 3))])]
>  {
> +  gcc_assert (<MODE>mode != V4SImode
> +	      || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2));
> +
>    operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0]));
>  }
>    [(set_attr "isa" "p9v,*")])
> @@ -3830,58 +3890,43 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9"
>     (set (match_dup 0)
>  	(match_dup 3))])
> 
> -(define_insn_and_split  "*vsx_extract_si"
> +/* Extract from word 0, 2, 3 (BE order).  */

Nit: Use ";".

> +(define_insn_and_split "*vsx_extract_v4si_not_w1"

Nit: Maybe name it "...v4si_w023" to match the comment?


>    [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z")
>  	(vec_select:SI
>  	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v")
>  	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")])))
>     (clobber (match_scratch:V4SI 3 "=v,v,v"))]
> -  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR"
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)"

I think the case "INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
should be already matched by vsx_extract_v4si_w1 above, so move this
...

>    "#"
> -  "&& reload_completed"
> +  "&& 1"
>    [(const_int 0)]
>  {
> +  gcc_assert (!TARGET_P9_VECTOR);

... here as an assertion too.

> +
>    rtx dest = operands[0];
>    rtx src = operands[1];
>    rtx element = operands[2];
> -  rtx vec_tmp = operands[3];
> -  int value;
> +  rtx vec_tmp;
> +
> +  if (GET_CODE (operands[3]) == SCRATCH)
> +    vec_tmp = gen_reg_rtx (V4SImode);
> +  else
> +    vec_tmp = operands[3];
> 
>    /* Adjust index for LE element ordering, the below minuend 3 is computed by
>       GET_MODE_NUNITS (V4SImode) - 1.  */
>    if (!BYTES_BIG_ENDIAN)
>      element = GEN_INT (3 - INTVAL (element));
> 
> -  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
> -     instruction.  */
> -  value = INTVAL (element);
> -  if (value != 1)
> -    emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
> -  else
> -    vec_tmp = src;
> -
> -  if (MEM_P (operands[0]))
> -    {
> -      if (can_create_pseudo_p ())
> -	dest = rs6000_force_indexed_or_indirect_mem (dest);
> -
> -      if (TARGET_P8_VECTOR)
> -	emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
> -      else
> -	emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp))));
> -    }
> +  emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
> 
> -  else if (TARGET_P8_VECTOR)
> -    emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
> -  else
> -    emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
> -		    gen_rtx_REG (DImode, REGNO (vec_tmp)));
> +  int value = BYTES_BIG_ENDIAN ? 1 : 2;
> +  emit_insn (gen_vsx_extract_v4si_w1 (dest, vec_tmp, GEN_INT (value)));
> 
>    DONE;
> -}
> -  [(set_attr "type" "mfvsr,vecperm,fpstore")
> -   (set_attr "length" "8")
> -   (set_attr "isa" "*,p8v,*")])
> +})
> 
>  (define_insn_and_split  "*vsx_extract_<mode>_p8"
>    [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r")
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
> new file mode 100644
> index 00000000000..e7cdbc76298
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target powerpc_p8vector_ok } */
> +/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +#include "pr106769.h"
> +
> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
> new file mode 100644
> index 00000000000..2205e434a86
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +#include "pr106769.h"
> +
> +/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
> +/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
> +/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
> +/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */
> +/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */

Nit: I think the below with \m and \M should work:

/* { dg-final { scan-assembler-not {\mvextuw[rl]x\M} } } */

Okay for trunk with all the nits above tweaked, thanks!

BR,
Kewen

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]
@ 2023-07-25  2:10 HAO CHEN GUI
  2023-07-31  5:43 ` Kewen.Lin
  0 siblings, 1 reply; 4+ messages in thread
From: HAO CHEN GUI @ 2023-07-25  2:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: Segher Boessenkool, David, Kewen.Lin, Peter Bergner

Hi,
  This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
for all subtargets when the mode is V4SI and the index of extracted element
is 1 for BE and 2 for LE. Also this patch adds a insn pattern for mfvsrwz
which helps eliminate redundant zero extend.

  Compared to last version, the main change is to move "vsx_extract_v4si_w1"
and "*mfvsrwz" to the front of "*vsx_extract_<mode>_di_p9". Also some insn
conditions are changed to assertions.
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625128.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.

Thanks
Gui Haochen

ChangeLog
rs6000: Generate mfvsrwz for all platform and remove redundant zero extend

mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
generated even with p9 vector enabled.  Also the instruction is already
zero extended.  A combine pattern is needed to eliminate redundant zero
extend instructions.

gcc/
	PR target/106769
	* config/rs6000/vsx.md (expand vsx_extract_<mode>): Set it only
	for V8HI and V16QI.
	(vsx_extract_v4si): New expand for V4SI extraction.
	(vsx_extract_v4si_w1): New insn pattern for V4SI extraction
	when the index of extracted element is 1 with BE and 2 with LE.
	(*mfvsrwz): New insn pattern.
	(*vsx_extract_<mode>_di_p9): Not generate the insn when the index
	of extracted element is 1 with BE and 2 with LE.
	(*vsx_extract_si): Removed.
	(*vsx_extract_v4si_not_w1): New insn and split pattern which deals
	with the cases not handled by vsx_extract_v4si_w1.

gcc/testsuite/
	PR target/106769
	* gcc.target/powerpc/pr106769.h: New.
	* gcc.target/powerpc/pr106769-p8.c: New.
	* gcc.target/powerpc/pr106769-p9.c: New.

patch.diff
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 0a34ceebeb5..0065b76fef8 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2_<mode>_1"
 (define_expand  "vsx_extract_<mode>"
   [(parallel [(set (match_operand:<VEC_base> 0 "gpc_reg_operand")
 		   (vec_select:<VEC_base>
-		    (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
+		    (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
 		    (parallel [(match_operand:QI 2 "const_int_operand")])))
-	      (clobber (match_scratch:VSX_EXTRACT_I 3))])]
+	      (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
   "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_DIRECT_MOVE_64BIT"
 {
   /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
@@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_<mode>"
     }
 })

+(define_expand  "vsx_extract_v4si"
+  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
+		   (vec_select:SI
+		    (match_operand:V4SI 1 "gpc_reg_operand")
+		    (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
+	      (clobber (match_scratch:V4SI 3))])]
+  "TARGET_DIRECT_MOVE_64BIT"
+{
+  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
+     fall through to vsx_extract_v4si_w1.  */
+  if (TARGET_P9_VECTOR
+      && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
+    {
+      emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
+					  operands[2]));
+      DONE;
+    }
+})
+
+/* Extract from word 1 (BE order).  */
+(define_insn "vsx_extract_v4si_w1"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
+	(vec_select:SI
+	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
+	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
+   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
+{
+   if (which_alternative == 0)
+     return "mfvsrwz %0,%x1";
+
+   if (which_alternative == 1)
+     return "xxlor %x0,%x1,%x1";
+
+   if (which_alternative == 2)
+     return "stxsiwx %x1,%y0";
+
+   return ASM_COMMENT_START " vec_extract to same register";
+}
+  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
+   (set_attr "length" "4,4,4,0")
+   (set_attr "isa" "p8v,*,p8v,*")])
+
+(define_insn "*mfvsrwz"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "vsx_register_operand" "wa")
+	    (parallel [(match_operand:QI 2 "const_int_operand" "n")]))))
+   (clobber (match_scratch:V4SI 3 "=v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
+  "mfvsrwz %0,%x1"
+  [(set_attr "type" "mfvsr")
+   (set_attr "isa" "p8v")])
+
 (define_insn "vsx_extract_<mode>_p9"
   [(set (match_operand:<VEC_base> 0 "gpc_reg_operand" "=r,<VSX_EX>")
 	(vec_select:<VEC_base>
@@ -3798,7 +3855,7 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
 	  (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand" "v,<VSX_EX>")
 	  (parallel [(match_operand:QI 2 "const_int_operand" "n,n")]))))
    (clobber (match_scratch:SI 3 "=r,X"))]
-  "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_VEXTRACTUB"
+  "TARGET_VEXTRACTUB"
   "#"
   "&& reload_completed"
   [(parallel [(set (match_dup 4)
@@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract_<mode>_di_p9"
 		    (parallel [(match_dup 2)])))
 	      (clobber (match_dup 3))])]
 {
+  gcc_assert (<MODE>mode != V4SImode
+	      || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2));
+
   operands[4] = gen_rtx_REG (<VEC_base>mode, REGNO (operands[0]));
 }
   [(set_attr "isa" "p9v,*")])
@@ -3830,58 +3890,43 @@ (define_insn_and_split "*vsx_extract_<mode>_store_p9"
    (set (match_dup 0)
 	(match_dup 3))])

-(define_insn_and_split  "*vsx_extract_si"
+/* Extract from word 0, 2, 3 (BE order).  */
+(define_insn_and_split "*vsx_extract_v4si_not_w1"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z")
 	(vec_select:SI
 	 (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v")
 	 (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n")])))
    (clobber (match_scratch:V4SI 3 "=v,v,v"))]
-  "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT && !TARGET_P9_VECTOR"
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2)"
   "#"
-  "&& reload_completed"
+  "&& 1"
   [(const_int 0)]
 {
+  gcc_assert (!TARGET_P9_VECTOR);
+
   rtx dest = operands[0];
   rtx src = operands[1];
   rtx element = operands[2];
-  rtx vec_tmp = operands[3];
-  int value;
+  rtx vec_tmp;
+
+  if (GET_CODE (operands[3]) == SCRATCH)
+    vec_tmp = gen_reg_rtx (V4SImode);
+  else
+    vec_tmp = operands[3];

   /* Adjust index for LE element ordering, the below minuend 3 is computed by
      GET_MODE_NUNITS (V4SImode) - 1.  */
   if (!BYTES_BIG_ENDIAN)
     element = GEN_INT (3 - INTVAL (element));

-  /* If the value is in the correct position, we can avoid doing the VSPLT<x>
-     instruction.  */
-  value = INTVAL (element);
-  if (value != 1)
-    emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));
-  else
-    vec_tmp = src;
-
-  if (MEM_P (operands[0]))
-    {
-      if (can_create_pseudo_p ())
-	dest = rs6000_force_indexed_or_indirect_mem (dest);
-
-      if (TARGET_P8_VECTOR)
-	emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
-      else
-	emit_insn (gen_stfiwx (dest, gen_rtx_REG (DImode, REGNO (vec_tmp))));
-    }
+  emit_insn (gen_altivec_vspltw_direct (vec_tmp, src, element));

-  else if (TARGET_P8_VECTOR)
-    emit_move_insn (dest, gen_rtx_REG (SImode, REGNO (vec_tmp)));
-  else
-    emit_move_insn (gen_rtx_REG (DImode, REGNO (dest)),
-		    gen_rtx_REG (DImode, REGNO (vec_tmp)));
+  int value = BYTES_BIG_ENDIAN ? 1 : 2;
+  emit_insn (gen_vsx_extract_v4si_w1 (dest, vec_tmp, GEN_INT (value)));

   DONE;
-}
-  [(set_attr "type" "mfvsr,vecperm,fpstore")
-   (set_attr "length" "8")
-   (set_attr "isa" "*,p8v,*")])
+})

 (define_insn_and_split  "*vsx_extract_<mode>_p8"
   [(set (match_operand:<VEC_base> 0 "nonimmediate_operand" "=r")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
new file mode 100644
index 00000000000..e7cdbc76298
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p8.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+#include "pr106769.h"
+
+/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
+/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
+/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
new file mode 100644
index 00000000000..2205e434a86
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769-p9.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+#include "pr106769.h"
+
+/* { dg-final { scan-assembler {\mmfvsrwz\M} } } */
+/* { dg-final { scan-assembler {\mstxsiwx\M} } } */
+/* { dg-final { scan-assembler-not {\mrldicl\M} } } */
+/* { dg-final { scan-assembler-not {\mxxextractuw\M} } } */
+/* { dg-final { scan-assembler-not "vextuw\[rl\]x" } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr106769.h b/gcc/testsuite/gcc.target/powerpc/pr106769.h
new file mode 100644
index 00000000000..1c8c8a024f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr106769.h
@@ -0,0 +1,17 @@
+#include <altivec.h>
+
+#ifdef __BIG_ENDIAN__
+#define LANE 1
+#else
+#define LANE 2
+#endif
+
+unsigned int foo1 (vector unsigned int v)
+{
+   return vec_extract(v, LANE);
+}
+
+void foo2 (vector unsigned int v, unsigned int* p)
+{
+   *p = vec_extract(v, LANE);
+}

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-07-31  5:44 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-21  1:32 [PATCHv2, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769] HAO CHEN GUI
2023-07-24  2:43 ` Kewen.Lin
2023-07-25  2:10 HAO CHEN GUI
2023-07-31  5:43 ` Kewen.Lin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).