public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
@ 2021-10-25  2:50 Xionghu Luo
  2021-10-27 13:24 ` David Edelsohn
  0 siblings, 1 reply; 4+ messages in thread
From: Xionghu Luo @ 2021-10-25  2:50 UTC (permalink / raw)
  To: gcc-patches; +Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, Xionghu Luo

If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.

gcc/ChangeLog:

	* config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
	patterns match and emit for VSX xxpermdi.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/pr102868.c: New test.
---
 gcc/config/rs6000/rs6000.c                  | 47 ++++++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
 2 files changed, 97 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index d0730253bcc..5d802c1fa96 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
     {OPTION_MASK_P8_VECTOR,
      BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
 		      : CODE_FOR_p8_vmrgew_v4sf_direct,
-     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
+     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
+    {OPTION_MASK_VSX,
+     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+		       : CODE_FOR_vsx_xxpermdi_v16qi),
+     {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
+    {OPTION_MASK_VSX,
+     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+		       : CODE_FOR_vsx_xxpermdi_v16qi),
+     {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+    {OPTION_MASK_VSX,
+     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+		       : CODE_FOR_vsx_xxpermdi_v16qi),
+     {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
+    {OPTION_MASK_VSX,
+     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
+		       : CODE_FOR_vsx_xxpermdi_v16qi),
+     {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
 
   unsigned int i, j, elt, which;
   unsigned char perm[16];
@@ -23169,6 +23185,27 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
 	  machine_mode omode = insn_data[icode].operand[0].mode;
 	  machine_mode imode = insn_data[icode].operand[1].mode;
 
+	  rtx perm_idx = GEN_INT (0);
+	  if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+	    {
+	      int perm_val = 0;
+	      if (one_vec)
+		{
+		  if (perm[0] == 8)
+		    perm_val |= 2;
+		  if (perm[8] == 8)
+		    perm_val |= 1;
+		}
+	      else
+		{
+		  if (perm[0] != 0)
+		    perm_val |= 2;
+		  if (perm[8] != 16)
+		    perm_val |= 1;
+		}
+	      perm_idx = GEN_INT (perm_val);
+	    }
+
 	  /* For little-endian, don't use vpkuwum and vpkuhum if the
 	     underlying vector type is not V4SI and V8HI, respectively.
 	     For example, using vpkuwum with a V8HI picks up the even
@@ -23192,7 +23229,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
           /* For little-endian, the two input operands must be swapped
              (or swapped back) to ensure proper right-to-left numbering
              from 0 to 2N-1.  */
-	  if (swapped ^ !BYTES_BIG_ENDIAN)
+	  if (swapped ^ !BYTES_BIG_ENDIAN
+	      && icode != CODE_FOR_vsx_xxpermdi_v16qi)
 	    std::swap (op0, op1);
 	  if (imode != V16QImode)
 	    {
@@ -23203,7 +23241,10 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
 	    x = target;
 	  else
 	    x = gen_reg_rtx (omode);
-	  emit_insn (GEN_FCN (icode) (x, op0, op1));
+	  if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+	    emit_insn (GEN_FCN (icode) (x, op0, op1, perm_idx));
+	  else
+	    emit_insn (GEN_FCN (icode) (x, op0, op1));
 	  if (omode != V16QImode)
 	    emit_move_insn (target, gen_lowpart (V16QImode, x));
 	  return true;
diff --git a/gcc/testsuite/gcc.target/powerpc/pr102868.c b/gcc/testsuite/gcc.target/powerpc/pr102868.c
new file mode 100644
index 00000000000..eb45d193f66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr102868.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include <altivec.h>
+vector float b = {0.0f, 0.0f, 0.0f, 0.0f};
+
+
+vector float foo1 (vector float x)
+{
+  vector int c = {0, 1, 4, 5};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo2 (vector float x)
+{
+  vector int c = {2, 3, 4, 5};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo3 (vector float x)
+{
+  vector int c = {0, 1, 6, 7};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo4 (vector float x)
+{
+  vector int c = {2, 3, 6, 7};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector unsigned char foo5 (vector unsigned char x)
+{
+  vector unsigned char c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo6 (vector unsigned char x)
+{
+  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo7 (vector unsigned char x)
+{
+  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+  return __builtin_shuffle (x, c);
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target has_arch_pwr9 } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target { {! has_arch_pwr9} && be } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 11 { target { {! has_arch_pwr9} && le } } } } */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
  2021-10-25  2:50 [PATCH] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868] Xionghu Luo
@ 2021-10-27 13:24 ` David Edelsohn
  2021-10-28  5:38   ` [PATCH v2] " Xionghu Luo
  0 siblings, 1 reply; 4+ messages in thread
From: David Edelsohn @ 2021-10-27 13:24 UTC (permalink / raw)
  To: Xionghu Luo
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, guojiufu, linkw

On Sun, Oct 24, 2021 at 10:51 PM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
> If the second operand of __builtin_shuffle is const vector 0, and with
> specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
>
> gcc/ChangeLog:
>
>         * config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
>         patterns match and emit for VSX xxpermdi.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/powerpc/pr102868.c: New test.
> ---
>  gcc/config/rs6000/rs6000.c                  | 47 ++++++++++++++++--
>  gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
>  2 files changed, 97 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index d0730253bcc..5d802c1fa96 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>      {OPTION_MASK_P8_VECTOR,
>       BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
>                       : CODE_FOR_p8_vmrgew_v4sf_direct,
> -     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
> +     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
> +    {OPTION_MASK_VSX,
> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> +     {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
> +    {OPTION_MASK_VSX,
> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> +     {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
> +    {OPTION_MASK_VSX,
> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> +     {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
> +    {OPTION_MASK_VSX,
> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> +     {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};

If the insn_code is the same for big endian and little endian, why
does the new code test BYTES_BIG_ENDIAN to set the same value
(CODE_FOR_vsx_xxpermdi_v16qi)?

Thanks, David

>
>    unsigned int i, j, elt, which;
>    unsigned char perm[16];
> @@ -23169,6 +23185,27 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>           machine_mode omode = insn_data[icode].operand[0].mode;
>           machine_mode imode = insn_data[icode].operand[1].mode;
>
> +         rtx perm_idx = GEN_INT (0);
> +         if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
> +           {
> +             int perm_val = 0;
> +             if (one_vec)
> +               {
> +                 if (perm[0] == 8)
> +                   perm_val |= 2;
> +                 if (perm[8] == 8)
> +                   perm_val |= 1;
> +               }
> +             else
> +               {
> +                 if (perm[0] != 0)
> +                   perm_val |= 2;
> +                 if (perm[8] != 16)
> +                   perm_val |= 1;
> +               }
> +             perm_idx = GEN_INT (perm_val);
> +           }
> +
>           /* For little-endian, don't use vpkuwum and vpkuhum if the
>              underlying vector type is not V4SI and V8HI, respectively.
>              For example, using vpkuwum with a V8HI picks up the even
> @@ -23192,7 +23229,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>            /* For little-endian, the two input operands must be swapped
>               (or swapped back) to ensure proper right-to-left numbering
>               from 0 to 2N-1.  */
> -         if (swapped ^ !BYTES_BIG_ENDIAN)
> +         if (swapped ^ !BYTES_BIG_ENDIAN
> +             && icode != CODE_FOR_vsx_xxpermdi_v16qi)
>             std::swap (op0, op1);
>           if (imode != V16QImode)
>             {
> @@ -23203,7 +23241,10 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>             x = target;
>           else
>             x = gen_reg_rtx (omode);
> -         emit_insn (GEN_FCN (icode) (x, op0, op1));
> +         if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
> +           emit_insn (GEN_FCN (icode) (x, op0, op1, perm_idx));
> +         else
> +           emit_insn (GEN_FCN (icode) (x, op0, op1));
>           if (omode != V16QImode)
>             emit_move_insn (target, gen_lowpart (V16QImode, x));
>           return true;
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr102868.c b/gcc/testsuite/gcc.target/powerpc/pr102868.c
> new file mode 100644
> index 00000000000..eb45d193f66
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr102868.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-options "-O2 -mvsx" } */
> +
> +#include <altivec.h>
> +vector float b = {0.0f, 0.0f, 0.0f, 0.0f};
> +
> +
> +vector float foo1 (vector float x)
> +{
> +  vector int c = {0, 1, 4, 5};
> +  return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo2 (vector float x)
> +{
> +  vector int c = {2, 3, 4, 5};
> +  return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo3 (vector float x)
> +{
> +  vector int c = {0, 1, 6, 7};
> +  return __builtin_shuffle (x, b, c);
> +}
> +
> +vector float foo4 (vector float x)
> +{
> +  vector int c = {2, 3, 6, 7};
> +  return __builtin_shuffle (x, b, c);
> +}
> +
> +vector unsigned char foo5 (vector unsigned char x)
> +{
> +  vector unsigned char c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
> +  return __builtin_shuffle (x, c);
> +}
> +
> +vector unsigned char foo6 (vector unsigned char x)
> +{
> +  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
> +  return __builtin_shuffle (x, c);
> +}
> +
> +vector unsigned char foo7 (vector unsigned char x)
> +{
> +  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
> +  return __builtin_shuffle (x, c);
> +}
> +
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target has_arch_pwr9 } } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target { {! has_arch_pwr9} && be } } } } */
> +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 11 { target { {! has_arch_pwr9} && le } } } } */
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH v2] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
  2021-10-27 13:24 ` David Edelsohn
@ 2021-10-28  5:38   ` Xionghu Luo
  2021-10-28 15:00     ` David Edelsohn
  0 siblings, 1 reply; 4+ messages in thread
From: Xionghu Luo @ 2021-10-28  5:38 UTC (permalink / raw)
  To: David Edelsohn
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, guojiufu, linkw



On 2021/10/27 21:24, David Edelsohn wrote:
> On Sun, Oct 24, 2021 at 10:51 PM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>>
>> If the second operand of __builtin_shuffle is const vector 0, and with
>> specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
>>
>> gcc/ChangeLog:
>>
>>         * config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
>>         patterns match and emit for VSX xxpermdi.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * gcc.target/powerpc/pr102868.c: New test.
>> ---
>>  gcc/config/rs6000/rs6000.c                  | 47 ++++++++++++++++--
>>  gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
>>  2 files changed, 97 insertions(+), 3 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c
>>
>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> index d0730253bcc..5d802c1fa96 100644
>> --- a/gcc/config/rs6000/rs6000.c
>> +++ b/gcc/config/rs6000/rs6000.c
>> @@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
>>      {OPTION_MASK_P8_VECTOR,
>>       BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
>>                       : CODE_FOR_p8_vmrgew_v4sf_direct,
>> -     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
>> +     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
>> +    {OPTION_MASK_VSX,
>> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
>> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
>> +     {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
>> +    {OPTION_MASK_VSX,
>> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
>> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
>> +     {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
>> +    {OPTION_MASK_VSX,
>> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
>> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
>> +     {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
>> +    {OPTION_MASK_VSX,
>> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
>> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
>> +     {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
> 
> If the insn_code is the same for big endian and little endian, why
> does the new code test BYTES_BIG_ENDIAN to set the same value
> (CODE_FOR_vsx_xxpermdi_v16qi)?
> 

Thanks for the catch, updated the patch as below:


[PATCH v2] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]

If the second operand of __builtin_shuffle is const vector 0, and with
specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.

gcc/ChangeLog:

	* config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
	patterns match and emit for VSX xxpermdi.

gcc/testsuite/ChangeLog:

	* gcc.target/powerpc/pr102868.c: New test.
---
 gcc/config/rs6000/rs6000.c                  | 39 +++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
 2 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index d0730253bcc..533560bb9ba 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -23046,7 +23046,15 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
     {OPTION_MASK_P8_VECTOR,
      BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
 		      : CODE_FOR_p8_vmrgew_v4sf_direct,
-     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
+     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
+    {OPTION_MASK_VSX, CODE_FOR_vsx_xxpermdi_v16qi,
+     {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
+    {OPTION_MASK_VSX, CODE_FOR_vsx_xxpermdi_v16qi,
+     {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+    {OPTION_MASK_VSX, CODE_FOR_vsx_xxpermdi_v16qi,
+     {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
+    {OPTION_MASK_VSX, CODE_FOR_vsx_xxpermdi_v16qi,
+     {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
 
   unsigned int i, j, elt, which;
   unsigned char perm[16];
@@ -23169,6 +23177,27 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
 	  machine_mode omode = insn_data[icode].operand[0].mode;
 	  machine_mode imode = insn_data[icode].operand[1].mode;
 
+	  rtx perm_idx = GEN_INT (0);
+	  if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+	    {
+	      int perm_val = 0;
+	      if (one_vec)
+		{
+		  if (perm[0] == 8)
+		    perm_val |= 2;
+		  if (perm[8] == 8)
+		    perm_val |= 1;
+		}
+	      else
+		{
+		  if (perm[0] != 0)
+		    perm_val |= 2;
+		  if (perm[8] != 16)
+		    perm_val |= 1;
+		}
+	      perm_idx = GEN_INT (perm_val);
+	    }
+
 	  /* For little-endian, don't use vpkuwum and vpkuhum if the
 	     underlying vector type is not V4SI and V8HI, respectively.
 	     For example, using vpkuwum with a V8HI picks up the even
@@ -23192,7 +23221,8 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
           /* For little-endian, the two input operands must be swapped
              (or swapped back) to ensure proper right-to-left numbering
              from 0 to 2N-1.  */
-	  if (swapped ^ !BYTES_BIG_ENDIAN)
+	  if (swapped ^ !BYTES_BIG_ENDIAN
+	      && icode != CODE_FOR_vsx_xxpermdi_v16qi)
 	    std::swap (op0, op1);
 	  if (imode != V16QImode)
 	    {
@@ -23203,7 +23233,10 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
 	    x = target;
 	  else
 	    x = gen_reg_rtx (omode);
-	  emit_insn (GEN_FCN (icode) (x, op0, op1));
+	  if (icode == CODE_FOR_vsx_xxpermdi_v16qi)
+	    emit_insn (GEN_FCN (icode) (x, op0, op1, perm_idx));
+	  else
+	    emit_insn (GEN_FCN (icode) (x, op0, op1));
 	  if (omode != V16QImode)
 	    emit_move_insn (target, gen_lowpart (V16QImode, x));
 	  return true;
diff --git a/gcc/testsuite/gcc.target/powerpc/pr102868.c b/gcc/testsuite/gcc.target/powerpc/pr102868.c
new file mode 100644
index 00000000000..eb45d193f66
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr102868.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+#include <altivec.h>
+vector float b = {0.0f, 0.0f, 0.0f, 0.0f};
+
+
+vector float foo1 (vector float x)
+{
+  vector int c = {0, 1, 4, 5};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo2 (vector float x)
+{
+  vector int c = {2, 3, 4, 5};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo3 (vector float x)
+{
+  vector int c = {0, 1, 6, 7};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector float foo4 (vector float x)
+{
+  vector int c = {2, 3, 6, 7};
+  return __builtin_shuffle (x, b, c);
+}
+
+vector unsigned char foo5 (vector unsigned char x)
+{
+  vector unsigned char c = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+  return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo6 (vector unsigned char x)
+{
+  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __builtin_shuffle (x, c);
+}
+
+vector unsigned char foo7 (vector unsigned char x)
+{
+  vector unsigned char c = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+  return __builtin_shuffle (x, c);
+}
+
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target has_arch_pwr9 } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 7 { target { {! has_arch_pwr9} && be } } } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 11 { target { {! has_arch_pwr9} && le } } } } */
-- 
2.25.1

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
  2021-10-28  5:38   ` [PATCH v2] " Xionghu Luo
@ 2021-10-28 15:00     ` David Edelsohn
  0 siblings, 0 replies; 4+ messages in thread
From: David Edelsohn @ 2021-10-28 15:00 UTC (permalink / raw)
  To: Xionghu Luo
  Cc: GCC Patches, Segher Boessenkool, Bill Schmidt, guojiufu, linkw

On Thu, Oct 28, 2021 at 1:39 AM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
>
> On 2021/10/27 21:24, David Edelsohn wrote:
> > On Sun, Oct 24, 2021 at 10:51 PM Xionghu Luo <luoxhu@linux.ibm.com> wrote:
> >>
> >> If the second operand of __builtin_shuffle is const vector 0, and with
> >> specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
> >>
> >> gcc/ChangeLog:
> >>
> >>         * config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
> >>         patterns match and emit for VSX xxpermdi.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>         * gcc.target/powerpc/pr102868.c: New test.
> >> ---
> >>  gcc/config/rs6000/rs6000.c                  | 47 ++++++++++++++++--
> >>  gcc/testsuite/gcc.target/powerpc/pr102868.c | 53 +++++++++++++++++++++
> >>  2 files changed, 97 insertions(+), 3 deletions(-)
> >>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr102868.c
> >>
> >> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> >> index d0730253bcc..5d802c1fa96 100644
> >> --- a/gcc/config/rs6000/rs6000.c
> >> +++ b/gcc/config/rs6000/rs6000.c
> >> @@ -23046,7 +23046,23 @@ altivec_expand_vec_perm_const (rtx target, rtx op0, rtx op1,
> >>      {OPTION_MASK_P8_VECTOR,
> >>       BYTES_BIG_ENDIAN ? CODE_FOR_p8_vmrgow_v4sf_direct
> >>                       : CODE_FOR_p8_vmrgew_v4sf_direct,
> >> -     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}}};
> >> +     {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31}},
> >> +    {OPTION_MASK_VSX,
> >> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> >> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> >> +     {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}},
> >> +    {OPTION_MASK_VSX,
> >> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> >> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> >> +     {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
> >> +    {OPTION_MASK_VSX,
> >> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> >> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> >> +     {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}},
> >> +    {OPTION_MASK_VSX,
> >> +     (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_xxpermdi_v16qi
> >> +                      : CODE_FOR_vsx_xxpermdi_v16qi),
> >> +     {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}}};
> >
> > If the insn_code is the same for big endian and little endian, why
> > does the new code test BYTES_BIG_ENDIAN to set the same value
> > (CODE_FOR_vsx_xxpermdi_v16qi)?
> >
>
> Thanks for the catch, updated the patch as below:
>
> [PATCH v2] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868]
>
> If the second operand of __builtin_shuffle is const vector 0, and with
> specific mask, it can be optimized to vspltisw+xxpermdi instead of lxv.
>
> gcc/ChangeLog:
>
>         * config/rs6000/rs6000.c (altivec_expand_vec_perm_const): Add
>         patterns match and emit for VSX xxpermdi.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/powerpc/pr102868.c: New test.

Okay.

Thanks, David

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-10-28 15:00 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-25  2:50 [PATCH] rs6000: Optimize __builtin_shuffle when it's used to zero the upper bits [PR102868] Xionghu Luo
2021-10-27 13:24 ` David Edelsohn
2021-10-28  5:38   ` [PATCH v2] " Xionghu Luo
2021-10-28 15:00     ` David Edelsohn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).