[PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
@ 2024-05-21  5:12 liuhongt
  2024-05-21  5:12 ` [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication liuhongt
  2024-05-24  2:25 ` [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode Hongtao Liu
  0 siblings, 2 replies; 8+ messages in thread
From: liuhongt @ 2024-05-21  5:12 UTC (permalink / raw)
  To: gcc-patches; +Cc: crazylht, hjl.tools

When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok of trunk?

gcc/ChangeLog:

	PR target/114428
	* simplify-rtx.cc
	(simplify_context::simplify_binary_operation_1):
	Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
	specific mask.
---
 gcc/simplify-rtx.cc | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..6c91409200e 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4021,6 +4021,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
 	    return tem;
 	}
 
+      /* (and:v4si
+	   (ashiftrt:v4si A 16)
+	   (const_vector: 0xffff x4))
+	 is just (lshiftrt:v4si A 16).  */
+      if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+	  && (CONST_INT_P (XEXP (op0, 1))
+	      || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+		  && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))))
+	  && GET_CODE (op1) == CONST_VECTOR
+	  && CONST_VECTOR_DUPLICATE_P (op1))
+	{
+	  unsigned HOST_WIDE_INT shift_count
+	    = (CONST_INT_P (XEXP (op0, 1))
+	       ? UINTVAL (XEXP (op0, 1))
+	       : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+	  unsigned HOST_WIDE_INT inner_prec
+	    = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+	  /* Avoid UD shift count.  */
+	  if (shift_count < inner_prec
+	      && (UINTVAL (XVECEXP (op1, 0, 0))
+		  == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+	    return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1));
+	}
+
       tem = simplify_byte_swapping_operation (code, mode, op0, op1);
       if (tem)
 	return tem;
-- 
2.31.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication
  2024-05-21  5:12 [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode liuhongt
@ 2024-05-21  5:12 ` liuhongt
  2024-05-21  6:50   ` Uros Bizjak
  2024-05-24  2:25 ` [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode Hongtao Liu
  1 sibling, 1 reply; 8+ messages in thread
From: liuhongt @ 2024-05-21  5:12 UTC (permalink / raw)
  To: gcc-patches; +Cc: crazylht, hjl.tools

For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
variants in ix86_vector_duplicate_simode_const.
Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
bit larger than broadcast.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:
	PR target/114428
	* config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
	CONST_VECTOR_DUPLICATE_P in constant_pool.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr114428.c: New test.
---
 gcc/config/i386/i386-expand.cc           |  2 +-
 gcc/config/i386/i386-protos.h            |  1 +
 gcc/config/i386/i386.cc                  | 13 +++++++++++++
 gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++++++++++++++++++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..d96c365e144 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 
 /* OP is a memref of CONST_VECTOR, return scalar constant mem
    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
-static rtx
+rtx
 ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   int nunits = GET_MODE_NUNITS (mode);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..90712769200 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
+extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
 extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
 				       rtx[], bool = false);
 extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b4838b7939e..fdd9343e47a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22197,6 +22197,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
       return true;
 
     case MEM:
+      /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
+	 or variants in ix86_vector_duplicate_simode_const.  */
+
+      if (GET_MODE_SIZE (mode) >= 16
+	  && VECTOR_MODE_P (mode)
+	  && SYMBOL_REF_P (XEXP (x, 0))
+	  && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+	  && ix86_broadcast_from_constant (mode, x))
+	{
+	  *total = COSTS_N_INSNS (2) + speed;
+	  return true;
+	}
+
       /* An insn that accesses memory is slightly more expensive
          than one that does not.  */
       if (speed)
diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c b/gcc/testsuite/gcc.target/i386/pr114428.c
new file mode 100644
index 00000000000..bbbc5a080f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
+
+void
+foo2 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 32; i++)
+    a[i] = b[i] >> (short)8;
+}
+
+void
+foo3 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 16; i++)
+    a[i] = b[i] >> (short)8;
+}
+
-- 
2.31.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication
  2024-05-21  5:12 ` [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication liuhongt
@ 2024-05-21  6:50   ` Uros Bizjak
  0 siblings, 0 replies; 8+ messages in thread
From: Uros Bizjak @ 2024-05-21  6:50 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches, crazylht, hjl.tools

On Tue, May 21, 2024 at 7:13 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
> variants in ix86_vector_duplicate_simode_const.
> Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
> bit larger than broadcast.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>         PR target/114428
>         * config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
>         CONST_VECTOR_DUPLICATE_P in constant_pool.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr114428.c: New test.

LGTM.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc           |  2 +-
>  gcc/config/i386/i386-protos.h            |  1 +
>  gcc/config/i386/i386.cc                  | 13 +++++++++++++
>  gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++++++++++++++++++
>  4 files changed, 33 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr114428.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 4e16aedc5c1..d96c365e144 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
>
>  /* OP is a memref of CONST_VECTOR, return scalar constant mem
>     if CONST_VECTOR is a vec_duplicate, else return NULL.  */
> -static rtx
> +rtx
>  ix86_broadcast_from_constant (machine_mode mode, rtx op)
>  {
>    int nunits = GET_MODE_NUNITS (mode);
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..90712769200 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
>  extern void ix86_expand_move (machine_mode, rtx[]);
>  extern void ix86_expand_vector_move (machine_mode, rtx[]);
>  extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
> +extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
>  extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
>                                        rtx[], bool = false);
>  extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index b4838b7939e..fdd9343e47a 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -22197,6 +22197,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno,
>        return true;
>
>      case MEM:
> +      /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
> +        or variants in ix86_vector_duplicate_simode_const.  */
> +
> +      if (GET_MODE_SIZE (mode) >= 16
> +         && VECTOR_MODE_P (mode)
> +         && SYMBOL_REF_P (XEXP (x, 0))
> +         && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
> +         && ix86_broadcast_from_constant (mode, x))
> +       {
> +         *total = COSTS_N_INSNS (2) + speed;
> +         return true;
> +       }
> +
>        /* An insn that accesses memory is slightly more expensive
>           than one that does not.  */
>        if (speed)
> diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c b/gcc/testsuite/gcc.target/i386/pr114428.c
> new file mode 100644
> index 00000000000..bbbc5a080f6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr114428.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
> +/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
> +
> +void
> +foo2 (char* __restrict a, short* b)
> +{
> +  for (int i = 0; i != 32; i++)
> +    a[i] = b[i] >> (short)8;
> +}
> +
> +void
> +foo3 (char* __restrict a, short* b)
> +{
> +  for (int i = 0; i != 16; i++)
> +    a[i] = b[i] >> (short)8;
> +}
> +
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
  2024-05-21  5:12 [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode liuhongt
  2024-05-21  5:12 ` [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication liuhongt
@ 2024-05-24  2:25 ` Hongtao Liu
  2024-06-04 13:50   ` Jeff Law
  1 sibling, 1 reply; 8+ messages in thread
From: Hongtao Liu @ 2024-05-24  2:25 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Sandiford, Segher Boessenkool

CC for review.

On Tue, May 21, 2024 at 1:12 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
> of A, then it can be simplified to LSHIFTRT.
>
> i.e Simplify
> (and:v8hi
>   (ashifrt:v8hi A 8)
>   (const_vector 0xff x8))
> to
> (lshifrt:v8hi A 8)
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok of trunk?
>
> gcc/ChangeLog:
>
>         PR target/114428
>         * simplify-rtx.cc
>         (simplify_context::simplify_binary_operation_1):
>         Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
>         specific mask.
> ---
>  gcc/simplify-rtx.cc | 25 +++++++++++++++++++++++++
>  1 file changed, 25 insertions(+)
>
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index 53f54d1d392..6c91409200e 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -4021,6 +4021,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
>             return tem;
>         }
>
> +      /* (and:v4si
> +          (ashiftrt:v4si A 16)
> +          (const_vector: 0xffff x4))
> +        is just (lshiftrt:v4si A 16).  */
> +      if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
> +         && (CONST_INT_P (XEXP (op0, 1))
> +             || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
> +                 && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))))
> +         && GET_CODE (op1) == CONST_VECTOR
> +         && CONST_VECTOR_DUPLICATE_P (op1))
> +       {
> +         unsigned HOST_WIDE_INT shift_count
> +           = (CONST_INT_P (XEXP (op0, 1))
> +              ? UINTVAL (XEXP (op0, 1))
> +              : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
> +         unsigned HOST_WIDE_INT inner_prec
> +           = GET_MODE_PRECISION (GET_MODE_INNER (mode));
> +
> +         /* Avoid UD shift count.  */
> +         if (shift_count < inner_prec
> +             && (UINTVAL (XVECEXP (op1, 0, 0))
> +                 == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
> +           return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1));
> +       }
> +
>        tem = simplify_byte_swapping_operation (code, mode, op0, op1);
>        if (tem)
>         return tem;
> --
> 2.31.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
  2024-05-24  2:25 ` [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode Hongtao Liu
@ 2024-06-04 13:50   ` Jeff Law
  2024-06-05  4:22     ` [V2 PATCH] " liuhongt
  0 siblings, 1 reply; 8+ messages in thread
From: Jeff Law @ 2024-06-04 13:50 UTC (permalink / raw)
  To: Hongtao Liu, gcc-patches; +Cc: Richard Sandiford, Segher Boessenkool

On 5/23/24 8:25 PM, Hongtao Liu wrote:
> CC for review.
> 
> On Tue, May 21, 2024 at 1:12 PM liuhongt <hongtao.liu@intel.com> wrote:
>>
>> When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
>> of A, then it can be simplified to LSHIFTRT.
>>
>> i.e Simplify
>> (and:v8hi
>>    (ashifrt:v8hi A 8)
>>    (const_vector 0xff x8))
>> to
>> (lshifrt:v8hi A 8)
>>
>> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>> Ok of trunk?
>>
>> gcc/ChangeLog:
>>
>>          PR target/114428
>>          * simplify-rtx.cc
>>          (simplify_context::simplify_binary_operation_1):
>>          Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
>>          specific mask.

Can you add a testcase for this?  I don't mind if it's x86 specific and 
does a bit of asm scanning.

Also note that the context for this patch has changed, so it won't 
automatically apply.  So be extra careful when updating so that it goes 
into the right place (all the more reason to have a testcase validating 
that the optimization works correctly).

I think the patch itself is fine.  So further review is just for the 
testcase and should be easy.

jeff

ps.  It seems to help RISC-V as well :-)

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [V2 PATCH] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
  2024-06-04 13:50   ` Jeff Law
@ 2024-06-05  4:22     ` liuhongt
  2024-06-05 14:43       ` Jeff Law
  0 siblings, 1 reply; 8+ messages in thread
From: liuhongt @ 2024-06-05  4:22 UTC (permalink / raw)
  To: gcc-patches; +Cc: jeffreyalaw

> Can you add a testcase for this?  I don't mind if it's x86 specific and
> does a bit of asm scanning.
>
> Also note that the context for this patch has changed, so it won't
> automatically apply.  So be extra careful when updating so that it goes
> into the right place (all the more reason to have a testcase validating
> that the optimization works correctly).
>
>
> I think the patch itself is fine.  So further review is just for the
> testcase and should be easy.
rebased and add a testcase.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

gcc/ChangeLog:

	PR target/114428
	* simplify-rtx.cc
	(simplify_context::simplify_binary_operation_1):
	Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
	specific mask.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr114428-1.c: New test.
---
 gcc/simplify-rtx.cc                        | 25 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428-1.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 5caf1dfd957..05d410898b3 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4050,6 +4050,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
 	    return tem;
 	}
 
+      /* (and:v4si
+	   (ashiftrt:v4si A 16)
+	   (const_vector: 0xffff x4))
+	 is just (lshiftrt:v4si A 16).  */
+      if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+	  && (CONST_INT_P (XEXP (op0, 1))
+	      || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+		  && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))))
+	  && GET_CODE (op1) == CONST_VECTOR
+	  && CONST_VECTOR_DUPLICATE_P (op1))
+	{
+	  unsigned HOST_WIDE_INT shift_count
+	    = (CONST_INT_P (XEXP (op0, 1))
+	       ? UINTVAL (XEXP (op0, 1))
+	       : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+	  unsigned HOST_WIDE_INT inner_prec
+	    = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+	  /* Avoid UD shift count.  */
+	  if (shift_count < inner_prec
+	      && (UINTVAL (XVECEXP (op1, 0, 0))
+		  == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+	    return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1));
+	}
+
       tem = simplify_byte_swapping_operation (code, mode, op0, op1);
       if (tem)
 	return tem;
diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c b/gcc/testsuite/gcc.target/i386/pr114428-1.c
new file mode 100644
index 00000000000..927476f2269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */
+
+
+#define SHIFTC 12
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v8hi
+foo1 (v8hi a)
+{
+  return
+    (a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<<SHIFTC)-1, (1<<SHIFTC)-1,
+						(1<<SHIFTC)-1,(1<<SHIFTC)-1,
+						(1<<SHIFTC)-1, (1<<SHIFTC)-1,
+						(1<<SHIFTC)-1,(1<<SHIFTC)-1});
+}
+
+v4si
+foo2 (v4si a)
+{
+  return
+    (a >> (32 - SHIFTC)) & (__extension__(v4si){(1<<SHIFTC)-1, (1<<SHIFTC)-1,
+						(1<<SHIFTC)-1,(1<<SHIFTC)-1});
+}
+
+v2di
+__attribute__((target("avx512vl")))
+foo3 (v2di a)
+{
+  return
+    (a >> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<<SHIFTC)-1,
+							   (1ULL<<SHIFTC)-1});
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [V2 PATCH] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
  2024-06-05  4:22     ` [V2 PATCH] " liuhongt
@ 2024-06-05 14:43       ` Jeff Law
  2024-06-06  0:32         ` Hongtao Liu
  0 siblings, 1 reply; 8+ messages in thread
From: Jeff Law @ 2024-06-05 14:43 UTC (permalink / raw)
  To: liuhongt, gcc-patches



On 6/4/24 10:22 PM, liuhongt wrote:
>> Can you add a testcase for this?  I don't mind if it's x86 specific and
>> does a bit of asm scanning.
>>
>> Also note that the context for this patch has changed, so it won't
>> automatically apply.  So be extra careful when updating so that it goes
>> into the right place (all the more reason to have a testcase validating
>> that the optimization works correctly).
>>
>>
>> I think the patch itself is fine.  So further review is just for the
>> testcase and should be easy.
> rebased and add a testcase.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
> 
> 
> When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
> of A, then it can be simplified to LSHIFTRT.
> 
> i.e Simplify
> (and:v8hi
>    (ashifrt:v8hi A 8)
>    (const_vector 0xff x8))
> to
> (lshifrt:v8hi A 8)
> 
> gcc/ChangeLog:
> 
> 	PR target/114428
> 	* simplify-rtx.cc
> 	(simplify_context::simplify_binary_operation_1):
> 	Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
> 	specific mask.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/i386/pr114428-1.c: New test.
OK.

Being x264 related, I took a quick glance at RISC-V before/after and 
seems to be slightly better as well.

Jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [V2 PATCH] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
  2024-06-05 14:43       ` Jeff Law
@ 2024-06-06  0:32         ` Hongtao Liu
  0 siblings, 0 replies; 8+ messages in thread
From: Hongtao Liu @ 2024-06-06  0:32 UTC (permalink / raw)
  To: Jeff Law; +Cc: liuhongt, gcc-patches

On Wed, Jun 5, 2024 at 10:44 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 6/4/24 10:22 PM, liuhongt wrote:
> >> Can you add a testcase for this?  I don't mind if it's x86 specific and
> >> does a bit of asm scanning.
> >>
> >> Also note that the context for this patch has changed, so it won't
> >> automatically apply.  So be extra careful when updating so that it goes
> >> into the right place (all the more reason to have a testcase validating
> >> that the optimization works correctly).
> >>
> >>
> >> I think the patch itself is fine.  So further review is just for the
> >> testcase and should be easy.
> > rebased and add a testcase.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> >
> > When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
> > of A, then it can be simplified to LSHIFTRT.
> >
> > i.e Simplify
> > (and:v8hi
> >    (ashifrt:v8hi A 8)
> >    (const_vector 0xff x8))
> > to
> > (lshifrt:v8hi A 8)
> >
> > gcc/ChangeLog:
> >
> >       PR target/114428
> >       * simplify-rtx.cc
> >       (simplify_context::simplify_binary_operation_1):
> >       Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
> >       specific mask.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/i386/pr114428-1.c: New test.
> OK.
>
> Being x264 related, I took a quick glance at RISC-V before/after and
> seems to be slightly better as well.
Great, thanks for the review :)
>
> Jeff



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-06-06  0:32 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-21  5:12 [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode liuhongt
2024-05-21  5:12 ` [PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication liuhongt
2024-05-21  6:50   ` Uros Bizjak
2024-05-24  2:25 ` [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode Hongtao Liu
2024-06-04 13:50   ` Jeff Law
2024-06-05  4:22     ` [V2 PATCH] " liuhongt
2024-06-05 14:43       ` Jeff Law
2024-06-06  0:32         ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).