public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
@ 2022-09-23 11:42 Tamar Christina
  2022-09-23 11:43 ` [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand Tamar Christina
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Tamar Christina @ 2022-09-23 11:42 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 2146 bytes --]

Hi All,

This adds a match.pd rule that can fold right shifts and bit_field_refs of
integers into just a bit_field_ref by adjusting the offset and the size of the
extract and adds an extend to the previous size.

Concretely turns:

#include <arm_neon.h>

unsigned int foor (uint32x4_t x)
{
    return x[1] >> 16;
}

which used to generate:

  _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
  _3 = _1 >> 16;

into

  _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
  _2 = (unsigned int) _4;

I currently limit the rewrite to only doing it if the resulting extract is in
a mode the target supports. i.e. it won't rewrite it to extract say 13-bits
because I worry that for targets that won't have a bitfield extract instruction
this may be a de-optimization.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Testcase are added in patch 2/2.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add bitfield and shift folding.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03761544bfd499c01 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
   (IFN_REDUC_PLUS_WIDEN @0)))
 
+/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */
+(for shift (rshift)
+     op (plus)
+ (simplify
+  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
+  (if (INTEGRAL_TYPE_P (type))
+   (with { /* Can't use wide-int here as the precision differs between
+	      @1 and @3.  */
+	   unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
+	   unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
+	   unsigned HOST_WIDE_INT newsize = size - shiftc;
+	   tree nsize = wide_int_to_tree (bitsizetype, newsize);
+	   tree ntype
+	     = build_nonstandard_integer_type (newsize, 1); }
+    (if (ntype)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2 @3))))))))
+
 (simplify
  (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))




-- 

[-- Attachment #2: rb15776.patch --]
[-- Type: text/plain, Size: 1163 bytes --]

diff --git a/gcc/match.pd b/gcc/match.pd
index 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03761544bfd499c01 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
   (IFN_REDUC_PLUS_WIDEN @0)))
 
+/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */
+(for shift (rshift)
+     op (plus)
+ (simplify
+  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
+  (if (INTEGRAL_TYPE_P (type))
+   (with { /* Can't use wide-int here as the precision differs between
+	      @1 and @3.  */
+	   unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
+	   unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
+	   unsigned HOST_WIDE_INT newsize = size - shiftc;
+	   tree nsize = wide_int_to_tree (bitsizetype, newsize);
+	   tree ntype
+	     = build_nonstandard_integer_type (newsize, 1); }
+    (if (ntype)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2 @3))))))))
+
 (simplify
  (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))




^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-09-23 11:42 [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Tamar Christina
@ 2022-09-23 11:43 ` Tamar Christina
  2022-09-23 14:32   ` Richard Sandiford
  2022-09-24 18:38 ` [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Jeff Law
  2022-09-24 18:57 ` Andrew Pinski
  2 siblings, 1 reply; 19+ messages in thread
From: Tamar Christina @ 2022-09-23 11:43 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 6448 bytes --]

Hi All,

Similar to the 1/2 patch but adds additional back-end specific folding for if
the register sequence was created as a result of RTL optimizations.

Concretely:

#include <arm_neon.h>

unsigned int foor (uint32x4_t x)
{
    return x[1] >> 16;
}

generates:

foor:
        umov    w0, v0.h[3]
        ret

instead of

foor:
        umov    w0, v0.s[1]
        lsr     w0, w0, 16
        ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
+	(zero_extend:DI (LSHIFTRT:SI
+	 (match_operand:SI 1 "register_operand" "w,r,r")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  machine_mode dest, vec_mode;
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+	  if (size == 16)
+	    dest = HImode;
+	  else if (size == 8)
+	    dest = QImode;
+	  else
+	    gcc_unreachable ();
+
+	  /* Get nearest 64-bit vector mode.  */
+	  int nunits = 64 / size;
+	  auto vector_mode
+	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
+	  if (!vector_mode.exists (&vec_mode))
+	    gcc_unreachable ();
+	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
+	  operands[2] = gen_int_mode (val / size, SImode);
+
+	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
+	     into a weird loop due to a mov of w -> r being present most time
+	     this instruction applies.  */
+	  switch (dest)
+	  {
+	    case QImode:
+	      return "umov\\t%w0, %1.b[%2]";
+	    case HImode:
+	      return "umov\\t%w0, %1.h[%2]";
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT [lshiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}




-- 

[-- Attachment #2: rb15777.patch --]
[-- Type: text/plain, Size: 5634 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
+	(zero_extend:DI (LSHIFTRT:SI
+	 (match_operand:SI 1 "register_operand" "w,r,r")
+	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  machine_mode dest, vec_mode;
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+	  if (size == 16)
+	    dest = HImode;
+	  else if (size == 8)
+	    dest = QImode;
+	  else
+	    gcc_unreachable ();
+
+	  /* Get nearest 64-bit vector mode.  */
+	  int nunits = 64 / size;
+	  auto vector_mode
+	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
+	  if (!vector_mode.exists (&vec_mode))
+	    gcc_unreachable ();
+	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
+	  operands[2] = gen_int_mode (val / size, SImode);
+
+	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
+	     into a weird loop due to a mov of w -> r being present most time
+	     this instruction applies.  */
+	  switch (dest)
+	  {
+	    case QImode:
+	      return "umov\\t%w0, %1.b[%2]";
+	    case HImode:
+	      return "umov\\t%w0, %1.h[%2]";
+	    default:
+	      gcc_unreachable ();
+	  }
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT [lshiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}




^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-09-23 11:43 ` [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand Tamar Christina
@ 2022-09-23 14:32   ` Richard Sandiford
  2022-10-31 11:48     ` Tamar Christina
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Sandiford @ 2022-09-23 14:32 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> Similar to the 1/2 patch but adds additional back-end specific folding for if
> the register sequence was created as a result of RTL optimizations.
>
> Concretely:
>
> #include <arm_neon.h>
>
> unsigned int foor (uint32x4_t x)
> {
>     return x[1] >> 16;
> }
>
> generates:
>
> foor:
>         umov    w0, v0.h[3]
>         ret
>
> instead of
>
> foor:
>         umov    w0, v0.s[1]
>         lsr     w0, w0, 16
>         ret

The same thing ought to work for smov, so it would be good to do both.
That would also make the split between the original and new patterns
more obvious: left shift for the old pattern, right shift for the new
pattern.

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (SHIFT_NL, LSHIFTRT): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c333fb1f72725992bb304c560f1245a242d5192d..6aa1fb4be003f2027d63ac69fd314c2bbc876258 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5493,7 +5493,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5528,6 +5528,60 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_uxtw"
> +  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")

Is the "?" justified?  It seems odd to penalise a native,
single-instruction r->r operation in favour of a w->r operation.

> +	(zero_extend:DI (LSHIFTRT:SI
> +	 (match_operand:SI 1 "register_operand" "w,r,r")
> +	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  ""
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  machine_mode dest, vec_mode;
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +	  if (size == 16)
> +	    dest = HImode;
> +	  else if (size == 8)
> +	    dest = QImode;
> +	  else
> +	    gcc_unreachable ();
> +
> +	  /* Get nearest 64-bit vector mode.  */
> +	  int nunits = 64 / size;
> +	  auto vector_mode
> +	    = mode_for_vector (as_a <scalar_mode> (dest), nunits);
> +	  if (!vector_mode.exists (&vec_mode))
> +	    gcc_unreachable ();
> +	  operands[1] = gen_rtx_REG (vec_mode, REGNO (operands[1]));
> +	  operands[2] = gen_int_mode (val / size, SImode);
> +
> +	  /* Ideally we just call aarch64_get_lane_zero_extend but reload gets
> +	     into a weird loop due to a mov of w -> r being present most time
> +	     this instruction applies.  */
> +	  switch (dest)
> +	  {
> +	    case QImode:
> +	      return "umov\\t%w0, %1.b[%2]";
> +	    case HImode:
> +	      return "umov\\t%w0, %1.h[%2]";
> +	    default:
> +	      gcc_unreachable ();
> +	  }

Doesn't this reduce to something like:

  if (size == 16)
    return "umov\\t%w0, %1.h[1]";
  if (size == 8)
    return "umov\\t%w0, %1.b[3]";
  gcc_unreachable ();

?  We should print %1 correctly as vN even with its original type.

Thanks,
Richard

> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index e904407b2169e589b7007ff966b2d9347a6d0fd2..bf16207225e3a4f1f20ed6f54321bccbbf15d73f 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,11 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
> +
> +;; Singleton code iterator for only logical right shift.
> +(define_code_iterator LSHIFTRT [lshiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read.c b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-23 11:42 [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Tamar Christina
  2022-09-23 11:43 ` [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand Tamar Christina
@ 2022-09-24 18:38 ` Jeff Law
  2022-09-28 13:19   ` Tamar Christina
  2022-09-24 18:57 ` Andrew Pinski
  2 siblings, 1 reply; 19+ messages in thread
From: Jeff Law @ 2022-09-24 18:38 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches; +Cc: nd, rguenther


On 9/23/22 05:42, Tamar Christina wrote:
> Hi All,
>
> This adds a match.pd rule that can fold right shifts and bit_field_refs of
> integers into just a bit_field_ref by adjusting the offset and the size of the
> extract and adds an extend to the previous size.
>
> Concretely turns:
>
> #include <arm_neon.h>
>
> unsigned int foor (uint32x4_t x)
> {
>      return x[1] >> 16;
> }
>
> which used to generate:
>
>    _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
>    _3 = _1 >> 16;
>
> into
>
>    _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
>    _2 = (unsigned int) _4;
>
> I currently limit the rewrite to only doing it if the resulting extract is in
> a mode the target supports. i.e. it won't rewrite it to extract say 13-bits
> because I worry that for targets that won't have a bitfield extract instruction
> this may be a de-optimization.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Testcase are added in patch 2/2.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: Add bitfield and shift folding.

Were you planning to handle left shifts as well?  It looks like it since 
you've got iterations for the shift opcode and corresponding adjustment 
to the field, but they currently only handle rshift/plus.


Jeff



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-23 11:42 [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Tamar Christina
  2022-09-23 11:43 ` [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand Tamar Christina
  2022-09-24 18:38 ` [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Jeff Law
@ 2022-09-24 18:57 ` Andrew Pinski
  2022-09-26  4:55   ` Tamar Christina
  2 siblings, 1 reply; 19+ messages in thread
From: Andrew Pinski @ 2022-09-24 18:57 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, rguenther

On Fri, Sep 23, 2022 at 4:43 AM Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi All,
>
> This adds a match.pd rule that can fold right shifts and bit_field_refs of
> integers into just a bit_field_ref by adjusting the offset and the size of the
> extract and adds an extend to the previous size.
>
> Concretely turns:
>
> #include <arm_neon.h>
>
> unsigned int foor (uint32x4_t x)
> {
>     return x[1] >> 16;
> }
>
> which used to generate:
>
>   _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
>   _3 = _1 >> 16;
>
> into
>
>   _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
>   _2 = (unsigned int) _4;
>
> I currently limit the rewrite to only doing it if the resulting extract is in
> a mode the target supports. i.e. it won't rewrite it to extract say 13-bits
> because I worry that for targets that won't have a bitfield extract instruction
> this may be a de-optimization.

It is only a de-optimization for the following case:
* vector extraction

All other cases should be handled correctly in the middle-end when
expanding to RTL because they need to be handled for bit-fields
anyways.
Plus SIGN_EXTRACT and ZERO_EXTRACT would be used in the integer case
for the RTL.
Getting SIGN_EXTRACT/ZERO_EXTRACT early on in the RTL is better than
waiting until combine really.


>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Testcase are added in patch 2/2.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * match.pd: Add bitfield and shift folding.
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03761544bfd499c01 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
>    (IFN_REDUC_PLUS_WIDEN @0)))
>
> +/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */
> +(for shift (rshift)
> +     op (plus)
> + (simplify
> +  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
> +  (if (INTEGRAL_TYPE_P (type))
> +   (with { /* Can't use wide-int here as the precision differs between
> +             @1 and @3.  */
> +          unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> +          unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
> +          unsigned HOST_WIDE_INT newsize = size - shiftc;
> +          tree nsize = wide_int_to_tree (bitsizetype, newsize);
> +          tree ntype
> +            = build_nonstandard_integer_type (newsize, 1); }

Maybe use `build_nonstandard_integer_type (newsize, /* unsignedp = */ true);`
or better yet `build_nonstandard_integer_type (newsize, UNSIGNED);`

I had started to convert some of the unsignedp into enum signop but I
never finished or submitted the patch.

Thanks,
Andrew Pinski


> +    (if (ntype)
> +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2 @3))))))))
> +
>  (simplify
>   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
>   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
>
>
>
>
> --

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-24 18:57 ` Andrew Pinski
@ 2022-09-26  4:55   ` Tamar Christina
  2022-09-26  8:05     ` Richard Biener
  2022-09-26 15:24     ` Andrew Pinski
  0 siblings, 2 replies; 19+ messages in thread
From: Tamar Christina @ 2022-09-26  4:55 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches, nd, rguenther

> -----Original Message-----
> From: Andrew Pinski <pinskia@gmail.com>
> Sent: Saturday, September 24, 2022 8:57 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
> BIT_FIELD_REFs alone
> 
> On Fri, Sep 23, 2022 at 4:43 AM Tamar Christina via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> >
> > Hi All,
> >
> > This adds a match.pd rule that can fold right shifts and
> > bit_field_refs of integers into just a bit_field_ref by adjusting the
> > offset and the size of the extract and adds an extend to the previous size.
> >
> > Concretely turns:
> >
> > #include <arm_neon.h>
> >
> > unsigned int foor (uint32x4_t x)
> > {
> >     return x[1] >> 16;
> > }
> >
> > which used to generate:
> >
> >   _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
> >   _3 = _1 >> 16;
> >
> > into
> >
> >   _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
> >   _2 = (unsigned int) _4;
> >
> > I currently limit the rewrite to only doing it if the resulting
> > extract is in a mode the target supports. i.e. it won't rewrite it to
> > extract say 13-bits because I worry that for targets that won't have a
> > bitfield extract instruction this may be a de-optimization.
> 
> It is only a de-optimization for the following case:
> * vector extraction
> 
> All other cases should be handled correctly in the middle-end when
> expanding to RTL because they need to be handled for bit-fields anyways.
> Plus SIGN_EXTRACT and ZERO_EXTRACT would be used in the integer case
> for the RTL.
> Getting SIGN_EXTRACT/ZERO_EXTRACT early on in the RTL is better than
> waiting until combine really.
> 

Fair enough, I've dropped the constraint.

> 
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Testcase are added in patch 2/2.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         * match.pd: Add bitfield and shift folding.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> >
> 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03
> 761
> > 544bfd499c01 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P
> (TREE_TYPE(@0)))
> >    (IFN_REDUC_PLUS_WIDEN @0)))
> >
> > +/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */ (for
> > +shift (rshift)
> > +     op (plus)
> > + (simplify
> > +  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
> > +  (if (INTEGRAL_TYPE_P (type))
> > +   (with { /* Can't use wide-int here as the precision differs between
> > +             @1 and @3.  */
> > +          unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> > +          unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
> > +          unsigned HOST_WIDE_INT newsize = size - shiftc;
> > +          tree nsize = wide_int_to_tree (bitsizetype, newsize);
> > +          tree ntype
> > +            = build_nonstandard_integer_type (newsize, 1); }
> 
> Maybe use `build_nonstandard_integer_type (newsize, /* unsignedp = */
> true);` or better yet `build_nonstandard_integer_type (newsize,
> UNSIGNED);`

Ah, will do,
Tamar.

> 
> I had started to convert some of the unsignedp into enum signop but I never
> finished or submitted the patch.
> 
> Thanks,
> Andrew Pinski
> 
> 
> > +    (if (ntype)
> > +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2
> > + @3))))))))
> > +
> >  (simplify
> >   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
> >   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4);
> > }))
> >
> >
> >
> >
> > --

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-26  4:55   ` Tamar Christina
@ 2022-09-26  8:05     ` Richard Biener
  2022-09-26 15:24     ` Andrew Pinski
  1 sibling, 0 replies; 19+ messages in thread
From: Richard Biener @ 2022-09-26  8:05 UTC (permalink / raw)
  To: Tamar Christina; +Cc: Andrew Pinski, gcc-patches, nd

On Mon, 26 Sep 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Andrew Pinski <pinskia@gmail.com>
> > Sent: Saturday, September 24, 2022 8:57 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de
> > Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
> > BIT_FIELD_REFs alone
> > 
> > On Fri, Sep 23, 2022 at 4:43 AM Tamar Christina via Gcc-patches <gcc-
> > patches@gcc.gnu.org> wrote:
> > >
> > > Hi All,
> > >
> > > This adds a match.pd rule that can fold right shifts and
> > > bit_field_refs of integers into just a bit_field_ref by adjusting the
> > > offset and the size of the extract and adds an extend to the previous size.
> > >
> > > Concretely turns:
> > >
> > > #include <arm_neon.h>
> > >
> > > unsigned int foor (uint32x4_t x)
> > > {
> > >     return x[1] >> 16;
> > > }
> > >
> > > which used to generate:
> > >
> > >   _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
> > >   _3 = _1 >> 16;
> > >
> > > into
> > >
> > >   _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
> > >   _2 = (unsigned int) _4;
> > >
> > > I currently limit the rewrite to only doing it if the resulting
> > > extract is in a mode the target supports. i.e. it won't rewrite it to
> > > extract say 13-bits because I worry that for targets that won't have a
> > > bitfield extract instruction this may be a de-optimization.
> > 
> > It is only a de-optimization for the following case:
> > * vector extraction
> > 
> > All other cases should be handled correctly in the middle-end when
> > expanding to RTL because they need to be handled for bit-fields anyways.
> > Plus SIGN_EXTRACT and ZERO_EXTRACT would be used in the integer case
> > for the RTL.
> > Getting SIGN_EXTRACT/ZERO_EXTRACT early on in the RTL is better than
> > waiting until combine really.
> > 
> 
> Fair enough, I've dropped the constraint.
> 
> > 
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Testcase are added in patch 2/2.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add bitfield and shift folding.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03
> > 761
> > > 544bfd499c01 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P
> > (TREE_TYPE(@0)))
> > >    (IFN_REDUC_PLUS_WIDEN @0)))
> > >
> > > +/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */
> > > (for
> > > +shift (rshift)
> > > +     op (plus)

why have a for when you only iterate over a single operation?!  And 'op'
seems unused?

> > > + (simplify
> > > +  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
> > > +  (if (INTEGRAL_TYPE_P (type))
> > > +   (with { /* Can't use wide-int here as the precision differs between
> > > +             @1 and @3.  */
> > > +          unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> > > +          unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);

But you should then test tree_fits_uhwi_p.

> > > +          unsigned HOST_WIDE_INT newsize = size - shiftc;
> > > +          tree nsize = wide_int_to_tree (bitsizetype, newsize);
> > > +          tree ntype
> > > +            = build_nonstandard_integer_type (newsize, 1); }

build_nonstandard_integer_type never fails so I don't see how
you "limit" this to extractions fitting a mode.

I'm quite sure this breaks with BYTES_BIG_ENDIAN.  Please try
BIT_FIELD_REF _offsets_ that make the extraction cross byte
boundaries.

Also I'm missing a testcase?

Thanks,
Richard.

> > Maybe use `build_nonstandard_integer_type (newsize, /* unsignedp = */
> > true);` or better yet `build_nonstandard_integer_type (newsize,
> > UNSIGNED);`
> 
> Ah, will do,
> Tamar.
> 
> > 
> > I had started to convert some of the unsignedp into enum signop but I never
> > finished or submitted the patch.
> > 
> > Thanks,
> > Andrew Pinski
> > 
> > 
> > > +    (if (ntype)
> > > +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2
> > > + @3))))))))
> > > +
> > >  (simplify
> > >   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
> > >   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4);
> > > }))
> > >
> > >
> > >
> > >
> > > --
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-26  4:55   ` Tamar Christina
  2022-09-26  8:05     ` Richard Biener
@ 2022-09-26 15:24     ` Andrew Pinski
  2022-09-27 12:40       ` Richard Biener
  1 sibling, 1 reply; 19+ messages in thread
From: Andrew Pinski @ 2022-09-26 15:24 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, rguenther

[-- Attachment #1: Type: text/plain, Size: 4281 bytes --]

On Sun, Sep 25, 2022 at 9:56 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Andrew Pinski <pinskia@gmail.com>
> > Sent: Saturday, September 24, 2022 8:57 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de
> > Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
> > BIT_FIELD_REFs alone
> >
> > On Fri, Sep 23, 2022 at 4:43 AM Tamar Christina via Gcc-patches <gcc-
> > patches@gcc.gnu.org> wrote:
> > >
> > > Hi All,
> > >
> > > This adds a match.pd rule that can fold right shifts and
> > > bit_field_refs of integers into just a bit_field_ref by adjusting the
> > > offset and the size of the extract and adds an extend to the previous size.
> > >
> > > Concretely turns:
> > >
> > > #include <arm_neon.h>
> > >
> > > unsigned int foor (uint32x4_t x)
> > > {
> > >     return x[1] >> 16;
> > > }
> > >
> > > which used to generate:
> > >
> > >   _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
> > >   _3 = _1 >> 16;
> > >
> > > into
> > >
> > >   _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
> > >   _2 = (unsigned int) _4;
> > >
> > > I currently limit the rewrite to only doing it if the resulting
> > > extract is in a mode the target supports. i.e. it won't rewrite it to
> > > extract say 13-bits because I worry that for targets that won't have a
> > > bitfield extract instruction this may be a de-optimization.
> >
> > It is only a de-optimization for the following case:
> > * vector extraction
> >
> > All other cases should be handled correctly in the middle-end when
> > expanding to RTL because they need to be handled for bit-fields anyways.
> > Plus SIGN_EXTRACT and ZERO_EXTRACT would be used in the integer case
> > for the RTL.
> > Getting SIGN_EXTRACT/ZERO_EXTRACT early on in the RTL is better than
> > waiting until combine really.
> >
>
> Fair enough, I've dropped the constraint.

Well the constraint should be done still for VECTOR_TYPE I think.
Attached is what I had done for left shift for integer types.
Note the BYTES_BIG_ENDIAN part which you missed for the right shift case.

Thanks,
Andrew Pinski

>
> >
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Testcase are added in patch 2/2.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >         * match.pd: Add bitfield and shift folding.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03
> > 761
> > > 544bfd499c01 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P
> > (TREE_TYPE(@0)))
> > >    (IFN_REDUC_PLUS_WIDEN @0)))
> > >
> > > +/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */ (for
> > > +shift (rshift)
> > > +     op (plus)
> > > + (simplify
> > > +  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
> > > +  (if (INTEGRAL_TYPE_P (type))
> > > +   (with { /* Can't use wide-int here as the precision differs between
> > > +             @1 and @3.  */
> > > +          unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> > > +          unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
> > > +          unsigned HOST_WIDE_INT newsize = size - shiftc;
> > > +          tree nsize = wide_int_to_tree (bitsizetype, newsize);
> > > +          tree ntype
> > > +            = build_nonstandard_integer_type (newsize, 1); }
> >
> > Maybe use `build_nonstandard_integer_type (newsize, /* unsignedp = */
> > true);` or better yet `build_nonstandard_integer_type (newsize,
> > UNSIGNED);`
>
> Ah, will do,
> Tamar.
>
> >
> > I had started to convert some of the unsignedp into enum signop but I never
> > finished or submitted the patch.
> >
> > Thanks,
> > Andrew Pinski
> >
> >
> > > +    (if (ntype)
> > > +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2
> > > + @3))))))))
> > > +
> > >  (simplify
> > >   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
> > >   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4);
> > > }))
> > >
> > >
> > >
> > >
> > > --

[-- Attachment #2: ed7c08c.diff --]
[-- Type: text/plain, Size: 2057 bytes --]

From ed7c08c4d565bd4418cf2dce3bbfecc18fdd42a2 Mon Sep 17 00:00:00 2001
From: Andrew Pinski <apinski@marvell.com>
Date: Wed, 25 Dec 2019 01:20:13 +0000
Subject: [PATCH] Add simplification of shift of a bit_field.

We can simplify a shift of a bit_field_ref to
a shift of an and (note sometimes the shift can
be removed).

Change-Id: I1a9f3fc87889ecd7cf569272405b6ee7dd5f8d7b
Signed-off-by: Andrew Pinski <apinski@marvell.com>
---

diff --git a/gcc/match.pd b/gcc/match.pd
index cb981ec..e4f6d47 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6071,6 +6071,34 @@
     (cmp (bit_and @0 { wide_int_to_tree (type1, mask); })
          { wide_int_to_tree (type1, cst); })))))
 
+/* lshift<bitfield<>> -> shift(bit_and(@0, mask)) */
+(simplify
+ (lshift (convert (BIT_FIELD_REF@bit @0 @bitsize @bitpos)) INTEGER_CST@1)
+ (if (INTEGRAL_TYPE_P (type)
+      && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+      && tree_fits_uhwi_p (@1)
+      && (tree_nop_conversion_p (type, TREE_TYPE (@0))
+	  || (TYPE_UNSIGNED (TREE_TYPE (@0))
+	      && TYPE_UNSIGNED (TREE_TYPE (@bit))
+	      && TYPE_UNSIGNED (type)
+	      && TYPE_PRECISION (type) > tree_to_uhwi (@bitsize))))
+  (with
+   {
+     unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (@bitpos);
+     unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (@bitsize);
+     if (BYTES_BIG_ENDIAN)
+       bitpos = TYPE_PRECISION (TREE_TYPE (@0)) - bitpos - bitsize;
+     wide_int wmask = wi::shifted_mask (bitpos, bitsize, false, TYPE_PRECISION (type));
+   }
+   (switch
+    (if (tree_to_uhwi (@1) == bitpos)
+     (bit_and (convert @0) { wide_int_to_tree (type, wmask); }))
+    (if (tree_to_uhwi (@1) > bitpos)
+     (lshift (bit_and (convert @0) { wide_int_to_tree (type, wmask); })
+	     { wide_int_to_tree (integer_type_node, tree_to_uhwi (@1) - bitpos); } ))
+    (if (tree_to_uhwi (@1) < bitpos)
+     (rshift (bit_and (convert @0) { wide_int_to_tree (type, wmask); })
+	     { wide_int_to_tree (integer_type_node, bitpos - tree_to_uhwi (@1)); } ))))))
 
 (if (canonicalize_math_after_vectorization_p ())
  (for fmas (FMA)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-26 15:24     ` Andrew Pinski
@ 2022-09-27 12:40       ` Richard Biener
  2022-10-31 11:51         ` Tamar Christina
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Biener @ 2022-09-27 12:40 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Tamar Christina, rguenther, nd, gcc-patches

On Mon, Sep 26, 2022 at 5:25 PM Andrew Pinski via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Sun, Sep 25, 2022 at 9:56 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
> >
> > > -----Original Message-----
> > > From: Andrew Pinski <pinskia@gmail.com>
> > > Sent: Saturday, September 24, 2022 8:57 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de
> > > Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
> > > BIT_FIELD_REFs alone
> > >
> > > On Fri, Sep 23, 2022 at 4:43 AM Tamar Christina via Gcc-patches <gcc-
> > > patches@gcc.gnu.org> wrote:
> > > >
> > > > Hi All,
> > > >
> > > > This adds a match.pd rule that can fold right shifts and
> > > > bit_field_refs of integers into just a bit_field_ref by adjusting the
> > > > offset and the size of the extract and adds an extend to the previous size.
> > > >
> > > > Concretely turns:
> > > >
> > > > #include <arm_neon.h>
> > > >
> > > > unsigned int foor (uint32x4_t x)
> > > > {
> > > >     return x[1] >> 16;
> > > > }
> > > >
> > > > which used to generate:
> > > >
> > > >   _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
> > > >   _3 = _1 >> 16;
> > > >
> > > > into
> > > >
> > > >   _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
> > > >   _2 = (unsigned int) _4;
> > > >
> > > > I currently limit the rewrite to only doing it if the resulting
> > > > extract is in a mode the target supports. i.e. it won't rewrite it to
> > > > extract say 13-bits because I worry that for targets that won't have a
> > > > bitfield extract instruction this may be a de-optimization.
> > >
> > > It is only a de-optimization for the following case:
> > > * vector extraction
> > >
> > > All other cases should be handled correctly in the middle-end when
> > > expanding to RTL because they need to be handled for bit-fields anyways.
> > > Plus SIGN_EXTRACT and ZERO_EXTRACT would be used in the integer case
> > > for the RTL.
> > > Getting SIGN_EXTRACT/ZERO_EXTRACT early on in the RTL is better than
> > > waiting until combine really.
> > >
> >
> > Fair enough, I've dropped the constraint.
>
> Well the constraint should be done still for VECTOR_TYPE I think.
> Attached is what I had done for left shift for integer types.
> Note the BYTES_BIG_ENDIAN part which you missed for the right shift case.

Note we formerly had BIT_FIELD_REF_UNSIGNED and allowed the precision
of the TREE_TYPE of the BIT_FIELD_REF to not match the extracted size.  That
might have mapped directly to zero/sign_extract.

Now that this is no more we should think of a canonical way to express this
and make sure we can synthesize those early.

Richard.

> Thanks,
> Andrew Pinski
>
> >
> > >
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > > and no issues.
> > > >
> > > > Testcase are added in patch 2/2.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * match.pd: Add bitfield and shift folding.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > > 1d407414bee278c64c00d425d9f025c1c58d853d..b225d36dc758f1581502c8d03
> > > 761
> > > > 544bfd499c01 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -7245,6 +7245,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > > >        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P
> > > (TREE_TYPE(@0)))
> > > >    (IFN_REDUC_PLUS_WIDEN @0)))
> > > >
> > > > +/* Canonicalize BIT_FIELD_REFS and shifts to BIT_FIELD_REFS.  */ (for
> > > > +shift (rshift)
> > > > +     op (plus)
> > > > + (simplify
> > > > +  (shift (BIT_FIELD_REF @0 @1 @2) integer_pow2p@3)
> > > > +  (if (INTEGRAL_TYPE_P (type))
> > > > +   (with { /* Can't use wide-int here as the precision differs between
> > > > +             @1 and @3.  */
> > > > +          unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> > > > +          unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
> > > > +          unsigned HOST_WIDE_INT newsize = size - shiftc;
> > > > +          tree nsize = wide_int_to_tree (bitsizetype, newsize);
> > > > +          tree ntype
> > > > +            = build_nonstandard_integer_type (newsize, 1); }
> > >
> > > Maybe use `build_nonstandard_integer_type (newsize, /* unsignedp = */
> > > true);` or better yet `build_nonstandard_integer_type (newsize,
> > > UNSIGNED);`
> >
> > Ah, will do,
> > Tamar.
> >
> > >
> > > I had started to convert some of the unsignedp into enum signop but I never
> > > finished or submitted the patch.
> > >
> > > Thanks,
> > > Andrew Pinski
> > >
> > >
> > > > +    (if (ntype)
> > > > +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (op @2
> > > > + @3))))))))
> > > > +
> > > >  (simplify
> > > >   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
> > > >   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4);
> > > > }))
> > > >
> > > >
> > > >
> > > >
> > > > --

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-24 18:38 ` [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Jeff Law
@ 2022-09-28 13:19   ` Tamar Christina
  2022-09-28 17:25     ` Jeff Law
  0 siblings, 1 reply; 19+ messages in thread
From: Tamar Christina @ 2022-09-28 13:19 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: nd, rguenther

> -----Original Message-----
> From: Jeff Law <jeffreyalaw@gmail.com>
> Sent: Saturday, September 24, 2022 8:38 PM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
> BIT_FIELD_REFs alone
> 
> 
> On 9/23/22 05:42, Tamar Christina wrote:
> > Hi All,
> >
> > This adds a match.pd rule that can fold right shifts and
> > bit_field_refs of integers into just a bit_field_ref by adjusting the
> > offset and the size of the extract and adds an extend to the previous size.
> >
> > Concretely turns:
> >
> > #include <arm_neon.h>
> >
> > unsigned int foor (uint32x4_t x)
> > {
> >      return x[1] >> 16;
> > }
> >
> > which used to generate:
> >
> >    _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
> >    _3 = _1 >> 16;
> >
> > into
> >
> >    _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
> >    _2 = (unsigned int) _4;
> >
> > I currently limit the rewrite to only doing it if the resulting
> > extract is in a mode the target supports. i.e. it won't rewrite it to
> > extract say 13-bits because I worry that for targets that won't have a
> > bitfield extract instruction this may be a de-optimization.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Testcase are added in patch 2/2.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* match.pd: Add bitfield and shift folding.
> 
> Were you planning to handle left shifts as well?  It looks like it since you've
> got iterations for the shift opcode and corresponding adjustment to the field,
> but they currently only handle rshift/plus.
> 

Hmm do left shifts work here? Since a left shift would increase the size of the
resulting value by adding zeros to the end of the number, so you can't increase
the size of the bitfield to do the same.

I did however realize that truncating casts have the same effect as a right shift,
so I have added that now.

Tamar

> 
> Jeff
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-28 13:19   ` Tamar Christina
@ 2022-09-28 17:25     ` Jeff Law
  0 siblings, 0 replies; 19+ messages in thread
From: Jeff Law @ 2022-09-28 17:25 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches; +Cc: nd, rguenther


On 9/28/22 07:19, Tamar Christina wrote:
>> -----Original Message-----
>> From: Jeff Law <jeffreyalaw@gmail.com>
>> Sent: Saturday, September 24, 2022 8:38 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
>> Cc: nd <nd@arm.com>; rguenther@suse.de
>> Subject: Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into
>> BIT_FIELD_REFs alone
>>
>>
>> On 9/23/22 05:42, Tamar Christina wrote:
>>> Hi All,
>>>
>>> This adds a match.pd rule that can fold right shifts and
>>> bit_field_refs of integers into just a bit_field_ref by adjusting the
>>> offset and the size of the extract and adds an extend to the previous size.
>>>
>>> Concretely turns:
>>>
>>> #include <arm_neon.h>
>>>
>>> unsigned int foor (uint32x4_t x)
>>> {
>>>       return x[1] >> 16;
>>> }
>>>
>>> which used to generate:
>>>
>>>     _1 = BIT_FIELD_REF <x_2(D), 32, 32>;
>>>     _3 = _1 >> 16;
>>>
>>> into
>>>
>>>     _4 = BIT_FIELD_REF <x_1(D), 16, 48>;
>>>     _2 = (unsigned int) _4;
>>>
>>> I currently limit the rewrite to only doing it if the resulting
>>> extract is in a mode the target supports. i.e. it won't rewrite it to
>>> extract say 13-bits because I worry that for targets that won't have a
>>> bitfield extract instruction this may be a de-optimization.
>>>
>>> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
>>> and no issues.
>>>
>>> Testcase are added in patch 2/2.
>>>
>>> Ok for master?
>>>
>>> Thanks,
>>> Tamar
>>>
>>> gcc/ChangeLog:
>>>
>>> 	* match.pd: Add bitfield and shift folding.
>> Were you planning to handle left shifts as well?  It looks like it since you've
>> got iterations for the shift opcode and corresponding adjustment to the field,
>> but they currently only handle rshift/plus.
>>
> Hmm do left shifts work here? Since a left shift would increase the size of the
> resulting value by adding zeros to the end of the number, so you can't increase
> the size of the bitfield to do the same.

Dunno, I hadn't really thought about it.  It just looked like you were 
prepared to handle more cases with those iterators.


>
> I did however realize that truncating casts have the same effect as a right shift,
> so I have added that now.

ACK.

jeff


^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-09-23 14:32   ` Richard Sandiford
@ 2022-10-31 11:48     ` Tamar Christina
  2022-11-14 21:54       ` Richard Sandiford
  0 siblings, 1 reply; 19+ messages in thread
From: Tamar Christina @ 2022-10-31 11:48 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 11475 bytes --]

> 
> The same thing ought to work for smov, so it would be good to do both.
> That would also make the split between the original and new patterns more
> obvious: left shift for the old pattern, right shift for the new pattern.
> 

Done, though because umov can do multilevel extensions I couldn't combine them
Into a single pattern.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
	smov.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read_1.c: New test.
	* gcc.target/aarch64/shift-read_2.c: New test.
	* gcc.target/aarch64/shift-read_3.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5370,20 +5370,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_sxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(sign_extend:GPI (ASHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%<w>0, %<w>1, %2";
+      case 2:
+	return "<shift>\\t%<w>0, %<w>1, %<w>2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(zero_extend:GPI (LSHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "umov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "umov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
+
+;; Singleton code iterator for only arithmetic right shift.
+(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** ufoo:
+**	...
+** 	umov	w0, v0.h\[1\]
+** 	ret
+*/
+uint64_t ufoo (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo:
+**	...
+** 	smov	x0, v0.h\[1\]
+** 	ret
+*/
+int64_t sfoo (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo2:
+**	...
+** 	smov	w0, v0.h\[1\]
+** 	ret
+*/
+int32_t sfoo2 (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** ubar:
+**	...
+** 	umov	w0, v0.b\[3\]
+** 	ret
+*/
+uint64_t ubar (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}
+
+/* 
+** sbar:
+**	...
+** 	smov	x0, v0.b\[3\]
+** 	ret
+*/
+int64_t sbar (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}

[-- Attachment #2: rb15777.patch --]
[-- Type: application/octet-stream, Size: 10075 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5370,20 +5370,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_sxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(sign_extend:GPI (ASHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%<w>0, %<w>1, %2";
+      case 2:
+	return "<shift>\\t%<w>0, %<w>1, %<w>2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
+(define_insn "*<optab>si3_insn2_uxtw"
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
+	(zero_extend:GPI (LSHIFTRT_ONLY:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  ""
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "umov\\t%w0, %1.h[1]";
+	  if (size == 8)
+	    return "umov\\t%w0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -166,6 +166,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
+
+;; Singleton code iterator for only logical right shift.
+(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
+
+;; Singleton code iterator for only arithmetic right shift.
+(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}
+
+/*
+** foo2:
+**	...
+** 	umov	w0, v[0-8]+.h\[1\]
+** 	ret
+*/
+short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** ufoo:
+**	...
+** 	umov	w0, v0.h\[1\]
+** 	ret
+*/
+uint64_t ufoo (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo:
+**	...
+** 	smov	x0, v0.h\[1\]
+** 	ret
+*/
+int64_t sfoo (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** sfoo2:
+**	...
+** 	smov	w0, v0.h\[1\]
+** 	ret
+*/
+int32_t sfoo2 (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 16;
+}
+
+/* 
+** ubar:
+**	...
+** 	umov	w0, v0.b\[3\]
+** 	ret
+*/
+uint64_t ubar (uint32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}
+
+/* 
+** sbar:
+**	...
+** 	smov	x0, v0.b\[3\]
+** 	ret
+*/
+int64_t sbar (int32x4_t x)
+{
+  return (x[0] + x[1]) >> 24;
+}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-09-27 12:40       ` Richard Biener
@ 2022-10-31 11:51         ` Tamar Christina
  2022-10-31 16:24           ` Jeff Law
  2022-11-07 13:29           ` Richard Biener
  0 siblings, 2 replies; 19+ messages in thread
From: Tamar Christina @ 2022-10-31 11:51 UTC (permalink / raw)
  To: Richard Biener, Andrew Pinski; +Cc: rguenther, nd, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 5942 bytes --]

Hi All,

Here's a respin addressing review comments.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add bitfield and shift folding.

gcc/testsuite/ChangeLog:

	* gcc.dg/bitshift_1.c: New.
	* gcc.dg/bitshift_2.c: New.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 70e90cdbfa902830e6b58be84e114e86ff7b4dff..a4ad465b2b074b21835be74732dce295f8db03bc 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7245,6 +7245,45 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
   (IFN_REDUC_PLUS_WIDEN @0)))
 
+/* Canonicalize BIT_FIELD_REFS and right shift to BIT_FIELD_REFS.  */
+(simplify
+ (rshift (BIT_FIELD_REF @0 @1 @2) INTEGER_CST@3)
+ (if (INTEGRAL_TYPE_P (type)
+      && tree_fits_uhwi_p (@1)
+      && tree_fits_uhwi_p (@3))
+  (with { /* Can't use wide-int here as the precision differs between
+	     @1 and @3.  */
+	  unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
+	  unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
+	  unsigned HOST_WIDE_INT newsize = size - shiftc;
+	  tree nsize = wide_int_to_tree (bitsizetype, newsize);
+	  tree ntype
+	    = build_nonstandard_integer_type (newsize, TYPE_UNSIGNED (type)); }
+   (switch
+    (if (INTEGRAL_TYPE_P (ntype) && !BYTES_BIG_ENDIAN)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (plus @2 @3))))
+    (if (INTEGRAL_TYPE_P (ntype) && BYTES_BIG_ENDIAN)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (minus @2 @3))))))))
+
+/* Canonicalize BIT_FIELD_REFS and converts to BIT_FIELD_REFS.  */
+(simplify
+ (convert (BIT_FIELD_REF@3 @0 @1 @2))
+ (if (INTEGRAL_TYPE_P (type)
+      && INTEGRAL_TYPE_P (TREE_TYPE (@3)))
+  (with { unsigned int size_inner = element_precision (TREE_TYPE (@3));
+	  unsigned int size_outer  = element_precision (type); }
+   (if (size_inner > size_outer)
+    /* Truncating convert, we can shrink the bit field similar to the
+        shift case.  */
+    (with {
+	    tree nsize = wide_int_to_tree (bitsizetype, size_outer);
+	    auto sign = TYPE_UNSIGNED (type);
+	    tree ntype
+	      = build_nonstandard_integer_type (size_outer, sign);
+	    gcc_assert (useless_type_conversion_p (type, ntype)); }
+     (if (INTEGRAL_TYPE_P (ntype))
+      (BIT_FIELD_REF:ntype @0 { nsize; } @2)))))))
+
 (simplify
  (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
diff --git a/gcc/testsuite/gcc.dg/bitshift_1.c b/gcc/testsuite/gcc.dg/bitshift_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5995d0746d2301eb48304629cb4b779b079f1270
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitshift_1.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target le } } */
+/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef unsigned int v4usi __attribute__ ((vector_size (16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
+
+unsigned int foor (v4usi x)
+{
+    return x[1] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 48>;} "optimized" } } */
+
+unsigned int fool (v4usi x)
+{
+    return x[1] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
+
+unsigned short foor2 (v4usi x)
+{
+    return x[3] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 112>;} "optimized" } } */
+
+unsigned int fool2 (v4usi x)
+{
+    return x[0] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
+
+unsigned char foor3 (v8uhi x)
+{
+    return x[3] >> 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 57>;} "optimized" } } */
+
+unsigned short fool3 (v8uhi x)
+{
+    return x[0] << 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
+
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/bitshift_2.c b/gcc/testsuite/gcc.dg/bitshift_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..406b4def9d4aebbc83bd5bef92dab825b85f2aa4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitshift_2.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target be } } */
+/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef unsigned int v4usi __attribute__ ((vector_size (16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
+
+unsigned int foor (v4usi x)
+{
+    return x[1] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 16>;} "optimized" } } */
+
+unsigned int fool (v4usi x)
+{
+    return x[1] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
+
+unsigned short foor2 (v4usi x)
+{
+    return x[3] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 80>;} "optimized" } } */
+
+unsigned int fool2 (v4usi x)
+{
+    return x[0] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
+
+unsigned char foor3 (v8uhi x)
+{
+    return x[3] >> 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 39>;} "optimized" } } */
+
+unsigned short fool3 (v8uhi x)
+{
+    return x[0] << 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
+
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */

[-- Attachment #2: rb15776.patch --]
[-- Type: application/octet-stream, Size: 5410 bytes --]

diff --git a/gcc/match.pd b/gcc/match.pd
index 70e90cdbfa902830e6b58be84e114e86ff7b4dff..a4ad465b2b074b21835be74732dce295f8db03bc 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7245,6 +7245,45 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
       && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
   (IFN_REDUC_PLUS_WIDEN @0)))
 
+/* Canonicalize BIT_FIELD_REFS and right shift to BIT_FIELD_REFS.  */
+(simplify
+ (rshift (BIT_FIELD_REF @0 @1 @2) INTEGER_CST@3)
+ (if (INTEGRAL_TYPE_P (type)
+      && tree_fits_uhwi_p (@1)
+      && tree_fits_uhwi_p (@3))
+  (with { /* Can't use wide-int here as the precision differs between
+	     @1 and @3.  */
+	  unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
+	  unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
+	  unsigned HOST_WIDE_INT newsize = size - shiftc;
+	  tree nsize = wide_int_to_tree (bitsizetype, newsize);
+	  tree ntype
+	    = build_nonstandard_integer_type (newsize, TYPE_UNSIGNED (type)); }
+   (switch
+    (if (INTEGRAL_TYPE_P (ntype) && !BYTES_BIG_ENDIAN)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (plus @2 @3))))
+    (if (INTEGRAL_TYPE_P (ntype) && BYTES_BIG_ENDIAN)
+     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (minus @2 @3))))))))
+
+/* Canonicalize BIT_FIELD_REFS and converts to BIT_FIELD_REFS.  */
+(simplify
+ (convert (BIT_FIELD_REF@3 @0 @1 @2))
+ (if (INTEGRAL_TYPE_P (type)
+      && INTEGRAL_TYPE_P (TREE_TYPE (@3)))
+  (with { unsigned int size_inner = element_precision (TREE_TYPE (@3));
+	  unsigned int size_outer  = element_precision (type); }
+   (if (size_inner > size_outer)
+    /* Truncating convert, we can shrink the bit field similar to the
+        shift case.  */
+    (with {
+	    tree nsize = wide_int_to_tree (bitsizetype, size_outer);
+	    auto sign = TYPE_UNSIGNED (type);
+	    tree ntype
+	      = build_nonstandard_integer_type (size_outer, sign);
+	    gcc_assert (useless_type_conversion_p (type, ntype)); }
+     (if (INTEGRAL_TYPE_P (ntype))
+      (BIT_FIELD_REF:ntype @0 { nsize; } @2)))))))
+
 (simplify
  (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
diff --git a/gcc/testsuite/gcc.dg/bitshift_1.c b/gcc/testsuite/gcc.dg/bitshift_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5995d0746d2301eb48304629cb4b779b079f1270
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitshift_1.c
@@ -0,0 +1,50 @@
+/* { dg-do compile { target le } } */
+/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef unsigned int v4usi __attribute__ ((vector_size (16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
+
+unsigned int foor (v4usi x)
+{
+    return x[1] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 48>;} "optimized" } } */
+
+unsigned int fool (v4usi x)
+{
+    return x[1] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
+
+unsigned short foor2 (v4usi x)
+{
+    return x[3] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 112>;} "optimized" } } */
+
+unsigned int fool2 (v4usi x)
+{
+    return x[0] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
+
+unsigned char foor3 (v8uhi x)
+{
+    return x[3] >> 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 57>;} "optimized" } } */
+
+unsigned short fool3 (v8uhi x)
+{
+    return x[0] << 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
+
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/bitshift_2.c b/gcc/testsuite/gcc.dg/bitshift_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..406b4def9d4aebbc83bd5bef92dab825b85f2aa4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/bitshift_2.c
@@ -0,0 +1,49 @@
+/* { dg-do compile { target be } } */
+/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef unsigned int v4usi __attribute__ ((vector_size (16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
+
+unsigned int foor (v4usi x)
+{
+    return x[1] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 16>;} "optimized" } } */
+
+unsigned int fool (v4usi x)
+{
+    return x[1] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
+
+unsigned short foor2 (v4usi x)
+{
+    return x[3] >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 80>;} "optimized" } } */
+
+unsigned int fool2 (v4usi x)
+{
+    return x[0] << 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
+
+unsigned char foor3 (v8uhi x)
+{
+    return x[3] >> 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 39>;} "optimized" } } */
+
+unsigned short fool3 (v8uhi x)
+{
+    return x[0] << 9;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
+
+unsigned short foo2 (v4si x)
+{
+  int y = x[0] + x[1];
+  return y >> 16;
+}
+/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-10-31 11:51         ` Tamar Christina
@ 2022-10-31 16:24           ` Jeff Law
  2022-11-07 13:29           ` Richard Biener
  1 sibling, 0 replies; 19+ messages in thread
From: Jeff Law @ 2022-10-31 16:24 UTC (permalink / raw)
  To: Tamar Christina, Richard Biener, Andrew Pinski; +Cc: gcc-patches, nd, rguenther


On 10/31/22 05:51, Tamar Christina via Gcc-patches wrote:
> Hi All,
>
> Here's a respin addressing review comments.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: Add bitfield and shift folding.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/bitshift_1.c: New.
> 	* gcc.dg/bitshift_2.c: New.

OK

jeff



^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone
  2022-10-31 11:51         ` Tamar Christina
  2022-10-31 16:24           ` Jeff Law
@ 2022-11-07 13:29           ` Richard Biener
  1 sibling, 0 replies; 19+ messages in thread
From: Richard Biener @ 2022-11-07 13:29 UTC (permalink / raw)
  To: Tamar Christina; +Cc: Richard Biener, Andrew Pinski, nd, gcc-patches

On Mon, 31 Oct 2022, Tamar Christina wrote:

> Hi All,
> 
> Here's a respin addressing review comments.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: Add bitfield and shift folding.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/bitshift_1.c: New.
> 	* gcc.dg/bitshift_2.c: New.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 70e90cdbfa902830e6b58be84e114e86ff7b4dff..a4ad465b2b074b21835be74732dce295f8db03bc 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7245,6 +7245,45 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>        && ANY_INTEGRAL_TYPE_P (type) && ANY_INTEGRAL_TYPE_P (TREE_TYPE(@0)))
>    (IFN_REDUC_PLUS_WIDEN @0)))
>  
> +/* Canonicalize BIT_FIELD_REFS and right shift to BIT_FIELD_REFS.  */
> +(simplify
> + (rshift (BIT_FIELD_REF @0 @1 @2) INTEGER_CST@3)
> + (if (INTEGRAL_TYPE_P (type)
> +      && tree_fits_uhwi_p (@1)
> +      && tree_fits_uhwi_p (@3))
> +  (with { /* Can't use wide-int here as the precision differs between
> +	     @1 and @3.  */
> +	  unsigned HOST_WIDE_INT size = tree_to_uhwi (@1);
> +	  unsigned HOST_WIDE_INT shiftc = tree_to_uhwi (@3);
> +	  unsigned HOST_WIDE_INT newsize = size - shiftc;
> +	  tree nsize = wide_int_to_tree (bitsizetype, newsize);
> +	  tree ntype
> +	    = build_nonstandard_integer_type (newsize, TYPE_UNSIGNED (type)); }
> +   (switch
> +    (if (INTEGRAL_TYPE_P (ntype) && !BYTES_BIG_ENDIAN)
> +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (plus @2 @3))))

the :type is not necessary.  Don't you need to verify that (plus/minus @2 
@3) is in bounds?

> +    (if (INTEGRAL_TYPE_P (ntype) && BYTES_BIG_ENDIAN)
> +     (convert:type (BIT_FIELD_REF:ntype @0 { nsize; } (minus @2 @3))))))))
> +
> +/* Canonicalize BIT_FIELD_REFS and converts to BIT_FIELD_REFS.  */
> +(simplify
> + (convert (BIT_FIELD_REF@3 @0 @1 @2))
> + (if (INTEGRAL_TYPE_P (type)
> +      && INTEGRAL_TYPE_P (TREE_TYPE (@3)))
> +  (with { unsigned int size_inner = element_precision (TREE_TYPE (@3));
> +	  unsigned int size_outer  = element_precision (type); }

since you check for INTEGRAL_TYPE_P using element_precision is odd,
just use TYPE_PRECISION here.

> +   (if (size_inner > size_outer)
> +    /* Truncating convert, we can shrink the bit field similar to the
> +        shift case.  */
> +    (with {
> +	    tree nsize = wide_int_to_tree (bitsizetype, size_outer);

bitsize_int

> +	    auto sign = TYPE_UNSIGNED (type);
> +	    tree ntype
> +	      = build_nonstandard_integer_type (size_outer, sign);
> +	    gcc_assert (useless_type_conversion_p (type, ntype)); }

if it's the same type why re-build it?

> +     (if (INTEGRAL_TYPE_P (ntype))

since you build a nonstandard integer type that's always going
to be INTEGRAL_TYPE_P.

> +      (BIT_FIELD_REF:ntype @0 { nsize; } @2)))))))

so why not simply

  (if (size_inner > size_outer)
   (BIT_FIELD_REF @0 { bitsize_int (size_outer); } @2))

?

> +
>  (simplify
>   (BIT_FIELD_REF (BIT_FIELD_REF @0 @1 @2) @3 @4)
>   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
> diff --git a/gcc/testsuite/gcc.dg/bitshift_1.c b/gcc/testsuite/gcc.dg/bitshift_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5995d0746d2301eb48304629cb4b779b079f1270
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/bitshift_1.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile { target le } } */
> +/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef unsigned int v4usi __attribute__ ((vector_size (16)));
> +typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
> +
> +unsigned int foor (v4usi x)
> +{
> +    return x[1] >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 48>;} "optimized" } } */
> +
> +unsigned int fool (v4usi x)
> +{
> +    return x[1] << 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
> +
> +unsigned short foor2 (v4usi x)
> +{
> +    return x[3] >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 112>;} "optimized" } } */
> +
> +unsigned int fool2 (v4usi x)
> +{
> +    return x[0] << 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
> +
> +unsigned char foor3 (v8uhi x)
> +{
> +    return x[3] >> 9;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 57>;} "optimized" } } */
> +
> +unsigned short fool3 (v8uhi x)
> +{
> +    return x[0] << 9;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
> +
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/bitshift_2.c b/gcc/testsuite/gcc.dg/bitshift_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..406b4def9d4aebbc83bd5bef92dab825b85f2aa4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/bitshift_2.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile { target be } } */
> +/* { dg-additional-options "-O2 -save-temps -fdump-tree-optimized" } */
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef unsigned int v4usi __attribute__ ((vector_size (16)));
> +typedef unsigned short v8uhi __attribute__ ((vector_size (16)));
> +
> +unsigned int foor (v4usi x)
> +{
> +    return x[1] >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 16>;} "optimized" } } */
> +
> +unsigned int fool (v4usi x)
> +{
> +    return x[1] << 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 32>;} "optimized" } } */
> +
> +unsigned short foor2 (v4usi x)
> +{
> +    return x[3] >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 80>;} "optimized" } } */
> +
> +unsigned int fool2 (v4usi x)
> +{
> +    return x[0] << 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 32, 0>;} "optimized" } } */
> +
> +unsigned char foor3 (v8uhi x)
> +{
> +    return x[3] >> 9;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 7, 39>;} "optimized" } } */
> +
> +unsigned short fool3 (v8uhi x)
> +{
> +    return x[0] << 9;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 16, 0>;} "optimized" } } */
> +
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> +/* { dg-final { scan-tree-dump {BIT_FIELD_REF <x_[^,]+, 64, 0>;} "optimized" } } */
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-10-31 11:48     ` Tamar Christina
@ 2022-11-14 21:54       ` Richard Sandiford
  2022-11-14 21:59         ` Richard Sandiford
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Sandiford @ 2022-11-14 21:54 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> 
>> The same thing ought to work for smov, so it would be good to do both.
>> That would also make the split between the original and new patterns more
>> obvious: left shift for the old pattern, right shift for the new pattern.
>> 
>
> Done, though because umov can do multilevel extensions I couldn't combine them
> Into a single pattern.

Hmm, but the pattern is:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
	(zero_extend:GPI (LSHIFTRT_ONLY:SI
	  (match_operand:SI 1 "register_operand" "w,r,r")
	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
which isn't a valid operation.  The original patch was just for extending
to DI, which seems correct.  The choice between printing %x for smov and
%w for umov can then depend on the code.

>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
> 	smov.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read_1.c: New test.
> 	* gcc.target/aarch64/shift-read_2.c: New test.
> 	* gcc.target/aarch64/shift-read_3.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5370,20 +5370,42 @@ (define_split
>  
>  ;; Arithmetic right shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>  	(ashiftrt:GPI
> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>    ""
> -  "@
> -   asr\t%<w>0, %<w>1, %2
> -   asr\t%<w>0, %<w>1, %<w>2
> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
> -   #
> -   #"
> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
> -   (set_attr "arch" "*,*,simd,simd,simd")]
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	return "asr\t%<w>0, %<w>1, %2";
> +      case 1:
> +	return "asr\t%<w>0, %<w>1, %<w>2";
> +      case 2:
> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
> +      case 3:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%w0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%w0, %1.b[3]";

This only looks right for SI, not DI.  (But we can do something
similar for DI.)

Thanks,
Richard

> +	  gcc_unreachable ();
> +	}
> +      case 4:
> +	return "#";
> +      case 5:
> +	return "#";
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }
> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>  )
>  
>  (define_split
> @@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_sxtw"
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> +	(sign_extend:GPI (ASHIFTRT_ONLY:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%<w>0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%<w>0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%<w>0, %<w>1, %2";
> +      case 2:
> +	return "<shift>\\t%<w>0, %<w>1, %<w>2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
> +(define_insn "*<optab>si3_insn2_uxtw"
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> +	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  ""
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "umov\\t%w0, %1.h[1]";
> +	  if (size == 8)
> +	    return "umov\\t%w0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
> +
> +;; Singleton code iterator for only logical right shift.
> +(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
> +
> +;; Singleton code iterator for only arithmetic right shift.
> +(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +unsigned short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> @@ -0,0 +1,96 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor_1:
> +** 	smov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +int32_t foor_1 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** foor_2:
> +** 	smov	x0, v0.h\[3\]
> +** 	ret
> +*/
> +int64_t foor_2 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +
> +/*
> +** fool:
> +** 	[su]mov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool (int32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +short foor2 (int32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool2 (int32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +short foo (int x)
> +{
> +  return x >> 16;
> +}
> +
> +/*
> +** foo2:
> +**	...
> +** 	umov	w0, v[0-8]+.h\[1\]
> +** 	ret
> +*/
> +short foo2 (v4si x)
> +{
> +  int y = x[0] + x[1];
> +  return y >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** ufoo:
> +**	...
> +** 	umov	w0, v0.h\[1\]
> +** 	ret
> +*/
> +uint64_t ufoo (uint32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** sfoo:
> +**	...
> +** 	smov	x0, v0.h\[1\]
> +** 	ret
> +*/
> +int64_t sfoo (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** sfoo2:
> +**	...
> +** 	smov	w0, v0.h\[1\]
> +** 	ret
> +*/
> +int32_t sfoo2 (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 16;
> +}
> +
> +/* 
> +** ubar:
> +**	...
> +** 	umov	w0, v0.b\[3\]
> +** 	ret
> +*/
> +uint64_t ubar (uint32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 24;
> +}
> +
> +/* 
> +** sbar:
> +**	...
> +** 	smov	x0, v0.b\[3\]
> +** 	ret
> +*/
> +int64_t sbar (int32x4_t x)
> +{
> +  return (x[0] + x[1]) >> 24;
> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-11-14 21:54       ` Richard Sandiford
@ 2022-11-14 21:59         ` Richard Sandiford
  2022-12-01 16:25           ` Tamar Christina
  0 siblings, 1 reply; 19+ messages in thread
From: Richard Sandiford @ 2022-11-14 21:59 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

(Sorry, immediately following up to myself for a second time recently.)

Richard Sandiford <richard.sandiford@arm.com> writes:
> Tamar Christina <Tamar.Christina@arm.com> writes:
>>> 
>>> The same thing ought to work for smov, so it would be good to do both.
>>> That would also make the split between the original and new patterns more
>>> obvious: left shift for the old pattern, right shift for the new pattern.
>>> 
>>
>> Done, though because umov can do multilevel extensions I couldn't combine them
>> Into a single pattern.
>
> Hmm, but the pattern is:
>
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> 	  (match_operand:SI 1 "register_operand" "w,r,r")
> 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>
> GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
> which isn't a valid operation.  The original patch was just for extending
> to DI, which seems correct.  The choice between printing %x for smov and
> %w for umov can then depend on the code.

My original comment quoted above was about using smov in the zero-extend
pattern.  I.e. the original:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
	(zero_extend:DI (LSHIFTRT:SI
	 (match_operand:SI 1 "register_operand" "w,r,r")
	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

could instead be:

(define_insn "*<optab>si3_insn2_uxtw"
  [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
	(zero_extend:DI (SHIFTRT:SI
	 (match_operand:SI 1 "register_operand" "w,r,r")
	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]

with the pattern using "smov %w0, ..." for ashiftft case.

Thanks,
Richard

>
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
>> 	left and right ones.
>> 	(*aarch64_ashr_sisd_or_int_<mode>3, *<optab>si3_insn2_sxtw): Support
>> 	smov.
>> 	* config/aarch64/constraints.md (Usl): New.
>> 	* config/aarch64/iterators.md (LSHIFTRT_ONLY, ASHIFTRT_ONLY): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/aarch64/shift-read_1.c: New test.
>> 	* gcc.target/aarch64/shift-read_2.c: New test.
>> 	* gcc.target/aarch64/shift-read_3.c: New test.
>>
>> --- inline copy of patch ---
>>
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index c333fb1f72725992bb304c560f1245a242d5192d..2bc2684b82c35a44e0a2cea6e3aaf32d939f8cdf 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -5370,20 +5370,42 @@ (define_split
>>  
>>  ;; Arithmetic right shift using SISD or Integer instruction
>>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
>> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>>  	(ashiftrt:GPI
>> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
>> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
>> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
>> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>>    ""
>> -  "@
>> -   asr\t%<w>0, %<w>1, %2
>> -   asr\t%<w>0, %<w>1, %<w>2
>> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
>> -   #
>> -   #"
>> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
>> -   (set_attr "arch" "*,*,simd,simd,simd")]
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	return "asr\t%<w>0, %<w>1, %2";
>> +      case 1:
>> +	return "asr\t%<w>0, %<w>1, %<w>2";
>> +      case 2:
>> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
>> +      case 3:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "smov\\t%w0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "smov\\t%w0, %1.b[3]";
>
> This only looks right for SI, not DI.  (But we can do something
> similar for DI.)
>
> Thanks,
> Richard
>
>> +	  gcc_unreachable ();
>> +	}
>> +      case 4:
>> +	return "#";
>> +      case 5:
>> +	return "#";
>> +      default:
>> +	gcc_unreachable ();
>> +    }
>> +  }
>> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
>> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>>  )
>>  
>>  (define_split
>> @@ -5493,7 +5515,7 @@ (define_insn "*rol<mode>3_insn"
>>  ;; zero_extend version of shifts
>>  (define_insn "*<optab>si3_insn_uxtw"
>>    [(set (match_operand:DI 0 "register_operand" "=r,r")
>> -	(zero_extend:DI (SHIFT_no_rotate:SI
>> +	(zero_extend:DI (SHIFT_arith:SI
>>  	 (match_operand:SI 1 "register_operand" "r,r")
>>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>>    ""
>> @@ -5528,6 +5550,68 @@ (define_insn "*rolsi3_insn_uxtw"
>>    [(set_attr "type" "rotate_imm")]
>>  )
>>  
>> +(define_insn "*<optab>si3_insn2_sxtw"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> +	(sign_extend:GPI (ASHIFTRT_ONLY:SI
>> +	  (match_operand:SI 1 "register_operand" "w,r,r")
>> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>> +  "<MODE>mode != DImode || satisfies_constraint_Usl (operands[2])"
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "smov\\t%<w>0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "smov\\t%<w>0, %1.b[3]";
>> +	  gcc_unreachable ();
>> +	}
>> +      case 1:
>> +	return "<shift>\\t%<w>0, %<w>1, %2";
>> +      case 2:
>> +	return "<shift>\\t%<w>0, %<w>1, %<w>2";
>> +      default:
>> +	gcc_unreachable ();
>> +      }
>> +  }
>> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
>> +)
>> +
>> +(define_insn "*<optab>si3_insn2_uxtw"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> +	(zero_extend:GPI (LSHIFTRT_ONLY:SI
>> +	  (match_operand:SI 1 "register_operand" "w,r,r")
>> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
>> +  ""
>> +  {
>> +    switch (which_alternative)
>> +    {
>> +      case 0:
>> +	{
>> +	  int val = INTVAL (operands[2]);
>> +	  int size = 32 - val;
>> +
>> +	  if (size == 16)
>> +	    return "umov\\t%w0, %1.h[1]";
>> +	  if (size == 8)
>> +	    return "umov\\t%w0, %1.b[3]";
>> +	  gcc_unreachable ();
>> +	}
>> +      case 1:
>> +	return "<shift>\\t%w0, %w1, %2";
>> +      case 2:
>> +	return "<shift>\\t%w0, %w1, %w2";
>> +      default:
>> +	gcc_unreachable ();
>> +      }
>> +  }
>> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
>> +)
>> +
>>  (define_insn "*<optab><mode>3_insn"
>>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
>> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
>> index ee7587cca1673208e2bfd6b503a21d0c8b69bf75..470510d691ee8589aec9b0a71034677534641bea 100644
>> --- a/gcc/config/aarch64/constraints.md
>> +++ b/gcc/config/aarch64/constraints.md
>> @@ -166,6 +166,14 @@ (define_constraint "Uss"
>>    (and (match_code "const_int")
>>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>>  
>> +(define_constraint "Usl"
>> +  "@internal
>> +  A constraint that matches an immediate shift constant in SImode that has an
>> +  exact mode available to use."
>> +  (and (match_code "const_int")
>> +       (and (match_test "satisfies_constraint_Uss (op)")
>> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
>> +
>>  (define_constraint "Usn"
>>   "A constant that can be used with a CCMN operation (once negated)."
>>   (and (match_code "const_int")
>> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
>> index e904407b2169e589b7007ff966b2d9347a6d0fd2..b2682acb3bb12d584613d395200c3b39c0e94d8d 100644
>> --- a/gcc/config/aarch64/iterators.md
>> +++ b/gcc/config/aarch64/iterators.md
>> @@ -2149,8 +2149,14 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>>  ;; This code iterator allows the various shifts supported on the core
>>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>>  
>> -;; This code iterator allows all shifts except for rotates.
>> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
>> +;; This code iterator allows arithmetic shifts
>> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
>> +
>> +;; Singleton code iterator for only logical right shift.
>> +(define_code_iterator LSHIFTRT_ONLY [lshiftrt])
>> +
>> +;; Singleton code iterator for only arithmetic right shift.
>> +(define_code_iterator ASHIFTRT_ONLY [ashiftrt])
>>  
>>  ;; This code iterator allows the shifts supported in arithmetic instructions
>>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..e6e355224c96344fe1cdabd6b0d3d5d609cd95bd
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
>> @@ -0,0 +1,85 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foor:
>> +** 	umov	w0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +unsigned int foor (uint32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +/*
>> +** fool:
>> +** 	umov	w0, v0.s\[1\]
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned int fool (uint32x4_t x)
>> +{
>> +    return x[1] << 16;
>> +}
>> +
>> +/*
>> +** foor2:
>> +** 	umov	w0, v0.h\[7\]
>> +** 	ret
>> +*/
>> +unsigned short foor2 (uint32x4_t x)
>> +{
>> +    return x[3] >> 16;
>> +}
>> +
>> +/*
>> +** fool2:
>> +** 	fmov	w0, s0
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned int fool2 (uint32x4_t x)
>> +{
>> +    return x[0] << 16;
>> +}
>> +
>> +typedef int v4si __attribute__ ((vector_size (16)));
>> +
>> +/*
>> +** bar:
>> +**	addv	s0, v0.4s
>> +**	fmov	w0, s0
>> +**	lsr	w1, w0, 16
>> +**	add	w0, w1, w0, uxth
>> +**	ret
>> +*/
>> +int bar (v4si x)
>> +{
>> +  unsigned int sum = vaddvq_s32 (x);
>> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
>> +}
>> +
>> +/*
>> +** foo:
>> +** 	lsr	w0, w0, 16
>> +** 	ret
>> +*/
>> +unsigned short foo (unsigned x)
>> +{
>> +  return x >> 16;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**	...
>> +** 	umov	w0, v[0-8]+.h\[1\]
>> +** 	ret
>> +*/
>> +unsigned short foo2 (v4si x)
>> +{
>> +  int y = x[0] + x[1];
>> +  return y >> 16;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..541dce9303382e047c3931ad58a1cbd8b3e182fb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
>> @@ -0,0 +1,96 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foor_1:
>> +** 	smov	w0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +int32_t foor_1 (int32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +/*
>> +** foor_2:
>> +** 	smov	x0, v0.h\[3\]
>> +** 	ret
>> +*/
>> +int64_t foor_2 (int32x4_t x)
>> +{
>> +    return x[1] >> 16;
>> +}
>> +
>> +
>> +/*
>> +** fool:
>> +** 	[su]mov	w0, v0.s\[1\]
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +int fool (int32x4_t x)
>> +{
>> +    return x[1] << 16;
>> +}
>> +
>> +/*
>> +** foor2:
>> +** 	umov	w0, v0.h\[7\]
>> +** 	ret
>> +*/
>> +short foor2 (int32x4_t x)
>> +{
>> +    return x[3] >> 16;
>> +}
>> +
>> +/*
>> +** fool2:
>> +** 	fmov	w0, s0
>> +** 	lsl	w0, w0, 16
>> +** 	ret
>> +*/
>> +int fool2 (int32x4_t x)
>> +{
>> +    return x[0] << 16;
>> +}
>> +
>> +typedef int v4si __attribute__ ((vector_size (16)));
>> +
>> +/*
>> +** bar:
>> +**	addv	s0, v0.4s
>> +**	fmov	w0, s0
>> +**	lsr	w1, w0, 16
>> +**	add	w0, w1, w0, uxth
>> +**	ret
>> +*/
>> +int bar (v4si x)
>> +{
>> +  unsigned int sum = vaddvq_s32 (x);
>> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
>> +}
>> +
>> +/*
>> +** foo:
>> +** 	lsr	w0, w0, 16
>> +** 	ret
>> +*/
>> +short foo (int x)
>> +{
>> +  return x >> 16;
>> +}
>> +
>> +/*
>> +** foo2:
>> +**	...
>> +** 	umov	w0, v[0-8]+.h\[1\]
>> +** 	ret
>> +*/
>> +short foo2 (v4si x)
>> +{
>> +  int y = x[0] + x[1];
>> +  return y >> 16;
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_3.c b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..2ea81ff5b5af7794e062e471f46b433e1d7d87ee
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_3.c
>> @@ -0,0 +1,60 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-O2" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** ufoo:
>> +**	...
>> +** 	umov	w0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +uint64_t ufoo (uint32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** sfoo:
>> +**	...
>> +** 	smov	x0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +int64_t sfoo (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** sfoo2:
>> +**	...
>> +** 	smov	w0, v0.h\[1\]
>> +** 	ret
>> +*/
>> +int32_t sfoo2 (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 16;
>> +}
>> +
>> +/* 
>> +** ubar:
>> +**	...
>> +** 	umov	w0, v0.b\[3\]
>> +** 	ret
>> +*/
>> +uint64_t ubar (uint32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 24;
>> +}
>> +
>> +/* 
>> +** sbar:
>> +**	...
>> +** 	smov	x0, v0.b\[3\]
>> +** 	ret
>> +*/
>> +int64_t sbar (int32x4_t x)
>> +{
>> +  return (x[0] + x[1]) >> 24;
>> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* RE: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-11-14 21:59         ` Richard Sandiford
@ 2022-12-01 16:25           ` Tamar Christina
  2022-12-01 18:38             ` Richard Sandiford
  0 siblings, 1 reply; 19+ messages in thread
From: Tamar Christina @ 2022-12-01 16:25 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 11934 bytes --]

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, November 14, 2022 9:59 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves
> and shifts which arrive after expand
> 
> (Sorry, immediately following up to myself for a second time recently.)
> 
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > Tamar Christina <Tamar.Christina@arm.com> writes:
> >>>
> >>> The same thing ought to work for smov, so it would be good to do both.
> >>> That would also make the split between the original and new patterns
> >>> more
> >>> obvious: left shift for the old pattern, right shift for the new pattern.
> >>>
> >>
> >> Done, though because umov can do multilevel extensions I couldn't
> >> combine them Into a single pattern.
> >
> > Hmm, but the pattern is:
> >
> > (define_insn "*<optab>si3_insn2_uxtw"
> >   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
> > 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
> > 	  (match_operand:SI 1 "register_operand" "w,r,r")
> > 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> >
> > GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
> > which isn't a valid operation.  The original patch was just for
> > extending to DI, which seems correct.  The choice between printing %x
> > for smov and %w for umov can then depend on the code.

You're right, GPI made no sense here.  Fixed.

> 
> My original comment quoted above was about using smov in the zero-
> extend pattern.  I.e. the original:
> 
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
> 	(zero_extend:DI (LSHIFTRT:SI
> 	 (match_operand:SI 1 "register_operand" "w,r,r")
> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> 
> could instead be:
> 
> (define_insn "*<optab>si3_insn2_uxtw"
>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
> 	(zero_extend:DI (SHIFTRT:SI
> 	 (match_operand:SI 1 "register_operand" "w,r,r")
> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
> "Usl,Uss,r"))))]
> 
> with the pattern using "smov %w0, ..." for ashiftft case.

Almost, except the none immediate cases don't work with shifts.
i.e. a right shift can't be used to sign extend from 32 to 64 bits.

I've merged the cases but added a guard for this.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
	left and right ones.
	(*aarch64_ashr_sisd_or_int_<mode>3): Support smov.
	(*<optab>si3_insn2_<sra_op>xtw): New.
	* config/aarch64/constraints.md (Usl): New.
	* config/aarch64/iterators.md (is_zeroE, extend_op): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/shift-read_1.c: New test.
	* gcc.target/aarch64/shift-read_2.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5425,20 +5425,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5548,7 +5570,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_<sra_op>xtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(<extend_op>:DI (SHIFTRT:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<is_zeroE> || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "<sra_op>mov\\t%x0, %1.h[1]";
+	  if (size == 8)
+	    return "<sra_op>mov\\t%x0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
@@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
 (define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
 			   (lshiftrt "0") (rotatert "0") (rotate "1")])
 
+;; True if zero extending operation or not
+(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false")
+			   (lshiftrt "true")])
+
+
 ;; Op prefix for shift right and accumulate.
 (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
 
+;; Extensions that can be performed with Op
+(define_code_attr extend_op [(ashiftrt "sign_extend")
+			     (lshiftrt "zero_extend")])
+
 ;; op prefix for shift right and narrow.
 (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}

[-- Attachment #2: rb15777.patch --]
[-- Type: application/octet-stream, Size: 8395 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5425,20 +5425,42 @@ (define_split
 
 ;; Arithmetic right shift using SISD or Integer instruction
 (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
-  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
+  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
 	(ashiftrt:GPI
-	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
+	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
-			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
+			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
   ""
-  "@
-   asr\t%<w>0, %<w>1, %2
-   asr\t%<w>0, %<w>1, %<w>2
-   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
-   #
-   #"
-  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
-   (set_attr "arch" "*,*,simd,simd,simd")]
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	return "asr\t%<w>0, %<w>1, %2";
+      case 1:
+	return "asr\t%<w>0, %<w>1, %<w>2";
+      case 2:
+	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
+      case 3:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "smov\\t%<w>0, %1.h[1]";
+	  if (size == 8)
+	    return "smov\\t%<w>0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 4:
+	return "#";
+      case 5:
+	return "#";
+      default:
+	gcc_unreachable ();
+    }
+  }
+  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
+   (set_attr "arch" "*,*,simd,simd,simd,simd")]
 )
 
 (define_split
@@ -5548,7 +5570,7 @@ (define_insn "*rol<mode>3_insn"
 ;; zero_extend version of shifts
 (define_insn "*<optab>si3_insn_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
-	(zero_extend:DI (SHIFT_no_rotate:SI
+	(zero_extend:DI (SHIFT_arith:SI
 	 (match_operand:SI 1 "register_operand" "r,r")
 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
   ""
@@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw"
   [(set_attr "type" "rotate_imm")]
 )
 
+(define_insn "*<optab>si3_insn2_<sra_op>xtw"
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+	(<extend_op>:DI (SHIFTRT:SI
+	  (match_operand:SI 1 "register_operand" "w,r,r")
+	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
+  "<is_zeroE> || satisfies_constraint_Usl (operands[2])"
+  {
+    switch (which_alternative)
+    {
+      case 0:
+	{
+	  int val = INTVAL (operands[2]);
+	  int size = 32 - val;
+
+	  if (size == 16)
+	    return "<sra_op>mov\\t%x0, %1.h[1]";
+	  if (size == 8)
+	    return "<sra_op>mov\\t%x0, %1.b[3]";
+	  gcc_unreachable ();
+	}
+      case 1:
+	return "<shift>\\t%w0, %w1, %2";
+      case 2:
+	return "<shift>\\t%w0, %w1, %w2";
+      default:
+	gcc_unreachable ();
+      }
+  }
+  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
+)
+
 (define_insn "*<optab><mode>3_insn"
   [(set (match_operand:SHORT 0 "register_operand" "=r")
 	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,14 @@ (define_constraint "Uss"
   (and (match_code "const_int")
        (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
 
+(define_constraint "Usl"
+  "@internal
+  A constraint that matches an immediate shift constant in SImode that has an
+  exact mode available to use."
+  (and (match_code "const_int")
+       (and (match_test "satisfies_constraint_Uss (op)")
+	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
+
 (define_constraint "Usn"
  "A constant that can be used with a CCMN operation (once negated)."
  (and (match_code "const_int")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
 ;; This code iterator allows the various shifts supported on the core
 (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
 
-;; This code iterator allows all shifts except for rotates.
-(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
+;; This code iterator allows arithmetic shifts
+(define_code_iterator SHIFT_arith [ashift ashiftrt])
 
 ;; This code iterator allows the shifts supported in arithmetic instructions
 (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
@@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
 (define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
 			   (lshiftrt "0") (rotatert "0") (rotate "1")])
 
+;; True if zero extending operation or not
+(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false")
+			   (lshiftrt "true")])
+
+
 ;; Op prefix for shift right and accumulate.
 (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
 
+;; Extensions that can be performed with Op
+(define_code_attr extend_op [(ashiftrt "sign_extend")
+			     (lshiftrt "zero_extend")])
+
 ;; op prefix for shift right and narrow.
 (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor:
+** 	umov	w0, v0.h\[3\]
+** 	ret
+*/
+unsigned int foor (uint32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** fool:
+** 	umov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool (uint32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+unsigned short foor2 (uint32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+unsigned int fool2 (uint32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+unsigned short foo (unsigned x)
+{
+  return x >> 16;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
@@ -0,0 +1,84 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+** foor_1:
+** 	smov	w0, v0.h\[3\]
+** 	ret
+*/
+int32_t foor_1 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+/*
+** foor_2:
+** 	smov	x0, v0.h\[3\]
+** 	ret
+*/
+int64_t foor_2 (int32x4_t x)
+{
+    return x[1] >> 16;
+}
+
+
+/*
+** fool:
+** 	[su]mov	w0, v0.s\[1\]
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool (int32x4_t x)
+{
+    return x[1] << 16;
+}
+
+/*
+** foor2:
+** 	umov	w0, v0.h\[7\]
+** 	ret
+*/
+short foor2 (int32x4_t x)
+{
+    return x[3] >> 16;
+}
+
+/*
+** fool2:
+** 	fmov	w0, s0
+** 	lsl	w0, w0, 16
+** 	ret
+*/
+int fool2 (int32x4_t x)
+{
+    return x[0] << 16;
+}
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+** bar:
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
+
+/*
+** foo:
+** 	lsr	w0, w0, 16
+** 	ret
+*/
+short foo (int x)
+{
+  return x >> 16;
+}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand
  2022-12-01 16:25           ` Tamar Christina
@ 2022-12-01 18:38             ` Richard Sandiford
  0 siblings, 0 replies; 19+ messages in thread
From: Richard Sandiford @ 2022-12-01 18:38 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, November 14, 2022 9:59 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH 2/2]AArch64 Perform more late folding of reg moves
>> and shifts which arrive after expand
>> 
>> (Sorry, immediately following up to myself for a second time recently.)
>> 
>> Richard Sandiford <richard.sandiford@arm.com> writes:
>> > Tamar Christina <Tamar.Christina@arm.com> writes:
>> >>>
>> >>> The same thing ought to work for smov, so it would be good to do both.
>> >>> That would also make the split between the original and new patterns
>> >>> more
>> >>> obvious: left shift for the old pattern, right shift for the new pattern.
>> >>>
>> >>
>> >> Done, though because umov can do multilevel extensions I couldn't
>> >> combine them Into a single pattern.
>> >
>> > Hmm, but the pattern is:
>> >
>> > (define_insn "*<optab>si3_insn2_uxtw"
>> >   [(set (match_operand:GPI 0 "register_operand" "=r,r,r")
>> > 	(zero_extend:GPI (LSHIFTRT_ONLY:SI
>> > 	  (match_operand:SI 1 "register_operand" "w,r,r")
>> > 	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> >
>> > GPI is just SI or DI, so in the SI case we're zero-extending SI to SI,
>> > which isn't a valid operation.  The original patch was just for
>> > extending to DI, which seems correct.  The choice between printing %x
>> > for smov and %w for umov can then depend on the code.
>
> You're right, GPI made no sense here.  Fixed.
>
>> 
>> My original comment quoted above was about using smov in the zero-
>> extend pattern.  I.e. the original:
>> 
>> (define_insn "*<optab>si3_insn2_uxtw"
>>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
>> 	(zero_extend:DI (LSHIFTRT:SI
>> 	 (match_operand:SI 1 "register_operand" "w,r,r")
>> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> 
>> could instead be:
>> 
>> (define_insn "*<optab>si3_insn2_uxtw"
>>   [(set (match_operand:DI 0 "register_operand" "=r,?r,r")
>> 	(zero_extend:DI (SHIFTRT:SI
>> 	 (match_operand:SI 1 "register_operand" "w,r,r")
>> 	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"
>> "Usl,Uss,r"))))]
>> 
>> with the pattern using "smov %w0, ..." for ashiftft case.
>
> Almost, except the none immediate cases don't work with shifts.
> i.e. a right shift can't be used to sign extend from 32 to 64 bits.

Right, but the pattern I quoted above is doing a zero-extend rather than
a sign-extend, even for the ashiftrt case.  That is, I was suggesting that
we keep the zero_extend fixed but allow zero extensions of both lshiftrts
and ashiftrts.  That works because ASR Wx and SMOV Wx zero-extend the Wn
result to Xn.

I wasn't suggesting that you add support for SI->DI sign extensions,
although obviously the more cases we optimise the better :-)

The original comment was only supposed to be a small tweak, sorry for
not explaining it properly.

Thanks,
Richard

>
> I've merged the cases but added a guard for this.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.md (*<optab>si3_insn_uxtw): Split SHIFT into
> 	left and right ones.
> 	(*aarch64_ashr_sisd_or_int_<mode>3): Support smov.
> 	(*<optab>si3_insn2_<sra_op>xtw): New.
> 	* config/aarch64/constraints.md (Usl): New.
> 	* config/aarch64/iterators.md (is_zeroE, extend_op): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/shift-read_1.c: New test.
> 	* gcc.target/aarch64/shift-read_2.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 39e65979528fb7f748ed456399ca38f929dba1d4..4c181a96e555c2a58c59fc991000b2a2fa9bd244 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5425,20 +5425,42 @@ (define_split
>  
>  ;; Arithmetic right shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashr_sisd_or_int_<mode>3"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,&w,&w")
> +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r,&w,&w")
>  	(ashiftrt:GPI
> -	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w")
> +	  (match_operand:GPI 1 "register_operand" "r,r,w,w,w,w")
>  	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_di"
> -			       "Us<cmode>,r,Us<cmode_simd>,w,0")))]
> +			       "Us<cmode>,r,Us<cmode_simd>,Usl,w,0")))]
>    ""
> -  "@
> -   asr\t%<w>0, %<w>1, %2
> -   asr\t%<w>0, %<w>1, %<w>2
> -   sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2
> -   #
> -   #"
> -  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_shift_reg<q>,neon_shift_reg<q>")
> -   (set_attr "arch" "*,*,simd,simd,simd")]
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	return "asr\t%<w>0, %<w>1, %2";
> +      case 1:
> +	return "asr\t%<w>0, %<w>1, %<w>2";
> +      case 2:
> +	return "sshr\t%<rtn>0<vas>, %<rtn>1<vas>, %2";
> +      case 3:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "smov\\t%<w>0, %1.h[1]";
> +	  if (size == 8)
> +	    return "smov\\t%<w>0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 4:
> +	return "#";
> +      case 5:
> +	return "#";
> +      default:
> +	gcc_unreachable ();
> +    }
> +  }
> +  [(set_attr "type" "bfx,shift_reg,neon_shift_imm<q>,neon_to_gp, neon_shift_reg<q>,neon_shift_reg<q>")
> +   (set_attr "arch" "*,*,simd,simd,simd,simd")]
>  )
>  
>  (define_split
> @@ -5548,7 +5570,7 @@ (define_insn "*rol<mode>3_insn"
>  ;; zero_extend version of shifts
>  (define_insn "*<optab>si3_insn_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=r,r")
> -	(zero_extend:DI (SHIFT_no_rotate:SI
> +	(zero_extend:DI (SHIFT_arith:SI
>  	 (match_operand:SI 1 "register_operand" "r,r")
>  	 (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Uss,r"))))]
>    ""
> @@ -5583,6 +5605,37 @@ (define_insn "*rolsi3_insn_uxtw"
>    [(set_attr "type" "rotate_imm")]
>  )
>  
> +(define_insn "*<optab>si3_insn2_<sra_op>xtw"
> +  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
> +	(<extend_op>:DI (SHIFTRT:SI
> +	  (match_operand:SI 1 "register_operand" "w,r,r")
> +	  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si" "Usl,Uss,r"))))]
> +  "<is_zeroE> || satisfies_constraint_Usl (operands[2])"
> +  {
> +    switch (which_alternative)
> +    {
> +      case 0:
> +	{
> +	  int val = INTVAL (operands[2]);
> +	  int size = 32 - val;
> +
> +	  if (size == 16)
> +	    return "<sra_op>mov\\t%x0, %1.h[1]";
> +	  if (size == 8)
> +	    return "<sra_op>mov\\t%x0, %1.b[3]";
> +	  gcc_unreachable ();
> +	}
> +      case 1:
> +	return "<shift>\\t%w0, %w1, %2";
> +      case 2:
> +	return "<shift>\\t%w0, %w1, %w2";
> +      default:
> +	gcc_unreachable ();
> +      }
> +  }
> +  [(set_attr "type" "neon_to_gp,bfx,shift_reg")]
> +)
> +
>  (define_insn "*<optab><mode>3_insn"
>    [(set (match_operand:SHORT 0 "register_operand" "=r")
>  	(ASHIFT:SHORT (match_operand:SHORT 1 "register_operand" "r")
> diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
> index 29efb6c0cff7574c9b239ef358acaca96dd75d03..c2a696cb77f49cae23239b0ed8a8aa5168f8898c 100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -171,6 +171,14 @@ (define_constraint "Uss"
>    (and (match_code "const_int")
>         (match_test "(unsigned HOST_WIDE_INT) ival < 32")))
>  
> +(define_constraint "Usl"
> +  "@internal
> +  A constraint that matches an immediate shift constant in SImode that has an
> +  exact mode available to use."
> +  (and (match_code "const_int")
> +       (and (match_test "satisfies_constraint_Uss (op)")
> +	    (match_test "(32 - ival == 8) || (32 - ival == 16)"))))
> +
>  (define_constraint "Usn"
>   "A constant that can be used with a CCMN operation (once negated)."
>   (and (match_code "const_int")
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 7c69b124f076b4fb2540241f287c6999c32123c1..df72c079f218db9727a96924cab496e91ce6df59 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2149,8 +2149,8 @@ (define_mode_attr sve_lane_pair_con [(VNx8HF "y") (VNx4SF "x")])
>  ;; This code iterator allows the various shifts supported on the core
>  (define_code_iterator SHIFT [ashift ashiftrt lshiftrt rotatert rotate])
>  
> -;; This code iterator allows all shifts except for rotates.
> -(define_code_iterator SHIFT_no_rotate [ashift ashiftrt lshiftrt])
> +;; This code iterator allows arithmetic shifts
> +(define_code_iterator SHIFT_arith [ashift ashiftrt])
>  
>  ;; This code iterator allows the shifts supported in arithmetic instructions
>  (define_code_iterator ASHIFT [ashift ashiftrt lshiftrt])
> @@ -2378,9 +2378,18 @@ (define_code_attr shift [(ashift "lsl") (ashiftrt "asr")
>  (define_code_attr is_rotl [(ashift "0") (ashiftrt "0")
>  			   (lshiftrt "0") (rotatert "0") (rotate "1")])
>  
> +;; True if zero extending operation or not
> +(define_code_attr is_zeroE [(ashift "false") (ashiftrt "false")
> +			   (lshiftrt "true")])
> +
> +
>  ;; Op prefix for shift right and accumulate.
>  (define_code_attr sra_op [(ashiftrt "s") (lshiftrt "u")])
>  
> +;; Extensions that can be performed with Op
> +(define_code_attr extend_op [(ashiftrt "sign_extend")
> +			     (lshiftrt "zero_extend")])
> +
>  ;; op prefix for shift right and narrow.
>  (define_code_attr srn_op [(ashiftrt "r") (lshiftrt "")])
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_1.c b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..864cfcb1650ae6553a18e753c8d8d0e85cd0ba7b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_1.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor:
> +** 	umov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +unsigned int foor (uint32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** fool:
> +** 	umov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool (uint32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +unsigned short foor2 (uint32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +unsigned int fool2 (uint32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +unsigned short foo (unsigned x)
> +{
> +  return x >> 16;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/shift-read_2.c b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bdc214d1941807ce5aa21c369fcfe23c1927e98b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/shift-read_2.c
> @@ -0,0 +1,84 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foor_1:
> +** 	smov	w0, v0.h\[3\]
> +** 	ret
> +*/
> +int32_t foor_1 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +/*
> +** foor_2:
> +** 	smov	x0, v0.h\[3\]
> +** 	ret
> +*/
> +int64_t foor_2 (int32x4_t x)
> +{
> +    return x[1] >> 16;
> +}
> +
> +
> +/*
> +** fool:
> +** 	[su]mov	w0, v0.s\[1\]
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool (int32x4_t x)
> +{
> +    return x[1] << 16;
> +}
> +
> +/*
> +** foor2:
> +** 	umov	w0, v0.h\[7\]
> +** 	ret
> +*/
> +short foor2 (int32x4_t x)
> +{
> +    return x[3] >> 16;
> +}
> +
> +/*
> +** fool2:
> +** 	fmov	w0, s0
> +** 	lsl	w0, w0, 16
> +** 	ret
> +*/
> +int fool2 (int32x4_t x)
> +{
> +    return x[0] << 16;
> +}
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +** bar:
> +**	addv	s0, v0.4s
> +**	fmov	w0, s0
> +**	lsr	w1, w0, 16
> +**	add	w0, w1, w0, uxth
> +**	ret
> +*/
> +int bar (v4si x)
> +{
> +  unsigned int sum = vaddvq_s32 (x);
> +  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
> +}
> +
> +/*
> +** foo:
> +** 	lsr	w0, w0, 16
> +** 	ret
> +*/
> +short foo (int x)
> +{
> +  return x >> 16;
> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2022-12-01 18:38 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-23 11:42 [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Tamar Christina
2022-09-23 11:43 ` [PATCH 2/2]AArch64 Perform more late folding of reg moves and shifts which arrive after expand Tamar Christina
2022-09-23 14:32   ` Richard Sandiford
2022-10-31 11:48     ` Tamar Christina
2022-11-14 21:54       ` Richard Sandiford
2022-11-14 21:59         ` Richard Sandiford
2022-12-01 16:25           ` Tamar Christina
2022-12-01 18:38             ` Richard Sandiford
2022-09-24 18:38 ` [PATCH 1/2]middle-end Fold BIT_FIELD_REF and Shifts into BIT_FIELD_REFs alone Jeff Law
2022-09-28 13:19   ` Tamar Christina
2022-09-28 17:25     ` Jeff Law
2022-09-24 18:57 ` Andrew Pinski
2022-09-26  4:55   ` Tamar Christina
2022-09-26  8:05     ` Richard Biener
2022-09-26 15:24     ` Andrew Pinski
2022-09-27 12:40       ` Richard Biener
2022-10-31 11:51         ` Tamar Christina
2022-10-31 16:24           ` Jeff Law
2022-11-07 13:29           ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).