public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [committed] arm: Improve code generation for BFI and BFC [PR105090]
@ 2022-06-07 11:13 Richard Earnshaw
  0 siblings, 0 replies; only message in thread
From: Richard Earnshaw @ 2022-06-07 11:13 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw

[-- Attachment #1: Type: text/plain, Size: 1886 bytes --]


This patch, in response to PR105090, makes some general improvements
to the code generation when BFI and BFC instructions are available.
Firstly we handle more cases where the RTL does not generate an INSV
operation due to a lack of a tie between the input and output, but we
nevertheless need to emit BFI later on; we handle this by requiring
the register allocator to tie the operands.  Secondly we handle some
cases where we were previously emitting BFC, but AND with an immediate
would be better; we do this by converting all BFC patterns into AND
using a split pattern.  And finally, we handle some cases where
previously we would emit multiple BIC operations to clear a value, but
could instead use a single BFC instruction.

BFC and BFI express the mask as a pair of values, one for the number
of bits to clear and another for the location of the least significant
bit.  We handle these with a single new output modifier letter that
causes both values to be printed; we use an 'inverted' value so that
it can be used directly with the constant used in an AND rtl
construct.  We've run out of 'new' letters, so to do this we re-use
one of the long-obsoleted Maverick output modifiers.

gcc/ChangeLog:

	PR target/105090
	* config/arm/arm.cc (arm_bfi_1_p): New function.
	(arm_bfi_p): New function.
	(arm_rtx_costs_internal): Add costs for BFI idioms.
	(arm_print_operand [case 'V']): Format output for BFI/BFC masks.
	* config/arm/constraints.md (Dj): New constraint.
	* config/arm/arm.md (arm_andsi3_insn): Add alternative to use BFC.
	(insv_zero): Convert to an insn with a split.
	(*bfi, *bfi_alt1, *bfi_alt2, *bfi_alt3): New patterns.
---
 gcc/config/arm/arm.cc         | 113 ++++++++++++++++++++++++++++++----
 gcc/config/arm/arm.md         | 106 ++++++++++++++++++++++++++++---
 gcc/config/arm/constraints.md |  10 ++-
 3 files changed, 208 insertions(+), 21 deletions(-)


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-arm-Improve-code-generation-for-BFI-and-BFC-PR105090.patch --]
[-- Type: text/x-patch; name="0001-arm-Improve-code-generation-for-BFI-and-BFC-PR105090.patch", Size: 11584 bytes --]

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 7ecf7b74ffa..2a76c7b9a72 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -10201,6 +10201,61 @@ arm_mem_costs (rtx x, const struct cpu_cost_table *extra_cost,
   return true;
 }
 
+/* Helper for arm_bfi_p.  */
+static bool
+arm_bfi_1_p (rtx op0, rtx op1, rtx *sub0, rtx *sub1)
+{
+  unsigned HOST_WIDE_INT const1;
+  unsigned HOST_WIDE_INT const2 = 0;
+
+  if (!CONST_INT_P (XEXP (op0, 1)))
+    return false;
+
+  const1 = XUINT (XEXP (op0, 1), 0);
+  if (!CONST_INT_P (XEXP (op1, 1))
+      || ~XUINT (XEXP (op1, 1), 0) != const1)
+    return false;
+
+  if (GET_CODE (XEXP (op0, 0)) == ASHIFT
+      && CONST_INT_P (XEXP (XEXP (op0, 0), 1)))
+    {
+      const2 = XUINT (XEXP (XEXP (op0, 0), 1), 0);
+      *sub0 = XEXP (XEXP (op0, 0), 0);
+    }
+  else
+    *sub0 = XEXP (op0, 0);
+
+  if (const2 >= GET_MODE_BITSIZE (GET_MODE (op0)))
+    return false;
+
+  *sub1 = XEXP (op1, 0);
+  return exact_log2 (const1 + (HOST_WIDE_INT_1U << const2)) >= 0;
+}
+
+/* Recognize a BFI idiom.  Helper for arm_rtx_costs_internal.  The
+   format looks something like:
+
+   (IOR (AND (reg1) (~const1))
+	(AND (ASHIFT (reg2) (const2))
+	     (const1)))
+
+   where const1 is a consecutive sequence of 1-bits with the
+   least-significant non-zero bit starting at bit position const2.  If
+   const2 is zero, then the shift will not appear at all, due to
+   canonicalization.  The two arms of the IOR expression may be
+   flipped.  */
+static bool
+arm_bfi_p (rtx x, rtx *sub0, rtx *sub1)
+{
+  if (GET_CODE (x) != IOR)
+    return false;
+  if (GET_CODE (XEXP (x, 0)) != AND
+      || GET_CODE (XEXP (x, 1)) != AND)
+    return false;
+  return (arm_bfi_1_p (XEXP (x, 0), XEXP (x, 1), sub0, sub1)
+	  || arm_bfi_1_p (XEXP (x, 1), XEXP (x, 0), sub1, sub0));
+}
+
 /* RTX costs.  Make an estimate of the cost of executing the operation
    X, which is contained within an operation with code OUTER_CODE.
    SPEED_P indicates whether the cost desired is the performance cost,
@@ -10959,14 +11014,28 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
       *cost = LIBCALL_COST (2);
       return false;
     case IOR:
-      if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
-        {
-          if (speed_p)
-            *cost += extra_cost->alu.rev;
+      {
+	rtx sub0, sub1;
+	if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
+	  {
+	    if (speed_p)
+	      *cost += extra_cost->alu.rev;
 
-          return true;
-        }
-    /* Fall through.  */
+	    return true;
+	  }
+	else if (mode == SImode && arm_arch_thumb2
+		 && arm_bfi_p (x, &sub0, &sub1))
+	  {
+	    *cost += rtx_cost (sub0, mode, ZERO_EXTRACT, 1, speed_p);
+	    *cost += rtx_cost (sub1, mode, ZERO_EXTRACT, 0, speed_p);
+	    if (speed_p)
+	      *cost += extra_cost->alu.bfi;
+
+	    return true;
+	  }
+      }
+
+      /* Fall through.  */
     case AND: case XOR:
       if (mode == SImode)
 	{
@@ -23780,8 +23849,8 @@ arm_print_condition (FILE *stream)
 /* Globally reserved letters: acln
    Puncutation letters currently used: @_|?().!#
    Lower case letters currently used: bcdefhimpqtvwxyz
-   Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTU
-   Letters previously used, but now deprecated/obsolete: sVWXYZ.
+   Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTUV
+   Letters previously used, but now deprecated/obsolete: sWXYZ.
 
    Note that the global reservation for 'c' is only for CONSTANT_ADDRESS_P.
 
@@ -23797,7 +23866,10 @@ arm_print_condition (FILE *stream)
    If CODE is 'N' then X is a floating point operand that must be negated
    before output.
    If CODE is 'B' then output a bitwise inverted value of X (a const int).
-   If X is a REG and CODE is `M', output a ldm/stm style multi-reg.  */
+   If X is a REG and CODE is `M', output a ldm/stm style multi-reg.
+   If CODE is 'V', then the operand must be a CONST_INT representing
+   the bits to preserve in the modified register (Rd) of a BFI or BFC
+   instruction: print out both the width and lsb (shift) fields.  */
 static void
 arm_print_operand (FILE *stream, rtx x, int code)
 {
@@ -24106,8 +24178,27 @@ arm_print_operand (FILE *stream, rtx x, int code)
 	     stream);
       return;
 
-    case 's':
     case 'V':
+      {
+	/* Output the LSB (shift) and width for a bitmask instruction
+	   based on a literal mask.  The LSB is printed first,
+	   followed by the width.
+
+	   Eg. For 0b1...1110001, the result is #1, #3.  */
+	if (!CONST_INT_P (x))
+	  {
+	    output_operand_lossage ("invalid operand for code '%c'", code);
+	    return;
+	  }
+
+	unsigned HOST_WIDE_INT val = ~XUINT (x, 0);
+	int lsb = exact_log2 (val & -val);
+	asm_fprintf (stream, "#%d, #%d", lsb,
+		     (exact_log2 (val + (val & -val)) - lsb));
+      }
+      return;
+
+    case 's':
     case 'W':
     case 'X':
     case 'Y':
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 60468f6182c..69bf343fb0e 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3002,30 +3002,36 @@ (define_expand "andsi3"
 
 ; ??? Check split length for Thumb-2
 (define_insn_and_split "*arm_andsi3_insn"
-  [(set (match_operand:SI         0 "s_register_operand" "=r,l,r,r,r")
-	(and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,r")
-		(match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,?n")))]
+  [(set (match_operand:SI         0 "s_register_operand" "=r,l,r,r,r,r")
+	(and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,0,r")
+		(match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,Dj,?n")))]
   "TARGET_32BIT"
   "@
    and%?\\t%0, %1, %2
    and%?\\t%0, %1, %2
    bic%?\\t%0, %1, #%B2
    and%?\\t%0, %1, %2
+   bfc%?\\t%0, %V2
    #"
   "TARGET_32BIT
    && CONST_INT_P (operands[2])
    && !(const_ok_for_arm (INTVAL (operands[2]))
-	|| const_ok_for_arm (~INTVAL (operands[2])))"
+	|| const_ok_for_arm (~INTVAL (operands[2]))
+	|| (arm_arch_thumb2
+	    && satisfies_constraint_Dj (operands[2])
+	    && (rtx_equal_p (operands[0], operands[1])
+		|| !reload_completed)))"
   [(clobber (const_int 0))]
   "
-  arm_split_constant  (AND, SImode, curr_insn, 
+  arm_split_constant  (AND, SImode, curr_insn,
 	               INTVAL (operands[2]), operands[0], operands[1], 0);
   DONE;
   "
-  [(set_attr "length" "4,4,4,4,16")
+  [(set_attr "length" "4,4,4,4,4,16")
    (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no,yes,no,no,no")
-   (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,logic_imm")]
+   (set_attr "predicable_short_it" "no,yes,no,no,no,no")
+   (set_attr "arch" "*,*,*,*,v6t2,*")
+   (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,bfm,logic_imm")]
 )
 
 (define_insn "*andsi3_compare0"
@@ -3471,13 +3477,25 @@ (define_expand "insv"
   }"
 )
 
-(define_insn "insv_zero"
+(define_insn_and_split "insv_zero"
   [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
                          (match_operand:SI 1 "const_int_M_operand" "M")
                          (match_operand:SI 2 "const_int_M_operand" "M"))
         (const_int 0))]
   "arm_arch_thumb2"
   "bfc%?\t%0, %2, %1"
+  ""
+  [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 1)))]
+  {
+    /* Convert back to a normal AND operation, so that we can take advantage
+       of BIC and AND when appropriate; we'll still emit BFC if that's the
+       right thing to do.  */
+    unsigned HOST_WIDE_INT width = UINTVAL (operands[1]);
+    unsigned HOST_WIDE_INT lsb = UINTVAL (operands[2]);
+    unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << width) - 1;
+
+    operands[1] = gen_int_mode (~(mask << lsb), SImode);
+  }
   [(set_attr "length" "4")
    (set_attr "predicable" "yes")
    (set_attr "type" "bfm")]
@@ -3495,6 +3513,76 @@ (define_insn "insv_t2"
    (set_attr "type" "bfm")]
 )
 
+(define_insn "*bfi"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "Dj"))
+		(and:SI (ashift:SI
+			 (match_operand:SI 3 "s_register_operand" "r")
+			 (match_operand 4 "const_int_operand" "i"))
+			(match_operand 5 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[4]) < 32
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+   && (exact_log2 (UINTVAL (operands[5])
+		   + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+       >= 0)"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt1"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (ashift:SI
+			 (match_operand:SI 3 "s_register_operand" "r")
+			 (match_operand 4 "const_int_operand" "i"))
+			(match_operand 5 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "Dj"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[4]) < 32
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+   && (exact_log2 (UINTVAL (operands[5])
+		   + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+       >= 0)"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt2"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 3 "s_register_operand" "r")
+			(match_operand 4 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+   && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt3"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+	(ior:SI (and:SI (match_operand:SI 3 "s_register_operand" "r")
+			(match_operand 4 "const_int_operand" "i"))
+		(and:SI (match_operand:SI 1 "s_register_operand" "0")
+			(match_operand 2 "const_int_operand" "i"))))]
+  "arm_arch_thumb2
+   && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+   && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+  "bfi%?\t%0, %3, %V2"
+  [(set_attr "length" "4")
+   (set_attr "predicable" "yes")
+   (set_attr "type" "bfm")]
+)
+
 (define_insn "andsi_notsi_si"
   [(set (match_operand:SI 0 "s_register_operand" "=r")
 	(and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r"))
diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 2b411b0cb0f..e5a36d29c71 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -32,7 +32,7 @@
 
 ;; The following multi-letter normal constraints have been used:
 ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di,
-;;			 Ds, Dt, Dp, Dz, Tu, Te
+;;			 Dj, Ds, Dt, Dp, Dz, Tu, Te
 ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
 ;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, Ra,
 ;;		     Rg, Ri
@@ -354,6 +354,14 @@ (define_constraint "Di"
  (and (match_code "const_double,const_int")
       (match_test "TARGET_32BIT && arm_const_double_by_immediates (op)")))
 
+(define_constraint "Dj"
+  "@internal
+   In cores with the v6t2 ISA, a constant with exactly one consecutive
+   string of zero bits."
+  (and (match_code "const_int")
+       (match_test "arm_arch_thumb2
+		    && exact_log2 (~ival + (~ival & -~ival)) >= 0")))
+
 (define_constraint "Dm"
  "@internal
   In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-06-07 11:13 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-07 11:13 [committed] arm: Improve code generation for BFI and BFC [PR105090] Richard Earnshaw

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).