[PATCH] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode
       [not found] <a6428a06-f728-06a9-a530-36aa115291dc.ref@yahoo.co.jp>
@ 2023-06-03  9:55 ` Takayuki 'January June' Suwa
  2023-06-04 12:46   ` Max Filippov
  0 siblings, 1 reply; 2+ messages in thread
From: Takayuki 'January June' Suwa @ 2023-06-03  9:55 UTC (permalink / raw)
  To: GCC Patches; +Cc: Max Filippov

This patch optimizes the boolean evaluation of EQ/NE against zero
by adding two insn_and_split patterns similar to SImode conditional
store:

"eq_zero":
	op0 = (op1 == 0) ? 1 : 0;
	op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */

"movsicc_ne0_reg_0":
	op0 = (op1 != 0) ? op2 : 0;
	op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */

These also work in SFmode by ignoring their sign bits, and further-
more, the branch if EQ/NE against zero in SFmode is also done in the
same manner.

The reasons for this optimization in SFmode are:

  - Only zero values (negative or non-negative) contain no bits of 1
    with both the exponent and the mantissa.
  - EQ/NE comparisons involving NaNs produce no signal even if they
    are signaling.
  - Even if the use of IEEE 754 single-precision floating-point co-
    processor is configured (TARGET_HARD_FLOAT is true):
	1. Load zero value to FP register
        2. Possibly, additional FP move if the comparison target is
	   an address register
	3. FP equality check instruction
	4. Read the boolean register containing the result, or condi-
	   tional branch
    As noted above, a considerable number of instructions are still
    generated.

gcc/ChangeLog:

	* config/xtensa/predicates.md (const_float_0_operand):
	Rename from obsolete "const_float_1_operand" and change the
	constant to compare.
	(cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
	New.
	* config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
	Add code for EQ/NE comparison with constant zero in SFmode.
	(xtensa_expand_scc): Added code to derive boolean evaluation
	of EQ/NE with constant zero for comparison in SFmode.
	(xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
	zero inside "cbranchsf4" to 0.
	* config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
	Change "match_operator" and the third "match_operand" to the
	ones mentioned above.
	(movsicc_ne0_reg_zero, eq_zero): New.
---
 gcc/config/xtensa/predicates.md | 19 ++++++++++--
 gcc/config/xtensa/xtensa.cc     | 43 ++++++++++++++++++++++++++
 gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md
index a3575a68892..d3b49e32fa4 100644
--- a/gcc/config/xtensa/predicates.md
+++ b/gcc/config/xtensa/predicates.md
@@ -155,11 +155,11 @@
 			    && CONSTANT_P (op)
 			    && GET_MODE_SIZE (mode) % UNITS_PER_WORD == 0")))))
 
-;; Accept the floating point constant 1 in the appropriate mode.
-(define_predicate "const_float_1_operand"
+;; Accept the floating point constant 0 in the appropriate mode.
+(define_predicate "const_float_0_operand"
   (match_code "const_double")
 {
-  return real_equal (CONST_DOUBLE_REAL_VALUE (op), &dconst1);
+  return real_equal (CONST_DOUBLE_REAL_VALUE (op), &dconst0);
 })
 
 (define_predicate "fpmem_offset_operand"
@@ -179,6 +179,13 @@
   return false;
 })
 
+(define_predicate "cstoresf_cbranchsf_operand"
+  (ior (and (match_test "TARGET_HARD_FLOAT")
+	    (match_operand 0 "register_operand"))
+       (and (match_code "const_double")
+	    (match_test "real_equal (CONST_DOUBLE_REAL_VALUE (op),
+				     &dconst0)"))))
+
 (define_predicate "branch_operator"
   (match_code "eq,ne,lt,ge"))
 
@@ -197,6 +204,12 @@
 (define_predicate "xtensa_cstoresi_operator"
   (match_code "eq,ne,gt,ge,lt,le"))
 
+(define_predicate "cstoresf_cbranchsf_operator"
+  (ior (and (match_test "TARGET_HARD_FLOAT")
+	    (match_operand 0 "comparison_operator"))
+       (and (match_test "!TARGET_HARD_FLOAT")
+	    (match_operand 0 "boolean_operator"))))
+
 (define_predicate "xtensa_shift_per_byte_operator"
   (match_code "ashift,ashiftrt,lshiftrt"))
 
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 3b5d25b660a..fefca3b11cd 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -865,6 +865,16 @@ xtensa_expand_conditional_branch (rtx *operands, machine_mode mode)
   switch (mode)
     {
     case E_SFmode:
+      if ((test_code == EQ || test_code == NE)
+	  && const_float_0_operand (cmp1, SFmode))
+	{
+	  emit_move_insn (cmp1 = gen_reg_rtx (SImode),
+			  gen_rtx_SUBREG (SImode, cmp0, 0));
+	  emit_insn (gen_addsi3 (cmp1, cmp1, cmp1));
+	  cmp = gen_int_relational (test_code, cmp1, const0_rtx);
+	  break;
+	}
+
       if (TARGET_HARD_FLOAT)
 	{
 	  cmp = gen_float_relational (test_code, cmp0, cmp1);
@@ -996,6 +1006,34 @@ xtensa_expand_scc (rtx operands[4], machine_mode cmp_mode)
   rtx one_tmp, zero_tmp;
   rtx (*gen_fn) (rtx, rtx, rtx, rtx, rtx);
 
+  if (cmp_mode == SFmode)
+    {
+      if (const_float_0_operand (operands[3], SFmode))
+	switch (GET_CODE (operands[1]))
+	  {
+	  case EQ:
+	    emit_move_insn (cmp = gen_reg_rtx (SImode),
+			    gen_rtx_SUBREG (SImode, operands[2], 0));
+	    emit_insn (gen_addsi3 (cmp, cmp, cmp));
+	    emit_insn (gen_eq_zero (dest, cmp));
+	    return 1;
+
+	  case NE:
+	    emit_move_insn (cmp = gen_reg_rtx (SImode),
+			    gen_rtx_SUBREG (SImode, operands[2], 0));
+	    emit_insn (gen_addsi3 (cmp, cmp, cmp));
+	    one_tmp = force_reg (SImode, const1_rtx);
+	    emit_insn (gen_movsicc_ne0_reg_zero (dest, cmp, one_tmp));
+	    return 1;
+
+	  default:
+	    gcc_unreachable ();
+	  }
+
+      if (! register_operand (operands[3], SFmode))
+	return 0;
+    }
+
   if (!(cmp = gen_conditional_move (GET_CODE (operands[1]), cmp_mode,
 				    operands[2], operands[3])))
     return 0;
@@ -4438,6 +4476,11 @@ xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
       return true;
 
     case CONST_DOUBLE:
+      if (outer_code == COMPARE && const_float_0_operand (x, SFmode))
+	{
+	  *total = 0;
+	  return true;
+	}
       if (TARGET_CONST16)
 	*total = COSTS_N_INSNS (4);
       else
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 21afa747e89..87620934bbe 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1906,11 +1906,11 @@
 })
 
 (define_expand "cbranchsf4"
-  [(match_operator 0 "comparison_operator"
+  [(match_operator 0 "cstoresf_cbranchsf_operator"
     [(match_operand:SF 1 "register_operand")
-     (match_operand:SF 2 "register_operand")])
+     (match_operand:SF 2 "cstoresf_cbranchsf_operand")])
    (match_operand 3 "")]
-  "TARGET_HARD_FLOAT"
+  ""
 {
   xtensa_expand_conditional_branch (operands, SFmode);
   DONE;
@@ -2364,10 +2364,10 @@
 
 (define_expand "cstoresf4"
   [(match_operand:SI 0 "register_operand")
-   (match_operator:SI 1 "comparison_operator"
+   (match_operator:SI 1 "cstoresf_cbranchsf_operator"
     [(match_operand:SF 2 "register_operand")
-     (match_operand:SF 3 "register_operand")])]
-  "TARGET_HARD_FLOAT"
+     (match_operand:SF 3 "cstoresf_cbranchsf_operand")])]
+  ""
 {
   if (!xtensa_expand_scc (operands, SFmode))
     FAIL;
@@ -2432,6 +2432,30 @@
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,3")])
 
+(define_insn_and_split "movsicc_ne0_reg_zero"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(if_then_else:SI (ne (match_operand:SI 1 "register_operand" "r")
+			     (const_int 0))
+			 (match_operand:SI 2 "register_operand" "r")
+			 (const_int 0)))]
+  ""
+  "#"
+  ""
+  [(set (match_dup 0)
+	(match_dup 2))
+   (set (match_dup 0)
+	(if_then_else:SI (ne (match_dup 1)
+			     (const_int 0))
+			 (match_dup 0)
+			 (match_dup 1)))]
+  ""
+  [(set_attr "type"	"move")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 5)
+		      (const_int 6)))])
+
 (define_insn "movsfcc_internal0"
   [(set (match_operand:SF 0 "register_operand" "=a,a,f,f")
 	(if_then_else:SF (match_operator 4 "branch_operator"
@@ -3157,6 +3181,23 @@
 		      (const_int 5)
 		      (const_int 6)))])
 
+(define_insn_and_split "eq_zero"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(eq:SI (match_operand:SI 1 "register_operand" "r")
+	       (const_int 0)))]
+  "TARGET_NSA"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(clz:SI (match_dup 1)))
+   (set (match_dup 0)
+	(lshiftrt:SI (match_dup 0)
+		     (const_int 5)))]
+  ""
+  [(set_attr "type"	"move")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
 (define_peephole2
   [(set (match_operand:SI 0 "register_operand")
 	(match_operand:SI 6 "reload_operand"))
-- 
2.30.2

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode
  2023-06-03  9:55 ` [PATCH] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode Takayuki 'January June' Suwa
@ 2023-06-04 12:46   ` Max Filippov
  0 siblings, 0 replies; 2+ messages in thread
From: Max Filippov @ 2023-06-04 12:46 UTC (permalink / raw)
  To: Takayuki 'January June' Suwa; +Cc: GCC Patches

Hi Suwa-san,

On Sat, Jun 3, 2023 at 2:55 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> This patch optimizes the boolean evaluation of EQ/NE against zero
> by adding two insn_and_split patterns similar to SImode conditional
> store:
>
> "eq_zero":
>         op0 = (op1 == 0) ? 1 : 0;
>         op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */
>
> "movsicc_ne0_reg_0":
>         op0 = (op1 != 0) ? op2 : 0;
>         op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */
>
> These also work in SFmode by ignoring their sign bits, and further-
> more, the branch if EQ/NE against zero in SFmode is also done in the
> same manner.
>
> The reasons for this optimization in SFmode are:
>
>   - Only zero values (negative or non-negative) contain no bits of 1
>     with both the exponent and the mantissa.
>   - EQ/NE comparisons involving NaNs produce no signal even if they
>     are signaling.
>   - Even if the use of IEEE 754 single-precision floating-point co-
>     processor is configured (TARGET_HARD_FLOAT is true):
>         1. Load zero value to FP register
>         2. Possibly, additional FP move if the comparison target is
>            an address register
>         3. FP equality check instruction
>         4. Read the boolean register containing the result, or condi-
>            tional branch
>     As noted above, a considerable number of instructions are still
>     generated.
>
> gcc/ChangeLog:
>
>         * config/xtensa/predicates.md (const_float_0_operand):
>         Rename from obsolete "const_float_1_operand" and change the
>         constant to compare.
>         (cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
>         New.
>         * config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
>         Add code for EQ/NE comparison with constant zero in SFmode.
>         (xtensa_expand_scc): Added code to derive boolean evaluation
>         of EQ/NE with constant zero for comparison in SFmode.
>         (xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
>         zero inside "cbranchsf4" to 0.
>         * config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
>         Change "match_operator" and the third "match_operand" to the
>         ones mentioned above.
>         (movsicc_ne0_reg_zero, eq_zero): New.
> ---
>  gcc/config/xtensa/predicates.md | 19 ++++++++++--
>  gcc/config/xtensa/xtensa.cc     | 43 ++++++++++++++++++++++++++
>  gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
>  3 files changed, 106 insertions(+), 9 deletions(-)

This change results in a bunch of new testsuite failures
on configurations without FPU that are all ICEs:

+FAIL: gcc.c-torture/execute/bitfld-3.c   -O1  execution test
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O1  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O1  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O3 -fomit-frame-pointer
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal
compiler error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O3 -fomit-frame-pointer
-funroll-loops -fpeel-loops -ftracer -finline-functions  (test for
excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O3 -g  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O3 -g  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -Os  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -Os  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  (internal compiler error:
in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error: in
extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-1.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O1  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O1  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O3 -fomit-frame-pointer
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal
compiler error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O3 -fomit-frame-pointer
-funroll-loops -fpeel-loops -ftracer -finline-functions  (test for
excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O3 -g  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O3 -g  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -Os  (internal compiler
error: in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -Os  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  (internal compiler error:
in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2 -flto
-fno-use-linker-plugin -flto-partition=none  (test for excess errors)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (internal compiler error: in
extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/c11-atomic-exec-3.c   -O2 -flto
-fuse-linker-plugin -fno-fat-lto-objects  (test for excess errors)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O1  (internal compiler error: in
extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O1  (test for excess errors)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O2  (internal compiler error: in
extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O2  (test for excess errors)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O3 -g  (internal compiler error:
in extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O3 -g  (test for excess errors)
+FAIL: gcc.dg/atomic/pr65345-4.c   -Os  (internal compiler error: in
extract_insn, at recog.cc:2791)
+FAIL: gcc.dg/atomic/pr65345-4.c   -Os  (test for excess errors)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O2 -flto -fno-use-linker-plugin
-flto-partition=none  (internal compiler error: in extract_insn, at
recog.cc:2791)
+FAIL: gcc.dg/atomic/pr65345-4.c   -O2 -flto -fno-use-linker-plugin
-flto-partition=none  (test for excess errors)

On configuration with FPU it results in ICEs during libgfortran build, all
with the similar diagnostic:

gcc/libgfortran/intrinsics/erfc_scaled_inc.c:179:1: error: unrecognizable insn:
 179 | }
     | ^
(insn 23 22 24 2 (set (reg:CC 18 b0)
       (lt:CC (const_double:SF 0.0 [0x0.0p+0])
           (reg/v:SF 96 [ x ])))
"gcc/libgfortran/intrinsics/erfc_scaled_inc.c":111:18 -1
    (nil))
during RTL pass: vregs

-- 
Thanks.
-- Max

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-06-04 12:46 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <a6428a06-f728-06a9-a530-36aa115291dc.ref@yahoo.co.jp>
2023-06-03  9:55 ` [PATCH] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode Takayuki 'January June' Suwa
2023-06-04 12:46   ` Max Filippov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).