public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733]
@ 2022-09-19 16:07 Kevin Lee
  2022-09-19 22:16 ` Kito Cheng
  0 siblings, 1 reply; 3+ messages in thread
From: Kevin Lee @ 2022-09-19 16:07 UTC (permalink / raw)
  To: gcc-patches; +Cc: gnu-toolchain

[-- Attachment #1: Type: text/plain, Size: 9208 bytes --]

Hello GCC,
 Started from Jim Wilson's patch in
https://github.com/riscv-admin/riscv-code-speed-optimization/blob/main/projects/gcc-optimizations.adoc
for the large stack frame optimization problem, this augmented patch
generates less instructions for cases such as
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105733.
Original:
foo:
li t0,-4096
addi t0,t0,2016
li a4,4096
add sp,sp,t0
li a5,-4096
addi a4,a4,-2032
add a4,a4,a5
addi a5,sp,16
add a5,a4,a5
add a0,a5,a0
li t0,4096
sd a5,8(sp)
sb zero,2032(a0)
addi t0,t0,-2016
add sp,sp,t0
jr ra
After Patch:
foo:
li t0,-4096
addi t0,t0,2032
add sp,sp,t0
addi a5,sp,-2032
add a0,a5,a0
li t0,4096
sb zero,2032(a0)
addi t0,t0,-2032
add sp,sp,t0
jr ra

               ========= Summary of gcc testsuite =========
                            | # of unexpected case / # of unique unexpected
case
                            |          gcc |          g++ |     gfortran |
     rv64gc/  lp64d/ medlow |    4 /     4 |   13 /     4 |    0 /     0 |
No additional failures were created from the testsuite.

gcc/ChangeLog:
   Jim Wilson <jim.wilson.gcc@gmail.com>
   Michael Collison <collison@rivosinc.com>
   Kevin Lee <kevinl@rivosinc.com>

* config/riscv/predicates.md (const_lui_operand): New predicate.
(add_operand): Ditto.
(reg_or_const_int_operand): Ditto.
* config/riscv/riscv-protos.h (riscv_eliminable_reg): New
    function.
* config/riscv/riscv.cc (riscv_eliminable_reg): New Function.
(riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
    gen_rtx_fmt_ee instead of gen_add3_insn.
(riscv_adjust_libcall_cfi_epilogue): ditto.
* config/riscv/riscv.md (addsi3): Remove.
(adddi3): ditto.
(add<mode>3): New instruction for large stack frame optimization.
(add<mode>3_internal): ditto
(add<mode>3_internal2): New instruction for insns generated in
the prologue and epilogue pass.
---
 gcc/config/riscv/predicates.md  | 13 +++++
 gcc/config/riscv/riscv-protos.h |  1 +
 gcc/config/riscv/riscv.cc       | 20 ++++++--
 gcc/config/riscv/riscv.md       | 84 ++++++++++++++++++++++++++++-----
 4 files changed, 101 insertions(+), 17 deletions(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 862e72b0983..b98bb5a9768 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -35,6 +35,14 @@ (define_predicate "sfb_alu_operand"
   (ior (match_operand 0 "arith_operand")
        (match_operand 0 "lui_operand")))

+(define_predicate "const_lui_operand"
+  (and (match_code "const_int")
+       (match_test "(INTVAL (op) & 0xFFF) == 0 && INTVAL (op) != 0")))
+
+(define_predicate "add_operand"
+  (ior (match_operand 0 "arith_operand")
+       (match_operand 0 "const_lui_operand")))
+
 (define_predicate "const_csr_operand"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
@@ -59,6 +67,11 @@ (define_predicate "reg_or_0_operand"
   (ior (match_operand 0 "const_0_operand")
        (match_operand 0 "register_operand")))

+;; For use in adds, when adding to an eliminable register.
+(define_predicate "reg_or_const_int_operand"
+  (ior (match_code "const_int")
+       (match_operand 0 "register_operand")))
+
 ;; Only use branch-on-bit sequences when the mask is not an ANDI immediate.
 (define_predicate "branch_on_bit_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/riscv/riscv-protos.h
b/gcc/config/riscv/riscv-protos.h
index 649c5c977e1..8f0aa8114be 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -63,6 +63,7 @@ extern void riscv_expand_conditional_move (rtx, rtx, rtx,
rtx_code, rtx, rtx);
 extern rtx riscv_legitimize_call_address (rtx);
 extern void riscv_set_return_address (rtx, rtx);
 extern bool riscv_expand_block_move (rtx, rtx, rtx);
+extern bool riscv_eliminable_reg (rtx);
 extern rtx riscv_return_addr (int, rtx);
 extern poly_int64 riscv_initial_elimination_offset (int, int);
 extern void riscv_expand_prologue (void);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 675d92c0961..b5577a4f366 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4320,6 +4320,16 @@ riscv_initial_elimination_offset (int from, int to)
   return src - dest;
 }

+/* Return true if X is a register that will be eliminated later on.  */
+bool
+riscv_eliminable_reg (rtx x)
+{
+  return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM
+       || REGNO (x) == ARG_POINTER_REGNUM
+       || (REGNO (x) >= FIRST_VIRTUAL_REGISTER
+   && REGNO (x) <= LAST_VIRTUAL_REGISTER));
+}
+
 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
    previous frame.  */

@@ -4521,8 +4531,9 @@ riscv_adjust_libcall_cfi_prologue ()
       }

   /* Debug info for adjust sp.  */
-  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
- stack_pointer_rtx, GEN_INT (-saved_size));
+  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
+            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
+          stack_pointer_rtx, GEN_INT (saved_size)));
   dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
   dwarf);
   return dwarf;
@@ -4624,8 +4635,9 @@ riscv_adjust_libcall_cfi_epilogue ()
   int saved_size = cfun->machine->frame.save_libcall_adjustment;

   /* Debug info for adjust sp.  */
-  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
- stack_pointer_rtx, GEN_INT (saved_size));
+  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
+            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
+          stack_pointer_rtx, GEN_INT (saved_size)));
   dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
   dwarf);

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 014206fb8bd..0285ac67b2a 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -438,23 +438,80 @@ (define_insn "add<mode>3"
   [(set_attr "type" "fadd")
    (set_attr "mode" "<UNITMODE>")])

-(define_insn "addsi3"
-  [(set (match_operand:SI          0 "register_operand" "=r,r")
- (plus:SI (match_operand:SI 1 "register_operand" " r,r")
- (match_operand:SI 2 "arith_operand"    " r,I")))]
+(define_expand "add<mode>3"
+  [(parallel
+    [(set (match_operand:GPR          0 "register_operand" "")
+  (plus:GPR (match_operand:GPR 1 "register_operand" "")
+   (match_operand:GPR 2 "add_operand" "")))
+    (clobber (match_scratch:GPR 3 ""))])]
   ""
-  "add%i2%~\t%0,%1,%2"
+{
+  if (riscv_eliminable_reg (operands[1]))
+  {
+    if (splittable_const_int_operand (operands[2], <MODE>mode))
+    {
+      /* The idea here is that we emit
+          add op0, op1, %hi(op2)
+          addi op0, op0, %lo(op2)
+        Then when op1, the eliminable reg, gets replaced with sp+offset,
+        we can simplify the constants.  */
+      HOST_WIDE_INT high_part = CONST_HIGH_PART (INTVAL (operands[2]));
+      emit_insn (gen_add<mode>3_internal (operands[0], operands[1],
+                  GEN_INT (high_part)));
+      operands[1] = operands[0];
+      operands[2] = GEN_INT (INTVAL (operands[2]) - high_part);
+    }
+    else if (! const_arith_operand (operands[2], <MODE>mode))
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+  }
+})
+
+(define_insn_and_split "add<mode>3_internal"
+  [(set (match_operand:GPR          0 "register_operand" "=r,r,&r,!&r")
+ (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r,r,0")
+  (match_operand:GPR 2 "add_operand"      " r,I,L,L")))
+  (clobber (match_scratch:GPR 3 "=X,X,X,&r"))]
+  ""
+{
+  if ((which_alternative == 2) || (which_alternative == 3))
+    return "#";
+
+  if (TARGET_64BIT && <MODE>mode == SImode)
+    return "add%i2w\t%0,%1,%2";
+  else
+    return "add%i2\t%0,%1,%2";
+}
+  "&& reload_completed && const_lui_operand (operands[2], <MODE>mode)"
+  [(const_int 0)]
+{
+  if (REGNO (operands[0]) != REGNO (operands[1]))
+    {
+      emit_insn (gen_mov<mode> (operands[0], operands[2]));
+      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
operands[1]));
+    }
+  else
+    {
+      emit_insn (gen_mov<mode> (operands[3], operands[2]));
+      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
operands[3]));
+    }
+  DONE;
+}
   [(set_attr "type" "arith")
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "<MODE>")])

-(define_insn "adddi3"
-  [(set (match_operand:DI          0 "register_operand" "=r,r")
- (plus:DI (match_operand:DI 1 "register_operand" " r,r")
- (match_operand:DI 2 "arith_operand"    " r,I")))]
-  "TARGET_64BIT"
-  "add%i2\t%0,%1,%2"
+(define_insn "add<mode>3_internal2"
+  [(set (match_operand:GPR          0 "register_operand" "=r,r")
+ (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r")
+  (match_operand:GPR 2 "arith_operand"    " r,I")))]
+  ""
+ {
+   if (TARGET_64BIT && <MODE>mode == SImode)
+     return "add%i2w\t%0,%1,%2";
+   else
+     return "add%i2\t%0,%1,%2";
+ }
   [(set_attr "type" "arith")
-   (set_attr "mode" "DI")])
+   (set_attr "mode" "<MODE>")])

 (define_expand "addv<mode>4"
   [(set (match_operand:GPR           0 "register_operand" "=r,r")
@@ -500,6 +557,7 @@ (define_expand "addv<mode>4"
   DONE;
 })

+
 (define_expand "uaddv<mode>4"
   [(set (match_operand:GPR           0 "register_operand" "=r,r")
  (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
-- 
2.25.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733]
  2022-09-19 16:07 [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733] Kevin Lee
@ 2022-09-19 22:16 ` Kito Cheng
  2022-09-21  0:04   ` Kevin Lee
  0 siblings, 1 reply; 3+ messages in thread
From: Kito Cheng @ 2022-09-19 22:16 UTC (permalink / raw)
  To: Kevin Lee; +Cc: gcc-patches, gnu-toolchain

[-- Attachment #1: Type: text/plain, Size: 10242 bytes --]

Could you provide some data including code size and performance? add is
frequently used patten, so we should more careful when changing that.

Kevin Lee <kevinl@rivosinc.com>於 2022年9月19日 週一,18:07寫道:

> Hello GCC,
>  Started from Jim Wilson's patch in
>
> https://github.com/riscv-admin/riscv-code-speed-optimization/blob/main/projects/gcc-optimizations.adoc
> for the large stack frame optimization problem, this augmented patch
> generates less instructions for cases such as
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105733.
> Original:
> foo:
> li t0,-4096
> addi t0,t0,2016
> li a4,4096
> add sp,sp,t0
> li a5,-4096
> addi a4,a4,-2032
> add a4,a4,a5
> addi a5,sp,16
> add a5,a4,a5
> add a0,a5,a0
> li t0,4096
> sd a5,8(sp)
> sb zero,2032(a0)
> addi t0,t0,-2016
> add sp,sp,t0
> jr ra
> After Patch:
> foo:
> li t0,-4096
> addi t0,t0,2032
> add sp,sp,t0
> addi a5,sp,-2032
> add a0,a5,a0
> li t0,4096
> sb zero,2032(a0)
> addi t0,t0,-2032
> add sp,sp,t0
> jr ra
>
>                ========= Summary of gcc testsuite =========
>                             | # of unexpected case / # of unique unexpected
> case
>                             |          gcc |          g++ |     gfortran |
>      rv64gc/  lp64d/ medlow |    4 /     4 |   13 /     4 |    0 /     0 |
> No additional failures were created from the testsuite.
>
> gcc/ChangeLog:
>    Jim Wilson <jim.wilson.gcc@gmail.com>
>    Michael Collison <collison@rivosinc.com>
>    Kevin Lee <kevinl@rivosinc.com>
>
> * config/riscv/predicates.md (const_lui_operand): New predicate.
> (add_operand): Ditto.
> (reg_or_const_int_operand): Ditto.
> * config/riscv/riscv-protos.h (riscv_eliminable_reg): New
>     function.
> * config/riscv/riscv.cc (riscv_eliminable_reg): New Function.
> (riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
>     gen_rtx_fmt_ee instead of gen_add3_insn.
> (riscv_adjust_libcall_cfi_epilogue): ditto.
> * config/riscv/riscv.md (addsi3): Remove.
> (adddi3): ditto.
> (add<mode>3): New instruction for large stack frame optimization.
> (add<mode>3_internal): ditto
> (add<mode>3_internal2): New instruction for insns generated in
> the prologue and epilogue pass.
> ---
>  gcc/config/riscv/predicates.md  | 13 +++++
>  gcc/config/riscv/riscv-protos.h |  1 +
>  gcc/config/riscv/riscv.cc       | 20 ++++++--
>  gcc/config/riscv/riscv.md       | 84 ++++++++++++++++++++++++++++-----
>  4 files changed, 101 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/config/riscv/predicates.md
> b/gcc/config/riscv/predicates.md
> index 862e72b0983..b98bb5a9768 100644
> --- a/gcc/config/riscv/predicates.md
> +++ b/gcc/config/riscv/predicates.md
> @@ -35,6 +35,14 @@ (define_predicate "sfb_alu_operand"
>    (ior (match_operand 0 "arith_operand")
>         (match_operand 0 "lui_operand")))
>
> +(define_predicate "const_lui_operand"
> +  (and (match_code "const_int")
> +       (match_test "(INTVAL (op) & 0xFFF) == 0 && INTVAL (op) != 0")))
> +
> +(define_predicate "add_operand"
> +  (ior (match_operand 0 "arith_operand")
> +       (match_operand 0 "const_lui_operand")))
> +
>  (define_predicate "const_csr_operand"
>    (and (match_code "const_int")
>         (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
> @@ -59,6 +67,11 @@ (define_predicate "reg_or_0_operand"
>    (ior (match_operand 0 "const_0_operand")
>         (match_operand 0 "register_operand")))
>
> +;; For use in adds, when adding to an eliminable register.
> +(define_predicate "reg_or_const_int_operand"
> +  (ior (match_code "const_int")
> +       (match_operand 0 "register_operand")))
> +
>  ;; Only use branch-on-bit sequences when the mask is not an ANDI
> immediate.
>  (define_predicate "branch_on_bit_operand"
>    (and (match_code "const_int")
> diff --git a/gcc/config/riscv/riscv-protos.h
> b/gcc/config/riscv/riscv-protos.h
> index 649c5c977e1..8f0aa8114be 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -63,6 +63,7 @@ extern void riscv_expand_conditional_move (rtx, rtx, rtx,
> rtx_code, rtx, rtx);
>  extern rtx riscv_legitimize_call_address (rtx);
>  extern void riscv_set_return_address (rtx, rtx);
>  extern bool riscv_expand_block_move (rtx, rtx, rtx);
> +extern bool riscv_eliminable_reg (rtx);
>  extern rtx riscv_return_addr (int, rtx);
>  extern poly_int64 riscv_initial_elimination_offset (int, int);
>  extern void riscv_expand_prologue (void);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 675d92c0961..b5577a4f366 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -4320,6 +4320,16 @@ riscv_initial_elimination_offset (int from, int to)
>    return src - dest;
>  }
>
> +/* Return true if X is a register that will be eliminated later on.  */
> +bool
> +riscv_eliminable_reg (rtx x)
> +{
> +  return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM
> +       || REGNO (x) == ARG_POINTER_REGNUM
> +       || (REGNO (x) >= FIRST_VIRTUAL_REGISTER
> +   && REGNO (x) <= LAST_VIRTUAL_REGISTER));
> +}
> +
>  /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
>     previous frame.  */
>
> @@ -4521,8 +4531,9 @@ riscv_adjust_libcall_cfi_prologue ()
>        }
>
>    /* Debug info for adjust sp.  */
> -  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
> - stack_pointer_rtx, GEN_INT (-saved_size));
> +  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
> +          stack_pointer_rtx, GEN_INT (saved_size)));
>    dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>    dwarf);
>    return dwarf;
> @@ -4624,8 +4635,9 @@ riscv_adjust_libcall_cfi_epilogue ()
>    int saved_size = cfun->machine->frame.save_libcall_adjustment;
>
>    /* Debug info for adjust sp.  */
> -  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
> - stack_pointer_rtx, GEN_INT (saved_size));
> +  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
> +          stack_pointer_rtx, GEN_INT (saved_size)));
>    dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>    dwarf);
>
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 014206fb8bd..0285ac67b2a 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -438,23 +438,80 @@ (define_insn "add<mode>3"
>    [(set_attr "type" "fadd")
>     (set_attr "mode" "<UNITMODE>")])
>
> -(define_insn "addsi3"
> -  [(set (match_operand:SI          0 "register_operand" "=r,r")
> - (plus:SI (match_operand:SI 1 "register_operand" " r,r")
> - (match_operand:SI 2 "arith_operand"    " r,I")))]
> +(define_expand "add<mode>3"
> +  [(parallel
> +    [(set (match_operand:GPR          0 "register_operand" "")
> +  (plus:GPR (match_operand:GPR 1 "register_operand" "")
> +   (match_operand:GPR 2 "add_operand" "")))
> +    (clobber (match_scratch:GPR 3 ""))])]
>    ""
> -  "add%i2%~\t%0,%1,%2"
> +{
> +  if (riscv_eliminable_reg (operands[1]))
> +  {
> +    if (splittable_const_int_operand (operands[2], <MODE>mode))
> +    {
> +      /* The idea here is that we emit
> +          add op0, op1, %hi(op2)
> +          addi op0, op0, %lo(op2)
> +        Then when op1, the eliminable reg, gets replaced with sp+offset,
> +        we can simplify the constants.  */
> +      HOST_WIDE_INT high_part = CONST_HIGH_PART (INTVAL (operands[2]));
> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[1],
> +                  GEN_INT (high_part)));
> +      operands[1] = operands[0];
> +      operands[2] = GEN_INT (INTVAL (operands[2]) - high_part);
> +    }
> +    else if (! const_arith_operand (operands[2], <MODE>mode))
> +      operands[2] = force_reg (<MODE>mode, operands[2]);
> +  }
> +})
> +
> +(define_insn_and_split "add<mode>3_internal"
> +  [(set (match_operand:GPR          0 "register_operand" "=r,r,&r,!&r")
> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r,r,0")
> +  (match_operand:GPR 2 "add_operand"      " r,I,L,L")))
> +  (clobber (match_scratch:GPR 3 "=X,X,X,&r"))]
> +  ""
> +{
> +  if ((which_alternative == 2) || (which_alternative == 3))
> +    return "#";
> +
> +  if (TARGET_64BIT && <MODE>mode == SImode)
> +    return "add%i2w\t%0,%1,%2";
> +  else
> +    return "add%i2\t%0,%1,%2";
> +}
> +  "&& reload_completed && const_lui_operand (operands[2], <MODE>mode)"
> +  [(const_int 0)]
> +{
> +  if (REGNO (operands[0]) != REGNO (operands[1]))
> +    {
> +      emit_insn (gen_mov<mode> (operands[0], operands[2]));
> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
> operands[1]));
> +    }
> +  else
> +    {
> +      emit_insn (gen_mov<mode> (operands[3], operands[2]));
> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
> operands[3]));
> +    }
> +  DONE;
> +}
>    [(set_attr "type" "arith")
> -   (set_attr "mode" "SI")])
> +   (set_attr "mode" "<MODE>")])
>
> -(define_insn "adddi3"
> -  [(set (match_operand:DI          0 "register_operand" "=r,r")
> - (plus:DI (match_operand:DI 1 "register_operand" " r,r")
> - (match_operand:DI 2 "arith_operand"    " r,I")))]
> -  "TARGET_64BIT"
> -  "add%i2\t%0,%1,%2"
> +(define_insn "add<mode>3_internal2"
> +  [(set (match_operand:GPR          0 "register_operand" "=r,r")
> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r")
> +  (match_operand:GPR 2 "arith_operand"    " r,I")))]
> +  ""
> + {
> +   if (TARGET_64BIT && <MODE>mode == SImode)
> +     return "add%i2w\t%0,%1,%2";
> +   else
> +     return "add%i2\t%0,%1,%2";
> + }
>    [(set_attr "type" "arith")
> -   (set_attr "mode" "DI")])
> +   (set_attr "mode" "<MODE>")])
>
>  (define_expand "addv<mode>4"
>    [(set (match_operand:GPR           0 "register_operand" "=r,r")
> @@ -500,6 +557,7 @@ (define_expand "addv<mode>4"
>    DONE;
>  })
>
> +
>  (define_expand "uaddv<mode>4"
>    [(set (match_operand:GPR           0 "register_operand" "=r,r")
>   (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733]
  2022-09-19 22:16 ` Kito Cheng
@ 2022-09-21  0:04   ` Kevin Lee
  0 siblings, 0 replies; 3+ messages in thread
From: Kevin Lee @ 2022-09-21  0:04 UTC (permalink / raw)
  To: Kito Cheng; +Cc: gcc-patches, gnu-toolchain

[-- Attachment #1: Type: text/plain, Size: 14583 bytes --]

The proposed patch only makes the difference if the operand 1 is an
eliminable register and operand 2 is a splittable const int. Otherwise, it
follows the original add3 pattern.

Besides the example from pr105733 shown on the first post,
#define BUF_SIZE 5012
void saxpy( float a )
{
  volatile float x[BUF_SIZE];
  volatile float y[BUF_SIZE];

  for (int i = 0; i < BUF_SIZE; ++i)
      y[i] = a*x[i] + y[i];
}
generates
Before:
saxpy:
            li t0,-40960
            li a2,40960
            addi t0,t0,848
            add sp,sp,t0
            li a4,-40960
            addi a3,a2,-864
            add a3,a3,a4
            addi a4,sp,16
            add a4,a3,a4
            sd a4,0(sp)
            addi a3,a2,-864
            li a4,-20480
            add a3,a3,a4
            addi a4,sp,16
            add a4,a3,a4
            li a2,4096
            li a5,0
            sd a4,8(sp)
            addi a2,a2,916
.L2:
            ld a4,8(sp)
            ld a3,0(sp)
            sh2add a4,a5,a4
            sh2add a3,a5,a3
            flw fa5,864(a3)
            flw fa4,432(a4)
            addiw a5,a5,1
            fmadd.s fa5,fa5,fa0,fa4
            fsw fa5,432(a4)
            bne a5,a2,.L2
            li t0,40960
            addi t0,t0,-848
            add sp,sp,t0
            jr ra

After:
saxpy:
            li t0,-40960
            addi t0,t0,864
            li a2,4096
            add sp,sp,t0
            li a5,0
            addi a2,a2,916
.L2:
            li a4,20480
            addi a4,a4,-864
            add a4,a4,sp
            addi a3,sp,-864
            sh2add a4,a5,a4
            sh2add a3,a5,a3
            flw fa5,864(a3)
            flw fa4,432(a4)
            addiw a5,a5,1
            fmadd.s fa5,fa5,fa0,fa4
            fsw fa5,432(a4)
            bne a5,a2,.L2
            li t0,40960
            addi t0,t0,-864
            add sp,sp,t0
            jr ra

The number of instructions before .L2 is reduced from 19 to 6 after the
patch.
Moreover, the following example
#define limit 4096
void foo()
{
    volatile int temp = 0;
    volatile int buf[limit];
    for(int i = 0; i < limit; ++i){
        for(int j = 0; j < limit; ++j){
            temp += buf[(i * 1234 + j) % limit];
        }
    }
}
generates
before:
foo:
            li t0,-16384
            addi t0,t0,-32
            li a4,16384
            add sp,sp,t0
            li a5,-16384
            addi a4,a4,16
            add a4,a4,a5
            addi a5,sp,16
            add a5,a4,a5
            li a1,4096
            sd a5,8(sp)
            sw zero,-4(a5)
            li a7,-4096
            addi a0,a1,-1
            li a6,5058560
.L2:
            addw a5,a7,a1
.L3:
            ld a3,8(sp)
            and a4,a5,a0
            addiw a5,a5,1
            sh2add a4,a4,a3
            lw a2,0(a4)
            lw a4,-4(a3)
            addw a4,a4,a2
            ld a2,8(sp)
            sw a4,-4(a2)
            bne a5,a1,.L3
            addiw a1,a5,1234
            bne a1,a6,.L2
            li t0,16384
            addi t0,t0,32
            add sp,sp,t0
            jr ra

After:
foo:
            li t0,-16384
            addi t0,t0,-16
            add sp,sp,t0
            li a1,4096
            sw zero,12(sp)
            li a7,-4096
            addi a0,a1,-1
            li a6,5058560
.L2:
            addw a5,a7,a1
.L3:
            and a4,a5,a0
            addi a3,sp,16
            sh2add a4,a4,a3
            lw a2,0(a4)
            lw a4,12(sp)
            addiw a5,a5,1
            addw a4,a4,a2
            sw a4,12(sp)
            bne a5,a1,.L3
            addiw a1,a5,1234
            bne a1,a6,.L2
            li t0,16384
            addi t0,t0,16
            add sp,sp,t0
            jr ra

This example also shows that the instructions before .L2 is reduced from 15
lines to 8 lines after the patch.

On Mon, Sep 19, 2022 at 3:16 PM Kito Cheng <kito.cheng@gmail.com> wrote:

> Could you provide some data including code size and performance? add is
> frequently used patten, so we should more careful when changing that.
>
> Kevin Lee <kevinl@rivosinc.com>於 2022年9月19日 週一,18:07寫道:
>
>> Hello GCC,
>>  Started from Jim Wilson's patch in
>>
>> https://github.com/riscv-admin/riscv-code-speed-optimization/blob/main/projects/gcc-optimizations.adoc
>> for the large stack frame optimization problem, this augmented patch
>> generates less instructions for cases such as
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105733.
>> Original:
>> foo:
>> li t0,-4096
>> addi t0,t0,2016
>> li a4,4096
>> add sp,sp,t0
>> li a5,-4096
>> addi a4,a4,-2032
>> add a4,a4,a5
>> addi a5,sp,16
>> add a5,a4,a5
>> add a0,a5,a0
>> li t0,4096
>> sd a5,8(sp)
>> sb zero,2032(a0)
>> addi t0,t0,-2016
>> add sp,sp,t0
>> jr ra
>> After Patch:
>> foo:
>> li t0,-4096
>> addi t0,t0,2032
>> add sp,sp,t0
>> addi a5,sp,-2032
>> add a0,a5,a0
>> li t0,4096
>> sb zero,2032(a0)
>> addi t0,t0,-2032
>> add sp,sp,t0
>> jr ra
>>
>>                ========= Summary of gcc testsuite =========
>>                             | # of unexpected case / # of unique
>> unexpected
>> case
>>                             |          gcc |          g++ |     gfortran |
>>      rv64gc/  lp64d/ medlow |    4 /     4 |   13 /     4 |    0 /     0 |
>> No additional failures were created from the testsuite.
>>
>> gcc/ChangeLog:
>>    Jim Wilson <jim.wilson.gcc@gmail.com>
>>    Michael Collison <collison@rivosinc.com>
>>    Kevin Lee <kevinl@rivosinc.com>
>>
>> * config/riscv/predicates.md (const_lui_operand): New predicate.
>> (add_operand): Ditto.
>> (reg_or_const_int_operand): Ditto.
>> * config/riscv/riscv-protos.h (riscv_eliminable_reg): New
>>     function.
>> * config/riscv/riscv.cc (riscv_eliminable_reg): New Function.
>> (riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
>>     gen_rtx_fmt_ee instead of gen_add3_insn.
>> (riscv_adjust_libcall_cfi_epilogue): ditto.
>> * config/riscv/riscv.md (addsi3): Remove.
>> (adddi3): ditto.
>> (add<mode>3): New instruction for large stack frame optimization.
>> (add<mode>3_internal): ditto
>> (add<mode>3_internal2): New instruction for insns generated in
>> the prologue and epilogue pass.
>> ---
>>  gcc/config/riscv/predicates.md  | 13 +++++
>>  gcc/config/riscv/riscv-protos.h |  1 +
>>  gcc/config/riscv/riscv.cc       | 20 ++++++--
>>  gcc/config/riscv/riscv.md       | 84 ++++++++++++++++++++++++++++-----
>>  4 files changed, 101 insertions(+), 17 deletions(-)
>>
>> diff --git a/gcc/config/riscv/predicates.md
>> b/gcc/config/riscv/predicates.md
>> index 862e72b0983..b98bb5a9768 100644
>> --- a/gcc/config/riscv/predicates.md
>> +++ b/gcc/config/riscv/predicates.md
>> @@ -35,6 +35,14 @@ (define_predicate "sfb_alu_operand"
>>    (ior (match_operand 0 "arith_operand")
>>         (match_operand 0 "lui_operand")))
>>
>> +(define_predicate "const_lui_operand"
>> +  (and (match_code "const_int")
>> +       (match_test "(INTVAL (op) & 0xFFF) == 0 && INTVAL (op) != 0")))
>> +
>> +(define_predicate "add_operand"
>> +  (ior (match_operand 0 "arith_operand")
>> +       (match_operand 0 "const_lui_operand")))
>> +
>>  (define_predicate "const_csr_operand"
>>    (and (match_code "const_int")
>>         (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
>> @@ -59,6 +67,11 @@ (define_predicate "reg_or_0_operand"
>>    (ior (match_operand 0 "const_0_operand")
>>         (match_operand 0 "register_operand")))
>>
>> +;; For use in adds, when adding to an eliminable register.
>> +(define_predicate "reg_or_const_int_operand"
>> +  (ior (match_code "const_int")
>> +       (match_operand 0 "register_operand")))
>> +
>>  ;; Only use branch-on-bit sequences when the mask is not an ANDI
>> immediate.
>>  (define_predicate "branch_on_bit_operand"
>>    (and (match_code "const_int")
>> diff --git a/gcc/config/riscv/riscv-protos.h
>> b/gcc/config/riscv/riscv-protos.h
>> index 649c5c977e1..8f0aa8114be 100644
>> --- a/gcc/config/riscv/riscv-protos.h
>> +++ b/gcc/config/riscv/riscv-protos.h
>> @@ -63,6 +63,7 @@ extern void riscv_expand_conditional_move (rtx, rtx,
>> rtx,
>> rtx_code, rtx, rtx);
>>  extern rtx riscv_legitimize_call_address (rtx);
>>  extern void riscv_set_return_address (rtx, rtx);
>>  extern bool riscv_expand_block_move (rtx, rtx, rtx);
>> +extern bool riscv_eliminable_reg (rtx);
>>  extern rtx riscv_return_addr (int, rtx);
>>  extern poly_int64 riscv_initial_elimination_offset (int, int);
>>  extern void riscv_expand_prologue (void);
>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>> index 675d92c0961..b5577a4f366 100644
>> --- a/gcc/config/riscv/riscv.cc
>> +++ b/gcc/config/riscv/riscv.cc
>> @@ -4320,6 +4320,16 @@ riscv_initial_elimination_offset (int from, int to)
>>    return src - dest;
>>  }
>>
>> +/* Return true if X is a register that will be eliminated later on.  */
>> +bool
>> +riscv_eliminable_reg (rtx x)
>> +{
>> +  return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM
>> +       || REGNO (x) == ARG_POINTER_REGNUM
>> +       || (REGNO (x) >= FIRST_VIRTUAL_REGISTER
>> +   && REGNO (x) <= LAST_VIRTUAL_REGISTER));
>> +}
>> +
>>  /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
>>     previous frame.  */
>>
>> @@ -4521,8 +4531,9 @@ riscv_adjust_libcall_cfi_prologue ()
>>        }
>>
>>    /* Debug info for adjust sp.  */
>> -  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
>> - stack_pointer_rtx, GEN_INT (-saved_size));
>> +  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
>> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
>> +          stack_pointer_rtx, GEN_INT (saved_size)));
>>    dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>>    dwarf);
>>    return dwarf;
>> @@ -4624,8 +4635,9 @@ riscv_adjust_libcall_cfi_epilogue ()
>>    int saved_size = cfun->machine->frame.save_libcall_adjustment;
>>
>>    /* Debug info for adjust sp.  */
>> -  adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
>> - stack_pointer_rtx, GEN_INT (saved_size));
>> +  adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
>> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
>> +          stack_pointer_rtx, GEN_INT (saved_size)));
>>    dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>>    dwarf);
>>
>> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
>> index 014206fb8bd..0285ac67b2a 100644
>> --- a/gcc/config/riscv/riscv.md
>> +++ b/gcc/config/riscv/riscv.md
>> @@ -438,23 +438,80 @@ (define_insn "add<mode>3"
>>    [(set_attr "type" "fadd")
>>     (set_attr "mode" "<UNITMODE>")])
>>
>> -(define_insn "addsi3"
>> -  [(set (match_operand:SI          0 "register_operand" "=r,r")
>> - (plus:SI (match_operand:SI 1 "register_operand" " r,r")
>> - (match_operand:SI 2 "arith_operand"    " r,I")))]
>> +(define_expand "add<mode>3"
>> +  [(parallel
>> +    [(set (match_operand:GPR          0 "register_operand" "")
>> +  (plus:GPR (match_operand:GPR 1 "register_operand" "")
>> +   (match_operand:GPR 2 "add_operand" "")))
>> +    (clobber (match_scratch:GPR 3 ""))])]
>>    ""
>> -  "add%i2%~\t%0,%1,%2"
>> +{
>> +  if (riscv_eliminable_reg (operands[1]))
>> +  {
>> +    if (splittable_const_int_operand (operands[2], <MODE>mode))
>> +    {
>> +      /* The idea here is that we emit
>> +          add op0, op1, %hi(op2)
>> +          addi op0, op0, %lo(op2)
>> +        Then when op1, the eliminable reg, gets replaced with sp+offset,
>> +        we can simplify the constants.  */
>> +      HOST_WIDE_INT high_part = CONST_HIGH_PART (INTVAL (operands[2]));
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[1],
>> +                  GEN_INT (high_part)));
>> +      operands[1] = operands[0];
>> +      operands[2] = GEN_INT (INTVAL (operands[2]) - high_part);
>> +    }
>> +    else if (! const_arith_operand (operands[2], <MODE>mode))
>> +      operands[2] = force_reg (<MODE>mode, operands[2]);
>> +  }
>> +})
>> +
>> +(define_insn_and_split "add<mode>3_internal"
>> +  [(set (match_operand:GPR          0 "register_operand" "=r,r,&r,!&r")
>> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r,r,0")
>> +  (match_operand:GPR 2 "add_operand"      " r,I,L,L")))
>> +  (clobber (match_scratch:GPR 3 "=X,X,X,&r"))]
>> +  ""
>> +{
>> +  if ((which_alternative == 2) || (which_alternative == 3))
>> +    return "#";
>> +
>> +  if (TARGET_64BIT && <MODE>mode == SImode)
>> +    return "add%i2w\t%0,%1,%2";
>> +  else
>> +    return "add%i2\t%0,%1,%2";
>> +}
>> +  "&& reload_completed && const_lui_operand (operands[2], <MODE>mode)"
>> +  [(const_int 0)]
>> +{
>> +  if (REGNO (operands[0]) != REGNO (operands[1]))
>> +    {
>> +      emit_insn (gen_mov<mode> (operands[0], operands[2]));
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
>> operands[1]));
>> +    }
>> +  else
>> +    {
>> +      emit_insn (gen_mov<mode> (operands[3], operands[2]));
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
>> operands[3]));
>> +    }
>> +  DONE;
>> +}
>>    [(set_attr "type" "arith")
>> -   (set_attr "mode" "SI")])
>> +   (set_attr "mode" "<MODE>")])
>>
>> -(define_insn "adddi3"
>> -  [(set (match_operand:DI          0 "register_operand" "=r,r")
>> - (plus:DI (match_operand:DI 1 "register_operand" " r,r")
>> - (match_operand:DI 2 "arith_operand"    " r,I")))]
>> -  "TARGET_64BIT"
>> -  "add%i2\t%0,%1,%2"
>> +(define_insn "add<mode>3_internal2"
>> +  [(set (match_operand:GPR          0 "register_operand" "=r,r")
>> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r")
>> +  (match_operand:GPR 2 "arith_operand"    " r,I")))]
>> +  ""
>> + {
>> +   if (TARGET_64BIT && <MODE>mode == SImode)
>> +     return "add%i2w\t%0,%1,%2";
>> +   else
>> +     return "add%i2\t%0,%1,%2";
>> + }
>>    [(set_attr "type" "arith")
>> -   (set_attr "mode" "DI")])
>> +   (set_attr "mode" "<MODE>")])
>>
>>  (define_expand "addv<mode>4"
>>    [(set (match_operand:GPR           0 "register_operand" "=r,r")
>> @@ -500,6 +557,7 @@ (define_expand "addv<mode>4"
>>    DONE;
>>  })
>>
>> +
>>  (define_expand "uaddv<mode>4"
>>    [(set (match_operand:GPR           0 "register_operand" "=r,r")
>>   (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
>> --
>> 2.25.1
>>
>

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-09-21  0:04 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-19 16:07 [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733] Kevin Lee
2022-09-19 22:16 ` Kito Cheng
2022-09-21  0:04   ` Kevin Lee

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).