From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <kevinl@rivosinc.com>
Received: from mail-ed1-x534.google.com (mail-ed1-x534.google.com [IPv6:2a00:1450:4864:20::534])
	by sourceware.org (Postfix) with ESMTPS id 0CBF53858C55
	for <gcc-patches@gcc.gnu.org>; Wed, 21 Sep 2022 00:04:36 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 0CBF53858C55
Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=rivosinc.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=rivosinc.com
Received: by mail-ed1-x534.google.com with SMTP id f20so6223870edf.6
        for <gcc-patches@gcc.gnu.org>; Tue, 20 Sep 2022 17:04:35 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=rivosinc-com.20210112.gappssmtp.com; s=20210112;
        h=cc:to:subject:message-id:date:from:in-reply-to:references
         :mime-version:from:to:cc:subject:date;
        bh=db3FpM5i1vioWSwtwsLzmYjuHY9yY+i/U5iMjuv+S2A=;
        b=qaNTWdJgZzNgkanKc0FhZat6GUHLefJeMlpb231jt1kVI0QCP/vdTIgIQ2bxU1r8Rq
         c3hDm8BHmiop4cTXKQHRW31mHGdsDOl6W9882OTH5j0jmKyaWskhbL/txF217E3fQzkS
         yhtItd4SnTwqw0xljCRRp/oUwA4xhczQMYVL7AoKgoMDpQuJZAbgARk06HzRMFbazKZj
         TUUAfFu1Mk5ERUYfsh7dh+jeqmhpPvC7AYLQ+sTnxJJ1ISlrASzLML+aEO7KQMXYB+br
         k90hXekYkUxMgxq1N78+iyDQ36FUGVPB/3DHzQPtulopC1LbGvGh6G3Wg0l8gCeFoWjP
         uG0g==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=cc:to:subject:message-id:date:from:in-reply-to:references
         :mime-version:x-gm-message-state:from:to:cc:subject:date;
        bh=db3FpM5i1vioWSwtwsLzmYjuHY9yY+i/U5iMjuv+S2A=;
        b=sgq5lWmVZMmWbeo/reraHeGfPPPYSaQu7DE0UFfKWTIw/hiucF7rIUFJPyL7Bptju5
         zRj8Ipq4RYGnukpavzdvvmjnTxl5/IR7/9X5mkUAtDjFV/pFyNt7kFL6zfbFK7xov5dH
         KMGrYqz/j6amKmD4QfKQ+4CCzde/LAaZYwzHnn+rK1dYVZh4Ho+i1atQEBjOwfgqUwk7
         zFTr7Uq/D7xwkQT7G6YjyX8LWp6gBVPFu3qZiqrA5xCwwHBoK4pOdowCG6Y6fGIQTvGd
         GvGPIsv4k9TGnSFgKFA/fgMG59dYsjghRp3kSpVDBSL+ggpIX/V2Un5S26lHI6Z73LCl
         wg+g==
X-Gm-Message-State: ACrzQf3l5QlzNiZU0KQxl20iO4DKwYS6ZhNt5xyxm0Kpes1SVvSoZ9A/
	Pjq3hTugAvxhU26mU9JlFZiShzL5SDJfOvsHfdkkOQ==
X-Google-Smtp-Source: AMsMyM7QEFrXR2LtUg0i4hf8dyZuXMqb65o29hpVNj/GIEVfb58IoSIleBjKkbVkD3ctmMCot+zpZrPt0/pGkVs4Mqk=
X-Received: by 2002:a05:6402:d0b:b0:443:df38:9df with SMTP id
 eb11-20020a0564020d0b00b00443df3809dfmr22130357edb.9.1663718674761; Tue, 20
 Sep 2022 17:04:34 -0700 (PDT)
MIME-Version: 1.0
References: <CALkvSf8f0-cAyKR6K4GfW3L+DQ6ghexE=kjR-JkeokUeBQ3mpw@mail.gmail.com>
 <CA+yXCZC9Ne3u7+Qz_Z1RfgZFSn7U-bHos4q25YfUYT_p2nPhvw@mail.gmail.com>
In-Reply-To: <CA+yXCZC9Ne3u7+Qz_Z1RfgZFSn7U-bHos4q25YfUYT_p2nPhvw@mail.gmail.com>
From: Kevin Lee <kevinl@rivosinc.com>
Date: Tue, 20 Sep 2022 17:04:23 -0700
Message-ID: <CALkvSf9eVfH5TSGuHamVE4AebfOFUpJ42HittNof_UZCkt1vpA@mail.gmail.com>
Subject: Re: [PATCH] RISC-V modified add3 for large stack frame optimization [PR105733]
To: Kito Cheng <kito.cheng@gmail.com>
Cc: gcc-patches@gcc.gnu.org, gnu-toolchain@rivosinc.com
Content-Type: multipart/alternative; boundary="0000000000009d9e5f05e924b0a1"
X-Spam-Status: No, score=-7.5 required=5.0 tests=BAYES_00,BODY_8BITS,DKIM_SIGNED,DKIM_VALID,GIT_PATCH_0,HTML_MESSAGE,KAM_SHORT,RCVD_IN_DNSWL_NONE,SCC_5_SHORT_WORD_LINES,SPF_HELO_NONE,SPF_PASS,TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <gcc-patches.gcc.gnu.org>

--0000000000009d9e5f05e924b0a1
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

The proposed patch only makes the difference if the operand 1 is an
eliminable register and operand 2 is a splittable const int. Otherwise, it
follows the original add3 pattern.

Besides the example from pr105733 shown on the first post,
#define BUF_SIZE 5012
void saxpy( float a )
{
  volatile float x[BUF_SIZE];
  volatile float y[BUF_SIZE];

  for (int i =3D 0; i < BUF_SIZE; ++i)
      y[i] =3D a*x[i] + y[i];
}
generates
Before:
saxpy:
            li t0,-40960
            li a2,40960
            addi t0,t0,848
            add sp,sp,t0
            li a4,-40960
            addi a3,a2,-864
            add a3,a3,a4
            addi a4,sp,16
            add a4,a3,a4
            sd a4,0(sp)
            addi a3,a2,-864
            li a4,-20480
            add a3,a3,a4
            addi a4,sp,16
            add a4,a3,a4
            li a2,4096
            li a5,0
            sd a4,8(sp)
            addi a2,a2,916
.L2:
            ld a4,8(sp)
            ld a3,0(sp)
            sh2add a4,a5,a4
            sh2add a3,a5,a3
            flw fa5,864(a3)
            flw fa4,432(a4)
            addiw a5,a5,1
            fmadd.s fa5,fa5,fa0,fa4
            fsw fa5,432(a4)
            bne a5,a2,.L2
            li t0,40960
            addi t0,t0,-848
            add sp,sp,t0
            jr ra

After:
saxpy:
            li t0,-40960
            addi t0,t0,864
            li a2,4096
            add sp,sp,t0
            li a5,0
            addi a2,a2,916
.L2:
            li a4,20480
            addi a4,a4,-864
            add a4,a4,sp
            addi a3,sp,-864
            sh2add a4,a5,a4
            sh2add a3,a5,a3
            flw fa5,864(a3)
            flw fa4,432(a4)
            addiw a5,a5,1
            fmadd.s fa5,fa5,fa0,fa4
            fsw fa5,432(a4)
            bne a5,a2,.L2
            li t0,40960
            addi t0,t0,-864
            add sp,sp,t0
            jr ra

The number of instructions before .L2 is reduced from 19 to 6 after the
patch.
Moreover, the following example
#define limit 4096
void foo()
{
    volatile int temp =3D 0;
    volatile int buf[limit];
    for(int i =3D 0; i < limit; ++i){
        for(int j =3D 0; j < limit; ++j){
            temp +=3D buf[(i * 1234 + j) % limit];
        }
    }
}
generates
before:
foo:
            li t0,-16384
            addi t0,t0,-32
            li a4,16384
            add sp,sp,t0
            li a5,-16384
            addi a4,a4,16
            add a4,a4,a5
            addi a5,sp,16
            add a5,a4,a5
            li a1,4096
            sd a5,8(sp)
            sw zero,-4(a5)
            li a7,-4096
            addi a0,a1,-1
            li a6,5058560
.L2:
            addw a5,a7,a1
.L3:
            ld a3,8(sp)
            and a4,a5,a0
            addiw a5,a5,1
            sh2add a4,a4,a3
            lw a2,0(a4)
            lw a4,-4(a3)
            addw a4,a4,a2
            ld a2,8(sp)
            sw a4,-4(a2)
            bne a5,a1,.L3
            addiw a1,a5,1234
            bne a1,a6,.L2
            li t0,16384
            addi t0,t0,32
            add sp,sp,t0
            jr ra

After:
foo:
            li t0,-16384
            addi t0,t0,-16
            add sp,sp,t0
            li a1,4096
            sw zero,12(sp)
            li a7,-4096
            addi a0,a1,-1
            li a6,5058560
.L2:
            addw a5,a7,a1
.L3:
            and a4,a5,a0
            addi a3,sp,16
            sh2add a4,a4,a3
            lw a2,0(a4)
            lw a4,12(sp)
            addiw a5,a5,1
            addw a4,a4,a2
            sw a4,12(sp)
            bne a5,a1,.L3
            addiw a1,a5,1234
            bne a1,a6,.L2
            li t0,16384
            addi t0,t0,16
            add sp,sp,t0
            jr ra

This example also shows that the instructions before .L2 is reduced from 15
lines to 8 lines after the patch.

On Mon, Sep 19, 2022 at 3:16 PM Kito Cheng <kito.cheng@gmail.com> wrote:

> Could you provide some data including code size and performance? add is
> frequently used patten, so we should more careful when changing that.
>
> Kevin Lee <kevinl@rivosinc.com>=E6=96=BC 2022=E5=B9=B49=E6=9C=8819=E6=97=
=A5 =E9=80=B1=E4=B8=80=EF=BC=8C18:07=E5=AF=AB=E9=81=93=EF=BC=9A
>
>> Hello GCC,
>>  Started from Jim Wilson's patch in
>>
>> https://github.com/riscv-admin/riscv-code-speed-optimization/blob/main/p=
rojects/gcc-optimizations.adoc
>> for the large stack frame optimization problem, this augmented patch
>> generates less instructions for cases such as
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D105733.
>> Original:
>> foo:
>> li t0,-4096
>> addi t0,t0,2016
>> li a4,4096
>> add sp,sp,t0
>> li a5,-4096
>> addi a4,a4,-2032
>> add a4,a4,a5
>> addi a5,sp,16
>> add a5,a4,a5
>> add a0,a5,a0
>> li t0,4096
>> sd a5,8(sp)
>> sb zero,2032(a0)
>> addi t0,t0,-2016
>> add sp,sp,t0
>> jr ra
>> After Patch:
>> foo:
>> li t0,-4096
>> addi t0,t0,2032
>> add sp,sp,t0
>> addi a5,sp,-2032
>> add a0,a5,a0
>> li t0,4096
>> sb zero,2032(a0)
>> addi t0,t0,-2032
>> add sp,sp,t0
>> jr ra
>>
>>                =3D=3D=3D=3D=3D=3D=3D=3D=3D Summary of gcc testsuite =3D=
=3D=3D=3D=3D=3D=3D=3D=3D
>>                             | # of unexpected case / # of unique
>> unexpected
>> case
>>                             |          gcc |          g++ |     gfortran=
 |
>>      rv64gc/  lp64d/ medlow |    4 /     4 |   13 /     4 |    0 /     0=
 |
>> No additional failures were created from the testsuite.
>>
>> gcc/ChangeLog:
>>    Jim Wilson <jim.wilson.gcc@gmail.com>
>>    Michael Collison <collison@rivosinc.com>
>>    Kevin Lee <kevinl@rivosinc.com>
>>
>> * config/riscv/predicates.md (const_lui_operand): New predicate.
>> (add_operand): Ditto.
>> (reg_or_const_int_operand): Ditto.
>> * config/riscv/riscv-protos.h (riscv_eliminable_reg): New
>>     function.
>> * config/riscv/riscv.cc (riscv_eliminable_reg): New Function.
>> (riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
>>     gen_rtx_fmt_ee instead of gen_add3_insn.
>> (riscv_adjust_libcall_cfi_epilogue): ditto.
>> * config/riscv/riscv.md (addsi3): Remove.
>> (adddi3): ditto.
>> (add<mode>3): New instruction for large stack frame optimization.
>> (add<mode>3_internal): ditto
>> (add<mode>3_internal2): New instruction for insns generated in
>> the prologue and epilogue pass.
>> ---
>>  gcc/config/riscv/predicates.md  | 13 +++++
>>  gcc/config/riscv/riscv-protos.h |  1 +
>>  gcc/config/riscv/riscv.cc       | 20 ++++++--
>>  gcc/config/riscv/riscv.md       | 84 ++++++++++++++++++++++++++++-----
>>  4 files changed, 101 insertions(+), 17 deletions(-)
>>
>> diff --git a/gcc/config/riscv/predicates.md
>> b/gcc/config/riscv/predicates.md
>> index 862e72b0983..b98bb5a9768 100644
>> --- a/gcc/config/riscv/predicates.md
>> +++ b/gcc/config/riscv/predicates.md
>> @@ -35,6 +35,14 @@ (define_predicate "sfb_alu_operand"
>>    (ior (match_operand 0 "arith_operand")
>>         (match_operand 0 "lui_operand")))
>>
>> +(define_predicate "const_lui_operand"
>> +  (and (match_code "const_int")
>> +       (match_test "(INTVAL (op) & 0xFFF) =3D=3D 0 && INTVAL (op) !=3D =
0")))
>> +
>> +(define_predicate "add_operand"
>> +  (ior (match_operand 0 "arith_operand")
>> +       (match_operand 0 "const_lui_operand")))
>> +
>>  (define_predicate "const_csr_operand"
>>    (and (match_code "const_int")
>>         (match_test "IN_RANGE (INTVAL (op), 0, 31)")))
>> @@ -59,6 +67,11 @@ (define_predicate "reg_or_0_operand"
>>    (ior (match_operand 0 "const_0_operand")
>>         (match_operand 0 "register_operand")))
>>
>> +;; For use in adds, when adding to an eliminable register.
>> +(define_predicate "reg_or_const_int_operand"
>> +  (ior (match_code "const_int")
>> +       (match_operand 0 "register_operand")))
>> +
>>  ;; Only use branch-on-bit sequences when the mask is not an ANDI
>> immediate.
>>  (define_predicate "branch_on_bit_operand"
>>    (and (match_code "const_int")
>> diff --git a/gcc/config/riscv/riscv-protos.h
>> b/gcc/config/riscv/riscv-protos.h
>> index 649c5c977e1..8f0aa8114be 100644
>> --- a/gcc/config/riscv/riscv-protos.h
>> +++ b/gcc/config/riscv/riscv-protos.h
>> @@ -63,6 +63,7 @@ extern void riscv_expand_conditional_move (rtx, rtx,
>> rtx,
>> rtx_code, rtx, rtx);
>>  extern rtx riscv_legitimize_call_address (rtx);
>>  extern void riscv_set_return_address (rtx, rtx);
>>  extern bool riscv_expand_block_move (rtx, rtx, rtx);
>> +extern bool riscv_eliminable_reg (rtx);
>>  extern rtx riscv_return_addr (int, rtx);
>>  extern poly_int64 riscv_initial_elimination_offset (int, int);
>>  extern void riscv_expand_prologue (void);
>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>> index 675d92c0961..b5577a4f366 100644
>> --- a/gcc/config/riscv/riscv.cc
>> +++ b/gcc/config/riscv/riscv.cc
>> @@ -4320,6 +4320,16 @@ riscv_initial_elimination_offset (int from, int t=
o)
>>    return src - dest;
>>  }
>>
>> +/* Return true if X is a register that will be eliminated later on.  */
>> +bool
>> +riscv_eliminable_reg (rtx x)
>> +{
>> +  return REG_P (x) && (REGNO (x) =3D=3D FRAME_POINTER_REGNUM
>> +       || REGNO (x) =3D=3D ARG_POINTER_REGNUM
>> +       || (REGNO (x) >=3D FIRST_VIRTUAL_REGISTER
>> +   && REGNO (x) <=3D LAST_VIRTUAL_REGISTER));
>> +}
>> +
>>  /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
>>     previous frame.  */
>>
>> @@ -4521,8 +4531,9 @@ riscv_adjust_libcall_cfi_prologue ()
>>        }
>>
>>    /* Debug info for adjust sp.  */
>> -  adjust_sp_rtx =3D gen_add3_insn (stack_pointer_rtx,
>> - stack_pointer_rtx, GEN_INT (-saved_size));
>> +  adjust_sp_rtx =3D gen_rtx_SET (stack_pointer_rtx,
>> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
>> +          stack_pointer_rtx, GEN_INT (saved_size)));
>>    dwarf =3D alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>>    dwarf);
>>    return dwarf;
>> @@ -4624,8 +4635,9 @@ riscv_adjust_libcall_cfi_epilogue ()
>>    int saved_size =3D cfun->machine->frame.save_libcall_adjustment;
>>
>>    /* Debug info for adjust sp.  */
>> -  adjust_sp_rtx =3D gen_add3_insn (stack_pointer_rtx,
>> - stack_pointer_rtx, GEN_INT (saved_size));
>> +  adjust_sp_rtx =3D gen_rtx_SET (stack_pointer_rtx,
>> +            gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
>> +          stack_pointer_rtx, GEN_INT (saved_size)));
>>    dwarf =3D alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
>>    dwarf);
>>
>> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
>> index 014206fb8bd..0285ac67b2a 100644
>> --- a/gcc/config/riscv/riscv.md
>> +++ b/gcc/config/riscv/riscv.md
>> @@ -438,23 +438,80 @@ (define_insn "add<mode>3"
>>    [(set_attr "type" "fadd")
>>     (set_attr "mode" "<UNITMODE>")])
>>
>> -(define_insn "addsi3"
>> -  [(set (match_operand:SI          0 "register_operand" "=3Dr,r")
>> - (plus:SI (match_operand:SI 1 "register_operand" " r,r")
>> - (match_operand:SI 2 "arith_operand"    " r,I")))]
>> +(define_expand "add<mode>3"
>> +  [(parallel
>> +    [(set (match_operand:GPR          0 "register_operand" "")
>> +  (plus:GPR (match_operand:GPR 1 "register_operand" "")
>> +   (match_operand:GPR 2 "add_operand" "")))
>> +    (clobber (match_scratch:GPR 3 ""))])]
>>    ""
>> -  "add%i2%~\t%0,%1,%2"
>> +{
>> +  if (riscv_eliminable_reg (operands[1]))
>> +  {
>> +    if (splittable_const_int_operand (operands[2], <MODE>mode))
>> +    {
>> +      /* The idea here is that we emit
>> +          add op0, op1, %hi(op2)
>> +          addi op0, op0, %lo(op2)
>> +        Then when op1, the eliminable reg, gets replaced with sp+offset,
>> +        we can simplify the constants.  */
>> +      HOST_WIDE_INT high_part =3D CONST_HIGH_PART (INTVAL (operands[2])=
);
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[1],
>> +                  GEN_INT (high_part)));
>> +      operands[1] =3D operands[0];
>> +      operands[2] =3D GEN_INT (INTVAL (operands[2]) - high_part);
>> +    }
>> +    else if (! const_arith_operand (operands[2], <MODE>mode))
>> +      operands[2] =3D force_reg (<MODE>mode, operands[2]);
>> +  }
>> +})
>> +
>> +(define_insn_and_split "add<mode>3_internal"
>> +  [(set (match_operand:GPR          0 "register_operand" "=3Dr,r,&r,!&r=
")
>> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r,r,0")
>> +  (match_operand:GPR 2 "add_operand"      " r,I,L,L")))
>> +  (clobber (match_scratch:GPR 3 "=3DX,X,X,&r"))]
>> +  ""
>> +{
>> +  if ((which_alternative =3D=3D 2) || (which_alternative =3D=3D 3))
>> +    return "#";
>> +
>> +  if (TARGET_64BIT && <MODE>mode =3D=3D SImode)
>> +    return "add%i2w\t%0,%1,%2";
>> +  else
>> +    return "add%i2\t%0,%1,%2";
>> +}
>> +  "&& reload_completed && const_lui_operand (operands[2], <MODE>mode)"
>> +  [(const_int 0)]
>> +{
>> +  if (REGNO (operands[0]) !=3D REGNO (operands[1]))
>> +    {
>> +      emit_insn (gen_mov<mode> (operands[0], operands[2]));
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
>> operands[1]));
>> +    }
>> +  else
>> +    {
>> +      emit_insn (gen_mov<mode> (operands[3], operands[2]));
>> +      emit_insn (gen_add<mode>3_internal (operands[0], operands[0],
>> operands[3]));
>> +    }
>> +  DONE;
>> +}
>>    [(set_attr "type" "arith")
>> -   (set_attr "mode" "SI")])
>> +   (set_attr "mode" "<MODE>")])
>>
>> -(define_insn "adddi3"
>> -  [(set (match_operand:DI          0 "register_operand" "=3Dr,r")
>> - (plus:DI (match_operand:DI 1 "register_operand" " r,r")
>> - (match_operand:DI 2 "arith_operand"    " r,I")))]
>> -  "TARGET_64BIT"
>> -  "add%i2\t%0,%1,%2"
>> +(define_insn "add<mode>3_internal2"
>> +  [(set (match_operand:GPR          0 "register_operand" "=3Dr,r")
>> + (plus:GPR (match_operand:GPR 1 "register_operand" " %r,r")
>> +  (match_operand:GPR 2 "arith_operand"    " r,I")))]
>> +  ""
>> + {
>> +   if (TARGET_64BIT && <MODE>mode =3D=3D SImode)
>> +     return "add%i2w\t%0,%1,%2";
>> +   else
>> +     return "add%i2\t%0,%1,%2";
>> + }
>>    [(set_attr "type" "arith")
>> -   (set_attr "mode" "DI")])
>> +   (set_attr "mode" "<MODE>")])
>>
>>  (define_expand "addv<mode>4"
>>    [(set (match_operand:GPR           0 "register_operand" "=3Dr,r")
>> @@ -500,6 +557,7 @@ (define_expand "addv<mode>4"
>>    DONE;
>>  })
>>
>> +
>>  (define_expand "uaddv<mode>4"
>>    [(set (match_operand:GPR           0 "register_operand" "=3Dr,r")
>>   (plus:GPR (match_operand:GPR 1 "register_operand" " r,r")
>> --
>> 2.25.1
>>
>

--0000000000009d9e5f05e924b0a1--