* [PATCH v1] LoongArch: Optimize implementation of single-precision floating-point approximate division.
@ 2024-01-24 9:44 Li Wei
2024-01-26 8:17 ` [pushed][PATCH " chenglulu
0 siblings, 1 reply; 2+ messages in thread
From: Li Wei @ 2024-01-24 9:44 UTC (permalink / raw)
To: gcc-patches; +Cc: xry111, i, xuchenghua, chenglulu, Li Wei
We found that in the spec17 521.wrf program, some loop invariant code generated
from single-precision floating-point approximate division calculation failed to
propose a loop. This is because the pseudo-register that stores the
intermediate temporary calculation results is rewritten in the implementation
of single-precision floating-point approximate division, failing to propose
invariants in the loop2_invariant pass. To this end, the intermediate temporary
calculation results are stored in new pseudo-registers without destroying the
read-write dependency, so that they could be recognized as loop invariants in
the loop2_invariant pass.
After optimization, the number of instructions of 521.wrf is reduced by 0.18%
compared with before optimization (1716612948501 -> 1713471771364).
gcc/ChangeLog:
* config/loongarch/loongarch.cc (loongarch_emit_swdivsf): Adjust.
gcc/testsuite/ChangeLog:
* gcc.target/loongarch/invariant-recip.c: New test.
---
gcc/config/loongarch/loongarch.cc | 19 +++++++----
.../gcc.target/loongarch/invariant-recip.c | 33 +++++++++++++++++++
2 files changed, 46 insertions(+), 6 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/loongarch/invariant-recip.c
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index 32a0b6f43e8..1b88147fd8c 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -10894,16 +10894,23 @@ void loongarch_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
/* x0 = 1./b estimate. */
emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
unspec)));
- /* 2.0 - b * x0 */
+ /* e0 = 2.0 - b * x0. */
emit_insn (gen_rtx_SET (e0, gen_rtx_FMA (mode,
gen_rtx_NEG (mode, b), x0, mtwo)));
- /* x0 = a * x0 */
if (a != CONST1_RTX (mode))
- emit_insn (gen_rtx_SET (x0, gen_rtx_MULT (mode, a, x0)));
-
- /* res = e0 * x0 */
- emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
+ {
+ rtx e1 = gen_reg_rtx (mode);
+ /* e1 = a * x0. */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, a, x0)));
+ /* res = e0 * e1. */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, e1)));
+ }
+ else
+ {
+ /* res = e0 * x0. */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
+ }
}
static bool
diff --git a/gcc/testsuite/gcc.target/loongarch/invariant-recip.c b/gcc/testsuite/gcc.target/loongarch/invariant-recip.c
new file mode 100644
index 00000000000..2f64f6ed5e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/invariant-recip.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=loongarch64 -mabi=lp64d -mrecip -mfrecipe -fdump-rtl-loop2_invariant " } */
+/* { dg-final { scan-rtl-dump "Decided to move dependent invariant" "loop2_invariant" } } */
+
+void
+nislfv_rain_plm (int im, int km, float dzl[im][km], float rql[im][km],
+ float dt)
+{
+ int i, k;
+ float con1, decfl;
+ float dz[km], qn[km], wi[km + 1];
+
+ for (i = 0; i < im; i++)
+ {
+ for (k = 0; k < km; k++)
+ {
+ dz[k] = dzl[i][k];
+ }
+ con1 = 0.05;
+ for (k = km - 1; k >= 0; k--)
+ {
+ decfl = (wi[k + 1] - wi[k]) * dt / dz[k];
+ if (decfl > con1)
+ {
+ wi[k] = wi[k + 1] - con1 * dz[k] / dt;
+ }
+ }
+ for (k = 0; k < km; k++)
+ {
+ rql[i][k] = qn[k];
+ }
+ }
+}
--
2.39.3
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [pushed][PATCH v1] LoongArch: Optimize implementation of single-precision floating-point approximate division.
2024-01-24 9:44 [PATCH v1] LoongArch: Optimize implementation of single-precision floating-point approximate division Li Wei
@ 2024-01-26 8:17 ` chenglulu
0 siblings, 0 replies; 2+ messages in thread
From: chenglulu @ 2024-01-26 8:17 UTC (permalink / raw)
To: Li Wei, gcc-patches; +Cc: xry111, i, xuchenghua
Pushed to r14-8444.
在 2024/1/24 下午5:44, Li Wei 写道:
> We found that in the spec17 521.wrf program, some loop invariant code generated
> from single-precision floating-point approximate division calculation failed to
> propose a loop. This is because the pseudo-register that stores the
> intermediate temporary calculation results is rewritten in the implementation
> of single-precision floating-point approximate division, failing to propose
> invariants in the loop2_invariant pass. To this end, the intermediate temporary
> calculation results are stored in new pseudo-registers without destroying the
> read-write dependency, so that they could be recognized as loop invariants in
> the loop2_invariant pass.
> After optimization, the number of instructions of 521.wrf is reduced by 0.18%
> compared with before optimization (1716612948501 -> 1713471771364).
>
> gcc/ChangeLog:
>
> * config/loongarch/loongarch.cc (loongarch_emit_swdivsf): Adjust.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/loongarch/invariant-recip.c: New test.
> ---
> gcc/config/loongarch/loongarch.cc | 19 +++++++----
> .../gcc.target/loongarch/invariant-recip.c | 33 +++++++++++++++++++
> 2 files changed, 46 insertions(+), 6 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/loongarch/invariant-recip.c
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index 32a0b6f43e8..1b88147fd8c 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -10894,16 +10894,23 @@ void loongarch_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
> /* x0 = 1./b estimate. */
> emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
> unspec)));
> - /* 2.0 - b * x0 */
> + /* e0 = 2.0 - b * x0. */
> emit_insn (gen_rtx_SET (e0, gen_rtx_FMA (mode,
> gen_rtx_NEG (mode, b), x0, mtwo)));
>
> - /* x0 = a * x0 */
> if (a != CONST1_RTX (mode))
> - emit_insn (gen_rtx_SET (x0, gen_rtx_MULT (mode, a, x0)));
> -
> - /* res = e0 * x0 */
> - emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
> + {
> + rtx e1 = gen_reg_rtx (mode);
> + /* e1 = a * x0. */
> + emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, a, x0)));
> + /* res = e0 * e1. */
> + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, e1)));
> + }
> + else
> + {
> + /* res = e0 * x0. */
> + emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
> + }
> }
>
> static bool
> diff --git a/gcc/testsuite/gcc.target/loongarch/invariant-recip.c b/gcc/testsuite/gcc.target/loongarch/invariant-recip.c
> new file mode 100644
> index 00000000000..2f64f6ed5e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/invariant-recip.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -march=loongarch64 -mabi=lp64d -mrecip -mfrecipe -fdump-rtl-loop2_invariant " } */
> +/* { dg-final { scan-rtl-dump "Decided to move dependent invariant" "loop2_invariant" } } */
> +
> +void
> +nislfv_rain_plm (int im, int km, float dzl[im][km], float rql[im][km],
> + float dt)
> +{
> + int i, k;
> + float con1, decfl;
> + float dz[km], qn[km], wi[km + 1];
> +
> + for (i = 0; i < im; i++)
> + {
> + for (k = 0; k < km; k++)
> + {
> + dz[k] = dzl[i][k];
> + }
> + con1 = 0.05;
> + for (k = km - 1; k >= 0; k--)
> + {
> + decfl = (wi[k + 1] - wi[k]) * dt / dz[k];
> + if (decfl > con1)
> + {
> + wi[k] = wi[k + 1] - con1 * dz[k] / dt;
> + }
> + }
> + for (k = 0; k < km; k++)
> + {
> + rql[i][k] = qn[k];
> + }
> + }
> +}
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-01-26 8:17 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-24 9:44 [PATCH v1] LoongArch: Optimize implementation of single-precision floating-point approximate division Li Wei
2024-01-26 8:17 ` [pushed][PATCH " chenglulu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).