[PATCH] LoongArch: Optimize LSX vector shuffle on floating-point vector

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] LoongArch: Optimize LSX vector shuffle on floating-point vector
@ 2023-11-19  7:01 Xi Ruoyao
  2023-11-22  6:41 ` chenglulu
  0 siblings, 1 reply; 2+ messages in thread
From: Xi Ruoyao @ 2023-11-19  7:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: chenglulu, i, xuchenghua, Xi Ruoyao

The vec_perm expander was wrongly defined.  GCC internal says:

Operand 3 is the “selector”.  It is an integral mode vector of the same
width and number of elements as mode M.

With this mistake, the generic code manages to work around and it ends
up creating some very nasty code for a simple __builtin_shuffle (a, b,
c) where a and b are V4SF, c is V4SI:

    la.local    $r12,.LANCHOR0
    la.local    $r13,.LANCHOR1
    vld $vr1,$r12,48
    vslli.w $vr1,$vr1,2
    vld $vr2,$r12,16
    vld $vr0,$r13,0
    vld $vr3,$r13,16
    vshuf.b $vr0,$vr1,$vr1,$vr0
    vld $vr1,$r12,32
    vadd.b  $vr0,$vr0,$vr3
    vandi.b $vr0,$vr0,31
    vshuf.b $vr0,$vr1,$vr2,$vr0
    vst $vr0,$r12,0
    jr  $r1

This is obviously stupid.  Fix the expander definition and adjust
loongarch_expand_vec_perm to handle it correctly.

gcc/ChangeLog:

	* config/loongarch/lsx.md (vec_perm<mode:LSX>): Make the
	selector VIMODE.
	* config/loongarch/loongarch.cc (loongarch_expand_vec_perm):
	Use the mode of the selector (instead of the shuffled vector)
	for truncating it.  Operate on subregs in the selector mode if
	the shuffled vector has a different mode (i. e. it's a
	floating-point vector).

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/vect-shuf-fp.c: New test.
---

Bootstrapped & regtested on loongarch64-linux-gnu.  Ok for trunk?

 gcc/config/loongarch/loongarch.cc              | 18 ++++++++++--------
 gcc/config/loongarch/lsx.md                    |  2 +-
 .../gcc.target/loongarch/vect-shuf-fp.c        | 16 ++++++++++++++++
 3 files changed, 27 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c

diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
index ce601a331f7..33357c670e1 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -8607,8 +8607,9 @@ void
 loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
 {
   machine_mode vmode = GET_MODE (target);
+  machine_mode vimode = GET_MODE (sel);
   auto nelt = GET_MODE_NUNITS (vmode);
-  auto round_reg = gen_reg_rtx (vmode);
+  auto round_reg = gen_reg_rtx (vimode);
   rtx round_data[MAX_VECT_LEN];
 
   for (int i = 0; i < nelt; i += 1)
@@ -8616,9 +8617,16 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
       round_data[i] = GEN_INT (0x1f);
     }
 
-  rtx round_data_rtx = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, round_data));
+  rtx round_data_rtx = gen_rtx_CONST_VECTOR (vimode, gen_rtvec_v (nelt, round_data));
   emit_move_insn (round_reg, round_data_rtx);
 
+  if (vmode != vimode)
+    {
+      target = lowpart_subreg (vimode, target, vmode);
+      op0 = lowpart_subreg (vimode, op0, vmode);
+      op1 = lowpart_subreg (vimode, op1, vmode);
+    }
+
   switch (vmode)
     {
     case E_V16QImode:
@@ -8626,17 +8634,11 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
       emit_insn (gen_lsx_vshuf_b (target, op1, op0, sel));
       break;
     case E_V2DFmode:
-      emit_insn (gen_andv2di3 (sel, sel, round_reg));
-      emit_insn (gen_lsx_vshuf_d_f (target, sel, op1, op0));
-      break;
     case E_V2DImode:
       emit_insn (gen_andv2di3 (sel, sel, round_reg));
       emit_insn (gen_lsx_vshuf_d (target, sel, op1, op0));
       break;
     case E_V4SFmode:
-      emit_insn (gen_andv4si3 (sel, sel, round_reg));
-      emit_insn (gen_lsx_vshuf_w_f (target, sel, op1, op0));
-      break;
     case E_V4SImode:
       emit_insn (gen_andv4si3 (sel, sel, round_reg));
       emit_insn (gen_lsx_vshuf_w (target, sel, op1, op0));
diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
index 8ea41c85b01..5e8d8d74b43 100644
--- a/gcc/config/loongarch/lsx.md
+++ b/gcc/config/loongarch/lsx.md
@@ -837,7 +837,7 @@ (define_expand "vec_perm<mode>"
  [(match_operand:LSX 0 "register_operand")
   (match_operand:LSX 1 "register_operand")
   (match_operand:LSX 2 "register_operand")
-  (match_operand:LSX 3 "register_operand")]
+  (match_operand:<VIMODE> 3 "register_operand")]
   "ISA_HAS_LSX"
 {
   loongarch_expand_vec_perm (operands[0], operands[1],
diff --git a/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c b/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c
new file mode 100644
index 00000000000..7acc2113afe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mlasx -O3" } */
+/* { dg-final { scan-assembler "vshuf\.w" } } */
+
+#define V __attribute__ ((vector_size (16)))
+
+int a V;
+float b V;
+float c V;
+float d V;
+
+void
+test (void)
+{
+  d = __builtin_shuffle (b, c, a);
+}
-- 
2.42.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] LoongArch: Optimize LSX vector shuffle on floating-point vector
  2023-11-19  7:01 [PATCH] LoongArch: Optimize LSX vector shuffle on floating-point vector Xi Ruoyao
@ 2023-11-22  6:41 ` chenglulu
  0 siblings, 0 replies; 2+ messages in thread
From: chenglulu @ 2023-11-22  6:41 UTC (permalink / raw)
  To: Xi Ruoyao, gcc-patches; +Cc: i, xuchenghua


在 2023/11/19 下午3:01, Xi Ruoyao 写道:
> The vec_perm expander was wrongly defined.  GCC internal says:
>
> Operand 3 is the “selector”.  It is an integral mode vector of the same
> width and number of elements as mode M.
>
> With this mistake, the generic code manages to work around and it ends
> up creating some very nasty code for a simple __builtin_shuffle (a, b,
> c) where a and b are V4SF, c is V4SI:
>
>      la.local    $r12,.LANCHOR0
>      la.local    $r13,.LANCHOR1
>      vld $vr1,$r12,48
>      vslli.w $vr1,$vr1,2
>      vld $vr2,$r12,16
>      vld $vr0,$r13,0
>      vld $vr3,$r13,16
>      vshuf.b $vr0,$vr1,$vr1,$vr0
>      vld $vr1,$r12,32
>      vadd.b  $vr0,$vr0,$vr3
>      vandi.b $vr0,$vr0,31
>      vshuf.b $vr0,$vr1,$vr2,$vr0
>      vst $vr0,$r12,0
>      jr  $r1
>
> This is obviously stupid.  Fix the expander definition and adjust
> loongarch_expand_vec_perm to handle it correctly.
>
> gcc/ChangeLog:
>
> 	* config/loongarch/lsx.md (vec_perm<mode:LSX>): Make the
> 	selector VIMODE.
> 	* config/loongarch/loongarch.cc (loongarch_expand_vec_perm):
> 	Use the mode of the selector (instead of the shuffled vector)
> 	for truncating it.  Operate on subregs in the selector mode if
> 	the shuffled vector has a different mode (i. e. it's a
> 	floating-point vector).
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/loongarch/vect-shuf-fp.c: New test.
> ---
>
> Bootstrapped & regtested on loongarch64-linux-gnu.  Ok for trunk?
LGTM. Thanks!
>
>   gcc/config/loongarch/loongarch.cc              | 18 ++++++++++--------
>   gcc/config/loongarch/lsx.md                    |  2 +-
>   .../gcc.target/loongarch/vect-shuf-fp.c        | 16 ++++++++++++++++
>   3 files changed, 27 insertions(+), 9 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index ce601a331f7..33357c670e1 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -8607,8 +8607,9 @@ void
>   loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
>   {
>     machine_mode vmode = GET_MODE (target);
> +  machine_mode vimode = GET_MODE (sel);
>     auto nelt = GET_MODE_NUNITS (vmode);
> -  auto round_reg = gen_reg_rtx (vmode);
> +  auto round_reg = gen_reg_rtx (vimode);
>     rtx round_data[MAX_VECT_LEN];
>   
>     for (int i = 0; i < nelt; i += 1)
> @@ -8616,9 +8617,16 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
>         round_data[i] = GEN_INT (0x1f);
>       }
>   
> -  rtx round_data_rtx = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, round_data));
> +  rtx round_data_rtx = gen_rtx_CONST_VECTOR (vimode, gen_rtvec_v (nelt, round_data));
>     emit_move_insn (round_reg, round_data_rtx);
>   
> +  if (vmode != vimode)
> +    {
> +      target = lowpart_subreg (vimode, target, vmode);
> +      op0 = lowpart_subreg (vimode, op0, vmode);
> +      op1 = lowpart_subreg (vimode, op1, vmode);
> +    }
> +
>     switch (vmode)
>       {
>       case E_V16QImode:
> @@ -8626,17 +8634,11 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
>         emit_insn (gen_lsx_vshuf_b (target, op1, op0, sel));
>         break;
>       case E_V2DFmode:
> -      emit_insn (gen_andv2di3 (sel, sel, round_reg));
> -      emit_insn (gen_lsx_vshuf_d_f (target, sel, op1, op0));
> -      break;
>       case E_V2DImode:
>         emit_insn (gen_andv2di3 (sel, sel, round_reg));
>         emit_insn (gen_lsx_vshuf_d (target, sel, op1, op0));
>         break;
>       case E_V4SFmode:
> -      emit_insn (gen_andv4si3 (sel, sel, round_reg));
> -      emit_insn (gen_lsx_vshuf_w_f (target, sel, op1, op0));
> -      break;
>       case E_V4SImode:
>         emit_insn (gen_andv4si3 (sel, sel, round_reg));
>         emit_insn (gen_lsx_vshuf_w (target, sel, op1, op0));
> diff --git a/gcc/config/loongarch/lsx.md b/gcc/config/loongarch/lsx.md
> index 8ea41c85b01..5e8d8d74b43 100644
> --- a/gcc/config/loongarch/lsx.md
> +++ b/gcc/config/loongarch/lsx.md
> @@ -837,7 +837,7 @@ (define_expand "vec_perm<mode>"
>    [(match_operand:LSX 0 "register_operand")
>     (match_operand:LSX 1 "register_operand")
>     (match_operand:LSX 2 "register_operand")
> -  (match_operand:LSX 3 "register_operand")]
> +  (match_operand:<VIMODE> 3 "register_operand")]
>     "ISA_HAS_LSX"
>   {
>     loongarch_expand_vec_perm (operands[0], operands[1],
> diff --git a/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c b/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c
> new file mode 100644
> index 00000000000..7acc2113afe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/loongarch/vect-shuf-fp.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mlasx -O3" } */
> +/* { dg-final { scan-assembler "vshuf\.w" } } */
> +
> +#define V __attribute__ ((vector_size (16)))
> +
> +int a V;
> +float b V;
> +float c V;
> +float d V;
> +
> +void
> +test (void)
> +{
> +  d = __builtin_shuffle (b, c, a);
> +}


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-11-22  6:41 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-19  7:01 [PATCH] LoongArch: Optimize LSX vector shuffle on floating-point vector Xi Ruoyao
2023-11-22  6:41 ` chenglulu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).