public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
@ 2023-08-11  9:01 Lehua Ding
  2023-08-11 15:04 ` Jeff Law
  2023-08-11 23:02 ` Jeff Law
  0 siblings, 2 replies; 8+ messages in thread
From: Lehua Ding @ 2023-08-11  9:01 UTC (permalink / raw)
  To: gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer, jeffreyalaw

Hi,

This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
optimize the special case when the scalar operand is zero.

Currently, the broadcast pattern where the scalar operand is a imm
will be converted to vmv.v.i from vmv.s.x and the mask operand will be
converted from 00..01 to 11..11. There are some advantages and
disadvantages before and after the conversion after discussing
with Juzhe offline and we chose not to do this transform.

Before:

  Advantages: The vsetvli info required by vmv.s.x has better compatibility since
  vmv.s.x only required SEW and VLEN be zero or one. That mean there
  is more opportunities to combine with other vsetlv infos in vsetvl pass.

  Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
  will be needed.

After:

  Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.

  Disadvantages: Like before's advantages. Worse compatibility leads to more
  vsetvl instrunctions need.

Consider the bellow C code and asm after autovec.
there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma)
after converted vmv.s.x to vmv.v.i.

```
int foo1(int* restrict a, int* restrict b, int *restrict c, int n) {
    int sum = 0;
    for (int i = 0; i < n; i++)
      sum += a[i] * b[i];
    
    return sum;
}
```

asm (Before):

```
foo1:
        ble     a3,zero,.L7
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.v.i v1,0
.L6:
        vsetvli a5,a3,e32,m1,tu,ma
        slli    a4,a5,2
        sub     a3,a3,a5
        vle32.v v2,0(a0)
        vle32.v v3,0(a1)
        add     a0,a0,a4
        add     a1,a1,a4
        vmacc.vv        v1,v3,v2
        bne     a3,zero,.L6
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.s.x v2,zero
        vredsum.vs      v1,v1,v2
        vmv.x.s a0,v1
        ret
.L7:
        li      a0,0
        ret
```

asm (After):

```
foo1:
        ble     a3,zero,.L4
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.v.i v1,0
.L3:
        vsetvli a5,a3,e32,m1,tu,ma
        slli    a4,a5,2
        sub     a3,a3,a5
        vle32.v v2,0(a0)
        vle32.v v3,0(a1)
        add     a0,a0,a4
        add     a1,a1,a4
        vmacc.vv        v1,v3,v2
        bne     a3,zero,.L3
        vsetivli        zero,1,e32,m1,ta,ma
        vmv.v.i v2,0
        vsetvli a2,zero,e32,m1,ta,ma
        vredsum.vs      v1,v1,v2
        vmv.x.s a0,v1
        ret
.L4:
        li      a0,0
        ret
```

Best,
Lehua

Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

gcc/ChangeLog:

	* config/riscv/predicates.md (vector_const_0_operand): New.
	* config/riscv/vector.md (*pred_broadcast<mode>_zero): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/scalar_move-5.c: Update.
	* gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.

---
 gcc/config/riscv/predicates.md                |  4 ++
 gcc/config/riscv/vector.md                    | 43 +++++++++++++------
 .../gcc.target/riscv/rvv/base/scalar_move-5.c | 20 +++++++--
 .../gcc.target/riscv/rvv/base/scalar_move-6.c | 22 ++++++++--
 4 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index f2e406c718a..c102489d979 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -300,6 +300,10 @@
        (match_test "satisfies_constraint_vi (op)
                     || satisfies_constraint_Wc0 (op)")))
 
+(define_predicate "vector_const_0_operand"
+  (and (match_code "const_vector")
+       (match_test "satisfies_constraint_Wc0 (op)")))
+
 (define_predicate "vector_move_operand"
   (ior (match_operand 0 "nonimmediate_operand")
        (and (match_code "const_vector")
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 508a3074080..4d98ab6f7e8 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1719,23 +1719,24 @@
 	  (match_operand:V_VLS 2 "vector_merge_operand")))]
   "TARGET_VECTOR"
 {
-  /* Handle vmv.s.x instruction which has memory scalar.  */
-  if (satisfies_constraint_Wdm (operands[3]) || riscv_vector::simm5_p (operands[3])
-      || rtx_equal_p (operands[3], CONST0_RTX (<VEL>mode)))
+  /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar.  */
+  if (satisfies_constraint_Wdm (operands[3]))
     {
       if (satisfies_constraint_Wb1 (operands[1]))
-        {
-          // Case 1: vmv.s.x (TA) ==> vlse.v (TA)
-          if (satisfies_constraint_vu (operands[2]))
-            operands[1] = CONSTM1_RTX (<VM>mode);
-          else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
-            {
-	      // Case 2: vmv.s.x (TU) ==> andi vl + vlse.v (TU) in RV32 system.
+	{
+	  /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA)  */
+	  if (satisfies_constraint_vu (operands[2]))
+	    operands[1] = CONSTM1_RTX (<VM>mode);
+	  else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
+	    {
+	      /* Case 2: vmv.s.x (TU, x == memory) ==>
+			   vl = 0 or 1; + vlse.v (TU) in RV32 system  */
 	      operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
 	      operands[1] = CONSTM1_RTX (<VM>mode);
 	    }
-          else
-            operands[3] = force_reg (<VEL>mode, operands[3]);
+	  else
+	    /* Case 3: load x (memory) to register.  */
+	    operands[3] = force_reg (<VEL>mode, operands[3]);
 	}
     }
   else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
@@ -1885,6 +1886,24 @@
   [(set_attr "type" "vimov,vimov")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*pred_broadcast<mode>_zero"
+  [(set (match_operand:V_VLS 0 "register_operand"                          "=vr,    vr")
+    (if_then_else:V_VLS
+      (unspec:<VM>
+        [(match_operand:<VM> 1 "vector_least_significant_set_mask_operand" "Wb1,   Wb1")
+         (match_operand 4 "vector_length_operand"                          " rK,    rK")
+         (match_operand 5 "const_int_operand"                              "  i,     i")
+         (match_operand 6 "const_int_operand"                              "  i,     i")
+         (match_operand 7 "const_int_operand"                              "  i,     i")
+         (reg:SI VL_REGNUM)
+         (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+      (match_operand:V_VLS 3 "vector_const_0_operand"                      "Wc0,   Wc0")
+      (match_operand:V_VLS 2 "vector_merge_operand"                        " vu,     0")))]
+  "TARGET_VECTOR"
+  "vmv.s.x\t%0,zero"
+  [(set_attr "type" "vimovxv,vimovxv")
+   (set_attr "mode" "<MODE>")])
+
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated Strided loads/stores
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
index db6800c8978..2e897a4896f 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
@@ -121,7 +121,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
 /*
 ** foo9:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 ** ret
 */
@@ -150,7 +150,7 @@ void foo10 (void *base, void *out, size_t vl)
 /*
 ** foo11:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -164,7 +164,7 @@ void foo11 (void *base, void *out, size_t vl)
 /*
 ** foo12:
 ** ...
-** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -174,3 +174,17 @@ void foo12 (void *base, void *out, size_t vl)
     vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0, vl);
     *(vfloat64m2_t*)out = v;
 }
+
+/*
+** foo13:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo13 (void *base, void *out, size_t vl)
+{
+    vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+    vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+    *(vfloat64m2_t*)out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
index f27f85cdb58..326cfd8e2ff 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
@@ -119,7 +119,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
 /*
 ** foo9:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 ** ret
 */
@@ -133,7 +133,7 @@ void foo9 (void *base, void *out, size_t vl)
 /*
 ** foo10:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 */
 void foo10 (void *base, void *out, size_t vl)
@@ -147,7 +147,7 @@ void foo10 (void *base, void *out, size_t vl)
 /*
 ** foo11:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -161,7 +161,7 @@ void foo11 (void *base, void *out, size_t vl)
 /*
 ** foo12:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -172,6 +172,20 @@ void foo12 (void *base, void *out, size_t vl)
     *(vfloat64m2_t*)out = v;
 }
 
+/*
+** foo12_1:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo12_1 (void *base, void *out, size_t vl)
+{
+    vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+    vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+    *(vfloat64m2_t*)out = v;
+}
+
 /*
 ** foo13:
 ** ...
-- 
2.36.3


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2023-08-11  9:01 [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i Lehua Ding
@ 2023-08-11 15:04 ` Jeff Law
  2023-08-11 15:43   ` Lehua Ding
  2023-08-11 23:02 ` Jeff Law
  1 sibling, 1 reply; 8+ messages in thread
From: Jeff Law @ 2023-08-11 15:04 UTC (permalink / raw)
  To: Lehua Ding, gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer



On 8/11/23 03:01, Lehua Ding wrote:
> Hi,
> 
> This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
> optimize the special case when the scalar operand is zero.
> 
> Currently, the broadcast pattern where the scalar operand is a imm
> will be converted to vmv.v.i from vmv.s.x and the mask operand will be
> converted from 00..01 to 11..11. There are some advantages and
> disadvantages before and after the conversion after discussing
> with Juzhe offline and we chose not to do this transform.
> 
> Before:
> 
>    Advantages: The vsetvli info required by vmv.s.x has better compatibility since
>    vmv.s.x only required SEW and VLEN be zero or one. That mean there
>    is more opportunities to combine with other vsetlv infos in vsetvl pass.
> 
>    Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
>    will be needed.
> 
> After:
> 
>    Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.
> 
>    Disadvantages: Like before's advantages. Worse compatibility leads to more
>    vsetvl instrunctions need.
I can't speak for other uarches, but as a guiding principle for Ventana 
we're assuming vsetvl instructions are common and as a result need to be 
very cheap in hardware.   It's likely a good tradeoff for us.

I could see other uarches making different design choices though.  So at 
a high level, do we want this to be driven by cost modeling in some way?

Not a review yet.  Wanted to get that feedback to you now since the rest 
of my day is going to be fairly busy.

jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2023-08-11 15:04 ` Jeff Law
@ 2023-08-11 15:43   ` Lehua Ding
  2023-08-11 15:48     ` Jeff Law
  0 siblings, 1 reply; 8+ messages in thread
From: Lehua Ding @ 2023-08-11 15:43 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer

[-- Attachment #1: Type: text/plain, Size: 3235 bytes --]

&gt; I can't speak for other uarches, but as a guiding principle for Ventana
&gt; we're assuming vsetvl instructions are common and as a result need to be
&gt; very cheap in hardware.&nbsp;&nbsp; It's likely a good tradeoff for us.


&gt; I could see other uarches making different design choices though.&nbsp; So at
&gt; a high level, do we want this to be driven by cost modeling in some way?

&gt; Not a review yet.&nbsp; Wanted to get that feedback to you now since the rest
&gt; of my day is going to be fairly busy.


Thanks for the feedback.&nbsp;We'll think about it some more.
Just out of curiosity,&nbsp;will the combination of vsetvli + vmv.v.x perform
better than li + vmv.s.x on&nbsp;Ventana's CPU?&nbsp;




------------------&nbsp;Original&nbsp;------------------
From:                                                                                                                        "Jeff Law"                                                                                    <gcc-patches@gcc.gnu.org&gt;;
Date:&nbsp;Fri, Aug 11, 2023 11:04 PM
To:&nbsp;"Lehua Ding"<lehua.ding@rivai.ai&gt;;"gcc-patches"<gcc-patches@gcc.gnu.org&gt;;
Cc:&nbsp;"juzhe.zhong"<juzhe.zhong@rivai.ai&gt;;"kito.cheng"<kito.cheng@gmail.com&gt;;"rdapp.gcc"<rdapp.gcc@gmail.com&gt;;"palmer"<palmer@rivosinc.com&gt;;
Subject:&nbsp;Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i



On 8/11/23 03:01, Lehua Ding wrote:
&gt; Hi,
&gt; 
&gt; This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
&gt; optimize the special case when the scalar operand is zero.
&gt; 
&gt; Currently, the broadcast pattern where the scalar operand is a imm
&gt; will be converted to vmv.v.i from vmv.s.x and the mask operand will be
&gt; converted from 00..01 to 11..11. There are some advantages and
&gt; disadvantages before and after the conversion after discussing
&gt; with Juzhe offline and we chose not to do this transform.
&gt; 
&gt; Before:
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Advantages: The vsetvli info required by vmv.s.x has better compatibility since
&gt;&nbsp;&nbsp;&nbsp; vmv.s.x only required SEW and VLEN be zero or one. That mean there
&gt;&nbsp;&nbsp;&nbsp; is more opportunities to combine with other vsetlv infos in vsetvl pass.
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
&gt;&nbsp;&nbsp;&nbsp; will be needed.
&gt; 
&gt; After:
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Disadvantages: Like before's advantages. Worse compatibility leads to more
&gt;&nbsp;&nbsp;&nbsp; vsetvl instrunctions need.
I can't speak for other uarches, but as a guiding principle for Ventana 
we're assuming vsetvl instructions are common and as a result need to be 
very cheap in hardware.&nbsp;&nbsp; It's likely a good tradeoff for us.

I could see other uarches making different design choices though.&nbsp; So at 
a high level, do we want this to be driven by cost modeling in some way?

Not a review yet.&nbsp; Wanted to get that feedback to you now since the rest 
of my day is going to be fairly busy.

jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2023-08-11 15:43   ` Lehua Ding
@ 2023-08-11 15:48     ` Jeff Law
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff Law @ 2023-08-11 15:48 UTC (permalink / raw)
  To: Lehua Ding, gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer



On 8/11/23 09:43, Lehua Ding wrote:
>  > I can't speak for other uarches, but as a guiding principle for Ventana
>  > we're assuming vsetvl instructions are common and as a result need to be
>  > very cheap in hardware.   It's likely a good tradeoff for us.
> 
>  > I could see other uarches making different design choices though.  So at
>  > a high level, do we want this to be driven by cost modeling in some way?
> 
>  > Not a review yet.  Wanted to get that feedback to you now since the rest
>  > of my day is going to be fairly busy.
> 
> Thanks for the feedback. We'll think about it some more.
> Just out of curiosity, will the combination of vsetvli + vmv.v.x perform
> better than li + vmv.s.x on Ventana's CPU?
It's context dependent, but in general vsetvli+vmv would generally be 
better than li + vmv.


jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2023-08-11  9:01 [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i Lehua Ding
  2023-08-11 15:04 ` Jeff Law
@ 2023-08-11 23:02 ` Jeff Law
  2023-08-12  4:09   ` Lehua Ding
  1 sibling, 1 reply; 8+ messages in thread
From: Jeff Law @ 2023-08-11 23:02 UTC (permalink / raw)
  To: Lehua Ding, gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer



On 8/11/23 03:01, Lehua Ding wrote:
> Hi,
> 
> This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
> optimize the special case when the scalar operand is zero.
> 
> Currently, the broadcast pattern where the scalar operand is a imm
> will be converted to vmv.v.i from vmv.s.x and the mask operand will be
> converted from 00..01 to 11..11. There are some advantages and
> disadvantages before and after the conversion after discussing
> with Juzhe offline and we chose not to do this transform.
> 
> Before:
> 
>    Advantages: The vsetvli info required by vmv.s.x has better compatibility since
>    vmv.s.x only required SEW and VLEN be zero or one. That mean there
>    is more opportunities to combine with other vsetlv infos in vsetvl pass.
> 
>    Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
>    will be needed.
> 
> After:
> 
>    Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.
> 
>    Disadvantages: Like before's advantages. Worse compatibility leads to more
>    vsetvl instrunctions need.
> 
> Consider the bellow C code and asm after autovec.
> there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma)
> after converted vmv.s.x to vmv.v.i.
> 
> ```
> int foo1(int* restrict a, int* restrict b, int *restrict c, int n) {
>      int sum = 0;
>      for (int i = 0; i < n; i++)
>        sum += a[i] * b[i];
>      
>      return sum;
> }
> ```
> 
> asm (Before):
> 
> ```
> foo1:
>          ble     a3,zero,.L7
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.v.i v1,0
> .L6:
>          vsetvli a5,a3,e32,m1,tu,ma
>          slli    a4,a5,2
>          sub     a3,a3,a5
>          vle32.v v2,0(a0)
>          vle32.v v3,0(a1)
>          add     a0,a0,a4
>          add     a1,a1,a4
>          vmacc.vv        v1,v3,v2
>          bne     a3,zero,.L6
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.s.x v2,zero
>          vredsum.vs      v1,v1,v2
>          vmv.x.s a0,v1
>          ret
> .L7:
>          li      a0,0
>          ret
> ```
> 
> asm (After):
> 
> ```
> foo1:
>          ble     a3,zero,.L4
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.v.i v1,0
> .L3:
>          vsetvli a5,a3,e32,m1,tu,ma
>          slli    a4,a5,2
>          sub     a3,a3,a5
>          vle32.v v2,0(a0)
>          vle32.v v3,0(a1)
>          add     a0,a0,a4
>          add     a1,a1,a4
>          vmacc.vv        v1,v3,v2
>          bne     a3,zero,.L3
>          vsetivli        zero,1,e32,m1,ta,ma
>          vmv.v.i v2,0
>          vsetvli a2,zero,e32,m1,ta,ma
>          vredsum.vs      v1,v1,v2
>          vmv.x.s a0,v1
>          ret
> .L4:
>          li      a0,0
>          ret
> ```
> 
> Best,
> Lehua
> 
> Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/predicates.md (vector_const_0_operand): New.
> 	* config/riscv/vector.md (*pred_broadcast<mode>_zero): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/rvv/base/scalar_move-5.c: Update.
> 	* gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.
If we encounter a uarch where the other sequence is better, then I think 
we can do something like query costs or the like and select between the 
approaches -- but no need to do that now.

So OK for the trunk.
jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2023-08-11 23:02 ` Jeff Law
@ 2023-08-12  4:09   ` Lehua Ding
  0 siblings, 0 replies; 8+ messages in thread
From: Lehua Ding @ 2023-08-12  4:09 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: juzhe.zhong, kito.cheng, rdapp.gcc, palmer

[-- Attachment #1: Type: text/plain, Size: 7132 bytes --]

&gt; If we encounter a uarch where the other sequence is better, then I think
&gt; we can do something like query costs or the like and select between the
&gt; approaches -- but no need to do that now.

&gt; So OK for the trunk.


Thanks, patch will be committed soon.




------------------&nbsp;Original&nbsp;------------------
From:                                                                                                                        "Jeff Law"                                                                                    <gcc-patches@gcc.gnu.org&gt;;
Date:&nbsp;Sat, Aug 12, 2023 07:02 AM
To:&nbsp;"Lehua Ding"<lehua.ding@rivai.ai&gt;;"gcc-patches"<gcc-patches@gcc.gnu.org&gt;;
Cc:&nbsp;"juzhe.zhong"<juzhe.zhong@rivai.ai&gt;;"kito.cheng"<kito.cheng@gmail.com&gt;;"rdapp.gcc"<rdapp.gcc@gmail.com&gt;;"palmer"<palmer@rivosinc.com&gt;;
Subject:&nbsp;Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i



On 8/11/23 03:01, Lehua Ding wrote:
&gt; Hi,
&gt; 
&gt; This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
&gt; optimize the special case when the scalar operand is zero.
&gt; 
&gt; Currently, the broadcast pattern where the scalar operand is a imm
&gt; will be converted to vmv.v.i from vmv.s.x and the mask operand will be
&gt; converted from 00..01 to 11..11. There are some advantages and
&gt; disadvantages before and after the conversion after discussing
&gt; with Juzhe offline and we chose not to do this transform.
&gt; 
&gt; Before:
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Advantages: The vsetvli info required by vmv.s.x has better compatibility since
&gt;&nbsp;&nbsp;&nbsp; vmv.s.x only required SEW and VLEN be zero or one. That mean there
&gt;&nbsp;&nbsp;&nbsp; is more opportunities to combine with other vsetlv infos in vsetvl pass.
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
&gt;&nbsp;&nbsp;&nbsp; will be needed.
&gt; 
&gt; After:
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.
&gt; 
&gt;&nbsp;&nbsp;&nbsp; Disadvantages: Like before's advantages. Worse compatibility leads to more
&gt;&nbsp;&nbsp;&nbsp; vsetvl instrunctions need.
&gt; 
&gt; Consider the bellow C code and asm after autovec.
&gt; there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma)
&gt; after converted vmv.s.x to vmv.v.i.
&gt; 
&gt; ```
&gt; int foo1(int* restrict a, int* restrict b, int *restrict c, int n) {
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; int sum = 0;
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for (int i = 0; i < n; i++)
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sum += a[i] * b[i];
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; return sum;
&gt; }
&gt; ```
&gt; 
&gt; asm (Before):
&gt; 
&gt; ```
&gt; foo1:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ble&nbsp;&nbsp;&nbsp;&nbsp; a3,zero,.L7
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a2,zero,e32,m1,ta,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.v.i v1,0
&gt; .L6:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a5,a3,e32,m1,tu,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; slli&nbsp;&nbsp;&nbsp; a4,a5,2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sub&nbsp;&nbsp;&nbsp;&nbsp; a3,a3,a5
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vle32.v v2,0(a0)
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vle32.v v3,0(a1)
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; add&nbsp;&nbsp;&nbsp;&nbsp; a0,a0,a4
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; add&nbsp;&nbsp;&nbsp;&nbsp; a1,a1,a4
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmacc.vv&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; v1,v3,v2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; bne&nbsp;&nbsp;&nbsp;&nbsp; a3,zero,.L6
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a2,zero,e32,m1,ta,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.s.x v2,zero
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vredsum.vs&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; v1,v1,v2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.x.s a0,v1
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ret
&gt; .L7:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; li&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; a0,0
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ret
&gt; ```
&gt; 
&gt; asm (After):
&gt; 
&gt; ```
&gt; foo1:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ble&nbsp;&nbsp;&nbsp;&nbsp; a3,zero,.L4
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a2,zero,e32,m1,ta,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.v.i v1,0
&gt; .L3:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a5,a3,e32,m1,tu,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; slli&nbsp;&nbsp;&nbsp; a4,a5,2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sub&nbsp;&nbsp;&nbsp;&nbsp; a3,a3,a5
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vle32.v v2,0(a0)
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vle32.v v3,0(a1)
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; add&nbsp;&nbsp;&nbsp;&nbsp; a0,a0,a4
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; add&nbsp;&nbsp;&nbsp;&nbsp; a1,a1,a4
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmacc.vv&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; v1,v3,v2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; bne&nbsp;&nbsp;&nbsp;&nbsp; a3,zero,.L3
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetivli&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; zero,1,e32,m1,ta,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.v.i v2,0
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vsetvli a2,zero,e32,m1,ta,ma
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vredsum.vs&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; v1,v1,v2
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; vmv.x.s a0,v1
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ret
&gt; .L4:
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; li&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; a0,0
&gt;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ret
&gt; ```
&gt; 
&gt; Best,
&gt; Lehua
&gt; 
&gt; Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai&gt;
&gt; 
&gt; gcc/ChangeLog:
&gt; 
&gt;* config/riscv/predicates.md (vector_const_0_operand): New.
&gt;* config/riscv/vector.md (*pred_broadcast<mode&gt;_zero): Ditto.
&gt; 
&gt; gcc/testsuite/ChangeLog:
&gt; 
&gt;* gcc.target/riscv/rvv/base/scalar_move-5.c: Update.
&gt;* gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.
If we encounter a uarch where the other sequence is better, then I think 
we can do something like query costs or the like and select between the 
approaches -- but no need to do that now.

So OK for the trunk.
jeff

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
  2024-02-20  4:15 Alexandre Oliva
@ 2024-02-23  7:37 ` Jeff Law
  0 siblings, 0 replies; 8+ messages in thread
From: Jeff Law @ 2024-02-23  7:37 UTC (permalink / raw)
  To: Alexandre Oliva, gcc-patches
  Cc: Kito Cheng, Palmer Dabbelt, Andrew Waterman, Jim Wilson,
	Lehua Ding, Ju-Zhe Zhong



On 2/19/24 21:15, Alexandre Oliva wrote:
> This backport is the first of two required for the pr111935 testcase,
> already backported to gcc-13, to pass on riscv64-elf and riscv32-elf.
> The V_VLS mode iterator, used in the original patch, is not available in
> gcc-13, and I thought that would be too much to backport (and maybe so
> are these two patches, WDYT?), so I changed it to V, to match the
> preexisting gcc-13 pattern.  Regstrapped on x86_64-linux-gnu, along with
> other backports, and tested manually on riscv64-elf.  Ok to install?
> 
> From: Lehua Ding <lehua.ding@rivai.ai>
> 
> Hi,
> 
> This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
> optimize the special case when the scalar operand is zero.
> 
> Currently, the broadcast pattern where the scalar operand is a imm
> will be converted to vmv.v.i from vmv.s.x and the mask operand will be
> converted from 00..01 to 11..11. There are some advantages and
> disadvantages before and after the conversion after discussing
> with Juzhe offline and we chose not to do this transform.
> 
> Before:
> 
>    Advantages: The vsetvli info required by vmv.s.x has better compatibility since
>    vmv.s.x only required SEW and VLEN be zero or one. That mean there
>    is more opportunities to combine with other vsetlv infos in vsetvl pass.
> 
>    Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
>    will be needed.
> 
> After:
> 
>    Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.
> 
>    Disadvantages: Like before's advantages. Worse compatibility leads to more
>    vsetvl instrunctions need.
> 
> Consider the bellow C code and asm after autovec.
> there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma)
> after converted vmv.s.x to vmv.v.i.
> 
> ```
> int foo1(int* restrict a, int* restrict b, int *restrict c, int n) {
>      int sum = 0;
>      for (int i = 0; i < n; i++)
>        sum += a[i] * b[i];
> 
>      return sum;
> }
> ```
> 
> asm (Before):
> 
> ```
> foo1:
>          ble     a3,zero,.L7
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.v.i v1,0
> .L6:
>          vsetvli a5,a3,e32,m1,tu,ma
>          slli    a4,a5,2
>          sub     a3,a3,a5
>          vle32.v v2,0(a0)
>          vle32.v v3,0(a1)
>          add     a0,a0,a4
>          add     a1,a1,a4
>          vmacc.vv        v1,v3,v2
>          bne     a3,zero,.L6
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.s.x v2,zero
>          vredsum.vs      v1,v1,v2
>          vmv.x.s a0,v1
>          ret
> .L7:
>          li      a0,0
>          ret
> ```
> 
> asm (After):
> 
> ```
> foo1:
>          ble     a3,zero,.L4
>          vsetvli a2,zero,e32,m1,ta,ma
>          vmv.v.i v1,0
> .L3:
>          vsetvli a5,a3,e32,m1,tu,ma
>          slli    a4,a5,2
>          sub     a3,a3,a5
>          vle32.v v2,0(a0)
>          vle32.v v3,0(a1)
>          add     a0,a0,a4
>          add     a1,a1,a4
>          vmacc.vv        v1,v3,v2
>          bne     a3,zero,.L3
>          vsetivli        zero,1,e32,m1,ta,ma
>          vmv.v.i v2,0
>          vsetvli a2,zero,e32,m1,ta,ma
>          vredsum.vs      v1,v1,v2
>          vmv.x.s a0,v1
>          ret
> .L4:
>          li      a0,0
>          ret
> ```
> 
> Best,
> Lehua
> 
> Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
> 
> gcc/ChangeLog:
> 
> 	* config/riscv/predicates.md (vector_const_0_operand): New.
> 	* config/riscv/vector.md (*pred_broadcast<mode>_zero): Ditto.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/riscv/rvv/base/scalar_move-5.c: Update.
> 	* gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.
I wouldn't backport this.  Vector isn't something that's really expected 
to work with gcc-13.  Yea, you can do a bit of intrinsics, but that's it.

Jeff


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i
@ 2024-02-20  4:15 Alexandre Oliva
  2024-02-23  7:37 ` Jeff Law
  0 siblings, 1 reply; 8+ messages in thread
From: Alexandre Oliva @ 2024-02-20  4:15 UTC (permalink / raw)
  To: gcc-patches
  Cc: Kito Cheng, Palmer Dabbelt, Andrew Waterman, Jim Wilson,
	Lehua Ding, Ju-Zhe Zhong

This backport is the first of two required for the pr111935 testcase,
already backported to gcc-13, to pass on riscv64-elf and riscv32-elf.
The V_VLS mode iterator, used in the original patch, is not available in
gcc-13, and I thought that would be too much to backport (and maybe so
are these two patches, WDYT?), so I changed it to V, to match the
preexisting gcc-13 pattern.  Regstrapped on x86_64-linux-gnu, along with
other backports, and tested manually on riscv64-elf.  Ok to install?

From: Lehua Ding <lehua.ding@rivai.ai>

Hi,

This patch revert the convert from vmv.s.x to vmv.v.i and add new pattern
optimize the special case when the scalar operand is zero.

Currently, the broadcast pattern where the scalar operand is a imm
will be converted to vmv.v.i from vmv.s.x and the mask operand will be
converted from 00..01 to 11..11. There are some advantages and
disadvantages before and after the conversion after discussing
with Juzhe offline and we chose not to do this transform.

Before:

  Advantages: The vsetvli info required by vmv.s.x has better compatibility since
  vmv.s.x only required SEW and VLEN be zero or one. That mean there
  is more opportunities to combine with other vsetlv infos in vsetvl pass.

  Disadvantages: For non-zero scalar imm, one more `li rd, imm` instruction
  will be needed.

After:

  Advantages: No need `li rd, imm` instruction since vmv.v.i support imm operand.

  Disadvantages: Like before's advantages. Worse compatibility leads to more
  vsetvl instrunctions need.

Consider the bellow C code and asm after autovec.
there is an extra insn (vsetivli zero, 1, e32, m1, ta, ma)
after converted vmv.s.x to vmv.v.i.

```
int foo1(int* restrict a, int* restrict b, int *restrict c, int n) {
    int sum = 0;
    for (int i = 0; i < n; i++)
      sum += a[i] * b[i];

    return sum;
}
```

asm (Before):

```
foo1:
        ble     a3,zero,.L7
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.v.i v1,0
.L6:
        vsetvli a5,a3,e32,m1,tu,ma
        slli    a4,a5,2
        sub     a3,a3,a5
        vle32.v v2,0(a0)
        vle32.v v3,0(a1)
        add     a0,a0,a4
        add     a1,a1,a4
        vmacc.vv        v1,v3,v2
        bne     a3,zero,.L6
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.s.x v2,zero
        vredsum.vs      v1,v1,v2
        vmv.x.s a0,v1
        ret
.L7:
        li      a0,0
        ret
```

asm (After):

```
foo1:
        ble     a3,zero,.L4
        vsetvli a2,zero,e32,m1,ta,ma
        vmv.v.i v1,0
.L3:
        vsetvli a5,a3,e32,m1,tu,ma
        slli    a4,a5,2
        sub     a3,a3,a5
        vle32.v v2,0(a0)
        vle32.v v3,0(a1)
        add     a0,a0,a4
        add     a1,a1,a4
        vmacc.vv        v1,v3,v2
        bne     a3,zero,.L3
        vsetivli        zero,1,e32,m1,ta,ma
        vmv.v.i v2,0
        vsetvli a2,zero,e32,m1,ta,ma
        vredsum.vs      v1,v1,v2
        vmv.x.s a0,v1
        ret
.L4:
        li      a0,0
        ret
```

Best,
Lehua

Co-Authored-By: Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

gcc/ChangeLog:

	* config/riscv/predicates.md (vector_const_0_operand): New.
	* config/riscv/vector.md (*pred_broadcast<mode>_zero): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/scalar_move-5.c: Update.
	* gcc.target/riscv/rvv/base/scalar_move-6.c: Ditto.

(cherry picked from commit 86d80395cf3c8832b669135b1ca7ea8258790c19)
---
 gcc/config/riscv/predicates.md                     |    4 ++
 gcc/config/riscv/vector.md                         |   43 ++++++++++++++------
 .../gcc.target/riscv/rvv/base/scalar_move-5.c      |   20 ++++++++-
 .../gcc.target/riscv/rvv/base/scalar_move-6.c      |   22 ++++++++--
 4 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 8654dbc594354..1707c80cba256 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -276,6 +276,10 @@ (define_predicate "reg_or_int_operand"
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "const_int_operand")))
 
+(define_predicate "vector_const_0_operand"
+  (and (match_code "const_vector")
+       (match_test "satisfies_constraint_Wc0 (op)")))
+
 (define_predicate "vector_move_operand"
   (ior (match_operand 0 "nonimmediate_operand")
        (and (match_code "const_vector")
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index db3a972832aea..fb0caab8da360 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1217,23 +1217,24 @@ (define_expand "@pred_broadcast<mode>"
 	  (match_operand:V 2 "vector_merge_operand")))]
   "TARGET_VECTOR"
 {
-  /* Handle vmv.s.x instruction which has memory scalar.  */
-  if (satisfies_constraint_Wdm (operands[3]) || riscv_vector::simm5_p (operands[3])
-      || rtx_equal_p (operands[3], CONST0_RTX (<VEL>mode)))
+  /* Handle vmv.s.x instruction (Wb1 mask) which has memory scalar.  */
+  if (satisfies_constraint_Wdm (operands[3]))
     {
       if (satisfies_constraint_Wb1 (operands[1]))
-        {
-          // Case 1: vmv.s.x (TA) ==> vlse.v (TA)
-          if (satisfies_constraint_vu (operands[2]))
-            operands[1] = CONSTM1_RTX (<VM>mode);
-          else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
-            {
-	      // Case 2: vmv.s.x (TU) ==> andi vl + vlse.v (TU) in RV32 system.
+	{
+	  /* Case 1: vmv.s.x (TA, x == memory) ==> vlse.v (TA)  */
+	  if (satisfies_constraint_vu (operands[2]))
+	    operands[1] = CONSTM1_RTX (<VM>mode);
+	  else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode))
+	    {
+	      /* Case 2: vmv.s.x (TU, x == memory) ==>
+			   vl = 0 or 1; + vlse.v (TU) in RV32 system  */
 	      operands[4] = riscv_vector::gen_avl_for_scalar_move (operands[4]);
 	      operands[1] = CONSTM1_RTX (<VM>mode);
 	    }
-          else
-            operands[3] = force_reg (<VEL>mode, operands[3]);
+	  else
+	    /* Case 3: load x (memory) to register.  */
+	    operands[3] = force_reg (<VEL>mode, operands[3]);
 	}
     }
   else if (GET_MODE_BITSIZE (<VEL>mode) > GET_MODE_BITSIZE (Pmode)
@@ -1348,6 +1349,24 @@ (define_insn "*pred_broadcast<mode>_extended_scalar"
   [(set_attr "type" "vimov,vimov,vimovxv,vimovxv")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*pred_broadcast<mode>_zero"
+  [(set (match_operand:V 0 "register_operand"                          "=vr,    vr")
+    (if_then_else:V
+      (unspec:<VM>
+        [(match_operand:<VM> 1 "vector_least_significant_set_mask_operand" "Wb1,   Wb1")
+         (match_operand 4 "vector_length_operand"                          " rK,    rK")
+         (match_operand 5 "const_int_operand"                              "  i,     i")
+         (match_operand 6 "const_int_operand"                              "  i,     i")
+         (match_operand 7 "const_int_operand"                              "  i,     i")
+         (reg:SI VL_REGNUM)
+         (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+      (match_operand:V 3 "vector_const_0_operand"                      "Wc0,   Wc0")
+      (match_operand:V 2 "vector_merge_operand"                        " vu,     0")))]
+  "TARGET_VECTOR"
+  "vmv.s.x\t%0,zero"
+  [(set_attr "type" "vimovxv,vimovxv")
+   (set_attr "mode" "<MODE>")])
+
 ;; -------------------------------------------------------------------------------
 ;; ---- Predicated Strided loads/stores
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
index db6800c89781b..2e897a4896fec 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-5.c
@@ -121,7 +121,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
 /*
 ** foo9:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 ** ret
 */
@@ -150,7 +150,7 @@ void foo10 (void *base, void *out, size_t vl)
 /*
 ** foo11:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -164,7 +164,7 @@ void foo11 (void *base, void *out, size_t vl)
 /*
 ** foo12:
 ** ...
-** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -174,3 +174,17 @@ void foo12 (void *base, void *out, size_t vl)
     vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0, vl);
     *(vfloat64m2_t*)out = v;
 }
+
+/*
+** foo13:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo13 (void *base, void *out, size_t vl)
+{
+    vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+    vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+    *(vfloat64m2_t*)out = v;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
index f27f85cdb5866..326cfd8e2ff4b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/scalar_move-6.c
@@ -119,7 +119,7 @@ void foo8 (void *base, void *out, size_t vl, double x)
 /*
 ** foo9:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 ** ret
 */
@@ -133,7 +133,7 @@ void foo9 (void *base, void *out, size_t vl)
 /*
 ** foo10:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*-15
+** vmv.s.x\tv[0-9]+,\s*[a-x0-9]+
 ** ...
 */
 void foo10 (void *base, void *out, size_t vl)
@@ -147,7 +147,7 @@ void foo10 (void *base, void *out, size_t vl)
 /*
 ** foo11:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -161,7 +161,7 @@ void foo11 (void *base, void *out, size_t vl)
 /*
 ** foo12:
 ** ...
-** vmv.v.i\tv[0-9]+,\s*0
+** vmv.s.x\tv[0-9]+,\s*zero
 ** ...
 ** ret
 */
@@ -172,6 +172,20 @@ void foo12 (void *base, void *out, size_t vl)
     *(vfloat64m2_t*)out = v;
 }
 
+/*
+** foo12_1:
+** ...
+** vfmv.s.f\tv[0-9]+,\s*[a-x0-9]+
+** ...
+** ret
+*/
+void foo12_1 (void *base, void *out, size_t vl)
+{
+    vfloat64m2_t merge = *(vfloat64m2_t*) (base + 200);
+    vfloat64m2_t v = __riscv_vfmv_s_f_f64m2_tu (merge, 0.2, vl);
+    *(vfloat64m2_t*)out = v;
+}
+
 /*
 ** foo13:
 ** ...

-- 
Alexandre Oliva, happy hacker            https://FSFLA.org/blogs/lxo/
   Free Software Activist                   GNU Toolchain Engineer
More tolerance and less prejudice are key for inclusion and diversity
Excluding neuro-others for not behaving ""normal"" is *not* inclusive

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-02-23  7:37 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-11  9:01 [PATCH] RISC-V: Revert the convert from vmv.s.x to vmv.v.i Lehua Ding
2023-08-11 15:04 ` Jeff Law
2023-08-11 15:43   ` Lehua Ding
2023-08-11 15:48     ` Jeff Law
2023-08-11 23:02 ` Jeff Law
2023-08-12  4:09   ` Lehua Ding
2024-02-20  4:15 Alexandre Oliva
2024-02-23  7:37 ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).