cpymem for RISCV with v extension

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* cpymem for RISCV with v extension
@ 2023-08-04 23:10 钟居哲
  2023-08-04 23:17 ` Jeff Law
  0 siblings, 1 reply; 16+ messages in thread
From: 钟居哲 @ 2023-08-04 23:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, Jeff Law, rdapp.gcc

[-- Attachment #1: Type: text/plain, Size: 4537 bytes --]

Could you add testcases for this patch?

+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+	(use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "vle<sew>.v %3,(%1)\;"
+		   "sub %2,%2,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
+		   "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_scratch:P 5 "=&r"))
+   (clobber (match_scratch:P 6 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "beq %4,%2,1f\;"
+		   "add %5,%0,%2\;"
+		   "sub %6,%5,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+		   "add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
>>  		   "bltu %0,%6,0b\;"
>>  		   "sub %5,%5,%0", operands);
>>   if (<sew> != 8)
>>     {
>>       rtx xop[2];
>>       xop[0] = operands[4];
>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>       output_asm_insn ("srli %0,%0,%1", xop);
>>      }
>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>  	    "1:\t" "vle<sew>.v %3,(%1)\;"
>>  		   "vse<sew>.v %3,(%0)", operands);
>>   return "";
>>  })
I don't think they are necessary.

>>      considering that this code is usually memory-constrainted, limit this
>>      to -O3.  ??? It would make sense to differentiate here between in-order
>>     and OOO microarchitectures.  */
>>     else if (!size_p && optimize >= 3)
>>       emit_insn (gen_cpymem_loop_fast (Pmode, vmode, dst, src, end));
>>      else
>>       emit_insn (gen_cpymem_loop (Pmode, vmode, dst, src, end));
Why not just emit RVV pattern.
>> Just post the update for archival purposes and consider 
>> it pre-approved for the trunk.I am so sorry that I disagree approve this patch too fast.It should be well tested.
We should at least these 2 following situations:1. an unknown number bytes to be memcpy, this codegen should be as follows:   vsetvl a5,a2,e8,m8,ta,ma    vle    vse    bump counter    branch2. a known number bytes to be memcpy, and the number bytes allow us to fine a VLS modes to hold it.    For example, memcpy 16 bytes QImode.    Then, we can use V16QImode directly, the codegen should be:    vsetvli zero,16,....     vle     vseSimple 3 instructions are enough. 
This patch should be well tested with these 2 situations before approved since LLVM does the same thing.We should be able to have the same behavior as LLVM.


juzhe.zhong@rivai.ai

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-08-04 23:10 cpymem for RISCV with v extension 钟居哲
@ 2023-08-04 23:17 ` Jeff Law
  2023-08-04 23:34   ` 钟居哲
  2023-08-04 23:44   ` 钟居哲
  0 siblings, 2 replies; 16+ messages in thread
From: Jeff Law @ 2023-08-04 23:17 UTC (permalink / raw)
  To: 钟居哲, gcc-patches
  Cc: kito.cheng, kito.cheng, rdapp.gcc, Joern Rennecke



On 8/4/23 17:10, 钟居哲 wrote:
> Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?


> 
> +;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
> +;; optimizers from changing cpymem_loop_* into this.
> +(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
> +	(mem:BLK (match_operand:P 1 "register_operand" "r,r")))
> +	(use (and (match_dup 1) (const_int 127)))
> +   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
> +   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
> +)
> +
> +(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> +	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +		   "vle<sew>.v %3,(%1)\;"
> +		   "sub %2,%2,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("add %1,%1,%4\;"
> +		   "vse<sew>.v %3,(%0)\;"
> +		   "add %0,%0,%4\;"
> +		   "bnez %2,0b", operands);
> +  return "";
> +})
> +
> +;; This pattern (at bltu) assumes pointers can be treated as unsigned,
> +;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
> +(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> +	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_scratch:P 5 "=&r"))
> +   (clobber (match_scratch:P 6 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{
> +  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +		   "beq %4,%2,1f\;"
> +		   "add %5,%0,%2\;"
> +		   "sub %6,%5,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
> +		   "add %1,%1,%4\;"
> +		   "vse<sew>.v %3,(%0)\;"
> +		   "add %0,%0,%4\;"
>>> 		   "bltu %0,%6,0b\;"
>>> 		   "sub %5,%5,%0", operands);
>>>   if (<sew> != 8)
>>>     {
>>>       rtx xop[2];
>>>       xop[0] = operands[4];
>>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>>       output_asm_insn ("srli %0,%0,%1", xop);
>>>      }
>>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>> 	    "1:\t" "vle<sew>.v %3,(%1)\;"
>>> 		   "vse<sew>.v %3,(%0)", operands);
>>>   return "";
>>> })
> 
> I don't think they are necessary.
What specifically do you think is not necessary?


> 
>>> Just post the update for archival purposes and consider
>>> it pre-approved for the trunk.
> 
> I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.

> 
> It should be well tested.
If you refer to Joern's message he indicated how it was tested.  Joern 
is a long time GCC developer and is well aware of how to test code.


It was tested on this set of multilibs without regressions:

>    riscv-sim
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d




> 
> 
> We should at least these 2 following situations:
> 
> 1. an unknown number bytes to be memcpy, this codegen should be as follows:
> 
> vsetvl a5,a2,e8,m8,ta,ma
> 
> vle
> 
> vse
> 
> bump counter
> 
> branch
> 
> 2. a known number bytes to be memcpy, and the number bytes allow us to 
> fine a VLS modes to hold it.
> 
> For example, memcpy 16 bytes QImode.
> 
> Then, we can use V16QImode directly, the codegen should be:
> 
> vsetvli zero,16,....
> 
> vle
> 
> vse
> 
> Simple 3 instructions are enough.
> 
> 
> This patch should be well tested with these 2 situations before approved 
> since LLVM does the same thing.
> 
> We should be able to have the same behavior as LLVM.
I'm not sure that's strictly necessary and I don't mind iterating a bit 
on performance issues as long as we don't have correctness problems.

But since you've raised concerns -- Joern don't install until we've 
resolved the questions at hand.  Thanks.

jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: Re: cpymem for RISCV with v extension
  2023-08-04 23:17 ` Jeff Law
@ 2023-08-04 23:34   ` 钟居哲
  2023-08-15  8:12     ` Joern Rennecke
  2023-08-04 23:44   ` 钟居哲
  1 sibling, 1 reply; 16+ messages in thread
From: 钟居哲 @ 2023-08-04 23:34 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc, Joern Rennecke

[-- Attachment #1: Type: text/plain, Size: 8630 bytes --]

>> Testing what specifically?  Are you asking for correctness tests,
>> performance/code quality tests?

Add memcpy test using RVV instructions, just like we are adding testcases for auto-vectorization support.

For example:

#include <stdint.h>
#include <stdio.h>
#include <string.h>

void foo (int32_t * a, int32_t * b, int num)
{
  memcpy (a, b, num);
}


In my downstream LLVM/GCC codegen:
foo:
.L2:
        vsetvli a5,a2,e8,m8,ta,ma
        vle8.v  v24,(a1)
        sub     a2,a2,a5
        vse8.v  v24,(a0)
        add     a1,a1,a5
        add     a0,a0,a5
        bne     a2,zero,.L2
        ret

Another example:
void foo (int32_t * a, int32_t * b, int num)
{
  memcpy (a, b, 4);
}


My downstream LLVM/GCC assembly:

foo:
vsetvli zero,16,e8,m1,ta,ma
vle8.v v24,(a1)
vse8.v v24,(a0)
ret

>> What specifically do you think is not necessary?
> +(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +    "vle<sew>.v %3,(%1)\;"
> +    "sub %2,%2,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("add %1,%1,%4\;"
> +    "vse<sew>.v %3,(%0)\;"
> +    "add %0,%0,%4\;"
> +    "bnez %2,0b", operands);
> +  return "";
> +})

For example, this pattern, we could simpilfy emit insn with:

emit_label ...
emit_insn (gen_add...)
emit_insn (gen_pred_store...)
emit_insn (gen_add...)
emit_branch()

I don't see why it is necessary we should use such explicit pattern with explict multiple assembly.
More details, you can see "rvv-next" (a little bit different from my downstream but generally idea same).



juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-08-05 07:17
To: 钟居哲; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc; Joern Rennecke
Subject: Re: cpymem for RISCV with v extension
 
 
On 8/4/23 17:10, 钟居哲 wrote:
> Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?
 
 
> 
> +;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
> +;; optimizers from changing cpymem_loop_* into this.
> +(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
> + (use (and (match_dup 1) (const_int 127)))
> +   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
> +   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
> +)
> +
> +(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +    "vle<sew>.v %3,(%1)\;"
> +    "sub %2,%2,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("add %1,%1,%4\;"
> +    "vse<sew>.v %3,(%0)\;"
> +    "add %0,%0,%4\;"
> +    "bnez %2,0b", operands);
> +  return "";
> +})
> +
> +;; This pattern (at bltu) assumes pointers can be treated as unsigned,
> +;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
> +(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_scratch:P 5 "=&r"))
> +   (clobber (match_scratch:P 6 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{
> +  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +    "beq %4,%2,1f\;"
> +    "add %5,%0,%2\;"
> +    "sub %6,%5,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
> +    "add %1,%1,%4\;"
> +    "vse<sew>.v %3,(%0)\;"
> +    "add %0,%0,%4\;"
>>>    "bltu %0,%6,0b\;"
>>>    "sub %5,%5,%0", operands);
>>>   if (<sew> != 8)
>>>     {
>>>       rtx xop[2];
>>>       xop[0] = operands[4];
>>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>>       output_asm_insn ("srli %0,%0,%1", xop);
>>>      }
>>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>>     "1:\t" "vle<sew>.v %3,(%1)\;"
>>>    "vse<sew>.v %3,(%0)", operands);
>>>   return "";
>>> })
> 
> I don't think they are necessary.
What specifically do you think is not necessary?
 
 
> 
>>> Just post the update for archival purposes and consider
>>> it pre-approved for the trunk.
> 
> I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.
 
> 
> It should be well tested.
If you refer to Joern's message he indicated how it was tested.  Joern 
is a long time GCC developer and is well aware of how to test code.
 
 
It was tested on this set of multilibs without regressions:
 
>    riscv-sim
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
 
 
 
 
> 
> 
> We should at least these 2 following situations:
> 
> 1. an unknown number bytes to be memcpy, this codegen should be as follows:
> 
> vsetvl a5,a2,e8,m8,ta,ma
> 
> vle
> 
> vse
> 
> bump counter
> 
> branch
> 
> 2. a known number bytes to be memcpy, and the number bytes allow us to 
> fine a VLS modes to hold it.
> 
> For example, memcpy 16 bytes QImode.
> 
> Then, we can use V16QImode directly, the codegen should be:
> 
> vsetvli zero,16,....
> 
> vle
> 
> vse
> 
> Simple 3 instructions are enough.
> 
> 
> This patch should be well tested with these 2 situations before approved 
> since LLVM does the same thing.
> 
> We should be able to have the same behavior as LLVM.
I'm not sure that's strictly necessary and I don't mind iterating a bit 
on performance issues as long as we don't have correctness problems.
 
But since you've raised concerns -- Joern don't install until we've 
resolved the questions at hand.  Thanks.
 
jeff
 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: Re: cpymem for RISCV with v extension
  2023-08-04 23:34   ` 钟居哲
@ 2023-08-15  8:12     ` Joern Rennecke
  2023-08-15  9:16       ` juzhe.zhong
  2023-08-15 14:04       ` Jeff Law
  0 siblings, 2 replies; 16+ messages in thread
From: Joern Rennecke @ 2023-08-15  8:12 UTC (permalink / raw)
  To: 钟居哲
  Cc: Jeff Law, gcc-patches, kito.cheng, kito.cheng, rdapp.gcc

[-- Attachment #1: Type: text/plain, Size: 2018 bytes --]

On Sat, 5 Aug 2023 at 00:35, 钟居哲 <juzhe.zhong@rivai.ai> wrote:
>
> >> Testing what specifically?  Are you asking for correctness tests,
> >> performance/code quality tests?
>
> Add memcpy test using RVV instructions, just like we are adding testcases for auto-vectorization support.

I wanted to get in the test infrastructure first.

> void foo (int32_t * a, int32_t * b, int num)
> {
>   memcpy (a, b, num);
> }
>
>
> In my downstream LLVM/GCC codegen:
> foo:
> .L2:
>         vsetvli a5,a2,e8,m8,ta,ma
>         vle8.v  v24,(a1)
>         sub     a2,a2,a5
>         vse8.v  v24,(a0)
>         add     a1,a1,a5
>         add     a0,a0,a5
>         bne     a2,zero,.L2
>         ret

Yeah, it does that.

>
> Another example:
> void foo (int32_t * a, int32_t * b, int num)
> {
>   memcpy (a, b, 4);
> }
>
>
> My downstream LLVM/GCC assembly:
>
> foo:
> vsetvli zero,16,e8,m1,ta,ma
> vle8.v v24,(a1)
> vse8.v v24,(a0)
> ret

copying 16 bytes when asked to copy 4 is problematic.  Mine copies 4.

Note also for:
typedef struct { int a[31]; } s;

void foo (s *a, s *b)
{
  *a = *b;
}

You get:

        vsetivli        zero,31,e32,m8,ta,ma
        vle32.v v8,0(a1)
        vse32.v v8,0(a0)

Using memcpy, the compiler unfortunately discards the alignment.

> emit_insn (gen_pred_store...)

Thanks to pointing me in the right direction.  From the naming of the
patterns, the dearth of comments, and the default behaviour of the
compiler when optimizing with generic optimization options (i.e. no
vectorization) I had assumed that the infrastructure was still
missing.

I have attached a re-worked patch that uses pred_mov / pred_store and
as adapted to the refactored modes.
It lacks the strength reduction of the opaque pattern version for -O3,
though.  Would people also like to see that expanded into RTL?  Or
should I just drop in the opaque pattern for that?  Or not at all,
because everyone uses Superscalar Out-Of-Order execution?

[-- Attachment #2: cpymem-20230815.txt --]
[-- Type: text/plain, Size: 9251 bytes --]

commit 1f4b7a8e6798acab1f79de38e85d9d080a76eb4a
Author: Joern Rennecke <joern.rennecke@embecosm.com>
Date:   Tue Aug 15 08:18:53 2023 +0100

    cpymem using pred_mov / pred_store and adapted to mode refactoring.
    
    2023-07-12  Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
                Joern Rennecke  <joern.rennecke@embecosm.com>
    
    gcc/
            * config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
            Declare.
            * config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
            New function.
            * config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
            Change to ..
            (cpymem<P:mode>) .. this.

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 2fbed04ff84..70ffdcdf180 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -315,6 +315,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
 			  machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
 void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
 machine_mode preferred_simd_mode (scalar_mode);
 machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 5f9b296c92e..ea96a0ef84d 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
 #include "targhooks.h"
+#include "predict.h"
 
 using namespace riscv_vector;
 
@@ -2379,6 +2380,192 @@ expand_tuple_move (rtx *ops)
     }
 }
 
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+	mv a3, a0                       # Copy destination
+    loop:
+	vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+	vle8.v v0, (a1)                 # Load bytes
+	add a1, a1, t0                  # Bump pointer
+	sub a2, a2, t0                  # Decrement count
+	vse8.v v0, (a3)                 # Store bytes
+	add a3, a3, t0                  # Bump pointer
+	bnez a2, loop                   # Any more?
+	ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+	need_loop = false;
+
+	/* If a single scalar load / store pair can do the job, leave it
+	   to the scalar code to do that.  */
+
+	if (pow2p_hwi (length) && length <= potential_ew)
+	  return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+	 size is likely to give smaller constants, and thus potentially
+	 reducing code size.  However, if we need a loop, we need to update
+	 the pointers, and that is more complicated with a larger element
+	 size, unless we use an immediate, which prevents us from dynamically
+	 using the targets transfer size that the hart supports.  And then,
+	 unless we know the *exact* vector size of the hart, we'd need
+	 multiple vsetvli / branch statements, so it's not even a size win.
+	 If, in the future, we find an RISCV-V implementation that is slower
+	 for small element widths, we might allow larger element widths for
+	 loops too.  */
+      if (need_loop)
+	potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+	{
+	  scalar_int_mode elem_mode;
+	  unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+	  unsigned HOST_WIDE_INT per_iter;
+	  HOST_WIDE_INT nunits;
+
+	  if (need_loop)
+	    per_iter = TARGET_MIN_VLEN;
+	  else
+	    per_iter = length;
+	  nunits = per_iter / potential_ew;
+
+	  /* Unless we get an implementation that's slow for small element
+	     size / non-word-aligned accesses, we assume that the hardware
+	     handles this well, and we don't want to complicate the code
+	     with shifting word contents around or handling extra bytes at
+	     the start and/or end.  So we want the total transfer size and
+	     alignment to fit with the element size.  */
+	  if (length % potential_ew != 0
+	      || !int_mode_for_size (bits, 0).exists (&elem_mode))
+	    continue;
+	  /* Find the mode to use for the copy inside the loop - or the
+	     sole copy, if there is no loop.  */
+	  if (!need_loop)
+	    {
+	      /* Try if we have an exact mode for the copy.  */
+	      if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		break;
+	      /* We might have an odd transfer size.  Try to round it up to
+		 a power of two to get a valid vector mode for a clobber.  */
+	      for (nunits = 1ULL << ceil_log2 (nunits);
+		   nunits <= TARGET_MIN_VLEN;
+		   nunits <<= 1)
+		if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		  break;
+
+	      if (vmode != VOIDmode)
+		break;
+	    }
+
+	  /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+	     wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+	     the sizes of larger element types; the LMUL factor of 8 can at
+	     the moment with SEW of up to 8 bytes, but there are reserved
+	     encodings so there might be larger SEW in the future.  */
+	  if (get_vector_mode (elem_mode,
+			       exact_div (BYTES_PER_RISCV_VECTOR * 8,
+					  potential_ew)).exists (&vmode))
+	    break;
+
+	  /* We may get here if we tried an element size that's larger than
+	     the hardware supports, but we should at least find a suitable
+	     byte vector mode.  */
+	  gcc_assert (potential_ew > 1);
+	}
+      if (potential_ew > 1)
+	length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = E_RVVM8QImode;
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  rtx cnt = length_rtx;
+  rtx label = NULL_RTX;
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+  if (need_loop)
+    {
+      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
+    }
+
+  vec = gen_reg_rtx (vmode);
+  src = change_address (src_in, vmode, src_addr);
+  dst = change_address (dst_in, vmode, dst_addr);
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+    }
+  else
+    {
+      machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      if (!satisfies_constraint_K (cnt))
+	cnt= force_reg (Pmode, cnt);
+      rtx m_ops[] = {vec, mask, RVV_VUNDEF (vmode), src};
+      emit_nonvlmax_masked_insn (code_for_pred_mov (vmode), RVV_UNOP_M,
+				 m_ops, cnt);
+      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+				 get_avl_type_rtx (NONVLMAX)));
+    }
+
+  if (need_loop)
+    {
+      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
+      emit_insn (gen_nop ());
+    }
+
+  return true;
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index b456fa6abb3..5b63da4d93b 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2149,14 +2149,16 @@
   DONE;
 })
 
-(define_expand "cpymemsi"
+(define_expand "cpymem<mode>"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
-	      (use (match_operand:SI 2 ""))
+	      (use (match_operand:P 2 ""))
 	      (use (match_operand:SI 3 "const_int_operand"))])]
   ""
 {
-  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+  if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: Re: cpymem for RISCV with v extension
  2023-08-15  8:12     ` Joern Rennecke
@ 2023-08-15  9:16       ` juzhe.zhong
  2023-08-15 14:06         ` Jeff Law
  2023-08-15 14:04       ` Jeff Law
  1 sibling, 1 reply; 16+ messages in thread
From: juzhe.zhong @ 2023-08-15  9:16 UTC (permalink / raw)
  To: joern.rennecke
  Cc: jeffreyalaw, gcc-patches, kito.cheng, Kito.cheng, Robin Dapp

[-- Attachment #1: Type: text/plain, Size: 2437 bytes --]

The new  patch looks reasonable to me now. Thanks for fixing it.

Could you append testcase after finishing test infrastructure ?
I prefer this patch with testcase after infrastructure. 

Thanks.

juzhe.zhong@rivai.ai

From: Joern Rennecke
Date: 2023-08-15 16:12
To: 钟居哲
CC: Jeff Law; gcc-patches; kito.cheng; kito.cheng; rdapp.gcc
Subject: Re: Re: cpymem for RISCV with v extension
On Sat, 5 Aug 2023 at 00:35, 钟居哲 <juzhe.zhong@rivai.ai> wrote:
>
> >> Testing what specifically?  Are you asking for correctness tests,
> >> performance/code quality tests?
>
> Add memcpy test using RVV instructions, just like we are adding testcases for auto-vectorization support.

I wanted to get in the test infrastructure first.

> void foo (int32_t * a, int32_t * b, int num)
> {
>   memcpy (a, b, num);
> }
>
>
> In my downstream LLVM/GCC codegen:
> foo:
> .L2:
>         vsetvli a5,a2,e8,m8,ta,ma
>         vle8.v  v24,(a1)
>         sub     a2,a2,a5
>         vse8.v  v24,(a0)
>         add     a1,a1,a5
>         add     a0,a0,a5
>         bne     a2,zero,.L2
>         ret

Yeah, it does that.

>
> Another example:
> void foo (int32_t * a, int32_t * b, int num)
> {
>   memcpy (a, b, 4);
> }
>
>
> My downstream LLVM/GCC assembly:
>
> foo:
> vsetvli zero,16,e8,m1,ta,ma
> vle8.v v24,(a1)
> vse8.v v24,(a0)
> ret

copying 16 bytes when asked to copy 4 is problematic.  Mine copies 4.

Note also for:
typedef struct { int a[31]; } s;

void foo (s *a, s *b)
{
  *a = *b;
}

You get:

        vsetivli        zero,31,e32,m8,ta,ma
        vle32.v v8,0(a1)
        vse32.v v8,0(a0)

Using memcpy, the compiler unfortunately discards the alignment.

> emit_insn (gen_pred_store...)

Thanks to pointing me in the right direction.  From the naming of the
patterns, the dearth of comments, and the default behaviour of the
compiler when optimizing with generic optimization options (i.e. no
vectorization) I had assumed that the infrastructure was still
missing.

I have attached a re-worked patch that uses pred_mov / pred_store and
as adapted to the refactored modes.
It lacks the strength reduction of the opaque pattern version for -O3,
though.  Would people also like to see that expanded into RTL?  Or
should I just drop in the opaque pattern for that?  Or not at all,
because everyone uses Superscalar Out-Of-Order execution?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-08-15  9:16       ` juzhe.zhong
@ 2023-08-15 14:06         ` Jeff Law
  2023-10-02  2:43           ` [RISC-V]: " Joern Rennecke
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Law @ 2023-08-15 14:06 UTC (permalink / raw)
  To: juzhe.zhong, joern.rennecke
  Cc: gcc-patches, kito.cheng, Kito.cheng, Robin Dapp



On 8/15/23 03:16, juzhe.zhong@rivai.ai wrote:
> The new  patch looks reasonable to me now. Thanks for fixing it.
> 
> Could you append testcase after finishing test infrastructure ?
> I prefer this patch with testcase after infrastructure.
So let's call this an ACK, but ask that Joern not commit until the 
testsuite bits are in place.


jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RISC-V]: Re: cpymem for RISCV with v extension
  2023-08-15 14:06         ` Jeff Law
@ 2023-10-02  2:43           ` Joern Rennecke
  2023-10-04 17:38             ` Patrick O'Neill
  0 siblings, 1 reply; 16+ messages in thread
From: Joern Rennecke @ 2023-10-02  2:43 UTC (permalink / raw)
  To: GCC Patches; +Cc: juzhe.zhong, kito.cheng, Kito.cheng, Robin Dapp, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 584 bytes --]

On Tue, 15 Aug 2023 at 15:06, Jeff Law <jeffreyalaw@gmail.com> wrote:
 >
> On 8/15/23 03:16, juzhe.zhong@rivai.ai wrote:
> > The new  patch looks reasonable to me now. Thanks for fixing it.
> >
> > Could you append testcase after finishing test infrastructure ?
> > I prefer this patch with testcase after infrastructure.
> So let's call this an ACK, but ask that Joern not commit until the
> testsuite bits are in place.

Beyond the adding of tests, the patch needed some changes because of the
Refactoring of emit_{vlmax,nonvlmax}_xxx functions .
Attached is the committed version.

[-- Attachment #2: cpymem-20231002.txt --]
[-- Type: text/plain, Size: 13537 bytes --]

commit 9464e72bcc9123b619215af8cfef491772a3ebd9
Author: Joern Rennecke <joern.rennecke@embecosm.com>
Date:   Mon Oct 2 03:16:09 2023 +0100

    cpymem for RISC-V with v extension
    
    gcc/
            * config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
            Declare.
            * config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
            New function.
            * config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
            Change to ..
            (cpymem<P:mode>) .. this.
    
    gcc/testsuite/
            * gcc.target/riscv/rvv/base/cpymem-1.c: New test.
            * gcc.target/riscv/rvv/base/cpymem-2.c: Likewise.
    
    Co-Authored-By: Juzhe-Zhong <juzhe.zhong@rivai.ai>

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index af5baf37e6a..43426a5326b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -492,6 +492,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
 			  machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
 void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
 machine_mode preferred_simd_mode (scalar_mode);
 machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 097457562bd..29e138e1da2 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
 #include "targhooks.h"
+#include "predict.h"
 
 using namespace riscv_vector;
 
@@ -1991,6 +1992,206 @@ expand_tuple_move (rtx *ops)
     }
 }
 
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+	mv a3, a0                       # Copy destination
+    loop:
+	vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+	vle8.v v0, (a1)                 # Load bytes
+	add a1, a1, t0                  # Bump pointer
+	sub a2, a2, t0                  # Decrement count
+	vse8.v v0, (a3)                 # Store bytes
+	add a3, a3, t0                  # Bump pointer
+	bnez a2, loop                   # Any more?
+	ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+	need_loop = false;
+
+	/* If a single scalar load / store pair can do the job, leave it
+	   to the scalar code to do that.  */
+	/* ??? If fast unaligned access is supported, the scalar code could
+	   use suitably sized scalars irrespective of alignemnt.  If that
+	   gets fixed, we have to adjust the test here.  */
+
+	if (pow2p_hwi (length) && length <= potential_ew)
+	  return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+	 size is likely to give smaller constants, and thus potentially
+	 reducing code size.  However, if we need a loop, we need to update
+	 the pointers, and that is more complicated with a larger element
+	 size, unless we use an immediate, which prevents us from dynamically
+	 using the targets transfer size that the hart supports.  And then,
+	 unless we know the *exact* vector size of the hart, we'd need
+	 multiple vsetvli / branch statements, so it's not even a size win.
+	 If, in the future, we find an RISCV-V implementation that is slower
+	 for small element widths, we might allow larger element widths for
+	 loops too.  */
+      if (need_loop)
+	potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+	{
+	  scalar_int_mode elem_mode;
+	  unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+	  unsigned HOST_WIDE_INT per_iter;
+	  HOST_WIDE_INT nunits;
+
+	  if (need_loop)
+	    per_iter = TARGET_MIN_VLEN;
+	  else
+	    per_iter = length;
+	  nunits = per_iter / potential_ew;
+
+	  /* Unless we get an implementation that's slow for small element
+	     size / non-word-aligned accesses, we assume that the hardware
+	     handles this well, and we don't want to complicate the code
+	     with shifting word contents around or handling extra bytes at
+	     the start and/or end.  So we want the total transfer size and
+	     alignment to fit with the element size.  */
+	  if (length % potential_ew != 0
+	      || !int_mode_for_size (bits, 0).exists (&elem_mode))
+	    continue;
+	  /* Find the mode to use for the copy inside the loop - or the
+	     sole copy, if there is no loop.  */
+	  if (!need_loop)
+	    {
+	      /* Try if we have an exact mode for the copy.  */
+	      if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		break;
+	      /* Since we don't have a mode that exactlty matches the transfer
+		 size, we'll need to use pred_store, which is not available
+		 for all vector modes, but only iE_RVV_M* modes, hence trying
+		 to find a vector mode for a merely rounded-up size is
+		 pointless.
+		 Still, by choosing a lower LMUL factor that still allows
+		 an entire transfer, we can reduce register pressure.  */
+	      for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
+		if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
+		    /* Avoid loosing the option of using vsetivli .  */
+		    && (nunits <= 31 * lmul || nunits > 31 * 8)
+		    && (get_vector_mode
+			 (elem_mode,
+			  exact_div (BYTES_PER_RISCV_VECTOR * lmul,
+				     potential_ew)
+			  ).exists (&vmode)))
+		  break;
+	    }
+
+	  /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
+	     wide.  BYTES_PER_RISCV_VECTOR can't be eavenly divided by
+	     the sizes of larger element types; the LMUL factor of 8 can at
+	     the moment be divided by the SEW, with SEW of up to 8 bytes,
+	     but there are reserved encodings so there might be larger
+	     SEW in the future.  */
+	  if (get_vector_mode (elem_mode,
+			       exact_div (BYTES_PER_RISCV_VECTOR * 8,
+					  potential_ew)).exists (&vmode))
+	    break;
+
+	  /* We may get here if we tried an element size that's larger than
+	     the hardware supports, but we should at least find a suitable
+	     byte vector mode.  */
+	  gcc_assert (potential_ew > 1);
+	}
+      if (potential_ew > 1)
+	length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = E_RVVM8QImode;
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  /* length_rtx holds the (remaining) length of the required copy.
+     cnt holds the length we copy with the current load/store pair.  */
+  rtx cnt = length_rtx;
+  rtx label = NULL_RTX;
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
+
+  if (need_loop)
+    {
+      length_rtx = copy_to_mode_reg (Pmode, length_rtx);
+      cnt = gen_reg_rtx (Pmode);
+      label = gen_label_rtx ();
+
+      emit_label (label);
+      emit_insn (gen_no_side_effects_vsetvl_rtx (vmode, cnt, length_rtx));
+    }
+
+  vec = gen_reg_rtx (vmode);
+  src = change_address (src_in, vmode, src_addr);
+  dst = change_address (dst_in, vmode, dst_addr);
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+    }
+  else
+    {
+      machine_mode mask_mode = get_vector_mode (BImode, GET_MODE_NUNITS (vmode)).require ();
+      rtx mask =  CONSTM1_RTX (mask_mode);
+      if (!satisfies_constraint_K (cnt))
+	cnt= force_reg (Pmode, cnt);
+      rtx m_ops[] = {vec, mask, src};
+      emit_nonvlmax_insn (code_for_pred_mov (vmode), UNARY_OP_TAMA,
+			  m_ops, cnt);
+      emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
+				 get_avl_type_rtx (NONVLMAX)));
+    }
+
+  if (need_loop)
+    {
+      emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
+      emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
+      emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
+
+      /* Emit the loop condition.  */
+      rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
+      emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
+      emit_insn (gen_nop ());
+    }
+
+  return true;
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index e00b8ee3579..1ebe8f92284 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2271,14 +2271,16 @@
   DONE;
 })
 
-(define_expand "cpymemsi"
+(define_expand "cpymem<mode>"
   [(parallel [(set (match_operand:BLK 0 "general_operand")
 		   (match_operand:BLK 1 "general_operand"))
-	      (use (match_operand:SI 2 ""))
+	      (use (match_operand:P 2 ""))
 	      (use (match_operand:SI 3 "const_int_operand"))])]
   ""
 {
-  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+  if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
new file mode 100644
index 00000000000..9bb4904e8e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#if 0 /* Using include files when using a multilib-relevant -march option is dicey */
+#include <string.h>
+#else
+extern void *memcpy(void *__restrict dest, const void *__restrict src, __SIZE_TYPE__ n);
+#endif
+
+/* memcpy should be implemented using the cpymem pattern.
+** f1:
+XX	\.L\d+: # local label is ignored
+**	vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**	vle8\.v\s+v\d+,0\(a1\)
+**	vse8\.v\s+v\d+,0\(a0\)
+**	add\s+a1,a1,[ta][0-7]
+**	add\s+a0,a0,[ta][0-7]
+**	sub\s+a2,a2,[ta][0-7]
+**	bne\s+a2,zero,\.L\d+
+**	ret
+*/
+
+void f1 (void *a, void *b, __SIZE_TYPE__ l)
+{
+  memcpy (a, b, l);
+}
+
+/* We should still use cpymem even with slightly different types, as signed
+   overflow is undefined.
+** f2:
+XX	\.L\d+: # local label is ignored
+**	vsetvli\s+[ta][0-7],a2,e8,m8,ta,ma
+**	vle8\.v\s+v\d+,0\(a1\)
+**	vse8\.v\s+v\d+,0\(a0\)
+**	add\s+a1,a1,[ta][0-7]
+**	add\s+a0,a0,[ta][0-7]
+**	sub\s+a2,a2,[ta][0-7]
+**	bne\s+a2,zero,\.L\d+
+**	ret
+*/
+void f2 (__INT32_TYPE__* a, __INT32_TYPE__* b, int l)
+{
+  memcpy (a, b, l);
+}
+
+/* If it is known that the pointer arguments to memcpy point
+   to an aligned object, cpymem can use that alignment.
+   Use extern here so that we get a known alignment, lest
+   DATA_ALIGNMENT force us to make the scan pattern accomodate
+   code for different alignments depending on word size.
+** f3:
+**        lui\s+[ta][0-7],%hi\(a_a\)
+**        lui\s+[ta][0-7],%hi\(a_b\)
+**        addi\s+a4,[ta][0-7],%lo\(a_b\)
+**        vsetivli\s+zero,16,e32,m4,ta,ma
+**        vle32.v\s+v\d+,0\([ta][0-7]\)
+**        addi\s+[ta][0-7],[ta][0-7],%lo\(a_a\)
+**        vse32\.v\s+v\d+,0\([ta][0-7]\)
+**        ret
+*/
+
+extern struct { __INT32_TYPE__ a[16]; } a_a, a_b;
+
+void f3 ()
+{
+  memcpy (&a_a, &a_b, sizeof a_a);
+}
+
+/* { dg-final { scan-assembler-not {\m(tail|call)\s+memcpy\M} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
new file mode 100644
index 00000000000..7b706b6ef52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cpymem-2.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+/* { dg-add-options riscv_v } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef struct { char c[16]; } c16;
+typedef struct { char c[32]; } c32;
+typedef struct { short s; char c[30]; } s16;
+
+/* A short struct copy can use vsetivli.
+** f1:
+**	vsetivli\s+zero,16,e8,m1,ta,ma
+**	vle8.v\s+v1,0\(a1\)
+**	vse8.v\s+v1,0\(a0\)
+**	ret
+*/
+void f1 (c16 *a, c16* b)
+{
+  *a = *b;
+}
+
+/* A longer one needs li.
+** f2:
+**	li\s+[ta][0-7],32
+**	vsetvli\s+zero,[ta][0-7],e8,m2,ta,ma
+**	vle8.v\s+v2,0\(a1\)
+**	vse8.v\s+v2,0\(a0\)
+**	ret
+*/
+void f2 (c32 *a, c32* b)
+{
+  *a = *b;
+}
+
+/* A 32 byte struct is still short enough for vsetivli
+   if we can use an element width larger than 8.
+** f3:
+**	vsetivli\s+zero,16,e16,m2,ta,ma
+**	vle16.v\s+v2,0\(a1\)
+**	vse16.v\s+v2,0\(a0\)
+**	ret
+*/
+void f3 (s16 *a, s16* b)
+{
+  *a = *b;
+}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RISC-V]: Re: cpymem for RISCV with v extension
  2023-10-02  2:43           ` [RISC-V]: " Joern Rennecke
@ 2023-10-04 17:38             ` Patrick O'Neill
  2023-10-04 19:19               ` Joern Rennecke
  0 siblings, 1 reply; 16+ messages in thread
From: Patrick O'Neill @ 2023-10-04 17:38 UTC (permalink / raw)
  To: Joern Rennecke, GCC Patches
  Cc: juzhe.zhong, kito.cheng, Kito.cheng, Robin Dapp, Jeff Law

Hi Joern,

I'm seeing new failures introduced by this patch 
(9464e72bcc9123b619215af8cfef491772a3ebd9).

On rv64gcv:
FAIL: gcc.dg/pr90263.c scan-assembler memcpy
FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,  
-O2 -fomit-frame-pointer -finline-functions -funroll-loops

Debug log for intrinsic_count.f90:
spawn riscv64-unknown-linux-gnu-run 
/scratch/tc-testing/tc-410-break/build/build-gcc-linux-stage2/gcc/testsuite/gfortran9/intrinsic_count.x
STOP 2
FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,  
-O2 -fomit-frame-pointer -finline-functions -funroll-loops

It's worth noting that intrinsic_count.f90 had failures prior to this 
patch for other option combinations:
FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,  -O2
FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,  
-O2 -fbounds-check
FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,  
-O2 -fomit-frame-pointer -finline-functions

Thanks,
Patrick

On 10/1/23 19:43, Joern Rennecke wrote:
> On Tue, 15 Aug 2023 at 15:06, Jeff Law <jeffreyalaw@gmail.com> wrote:
>   >
>> On 8/15/23 03:16, juzhe.zhong@rivai.ai wrote:
>>> The new  patch looks reasonable to me now. Thanks for fixing it.
>>>
>>> Could you append testcase after finishing test infrastructure ?
>>> I prefer this patch with testcase after infrastructure.
>> So let's call this an ACK, but ask that Joern not commit until the
>> testsuite bits are in place.
> Beyond the adding of tests, the patch needed some changes because of the
> Refactoring of emit_{vlmax,nonvlmax}_xxx functions .
> Attached is the committed version.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RISC-V]: Re: cpymem for RISCV with v extension
  2023-10-04 17:38             ` Patrick O'Neill
@ 2023-10-04 19:19               ` Joern Rennecke
  2023-10-04 21:35                 ` Patrick O'Neill
  0 siblings, 1 reply; 16+ messages in thread
From: Joern Rennecke @ 2023-10-04 19:19 UTC (permalink / raw)
  To: Patrick O'Neill
  Cc: GCC Patches, juzhe.zhong, kito.cheng, Kito.cheng, Robin Dapp, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 790 bytes --]

On Wed, 4 Oct 2023 at 18:38, Patrick O'Neill <patrick@rivosinc.com> wrote:
>
> Hi Joern,
>
> I'm seeing new failures introduced by this patch
> (9464e72bcc9123b619215af8cfef491772a3ebd9).
>
> On rv64gcv:
> FAIL: gcc.dg/pr90263.c scan-assembler memcpy

My testing didn't flag this because I used elf targets.  The
expected behaviour now is to use vector instructions for rvv.
so we shouldn't expect memcpy to appear there.  I think the
rvv case is suitably covered by the new tests, so we just
have to avoid the failure here.  Does the attached patch work for you?

> FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,
> -O2 -fomit-frame-pointer -finline-functions -funroll-loops

There seems to be an issue with my test setup regarding fortran, I'll
have to investigate.

[-- Attachment #2: tmp.txt --]
[-- Type: text/plain, Size: 512 bytes --]

diff --git a/gcc/testsuite/gcc.dg/pr90263.c b/gcc/testsuite/gcc.dg/pr90263.c
index 3222a5331c1..09e0446f45c 100644
--- a/gcc/testsuite/gcc.dg/pr90263.c
+++ b/gcc/testsuite/gcc.dg/pr90263.c
@@ -9,4 +9,4 @@ int *f (int *p, int *q, long n)
 }
 
 /* { dg-final { scan-assembler "mempcpy" { target { i?86-*-* x86_64-*-* } } } } */
-/* { dg-final { scan-assembler "memcpy" { target { ! { i?86-*-* x86_64-*-* } } } } } */
+/* { dg-final { scan-assembler "memcpy" { target { ! { i?86-*-* x86_64-*-* riscv_v } } } } } */

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RISC-V]: Re: cpymem for RISCV with v extension
  2023-10-04 19:19               ` Joern Rennecke
@ 2023-10-04 21:35                 ` Patrick O'Neill
  0 siblings, 0 replies; 16+ messages in thread
From: Patrick O'Neill @ 2023-10-04 21:35 UTC (permalink / raw)
  To: Joern Rennecke
  Cc: GCC Patches, juzhe.zhong, kito.cheng, Kito.cheng, Robin Dapp, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 1698 bytes --]

On 10/4/23 12:19, Joern Rennecke wrote:

> On Wed, 4 Oct 2023 at 18:38, Patrick O'Neill<patrick@rivosinc.com>  wrote:
>> Hi Joern,
>>
>> I'm seeing new failures introduced by this patch
>> (9464e72bcc9123b619215af8cfef491772a3ebd9).
>>
>> On rv64gcv:
>> FAIL: gcc.dg/pr90263.c scan-assembler memcpy
> My testing didn't flag this because I used elf targets.  The
> expected behaviour now is to use vector instructions for rvv.
> so we shouldn't expect memcpy to appear there.  I think the
> rvv case is suitably covered by the new tests, so we just
> have to avoid the failure here.  Does the attached patch work for you?

Thanks for the quick response. I'm glad to hear the behavior is expected :)
The attached patch works, just needed some syntax changes:
ERROR: gcc.dg/pr90263.c: error executing dg-final: syntax error in target selector "target i?86-*-* x86_64-*-* riscv_v"
Diff w/ syntax changes:
diff --git a/gcc/testsuite/gcc.dg/pr90263.c b/gcc/testsuite/gcc.dg/pr90263.c
index 3222a5331c1..4044e6b1544 100644
--- a/gcc/testsuite/gcc.dg/pr90263.c
+++ b/gcc/testsuite/gcc.dg/pr90263.c
@@ -9,4 +9,4 @@ int *f (int *p, int *q, long n)
  }

  /* { dg-final { scan-assembler "mempcpy" { target { i?86-*-* x86_64-*-* } } } } */
-/* { dg-final { scan-assembler "memcpy" { target { ! { i?86-*-* x86_64-*-* } } } } } */
+/* { dg-final { scan-assembler "memcpy" { target { ! { { i?86-*-* x86_64-*-* } || { riscv_v } } } } } } */

I'll send it as a patch shortly.

Patrick

>> FAIL: gfortran.fortran-torture/execute/intrinsic_count.f90 execution,
>> -O2 -fomit-frame-pointer -finline-functions -funroll-loops
> There seems to be an issue with my test setup regarding fortran, I'll
> have to investigate.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-08-15  8:12     ` Joern Rennecke
  2023-08-15  9:16       ` juzhe.zhong
@ 2023-08-15 14:04       ` Jeff Law
  1 sibling, 0 replies; 16+ messages in thread
From: Jeff Law @ 2023-08-15 14:04 UTC (permalink / raw)
  To: Joern Rennecke, 钟居哲
  Cc: gcc-patches, kito.cheng, kito.cheng, rdapp.gcc



On 8/15/23 02:12, Joern Rennecke wrote:

> It lacks the strength reduction of the opaque pattern version for -O3,
> though.  Would people also like to see that expanded into RTL?  Or
> should I just drop in the opaque pattern for that?  Or not at all,
> because everyone uses Superscalar Out-Of-Order execution?
I doubt it's going to matter all that much.  Your decision IMHO.  I'd 
like to think everyone implementing V will be OOO superscalar, but I'm 
not naive enough to believe that will hold in practice (even with the P 
extension on the way).

jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: Re: cpymem for RISCV with v extension
  2023-08-04 23:17 ` Jeff Law
  2023-08-04 23:34   ` 钟居哲
@ 2023-08-04 23:44   ` 钟居哲
  1 sibling, 0 replies; 16+ messages in thread
From: 钟居哲 @ 2023-08-04 23:44 UTC (permalink / raw)
  To: Jeff Law, gcc-patches; +Cc: kito.cheng, kito.cheng, rdapp.gcc, Joern Rennecke

[-- Attachment #1: Type: text/plain, Size: 6641 bytes --]

>> Umm, this patch has been queued up for at least a couple weeks now.

Oh. I am sorry I didn't see this patch since this patch doesn't CC me.
I didn't subscribe GCC-patch, so I may miss some patches that didn't explicitly CC me.

I just happen to see your reply email today then reply.



juzhe.zhong@rivai.ai
 
From: Jeff Law
Date: 2023-08-05 07:17
To: 钟居哲; gcc-patches
CC: kito.cheng; kito.cheng; rdapp.gcc; Joern Rennecke
Subject: Re: cpymem for RISCV with v extension
 
 
On 8/4/23 17:10, 钟居哲 wrote:
> Could you add testcases for this patch?
Testing what specifically?  Are you asking for correctness tests, 
performance/code quality tests?
 
 
> 
> +;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
> +;; optimizers from changing cpymem_loop_* into this.
> +(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "r,r")))
> + (use (and (match_dup 1) (const_int 127)))
> +   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
> +   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
> +)
> +
> +(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +    "vle<sew>.v %3,(%1)\;"
> +    "sub %2,%2,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("add %1,%1,%4\;"
> +    "vse<sew>.v %3,(%0)\;"
> +    "add %0,%0,%4\;"
> +    "bnez %2,0b", operands);
> +  return "";
> +})
> +
> +;; This pattern (at bltu) assumes pointers can be treated as unsigned,
> +;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
> +(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
> +  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
> + (mem:BLK (match_operand:P 1 "register_operand" "+r")))
> +   (use (match_operand:P 2 "register_operand" "+r"))
> +   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
> +   (clobber (match_scratch:P 4 "=&r"))
> +   (clobber (match_scratch:P 5 "=&r"))
> +   (clobber (match_scratch:P 6 "=&r"))
> +   (clobber (match_dup 0))
> +   (clobber (match_dup 1))
> +   (clobber (match_dup 2))
> +   (clobber (reg:SI VL_REGNUM))
> +   (clobber (reg:SI VTYPE_REGNUM))]
> +  "TARGET_VECTOR"
> +{
> +  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
> +    "beq %4,%2,1f\;"
> +    "add %5,%0,%2\;"
> +    "sub %6,%5,%4", operands);
> +  if (<sew> != 8)
> +    {
> +      rtx xop[2];
> +      xop[0] = operands[4];
> +      xop[1] = GEN_INT (exact_log2 (<sew>/8));
> +      output_asm_insn ("slli %0,%0,%1", xop);
> +    }
> +  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
> +    "add %1,%1,%4\;"
> +    "vse<sew>.v %3,(%0)\;"
> +    "add %0,%0,%4\;"
>>>    "bltu %0,%6,0b\;"
>>>    "sub %5,%5,%0", operands);
>>>   if (<sew> != 8)
>>>     {
>>>       rtx xop[2];
>>>       xop[0] = operands[4];
>>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>>       output_asm_insn ("srli %0,%0,%1", xop);
>>>      }
>>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>>     "1:\t" "vle<sew>.v %3,(%1)\;"
>>>    "vse<sew>.v %3,(%0)", operands);
>>>   return "";
>>> })
> 
> I don't think they are necessary.
What specifically do you think is not necessary?
 
 
> 
>>> Just post the update for archival purposes and consider
>>> it pre-approved for the trunk.
> 
> I am so sorry that I disagree approve this patch too fast.
Umm, this patch has been queued up for at least a couple weeks now.
 
> 
> It should be well tested.
If you refer to Joern's message he indicated how it was tested.  Joern 
is a long time GCC developer and is well aware of how to test code.
 
 
It was tested on this set of multilibs without regressions:
 
>    riscv-sim
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>     riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>     riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
 
 
 
 
> 
> 
> We should at least these 2 following situations:
> 
> 1. an unknown number bytes to be memcpy, this codegen should be as follows:
> 
> vsetvl a5,a2,e8,m8,ta,ma
> 
> vle
> 
> vse
> 
> bump counter
> 
> branch
> 
> 2. a known number bytes to be memcpy, and the number bytes allow us to 
> fine a VLS modes to hold it.
> 
> For example, memcpy 16 bytes QImode.
> 
> Then, we can use V16QImode directly, the codegen should be:
> 
> vsetvli zero,16,....
> 
> vle
> 
> vse
> 
> Simple 3 instructions are enough.
> 
> 
> This patch should be well tested with these 2 situations before approved 
> since LLVM does the same thing.
> 
> We should be able to have the same behavior as LLVM.
I'm not sure that's strictly necessary and I don't mind iterating a bit 
on performance issues as long as we don't have correctness problems.
 
But since you've raised concerns -- Joern don't install until we've 
resolved the questions at hand.  Thanks.
 
jeff
 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* cpymem for RISCV with v extension
@ 2023-07-18  4:47 Joern Rennecke
  2023-08-04 20:52 ` Jeff Law
  0 siblings, 1 reply; 16+ messages in thread
From: Joern Rennecke @ 2023-07-18  4:47 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1425 bytes --]

As discussed on last week's patch call, this patch uses either a
straight copy or an opaque pattern that emits the loop as assembly to
optimize cpymem for the 'v' extension.
I used Ju-Zhe Zhong's patch - starting in git with:

Author: zhongjuzhe <66454988+zhongjuzhe@users.noreply.github.com>
Date:   Mon Mar 21 14:20:42 2022 +0800

      PR for RVV support using splitted small chunks (#334)

as a starting point, even though not all that much of the original code remains.

Regression tested on x86_64-pc-linux-gnu X
    riscv-sim
    riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
    riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
    riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
    riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
    riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
    riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
    riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d

[-- Attachment #2: cpymem-diff-20230718.txt --]
[-- Type: text/plain, Size: 12304 bytes --]

2023-07-12  Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
            Joern Rennecke  <joern.rennecke@embecosm.com>

	* config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
	Declare.
	* config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
	New function.
	* config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
	* config/riscv/vector.md (@cpymem_straight<P:mode><V_WHOLE:mode>):
	New define_insn patterns.
	(@cpymem_loop<P:mode><V_WHOLE:mode>): Likewise.
	(@cpymem_loop_fast<P:mode><V_WHOLE:mode>): Likewise.

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 16fb8dabca0..40965a00681 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -301,6 +301,7 @@ bool slide1_sew64_helper (int, machine_mode, machine_mode,
 			  machine_mode, rtx *);
 rtx gen_avl_for_scalar_move (rtx);
 void expand_tuple_move (rtx *);
+bool expand_block_move (rtx, rtx, rtx);
 machine_mode preferred_simd_mode (scalar_mode);
 opt_machine_mode get_mask_mode (machine_mode);
 void expand_vec_series (rtx, rtx, rtx);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b4884a30872..e61110fa3ad 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -49,6 +49,7 @@
 #include "tm-constrs.h"
 #include "rtx-vector-builder.h"
 #include "targhooks.h"
+#include "predict.h"
 
 using namespace riscv_vector;
 
@@ -2164,6 +2165,191 @@ expand_tuple_move (rtx *ops)
     }
 }
 
+/* Used by cpymemsi in riscv.md .  */
+
+bool
+expand_block_move (rtx dest_in, rtx src_in, rtx length_in)
+{
+  /*
+    memcpy:
+	mv a3, a0                       # Copy destination
+    loop:
+	vsetvli t0, a2, e8, m8, ta, ma  # Vectors of 8b
+	vle8.v v0, (a1)                 # Load bytes
+	add a1, a1, t0                  # Bump pointer
+	sub a2, a2, t0                  # Decrement count
+	vse8.v v0, (a3)                 # Store bytes
+	add a3, a3, t0                  # Bump pointer
+	bnez a2, loop                   # Any more?
+	ret                             # Return
+  */
+  if (!TARGET_VECTOR)
+    return false;
+  HOST_WIDE_INT potential_ew
+    = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dest_in)), BITS_PER_WORD)
+       / BITS_PER_UNIT);
+  machine_mode vmode = VOIDmode;
+  bool need_loop = true;
+  bool size_p = optimize_function_for_size_p (cfun);
+  rtx src, dst;
+  rtx end = gen_reg_rtx (Pmode);
+  rtx vec;
+  rtx length_rtx = length_in;
+
+  if (CONST_INT_P (length_in))
+    {
+      HOST_WIDE_INT length = INTVAL (length_in);
+
+    /* By using LMUL=8, we can copy as many bytes in one go as there
+       are bits in a vector register.  If the entire block thus fits,
+       we don't need a loop.  */
+    if (length <= TARGET_MIN_VLEN)
+      {
+	need_loop = false;
+
+	/* If a single scalar load / store pair can do the job, leave it
+	   to the scalar code to do that.  */
+
+	if (pow2p_hwi (length) && length <= potential_ew)
+	  return false;
+      }
+
+      /* Find the vector mode to use.  Using the largest possible element
+	 size is likely to give smaller constants, and thus potentially
+	 reducing code size.  However, if we need a loop, we need to update
+	 the pointers, and that is more complicated with a larger element
+	 size, unless we use an immediate, which prevents us from dynamically
+	 using the largets transfer size that the hart supports.  And then,
+	 unless we know the *exact* vector size of the hart, we'd need
+	 multiple vsetvli / branch statements, so it's not even a size win.
+	 If, in the future, we find an RISCV-V implementation that is slower
+	 for small element widths, we might allow larger element widths for
+	 loops too.  */
+      if (need_loop)
+	potential_ew = 1;
+      for (; potential_ew; potential_ew >>= 1)
+	{
+	  scalar_int_mode elem_mode;
+	  unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
+	  unsigned HOST_WIDE_INT per_iter;
+	  HOST_WIDE_INT nunits;
+
+	  if (need_loop)
+	    per_iter = TARGET_MIN_VLEN;
+	  else
+	    per_iter = length;
+	  nunits = per_iter / potential_ew;
+
+	  /* Unless we get an implementation that's slow for small element
+	     size / non-word-aligned accesses, we assume that the hardware
+	     handles this well, and we don't want to complicate the code
+	     with shifting word contents around or handling extra bytes at
+	     the start and/or end.  So we want the total transfer size and
+	     alignemnt to fit with the element size.  */
+	  if (length % potential_ew != 0
+	      || !int_mode_for_size (bits, 0).exists (&elem_mode))
+	    continue;
+	  /* Find the mode to use for the copy inside the loop - or the
+	     sole copy, if there is no loop.  */
+	  if (!need_loop)
+	    {
+	      /* Try if we have an exact mode for the copy.  */
+	      if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		break;
+	      /* We might have an odd transfer size.  Try to round it up to
+		 a power of two to get a valid vector mode for a clobber.  */
+	      for (nunits = 1ULL << ceil_log2 (nunits);
+		   nunits <= TARGET_MIN_VLEN;
+		   nunits <<= 1)
+		if (get_vector_mode (elem_mode, nunits).exists (&vmode))
+		  break;
+
+	      if (vmode != VOIDmode)
+		break;
+	    }
+
+	  // The VNx*?I modes have a factor of riscv_vector_chunks for nunits.
+	  if (get_vector_mode (elem_mode,
+			       TARGET_MIN_VLEN / potential_ew
+			       * riscv_vector_chunks).exists (&vmode))
+	    break;
+
+	  /* We may get here if we tried an element size that's larger than
+	     the hardware supports, but we should at least find a suitable
+	     byte vector mode.  */
+	  gcc_assert (potential_ew > 1);
+	}
+      if (potential_ew > 1)
+	length_rtx = GEN_INT (length / potential_ew);
+    }
+  else
+    {
+      vmode = (get_vector_mode (QImode, TARGET_MIN_VLEN * riscv_vector_chunks)
+	       .require ());
+    }
+
+  /* A memcpy libcall in the worst case takes 3 instructions to prepare the
+     arguments + 1 for the call.  When RVV should take 7 instructions and
+     we're optimizing for size a libcall may be preferable.  */
+  if (size_p && need_loop)
+    return false;
+
+  /* If we don't need a loop and have a suitable mode to describe the size,
+     just do a load / store pair and leave it up to the later lazy code
+     motion pass to insert the appropriate vsetvli.  */
+  if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      vec = gen_reg_rtx (vmode);
+      src = change_address (src_in, vmode, NULL);
+      dst = change_address (dest_in, vmode, NULL);
+      emit_move_insn (vec, src);
+      emit_move_insn (dst, vec);
+      return true;
+    }
+
+  if (CONST_POLY_INT_P (length_rtx))
+    {
+      if (GET_MODE (length_rtx) != Pmode)
+	{
+	  poly_int64 value = rtx_to_poly_int64 (length_rtx);
+	  emit_insn (gen_rtx_SET (end,
+				  gen_int_mode (poly_int64 (value.coeffs[0],
+							    value.coeffs[1]),
+						Pmode)));
+	}
+      else
+	emit_insn (gen_rtx_SET (end, length_rtx));
+    }
+  else
+    {
+      if (GET_MODE (length_rtx) != Pmode)
+	riscv_emit_move (end, gen_lowpart (Pmode, length_rtx));
+      else
+	riscv_emit_move (end, length_rtx);
+    }
+
+  /* Move the address into scratch registers.  */
+  dst = copy_addr_to_reg (XEXP (dest_in, 0));
+  src = copy_addr_to_reg (XEXP (src_in, 0));
+
+  /* Since we haven't implemented VLA handling in general, we emit
+     opaque patterns that output the appropriate instructions.  */
+  if (!need_loop)
+    emit_insn (gen_cpymem_straight (Pmode, vmode, dst, src, end));
+  /* The *_fast pattern needs 13 instructions instead of 7, and
+     considering that this code is usually memory-constrainted, limit this
+     to -O3.  ??? It would make sense to differentiate here between in-order
+     and OOO microarchitectures.  */
+  else if (!size_p && optimize >= 3)
+    emit_insn (gen_cpymem_loop_fast (Pmode, vmode, dst, src, end));
+  else
+    emit_insn (gen_cpymem_loop (Pmode, vmode, dst, src, end));
+
+  /* A nop to attach notes to.  */
+  emit_insn (gen_nop ());
+  return true;
+}
+
 /* Return the vectorization machine mode for RVV according to LMUL.  */
 machine_mode
 preferred_simd_mode (scalar_mode mode)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 7edef1fb546..4e596f42576 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2141,7 +2141,9 @@
 	      (use (match_operand:SI 3 "const_int_operand"))])]
   ""
 {
-  if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
+  if (riscv_vector::expand_block_move (operands[0], operands[1], operands[2]))
+    DONE;
+  else if (riscv_expand_block_move (operands[0], operands[1], operands[2]))
     DONE;
   else
     FAIL;
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 215ecb9cb58..eee58a8ff71 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -837,6 +837,97 @@
   [(set_attr "type" "vmov,vlde,vste")
    (set_attr "mode" "<VT:MODE>")])
 
+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+	(use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "vle<sew>.v %3,(%1)\;"
+		   "sub %2,%2,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
+		   "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_scratch:P 5 "=&r"))
+   (clobber (match_scratch:P 6 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "beq %4,%2,1f\;"
+		   "add %5,%0,%2\;"
+		   "sub %6,%5,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+		   "add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
+		   "bltu %0,%6,0b\;"
+		   "sub %5,%5,%0", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("srli %0,%0,%1", xop);
+    }
+  output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
+	    "1:\t" "vle<sew>.v %3,(%1)\;"
+		   "vse<sew>.v %3,(%0)", operands);
+  return "";
+})
+
 ;; -----------------------------------------------------------------
 ;; ---- Duplicate Operations
 ;; -----------------------------------------------------------------

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-07-18  4:47 Joern Rennecke
@ 2023-08-04 20:52 ` Jeff Law
  2023-08-15  1:46   ` Joern Rennecke
  0 siblings, 1 reply; 16+ messages in thread
From: Jeff Law @ 2023-08-04 20:52 UTC (permalink / raw)
  To: Joern Rennecke, GCC Patches



On 7/17/23 22:47, Joern Rennecke wrote:
> Subject:
> cpymem for RISCV with v extension
> From:
> Joern Rennecke <joern.rennecke@embecosm.com>
> Date:
> 7/17/23, 22:47
> 
> To:
> GCC Patches <gcc-patches@gcc.gnu.org>
> 
> 
> As discussed on last week's patch call, this patch uses either a
> straight copy or an opaque pattern that emits the loop as assembly to
> optimize cpymem for the 'v' extension.
> I used Ju-Zhe Zhong's patch - starting in git with:
> 
> Author: zhongjuzhe<66454988+zhongjuzhe@users.noreply.github.com>
> Date:   Mon Mar 21 14:20:42 2022 +0800
> 
>        PR for RVV support using splitted small chunks (#334)
> 
> as a starting point, even though not all that much of the original code remains.
> 
> Regression tested on x86_64-pc-linux-gnu X
>      riscv-sim
>      riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>      riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>      riscv-sim/-march=rv32imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32f
>      riscv-sim/-march=rv32imfdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=ilp32
>      riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbc_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>      riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zba_zbb_zbs_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
>      riscv-sim/-march=rv64imafdcv_zicsr_zifencei_zfh_zve32f_zve32x_zve64d_zve64f_zve64x_zvl128b_zvl32b_zvl64b/-mabi=lp64d
> 
> 
> cpymem-diff-20230718.txt
> 
> 2023-07-12  Ju-Zhe Zhong<juzhe.zhong@rivai.ai>
>              Joern Rennecke<joern.rennecke@embecosm.com>
> 
> 	* config/riscv/riscv-protos.h (riscv_vector::expand_block_move):
> 	Declare.
> 	* config/riscv/riscv-v.cc (riscv_vector::expand_block_move):
> 	New function.
> 	* config/riscv/riscv.md (cpymemsi): Use riscv_vector::expand_block_move.
> 	* config/riscv/vector.md (@cpymem_straight<P:mode><V_WHOLE:mode>):
> 	New define_insn patterns.
> 	(@cpymem_loop<P:mode><V_WHOLE:mode>): Likewise.
> 	(@cpymem_loop_fast<P:mode><V_WHOLE:mode>): Likewise.
> 

> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index b4884a30872..e61110fa3ad 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -49,6 +49,7 @@
>   #include "tm-constrs.h"
>   #include "rtx-vector-builder.h"
>   #include "targhooks.h"
> +#include "predict.h"
Not sure this is needed, but I didn't scan for it explicitly.  If it's 
not needed, then remove it.



> +  if (CONST_INT_P (length_in))
> +    {
> +      HOST_WIDE_INT length = INTVAL (length_in);
> +
> +    /* By using LMUL=8, we can copy as many bytes in one go as there
> +       are bits in a vector register.  If the entire block thus fits,
> +       we don't need a loop.  */
> +    if (length <= TARGET_MIN_VLEN)
> +      {
> +	need_loop = false;
> +
> +	/* If a single scalar load / store pair can do the job, leave it
> +	   to the scalar code to do that.  */
> +
> +	if (pow2p_hwi (length) && length <= potential_ew)
> +	  return false;
> +      }
We could probably argue over the threshold for doing the copy on the 
scalar side, but I don't think it's necessary.  Once we start seeing V 
hardware we can revisit.


> +
> +      /* Find the vector mode to use.  Using the largest possible element
> +	 size is likely to give smaller constants, and thus potentially
> +	 reducing code size.  However, if we need a loop, we need to update
> +	 the pointers, and that is more complicated with a larger element
> +	 size, unless we use an immediate, which prevents us from dynamically
> +	 using the largets transfer size that the hart supports.  And then,
> +	 unless we know the*exact*  vector size of the hart, we'd need
> +	 multiple vsetvli / branch statements, so it's not even a size win.
> +	 If, in the future, we find an RISCV-V implementation that is slower
> +	 for small element widths, we might allow larger element widths for
> +	 loops too.  */
s/largets/largest/

And a space is missing in "the*extact*"

Note that I think the proposed glibc copier does allow larger elements 
widths for this case.

> +
> +	  /* Unless we get an implementation that's slow for small element
> +	     size / non-word-aligned accesses, we assume that the hardware
> +	     handles this well, and we don't want to complicate the code
> +	     with shifting word contents around or handling extra bytes at
> +	     the start and/or end.  So we want the total transfer size and
> +	     alignemnt to fit with the element size.  */
s/alignemnt/alignment/

Yes, let's not try to support every uarch we can envision and instead do 
a good job on the uarches we know about.    If a uarch with slow element 
or non-word aligned accesses comes along, they can propose changes at 
that time.



> +
> +	  // The VNx*?I modes have a factor of riscv_vector_chunks for nunits.
Comment might need updating after the recent work to adjust the modes. 
I don't recall if we kept the VNx*?I modes or not.

So the adjustments are all comment related, so this is OK after fixing 
the comments.  Just post the update for archival purposes and consider 
it pre-approved for the trunk.

Thanks and sorry for the wait Joern.

jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-08-04 20:52 ` Jeff Law
@ 2023-08-15  1:46   ` Joern Rennecke
  2023-08-15 13:46     ` Jeff Law
  0 siblings, 1 reply; 16+ messages in thread
From: Joern Rennecke @ 2023-08-15  1:46 UTC (permalink / raw)
  To: Jeff Law; +Cc: GCC Patches

On Fri, 4 Aug 2023 at 21:52, Jeff Law <jeffreyalaw@gmail.com> wrote:

> > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> > index b4884a30872..e61110fa3ad 100644
> > --- a/gcc/config/riscv/riscv-v.cc
> > +++ b/gcc/config/riscv/riscv-v.cc
> > @@ -49,6 +49,7 @@
> >   #include "tm-constrs.h"
> >   #include "rtx-vector-builder.h"
> >   #include "targhooks.h"
> > +#include "predict.h"
> Not sure this is needed, but I didn't scan for it explicitly.  If it's
> not needed, then remove it.

It is needed to declare optimize_function_for_size_p .

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: cpymem for RISCV with v extension
  2023-08-15  1:46   ` Joern Rennecke
@ 2023-08-15 13:46     ` Jeff Law
  0 siblings, 0 replies; 16+ messages in thread
From: Jeff Law @ 2023-08-15 13:46 UTC (permalink / raw)
  To: Joern Rennecke; +Cc: GCC Patches



On 8/14/23 19:46, Joern Rennecke wrote:
> On Fri, 4 Aug 2023 at 21:52, Jeff Law <jeffreyalaw@gmail.com> wrote:
> 
>>> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
>>> index b4884a30872..e61110fa3ad 100644
>>> --- a/gcc/config/riscv/riscv-v.cc
>>> +++ b/gcc/config/riscv/riscv-v.cc
>>> @@ -49,6 +49,7 @@
>>>    #include "tm-constrs.h"
>>>    #include "rtx-vector-builder.h"
>>>    #include "targhooks.h"
>>> +#include "predict.h"
>> Not sure this is needed, but I didn't scan for it explicitly.  If it's
>> not needed, then remove it.
> 
> It is needed to declare optimize_function_for_size_p .
Obviously a trivial nit.  Thanks for tracking it down.

jeff

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-10-04 21:35 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-04 23:10 cpymem for RISCV with v extension 钟居哲
2023-08-04 23:17 ` Jeff Law
2023-08-04 23:34   ` 钟居哲
2023-08-15  8:12     ` Joern Rennecke
2023-08-15  9:16       ` juzhe.zhong
2023-08-15 14:06         ` Jeff Law
2023-10-02  2:43           ` [RISC-V]: " Joern Rennecke
2023-10-04 17:38             ` Patrick O'Neill
2023-10-04 19:19               ` Joern Rennecke
2023-10-04 21:35                 ` Patrick O'Neill
2023-08-15 14:04       ` Jeff Law
2023-08-04 23:44   ` 钟居哲
  -- strict thread matches above, loose matches on Subject: below --
2023-07-18  4:47 Joern Rennecke
2023-08-04 20:52 ` Jeff Law
2023-08-15  1:46   ` Joern Rennecke
2023-08-15 13:46     ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).