public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* cpymem for RISCV with v extension
@ 2023-08-04 23:10 钟居哲
  2023-08-04 23:17 ` Jeff Law
  0 siblings, 1 reply; 12+ messages in thread
From: 钟居哲 @ 2023-08-04 23:10 UTC (permalink / raw)
  To: gcc-patches; +Cc: kito.cheng, kito.cheng, Jeff Law, rdapp.gcc

[-- Attachment #1: Type: text/plain, Size: 4537 bytes --]

Could you add testcases for this patch?

+;; The (use (and (match_dup 1) (const_int 127))) is here to prevent the
+;; optimizers from changing cpymem_loop_* into this.
+(define_insn "@cpymem_straight<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "r,r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "r,r")))
+	(use (and (match_dup 1) (const_int 127)))
+   (use (match_operand:P 2 "reg_or_int_operand" "r,K"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr,&vr"))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+  "@vsetvli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)
+   vsetivli zero,%2,e<sew>,m8,ta,ma\;vle<sew>.v %3,(%1)\;vse<sew>.v %3,(%0)"
+)
+
+(define_insn "@cpymem_loop<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{ output_asm_insn ("\n0:\t" "vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "vle<sew>.v %3,(%1)\;"
+		   "sub %2,%2,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
+		   "bnez %2,0b", operands);
+  return "";
+})
+
+;; This pattern (at bltu) assumes pointers can be treated as unsigned,
+;; i.e.  objects can't straddle 0xffffffffffffffff / 0x0000000000000000 .
+(define_insn "@cpymem_loop_fast<P:mode><V_WHOLE:mode>"
+  [(set (mem:BLK (match_operand:P 0 "register_operand" "+r"))
+	(mem:BLK (match_operand:P 1 "register_operand" "+r")))
+   (use (match_operand:P 2 "register_operand" "+r"))
+   (clobber (match_scratch:V_WHOLE 3 "=&vr"))
+   (clobber (match_scratch:P 4 "=&r"))
+   (clobber (match_scratch:P 5 "=&r"))
+   (clobber (match_scratch:P 6 "=&r"))
+   (clobber (match_dup 0))
+   (clobber (match_dup 1))
+   (clobber (match_dup 2))
+   (clobber (reg:SI VL_REGNUM))
+   (clobber (reg:SI VTYPE_REGNUM))]
+  "TARGET_VECTOR"
+{
+  output_asm_insn ("vsetvli %4,%2,e<sew>,m8,ta,ma\;"
+		   "beq %4,%2,1f\;"
+		   "add %5,%0,%2\;"
+		   "sub %6,%5,%4", operands);
+  if (<sew> != 8)
+    {
+      rtx xop[2];
+      xop[0] = operands[4];
+      xop[1] = GEN_INT (exact_log2 (<sew>/8));
+      output_asm_insn ("slli %0,%0,%1", xop);
+    }
+  output_asm_insn ("\n0:\t" "vle<sew>.v %3,(%1)\;"
+		   "add %1,%1,%4\;"
+		   "vse<sew>.v %3,(%0)\;"
+		   "add %0,%0,%4\;"
>>  		   "bltu %0,%6,0b\;"
>>  		   "sub %5,%5,%0", operands);
>>   if (<sew> != 8)
>>     {
>>       rtx xop[2];
>>       xop[0] = operands[4];
>>       xop[1] = GEN_INT (exact_log2 (<sew>/8));
>>       output_asm_insn ("srli %0,%0,%1", xop);
>>      }
>>   output_asm_insn ("vsetvli %4,%5,e<sew>,m8,ta,ma\n"
>>  	    "1:\t" "vle<sew>.v %3,(%1)\;"
>>  		   "vse<sew>.v %3,(%0)", operands);
>>   return "";
>>  })
I don't think they are necessary.

>>      considering that this code is usually memory-constrainted, limit this
>>      to -O3.  ??? It would make sense to differentiate here between in-order
>>     and OOO microarchitectures.  */
>>     else if (!size_p && optimize >= 3)
>>       emit_insn (gen_cpymem_loop_fast (Pmode, vmode, dst, src, end));
>>      else
>>       emit_insn (gen_cpymem_loop (Pmode, vmode, dst, src, end));
Why not just emit RVV pattern.
>> Just post the update for archival purposes and consider 
>> it pre-approved for the trunk.I am so sorry that I disagree approve this patch too fast.It should be well tested.
We should at least these 2 following situations:1. an unknown number bytes to be memcpy, this codegen should be as follows:   vsetvl a5,a2,e8,m8,ta,ma    vle    vse    bump counter    branch2. a known number bytes to be memcpy, and the number bytes allow us to fine a VLS modes to hold it.    For example, memcpy 16 bytes QImode.    Then, we can use V16QImode directly, the codegen should be:    vsetvli zero,16,....     vle     vseSimple 3 instructions are enough. 
This patch should be well tested with these 2 situations before approved since LLVM does the same thing.We should be able to have the same behavior as LLVM.


juzhe.zhong@rivai.ai

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-10-04 21:35 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-04 23:10 cpymem for RISCV with v extension 钟居哲
2023-08-04 23:17 ` Jeff Law
2023-08-04 23:34   ` 钟居哲
2023-08-15  8:12     ` Joern Rennecke
2023-08-15  9:16       ` juzhe.zhong
2023-08-15 14:06         ` Jeff Law
2023-10-02  2:43           ` [RISC-V]: " Joern Rennecke
2023-10-04 17:38             ` Patrick O'Neill
2023-10-04 19:19               ` Joern Rennecke
2023-10-04 21:35                 ` Patrick O'Neill
2023-08-15 14:04       ` Jeff Law
2023-08-04 23:44   ` 钟居哲

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).