* [PATCH] Support udot_prodv*qi with emulation sdot_prodv*hi
@ 2023-12-04 7:01 liuhongt
0 siblings, 0 replies; only message in thread
From: liuhongt @ 2023-12-04 7:01 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
Like r14-5990-gb4a7c1c8c59d19, but the patch optimized for udot_prod.
Since (zero_extend) (unsigned char)-> int is equal
to (zero_extend)(unsigned char) -> short
+ (sign_extend) (short) -> int
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.
It should be safe to emulate udot_prodv*qi with
vec_unpacku_lo_v32qi
vec_unpacku_lo_v32qi
vec_unpacku_hi_v32qi
vec_unpacku_hi_v32qi
sdot_prodv16hi
sdot_prodv16hi
add3v8si
gcc/ChangeLog:
* config/i386/sse.md (udot_prodv64qi): New expander.
(udot_prod<mode>): Emulates with VEC_UNPACKU_EXPR +
DOT_PROD (short, int).
gcc/testsuite/ChangeLog:
* gcc.target/i386/udotprodint8_emulate.c: New test.
---
gcc/config/i386/sse.md | 82 ++++++++++++++++---
.../gcc.target/i386/udotprodint8_emulate.c | 15 ++++
2 files changed, 85 insertions(+), 12 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a1d4fec42a2..3244cef483a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30835,20 +30835,78 @@ (define_expand "sdot_prodv64qi"
(define_expand "udot_prod<mode>"
[(match_operand:<ssedvecmode> 0 "register_operand")
- (match_operand:VI1 1 "register_operand")
- (match_operand:VI1 2 "register_operand")
+ (match_operand:VI1_AVX2 1 "register_operand")
+ (match_operand:VI1_AVX2 2 "register_operand")
(match_operand:<ssedvecmode> 3 "register_operand")]
- "TARGET_AVXVNNIINT8"
+ "TARGET_SSE2"
{
- operands[1] = lowpart_subreg (<ssedvecmode>mode,
- force_reg (<MODE>mode, operands[1]),
- <MODE>mode);
- operands[2] = lowpart_subreg (<ssedvecmode>mode,
- force_reg (<MODE>mode, operands[2]),
- <MODE>mode);
- emit_insn (gen_rtx_SET (operands[0], operands[3]));
- emit_insn (gen_vpdpbuud_<ssedvecmodelower> (operands[0], operands[3],
- operands[1], operands[2]));
+ if (TARGET_AVXVNNIINT8)
+ {
+ operands[1] = lowpart_subreg (<ssedvecmode>mode,
+ force_reg (<MODE>mode, operands[1]),
+ <MODE>mode);
+ operands[2] = lowpart_subreg (<ssedvecmode>mode,
+ force_reg (<MODE>mode, operands[2]),
+ <MODE>mode);
+ emit_insn (gen_rtx_SET (operands[0], operands[3]));
+ emit_insn (gen_vpdpbuud_<ssedvecmodelower> (operands[0], operands[3],
+ operands[1], operands[2]));
+ }
+ else
+ {
+ /* Emulate with vpdpwssd. */
+ rtx op1_lo = gen_reg_rtx (<sseunpackmode>mode);
+ rtx op1_hi = gen_reg_rtx (<sseunpackmode>mode);
+ rtx op2_lo = gen_reg_rtx (<sseunpackmode>mode);
+ rtx op2_hi = gen_reg_rtx (<sseunpackmode>mode);
+
+ emit_insn (gen_vec_unpacku_lo_<mode> (op1_lo, operands[1]));
+ emit_insn (gen_vec_unpacku_lo_<mode> (op2_lo, operands[2]));
+ emit_insn (gen_vec_unpacku_hi_<mode> (op1_hi, operands[1]));
+ emit_insn (gen_vec_unpacku_hi_<mode> (op2_hi, operands[2]));
+
+ rtx res1 = gen_reg_rtx (<ssedvecmode>mode);
+ rtx res2 = gen_reg_rtx (<ssedvecmode>mode);
+ rtx sum = gen_reg_rtx (<ssedvecmode>mode);
+
+ emit_move_insn (sum, CONST0_RTX (<ssedvecmode>mode));
+ emit_insn (gen_sdot_prod<sseunpackmodelower> (res1, op1_lo,
+ op2_lo, sum));
+ emit_insn (gen_sdot_prod<sseunpackmodelower> (res2, op1_hi,
+ op2_hi, operands[3]));
+ emit_insn (gen_add<ssedvecmodelower>3 (operands[0], res1, res2));
+ }
+
+ DONE;
+})
+
+(define_expand "udot_prodv64qi"
+ [(match_operand:V16SI 0 "register_operand")
+ (match_operand:V64QI 1 "register_operand")
+ (match_operand:V64QI 2 "register_operand")
+ (match_operand:V16SI 3 "register_operand")]
+ "(TARGET_AVX512VNNI || TARGET_AVX512BW) && TARGET_EVEX512"
+{
+ /* Emulate with vpdpwssd. */
+ rtx op1_lo = gen_reg_rtx (V32HImode);
+ rtx op1_hi = gen_reg_rtx (V32HImode);
+ rtx op2_lo = gen_reg_rtx (V32HImode);
+ rtx op2_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_vec_unpacku_lo_v64qi (op1_lo, operands[1]));
+ emit_insn (gen_vec_unpacku_lo_v64qi (op2_lo, operands[2]));
+ emit_insn (gen_vec_unpacku_hi_v64qi (op1_hi, operands[1]));
+ emit_insn (gen_vec_unpacku_hi_v64qi (op2_hi, operands[2]));
+
+ rtx res1 = gen_reg_rtx (V16SImode);
+ rtx res2 = gen_reg_rtx (V16SImode);
+ rtx sum = gen_reg_rtx (V16SImode);
+
+ emit_move_insn (sum, CONST0_RTX (V16SImode));
+ emit_insn (gen_sdot_prodv32hi (res1, op1_lo, op2_lo, sum));
+ emit_insn (gen_sdot_prodv32hi (res2, op1_hi, op2_hi, operands[3]));
+
+ emit_insn (gen_addv16si3 (operands[0], res1, res2));
DONE;
})
diff --git a/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c
new file mode 100644
index 00000000000..1e8f2cfe521
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/udotprodint8_emulate.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mavxvnni -O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times "DOT_PROD_EXPR" 1 "optimized" } } */
+/* { dg-final { scan-assembler-times "vpdpwssd" 2 } } */
+
+int
+foo (unsigned char* a, unsigned char* b)
+{
+ int sum = 0;
+ for (int i = 0; i != 16; i++)
+ {
+ sum += a[i] * b[i];
+ }
+ return sum;
+}
--
2.31.1
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-12-04 7:03 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-04 7:01 [PATCH] Support udot_prodv*qi with emulation sdot_prodv*hi liuhongt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).