Could you test it v4096qi ? Also, VLS modes need to be tested. juzhe.zhong@rivai.ai From: pan2.li Date: 2023-11-15 11:48 To: gcc-patches CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng Subject: [PATCH v1] RISC-V: Refine the mask generation for vec_init case 2 From: Pan Li We take vec_init element int mode when generate the mask for case 2. But actually we don't need as many bits as the element. The extra bigger mode may introduce some unnecessary insns. For example as below code: typedef int64_t v16di __attribute__ ((vector_size (16 * 8))); void __attribute__ ((noinline, noclone)) foo (int64_t *out, int64_t x, int64_t y) { v16di v = {y, x, y, x, y, x, y, x, y, x, y, x, y, x, y, x}; *(v16di *) out = v; } We will have VDImode when generate the 0b0101010101010101 mask but actually VHImode is good enough here. This patch would like to refine the mask generation to avoid: 1. Unnecessary scalar to generate big constant mask. 2. Unnecessary vector insn to v0 mask. Before this patch: foo: li a5,-1431654400 li a4,-1431654400 <== unnecessary insn addi a5,a5,-1365 <== unnecessary insn addi a4,a4,-1366 slli a5,a5,32 <== unnecessary insn add a5,a5,a4 <== unnecessary insn vsetivli zero,16,e64,m8,ta,ma vmv.v.x v8,a2 vmv.s.x v16,a5 vmv1r.v v0,v16 <== unnecessary insn vmerge.vxm v8,v8,a1,v0 vse64.v v8,0(a0) ret After this patch: foo: li a5,-20480 addiw a5,a5,-1366 vsetivli zero,16,e64,m8,ta,ma vmv.s.x v0,a5 vmv.v.x v8,a2 vmerge.vxm v8,v8,a1,v0 vs8r.v v8,0(a0) ret gcc/ChangeLog: * config/riscv/riscv-v.cc (rvv_builder::get_merge_scalar_mask): Add inner_mode mask arg for mask int mode. (get_repeating_sequence_dup_machine_mode): Add mask_bit_mode arg to get the good enough vector int mode on precision. (expand_vector_init_merge_repeating_sequence): Pass required args to above func. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c: New test. Signed-off-by: Pan Li --- gcc/config/riscv/riscv-v.cc | 54 ++++++++++++++----- .../vls-vlmax/init-repeat-sequence-10.c | 28 ++++++++++ .../vls-vlmax/init-repeat-sequence-11.c | 26 +++++++++ .../vls-vlmax/init-repeat-sequence-6.c | 27 ++++++++++ .../vls-vlmax/init-repeat-sequence-7.c | 25 +++++++++ .../vls-vlmax/init-repeat-sequence-8.c | 27 ++++++++++ .../vls-vlmax/init-repeat-sequence-9.c | 25 +++++++++ 7 files changed, 200 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 265a298f447..ffb645eccf3 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -416,7 +416,7 @@ public: bool repeating_sequence_use_merge_profitable_p (); bool combine_sequence_use_slideup_profitable_p (); bool combine_sequence_use_merge_profitable_p (); - rtx get_merge_scalar_mask (unsigned int) const; + rtx get_merge_scalar_mask (unsigned int, machine_mode) const; bool single_step_npatterns_p () const; bool npatterns_all_equal_p () const; @@ -592,7 +592,8 @@ rvv_builder::get_merged_repeating_sequence () To merge "b", the mask should be 0101.... */ rtx -rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const +rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern, + machine_mode inner_mode) const { unsigned HOST_WIDE_INT mask = 0; unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); @@ -611,7 +612,7 @@ rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern) const for (int i = 0; i < limit; i++) mask |= base_mask << (i * npatterns ()); - return gen_int_mode (mask, inner_int_mode ()); + return gen_int_mode (mask, inner_mode); } /* Return true if the variable-length vector is single step. @@ -919,17 +920,45 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask) /* Emit merge instruction. */ static machine_mode -get_repeating_sequence_dup_machine_mode (const rvv_builder &builder) +get_repeating_sequence_dup_machine_mode (const rvv_builder &builder, + machine_mode mask_bit_mode) { - poly_uint64 dup_nunits = GET_MODE_NUNITS (builder.mode ()); + unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant (); + unsigned mask_scalar_size = mask_precision > builder.inner_bits_size () + ? builder.inner_bits_size () : mask_precision; - if (known_ge (GET_MODE_SIZE (builder.mode ()), BYTES_PER_RISCV_VECTOR)) + scalar_mode inner_mode; + unsigned minimal_bits_size; + + switch (mask_scalar_size) { - dup_nunits = exact_div (BYTES_PER_RISCV_VECTOR, - builder.inner_bytes_size ()); + case 8: + inner_mode = QImode; + minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */ + break; + case 16: + inner_mode = HImode; + minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */ + break; + case 32: + inner_mode = SImode; + minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */ + break; + case 64: + inner_mode = DImode; + minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */ + break; + default: + gcc_unreachable (); + break; } - return get_vector_mode (builder.inner_int_mode (), dup_nunits).require (); + gcc_assert (mask_precision % mask_scalar_size == 0); + + uint64_t dup_nunit = mask_precision > mask_scalar_size + ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size; + + return get_vector_mode (inner_mode, dup_nunit).require (); } /* Expand series const vector. */ @@ -2130,9 +2159,9 @@ expand_vector_init_merge_repeating_sequence (rtx target, since we don't have such instruction in RVV. Instead, we should use INT mode (QI/HI/SI/DI) with integer move instruction to generate the mask data we want. */ - machine_mode mask_int_mode - = get_repeating_sequence_dup_machine_mode (builder); machine_mode mask_bit_mode = get_mask_mode (builder.mode ()); + machine_mode mask_int_mode + = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode); uint64_t full_nelts = builder.full_nelts ().to_constant (); /* Step 1: Broadcast the first pattern. */ @@ -2143,7 +2172,8 @@ expand_vector_init_merge_repeating_sequence (rtx target, for (unsigned int i = 1; i < builder.npatterns (); i++) { /* Step 2-1: Generate mask register v0 for each merge. */ - rtx merge_mask = builder.get_merge_scalar_mask (i); + rtx merge_mask + = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode)); rtx mask = gen_reg_rtx (mask_bit_mode); rtx dup = gen_reg_rtx (mask_int_mode); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c new file mode 100644 index 00000000000..ccce5052dc2 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-10.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include + +typedef int64_t vnx32di __attribute__ ((vector_size (32 * 8))); + +/* +** f_vnx32di: +** vsetvli\s+[axt][0-9]+,\s*zero,\s*e64,\s*m2,\s*ta,\s*ma +** ... +** vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0 +** vs2r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx32di (int64_t a, int64_t b, int64_t *out) +{ + vnx32di v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx32di *) out = v; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c new file mode 100644 index 00000000000..a62eee8a5ae --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-11.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef double vnx32df __attribute__ ((vector_size (32 * 8))); + +/* +** f_vnx32df: +** vsetvli\s+[axt][0-9]+\s*,zero,\s*e64,\s*m2,\s*ta,\s*ma +** ... +** vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0 +** vs2r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx32df (double a, double b, double *out) +{ + vnx32df v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx32df *) out = v; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c new file mode 100644 index 00000000000..4f8a78b3161 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-6.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include + +typedef int64_t vnx16di __attribute__ ((vector_size (16 * 8))); + +/* +** f_vnx16di: +** vsetivli\s+zero,\s*16,\s*e64,\s*m8,\s*ta,\s*ma +** ... +** vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0 +** vs8r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx16di (int64_t a, int64_t b, int64_t *out) +{ + vnx16di v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx16di *) out = v; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c new file mode 100644 index 00000000000..f0d14db8fa8 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-7.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef double vnx16df __attribute__ ((vector_size (16 * 8))); + +/* +** f_vnx16df: +** vsetivli\s+zero,\s*16,\s*e64,\s*m8,\s*ta,\s*ma +** ... +** vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0 +** vs8r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx16df (double a, double b, double *out) +{ + vnx16df v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx16df *) out = v; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c new file mode 100644 index 00000000000..fd986e6b649 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-8.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include + +typedef int64_t vnx16di __attribute__ ((vector_size (16 * 8))); + +/* +** f_vnx16di: +** vsetivli\s+zero,\s*16,\s*e64,\s*m1,\s*ta,\s*ma +** ... +** vmv\.v\.x\s+v[0-9]+,\s*[axt][0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vmerge\.vxm\s+v[0-9]+,\s*v[0-9]+,\s*[axt][0-9]+,\s*v0 +** vs1r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx16di (int64_t a, int64_t b, int64_t *out) +{ + vnx16di v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx16di *) out = v; +} diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c new file mode 100644 index 00000000000..753221ffdbf --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/init-repeat-sequence-9.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv64gcv_zvl1024b -mabi=lp64d -fno-schedule-insns -fno-schedule-insns2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef double vnx16df __attribute__ ((vector_size (16 * 8))); + +/* +** f_vnx16df: +** vsetivli\s+zero,\s*16,\s*e64,\s*m1,\s*ta,\s*ma +** ... +** vfmv\.v\.f\s+v[0-9]+,\s*[af]+[0-9]+ +** ... +** vmv\.s\.x\s+v0,\s*[axt][0-9]+ +** vfmerge\.vfm\s+v[0-9]+,\s*v[0-9]+,\s*[af]+[0-9]+,\s*v0 +** vs1r\.v\s+v[0-9]+,\s*0\([axt][0-9]+\) +** ret +*/ +__attribute__ ((noipa)) void +f_vnx16df (double a, double b, double *out) +{ + vnx16df v = { + a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, + }; + *(vnx16df *) out = v; +} -- 2.34.1