vec_init is auto-vectorization pattern. The test is not loop since using vector type is easier to test the patterns. juzhe.zhong@rivai.ai From: Robin Dapp Date: 2023-05-12 16:53 To: Li, Pan2; Kito Cheng; juzhe.zhong@rivai.ai CC: gcc-patches@gcc.gnu.org; palmer@dabbelt.com; jeffreyalaw@gmail.com Subject: Re: [PATCH] RISC-V: Fix fail of vmv-imm-rv64.c in rv32 >> After update local codebase to the trunk. I realize there is one more fail in RV32. >> After this patch, all fails of RVV are cleaned up. >> Thanks. But only because we build vmv-imm with autovec-preference=scalable. With fixed-vlmax it still does not work because I messed up the rebase against the series patch. The following patch fixes it and adds another test similar to the repeating series ones with fixed-vlmax. Btw why is the vls-vlmax directory under autovec? It's not really autovectorization (no loops). Subject: [PATCH] [RISC-V] Allow more loading of const vectors. This patch fixes the recent vmv patch in order to allow loading of constants via vmv.vi. It also adds another test analogous to the series tests. gcc/ChangeLog: * config/riscv/riscv.cc (riscv_const_insns): Remove else. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c: New test. * gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c: New test. --- gcc/config/riscv/riscv.cc | 3 +- .../riscv/rvv/autovec/vls-vlmax/repeat-7.c | 219 ++++++++++++++++++ .../rvv/autovec/vls-vlmax/repeat_run-7.c | 145 ++++++++++++ 3 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index de578b5b899..5b109766c35 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -1291,11 +1291,12 @@ riscv_const_insns (rtx x) return 1; } } + /* Constants from -16 to 15 can be loaded with vmv.v.i. The Wc0, Wc1 constraints are already covered by the vi constraint so we do not need to check them here separately. */ - else if (TARGET_VECTOR && satisfies_constraint_vi (x)) + if (satisfies_constraint_vi (x)) return 1; /* TODO: We may support more const vector in the future. */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c new file mode 100644 index 00000000000..bc5580ebd1d --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat-7.c @@ -0,0 +1,219 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d" } */ + +#include + +typedef int8_t vnx2qi __attribute__ ((vector_size (2))); +typedef int8_t vnx4qi __attribute__ ((vector_size (4))); +typedef int8_t vnx8qi __attribute__ ((vector_size (8))); +typedef int8_t vnx16qi __attribute__ ((vector_size (16))); +typedef int8_t vnx32qi __attribute__ ((vector_size (32))); +typedef int8_t vnx64qi __attribute__ ((vector_size (64))); +typedef int8_t vnx128qi __attribute__ ((vector_size (128))); + +typedef int16_t vnx2hi __attribute__ ((vector_size (4))); +typedef int16_t vnx4hi __attribute__ ((vector_size (8))); +typedef int16_t vnx8hi __attribute__ ((vector_size (16))); +typedef int16_t vnx16hi __attribute__ ((vector_size (32))); +typedef int16_t vnx32hi __attribute__ ((vector_size (64))); +typedef int16_t vnx64hi __attribute__ ((vector_size (128))); + +typedef int32_t vnx2si __attribute__ ((vector_size (8))); +typedef int32_t vnx4si __attribute__ ((vector_size (16))); +typedef int32_t vnx8si __attribute__ ((vector_size (32))); +typedef int32_t vnx16si __attribute__ ((vector_size (64))); +typedef int32_t vnx32si __attribute__ ((vector_size (128))); + +typedef int64_t vnx2di __attribute__ ((vector_size (16))); +typedef int64_t vnx4di __attribute__ ((vector_size (32))); +typedef int64_t vnx8di __attribute__ ((vector_size (64))); +typedef int64_t vnx16di __attribute__ ((vector_size (128))); + +__attribute__ ((noipa)) void +f_vnx2qi (int8_t *out) +{ + vnx2qi v = {-16, -16}; + *(vnx2qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx4qi (int8_t *out) +{ + vnx4qi v = {-15, -15, -15, -15}; + *(vnx4qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx8qi (int8_t *out) +{ + vnx8qi v = {-14, -14, -14, -14, -14, -14, -14, -14}; + *(vnx8qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16qi (int8_t *out) +{ + vnx16qi v = {-13, -13, -13, -13, -13, -13, -13, -13, + -13, -13, -13, -13, -13, -13, -13, -13}; + *(vnx16qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx32qi (int8_t *out) +{ + vnx32qi v = {7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}; + *(vnx32qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx64qi (int8_t *out) +{ + vnx64qi v = {-7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, + -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, + -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, + -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, + -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7, -7}; + *(vnx64qi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx128qi (int8_t *out) +{ + vnx128qi v + = {-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, + -4, -4}; + *(vnx128qi *) out = v; +} + + +__attribute__ ((noipa)) void +f_vnx2hi (int16_t *out) +{ + vnx2hi v = {-12, -12}; + *(vnx2hi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx4hi (int16_t *out) +{ + vnx4hi v = {-11, -11, -11, -11}; + *(vnx4hi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx8hi (int16_t *out) +{ + vnx8hi v = {-10, -10, -10, -10, -10, -10, -10, -10}; + *(vnx8hi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16hi (int16_t *out) +{ + vnx16hi v = {-9, -9, -9, -9, -9, -9, -9, -9, + -9, -9, -9, -9, -9, -9, -9, -9}; + *(vnx16hi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx32hi (int16_t *out) +{ + vnx32hi v = {-8, -8, -8, -8, -8, -8, -8, -8, + -8, -8, -8, -8, -8, -8, -8, -8, + -8, -8, -8, -8, -8, -8, -8, -8, + -8, -8, -8, -8, -8, -8, -8, -8}; + *(vnx32hi *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx64hi (int16_t *out) +{ + vnx64hi v = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; + *(vnx64hi *) out = v; +} + + +__attribute__ ((noipa)) void +f_vnx2si (int32_t *out) +{ + vnx2si v = {15, 15}; + *(vnx2si *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx4si (int32_t *out) +{ + vnx4si v = {14, 14, 14, 14}; + *(vnx4si *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx8si (int32_t *out) +{ + vnx8si v = {13, 13, 13, 13, 13, 13, 13, 13}; + *(vnx8si *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16si (int32_t *out) +{ + vnx16si v = {12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12}; + *(vnx16si *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx32si (int32_t *out) +{ + vnx32si v = {11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11}; + *(vnx32si *) out = v; +} + + +__attribute__ ((noipa)) void +f_vnx2di (int64_t *out) +{ + vnx2di v = {10, 10}; + *(vnx2di *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx4di (int64_t *out) +{ + vnx4di v = {0, 0, 0, 0}; + *(vnx4di *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx8di (int64_t *out) +{ + vnx8di v = {1, 1, 1, 1, 1, 1, 1, 1}; + *(vnx8di *) out = v; +} + +__attribute__ ((noipa)) void +f_vnx16di (int64_t *out) +{ + vnx16di v = {2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2}; + *(vnx16di *) out = v; +} + +/* { dg-final { scan-assembler-times {vmv.v.i\tv[0-9]+,\s*[-0-9]+} 22 } } */ +/* { dg-final { scan-assembler-not {lui} } } */ +/* { dg-final { scan-assembler-not {addi} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c new file mode 100644 index 00000000000..e1ffa719a02 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls-vlmax/repeat_run-7.c @@ -0,0 +1,145 @@ +/* { dg-do run { target { riscv_vector } } } */ +/* { dg-options "--param riscv-autovec-preference=fixed-vlmax -O3" } */ + +#include "repeat-8.c" + +int +main () +{ + int8_t v_vnx2qi[sizeof (vnx2qi) / sizeof (int8_t)]; + f_vnx2qi (v_vnx2qi); + for (int i = 0; i < sizeof (vnx2qi) / sizeof (int8_t); i++) + if (v_vnx2qi[i] != -16) + __builtin_abort (); + + int8_t v_vnx4qi[sizeof (vnx4qi) / sizeof (int8_t)]; + f_vnx4qi (v_vnx4qi); + for (int i = 0; i < sizeof (vnx4qi) / sizeof (int8_t); i++) + if (v_vnx4qi[i] != -15) + __builtin_abort (); + + int8_t v_vnx8qi[sizeof (vnx8qi) / sizeof (int8_t)]; + f_vnx8qi (v_vnx8qi); + for (int i = 0; i < sizeof (vnx8qi) / sizeof (int8_t); i++) + if (v_vnx8qi[i] != -14) + __builtin_abort (); + + int8_t v_vnx16qi[sizeof (vnx16qi) / sizeof (int8_t)]; + f_vnx16qi (v_vnx16qi); + for (int i = 0; i < sizeof (vnx16qi) / sizeof (int8_t); i++) + if (v_vnx16qi[i] != -13) + __builtin_abort (); + + int8_t v_vnx32qi[sizeof (vnx32qi) / sizeof (int8_t)]; + f_vnx32qi (v_vnx32qi); + for (int i = 0; i < sizeof (vnx32qi) / sizeof (int8_t); i++) + if (v_vnx32qi[i] != 7) + __builtin_abort (); + + int8_t v_vnx64qi[sizeof (vnx64qi) / sizeof (int8_t)]; + f_vnx64qi (v_vnx64qi); + for (int i = 0; i < sizeof (vnx64qi) / sizeof (int8_t); i++) + if (v_vnx64qi[i] != -7) + __builtin_abort (); + + int8_t v_vnx128qi[sizeof (vnx128qi) / sizeof (int8_t)]; + f_vnx128qi (v_vnx128qi); + for (int i = 0; i < sizeof (vnx128qi) / sizeof (int8_t); i++) + if (v_vnx128qi[i] != -4) + __builtin_abort (); + + + int16_t v_vnx2hi[sizeof (vnx2hi) / sizeof (int16_t)]; + f_vnx2hi (v_vnx2hi); + for (int i = 0; i < sizeof (vnx2hi) / sizeof (int16_t); i++) + if (v_vnx2hi[i] != -12) + __builtin_abort (); + + int16_t v_vnx4hi[sizeof (vnx4hi) / sizeof (int16_t)]; + f_vnx4hi (v_vnx4hi); + for (int i = 0; i < sizeof (vnx4hi) / sizeof (int16_t); i++) + if (v_vnx4hi[i] != -11) + __builtin_abort (); + + int16_t v_vnx8hi[sizeof (vnx8hi) / sizeof (int16_t)]; + f_vnx8hi (v_vnx8hi); + for (int i = 0; i < sizeof (vnx8hi) / sizeof (int16_t); i++) + if (v_vnx8hi[i] != -10) + __builtin_abort (); + + int16_t v_vnx16hi[sizeof (vnx16hi) / sizeof (int16_t)]; + f_vnx16hi (v_vnx16hi); + for (int i = 0; i < sizeof (vnx16hi) / sizeof (int16_t); i++) + if (v_vnx16hi[i] != -9) + __builtin_abort (); + + int16_t v_vnx32hi[sizeof (vnx32hi) / sizeof (int16_t)]; + f_vnx32hi (v_vnx32hi); + for (int i = 0; i < sizeof (vnx32hi) / sizeof (int16_t); i++) + if (v_vnx32hi[i] != -8) + __builtin_abort (); + + int16_t v_vnx64hi[sizeof (vnx64hi) / sizeof (int16_t)]; + f_vnx64hi (v_vnx64hi); + for (int i = 0; i < sizeof (vnx64hi) / sizeof (int16_t); i++) + if (v_vnx64hi[i] != 8) + __builtin_abort (); + + + int32_t v_vnx2si[sizeof (vnx2si) / sizeof (int32_t)]; + f_vnx2si (v_vnx2si); + for (int i = 0; i < sizeof (vnx2si) / sizeof (int32_t); i++) + if (v_vnx2si[i] != 15) + __builtin_abort (); + + int32_t v_vnx4si[sizeof (vnx4si) / sizeof (int32_t)]; + f_vnx4si (v_vnx4si); + for (int i = 0; i < sizeof (vnx4si) / sizeof (int32_t); i++) + if (v_vnx4si[i] != 14) + __builtin_abort (); + + int32_t v_vnx8si[sizeof (vnx8si) / sizeof (int32_t)]; + f_vnx8si (v_vnx8si); + for (int i = 0; i < sizeof (vnx8si) / sizeof (int32_t); i++) + if (v_vnx8si[i] != 13) + __builtin_abort (); + + int32_t v_vnx16si[sizeof (vnx16si) / sizeof (int32_t)]; + f_vnx16si (v_vnx16si); + for (int i = 0; i < sizeof (vnx16si) / sizeof (int32_t); i++) + if (v_vnx16si[i] != 12) + __builtin_abort (); + + int32_t v_vnx32si[sizeof (vnx32si) / sizeof (int32_t)]; + f_vnx32si (v_vnx32si); + for (int i = 0; i < sizeof (vnx32si) / sizeof (int32_t); i++) + if (v_vnx32si[i] != 11) + __builtin_abort (); + + + int64_t v_vnx2di[sizeof (vnx2di) / sizeof (int64_t)]; + f_vnx2di (v_vnx2di); + for (int i = 0; i < sizeof (vnx2di) / sizeof (int64_t); i++) + if (v_vnx2di[i] != 10) + __builtin_abort (); + + int64_t v_vnx4di[sizeof (vnx4di) / sizeof (int64_t)]; + f_vnx4di (v_vnx4di); + for (int i = 0; i < sizeof (vnx4di) / sizeof (int64_t); i++) + if (v_vnx4di[i] != 0) + __builtin_abort (); + + int64_t v_vnx8di[sizeof (vnx8di) / sizeof (int64_t)]; + f_vnx8di (v_vnx8di); + for (int i = 0; i < sizeof (vnx8di) / sizeof (int64_t); i++) + if (v_vnx8di[i] != 1) + __builtin_abort (); + + int64_t v_vnx16di[sizeof (vnx16di) / sizeof (int64_t)]; + f_vnx16di (v_vnx16di); + for (int i = 0; i < sizeof (vnx16di) / sizeof (int64_t); i++) + if (v_vnx16di[i] != 2) + __builtin_abort (); + + return 0; +} -- 2.40.1