* Ping: [PATCH][ARM] MVE: Implementing auto-vectorized array * scalar instructions
@ 2023-04-27 14:25 Victor L. Do Nascimento
0 siblings, 0 replies; only message in thread
From: Victor L. Do Nascimento @ 2023-04-27 14:25 UTC (permalink / raw)
To: gcc-patches; +Cc: richard.sandiford
May I please ping this one??
https://gcc.gnu.org/pipermail/gcc-patches/2023-February/612152.html
Many Thanks!
Victor
On 2/16/23 15:48, Victor L. Do Nascimento wrote:
> Hi all,
>
> The back-end pattern for mapping the auto-vectorized representation of
> vector * scalar to to machine instruction VMUL was missing, and
> multiple instructions were needed to reproduce this behavior as a
> result of failed RTL pattern match in combine pass.
>
> RTL patterns were introduced to reproduce the behavior of the
> intrinsics vmulq_n_<mode> and vmulq_n_f<mode>.
>
> In the case of literal constants, an intermediate instruction was
> added in to initial RTL expansion to ensure a general-purpose register
> was allocated to store the constant, which could then be be extracted
> from the constant vector.
>
> For the function
>
> void test_vmulimm_s32x4 (int32_t * __restrict__ dest, int32_t *a)
> {
> int i;
> for (i=0; i<4; i++) {
> dest[i] = a[i] * 5;
> }
> }
>
>
> The GIMPLE -> RTL expansion is modified to produce:
> (set (reg:SI 119)
> (const_int 5 [0x5]))
> (set (reg:V4SI 118)
> (mult:V4SI (vec_duplicate:V4SI (reg:SI 119))
> (reg:V4SI 117)))
>
> instead of:
> (set (reg:V4SI 119)
> (const_vector:V4SI [
> (const_int 5 [0x5]) repeated x4
> ]))
> (set (reg:V4SI 118)
> (mult:V4SI (reg:V4SI 117)
> (reg:V4SI 119)))
>
> The end assembly for the above function introduces the emission of the following insn:
> vmul.i32 q3, q3, r3
>
> as opposed to:
> vmul.i32 q3, q3, q2
>
> All tests in gcc.target/arm/simd/mve-vmul-scalar-1.c now pass.
>
> Added new RTL templates, amended unit test and checked for regressions on arm-none-eabi.
>
> Thanks,
> Victor
>
> gcc:
> * gcc/config/arm/arm.cc (neon_vdup_constant): static keyword
> removed.
> * gcc/config/arm/arm-protos.h (neon_vdup_constant): prototype
> added.
> * gcc/config/arm/mve.md (@mve_vmulq_n_<mode>2): New.
> * gcc/config/arm/predicates.md (reg_or_me_replicated_const_operand):
> New.
> * gcc/config/arm/vec-common.md (mul<mode>3): Modify to use
> `reg_or_me_replicated_const_operand'.
>
> testsuite:
> * gcc.target/arm/simd/mve-vmul-scalar-1.c: Corrected typo,
> xfails removed.
> ---
> gcc/config/arm/arm-protos.h | 1 +
> gcc/config/arm/arm.cc | 2 +-
> gcc/config/arm/mve.md | 11 +++++++++++
> gcc/config/arm/predicates.md | 8 ++++++++
> gcc/config/arm/vec-common.md | 14 ++++++++++++--
> .../gcc.target/arm/simd/mve-vmul-scalar-1.c | 13 ++++++-------
> 6 files changed, 39 insertions(+), 10 deletions(-)
>
> diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
> index aea472bfbb9..4cf9fb00e01 100644
> --- a/gcc/config/arm/arm-protos.h
> +++ b/gcc/config/arm/arm-protos.h
> @@ -199,6 +199,7 @@ extern rtx arm_load_tp (rtx);
> extern bool arm_coproc_builtin_available (enum unspecv);
> extern bool arm_coproc_ldc_stc_legitimate_address (rtx);
> extern rtx arm_stack_protect_tls_canary_mem (bool);
> +extern rtx neon_vdup_constant (rtx, bool);
>
>
> #if defined TREE_CODE
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index efc48349dd3..7d9d265b0a7 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -13301,7 +13301,7 @@ neon_pairwise_reduce (rtx op0, rtx op1, machine_mode mode,
> If this is the case, and GENERATE is set, we also generate
> instructions to do this and return an RTX to assign to the register. */
>
> -static rtx
> +rtx
> neon_vdup_constant (rtx vals, bool generate)
> {
> machine_mode mode = GET_MODE (vals);
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index 555ad1b66c8..806c24e33aa 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -1376,6 +1376,17 @@
> [(set_attr "type" "mve_move")
> ])
>
> +(define_insn "@mve_vmulq_n_<mode>2"
> + [
> + (set (match_operand:MVE_VLD_ST 0 "s_register_operand" "=w")
> + (mult:MVE_VLD_ST (vec_duplicate:MVE_VLD_ST (match_operand:<V_elem> 1 "s_register_operand" "r"))
> + (match_operand:MVE_VLD_ST 2 "s_register_operand" "w")))
> + ]
> + "TARGET_HAVE_MVE"
> + "vmul.%#<V_if_elem>\t%q0, %q2, %r1"
> + [(set_attr "type" "mve_move")
> +])
> +
> ;;
> ;; [vmulq_u, vmulq_s])
> ;;
> diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
> index 3139750c606..31eadfa2d3b 100644
> --- a/gcc/config/arm/predicates.md
> +++ b/gcc/config/arm/predicates.md
> @@ -113,6 +113,14 @@
> && neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL));
> })
>
> +(define_predicate "reg_or_mve_replicated_const_operand"
> + (if_then_else (and (match_test "TARGET_HAVE_MVE")
> + (match_code "const_vector")
> + (match_test "const_vec_duplicate_p (op)"))
> + (match_operand 0 "immediate_operand")
> + (match_operand 0 "s_register_operand"))
> +)
> +
> (define_predicate "neon_inv_logic_op2"
> (ior (match_operand 0 "imm_for_neon_inv_logic_operand")
> (match_operand 0 "s_register_operand")))
> diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md
> index f06df4db636..17b67c214b4 100644
> --- a/gcc/config/arm/vec-common.md
> +++ b/gcc/config/arm/vec-common.md
> @@ -102,12 +102,22 @@
> (define_expand "mul<mode>3"
> [(set (match_operand:VDQWH 0 "s_register_operand")
> (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand")
> - (match_operand:VDQWH 2 "s_register_operand")))]
> + (match_operand:VDQWH 2 "reg_or_mve_replicated_const_operand")))]
> "ARM_HAVE_<MODE>_ARITH
> && (!TARGET_REALLY_IWMMXT
> || <MODE>mode == V4HImode
> || <MODE>mode == V2SImode)"
> -)
> +{
> + if ((GET_CODE (operands[2]) == CONST_VECTOR) && can_create_pseudo_p ()
> + && (VALID_MVE_SI_MODE (<MODE>mode) || VALID_MVE_SF_MODE (<MODE>mode)))
> + {
> + rtx tmp = gen_reg_rtx (<V_elem>mode);
> + emit_move_insn (tmp, neon_vdup_constant (operands[2], 0));
> + emit_insn (maybe_gen_mve_vmulq_n_2 (<MODE>mode, operands[0], tmp,
> + operands[1]));
> + DONE;
> + }
> +})
>
> (define_expand "smin<mode>3"
> [(set (match_operand:VALLW 0 "s_register_operand")
> diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
> index 22be452e8d9..0736847a96d 100644
> --- a/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
> +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul-scalar-1.c
> @@ -24,9 +24,9 @@ FUNC_IMM(u, uint, 8, 16, *, vmulimm)
>
> /* For the moment we do not select the T2 vmul variant operating on a scalar
> final argument. */
> -/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
> +/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
>
> void test_vmul_f32 (float * dest, float * a, float * b) {
> int i;
> @@ -40,16 +40,15 @@ void test_vmulimm_f32 (float * dest, float * a) {
> dest[i] = a[i] * 5.0;
> }
> }
> -/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
>
> void test_vmul_f16 (__fp16 * dest, __fp16 * a, __fp16 * b) {
> int i;
> for (i=0; i<8; i++) {
> - dest[i] = a[i] * b[i];
> + dest[i] = a[i] * b[1];
> }
> }
>
> -/* Note that dest[i] = a[i] * 5.0f16 is not vectorized. */
> void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) {
> int i;
> __fp16 b = 5.0f16;
> @@ -57,4 +56,4 @@ void test_vmulimm_f16 (__fp16 * dest, __fp16 * a) {
> dest[i] = a[i] * b;
> }
> }
> -/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, r[0-9]+} 2 } } */
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-04-27 14:25 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-27 14:25 Ping: [PATCH][ARM] MVE: Implementing auto-vectorized array * scalar instructions Victor L. Do Nascimento
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).