public inbox for gcc-cvs@sourceware.org help / color / mirror / Atom feed
From: Jonathan Wright <jonwri01@gcc.gnu.org> To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-2741] aarch64: Don't include vec_select high-half in SIMD multiply cost Date: Wed, 4 Aug 2021 15:59:15 +0000 (GMT) [thread overview] Message-ID: <20210804155915.50FED394AC1B@sourceware.org> (raw) https://gcc.gnu.org/g:63834c84d43fc2eeeaa054c5e24d1e468e9eddab commit r12-2741-g63834c84d43fc2eeeaa054c5e24d1e468e9eddab Author: Jonathan Wright <jonathan.wright@arm.com> Date: Mon Jul 19 10:19:30 2021 +0100 aarch64: Don't include vec_select high-half in SIMD multiply cost The Neon multiply/multiply-accumulate/multiply-subtract instructions can select the top or bottom half of the operand registers. This selection does not change the cost of the underlying instruction and this should be reflected by the RTL cost function. This patch adds RTL tree traversal in the Neon multiply cost function to match vec_select high-half of its operands. This traversal prevents the cost of the vec_select from being added into the cost of the multiply - meaning that these instructions can now be emitted in the combine pass as they are no longer deemed prohibitively expensive. gcc/ChangeLog: 2021-07-19 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/aarch64.c (aarch64_strip_extend_vec_half): Define. (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of vec_select high-half from being added into Neon multiply cost. * rtlanal.c (vec_series_highpart_p): Define. * rtlanal.h (vec_series_highpart_p): Declare. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vmul_high_cost.c: New test. Diff: --- gcc/config/aarch64/aarch64.c | 22 ++++++ gcc/rtlanal.c | 19 +++++ gcc/rtlanal.h | 4 ++ gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c | 85 +++++++++++++++++++++++ 4 files changed, 130 insertions(+) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 23829bb691c..e02cbcbcb38 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -78,6 +78,7 @@ #include "gimple-pretty-print.h" #include "tree-ssa-loop-niter.h" #include "fractional-cost.h" +#include "rtlanal.h" /* This file should be included last. */ #include "target-def.h" @@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift) return x; } +/* Helper function for rtx cost calculation. Strip extension as well as any + inner VEC_SELECT high-half from X. Returns the inner vector operand if + successful, or the original expression on failure. */ +static rtx +aarch64_strip_extend_vec_half (rtx x) +{ + if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND) + { + x = XEXP (x, 0); + if (GET_CODE (x) == VEC_SELECT + && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)), + XEXP (x, 1))) + x = XEXP (x, 0); + } + return x; +} /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as any subsequent extend and VEC_SELECT from X. Returns the inner scalar @@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) unsigned int vec_flags = aarch64_classify_vector_mode (mode); if (vec_flags & VEC_ADVSIMD) { + /* The select-operand-high-half versions of the instruction have the + same cost as the three vector version - don't add the costs of the + extension or selection into the costs of the multiply. */ + op0 = aarch64_strip_extend_vec_half (op0); + op1 = aarch64_strip_extend_vec_half (op1); /* The by-element versions of the instruction have the same costs as the normal 3-vector version. We make an assumption that the input to the VEC_DUPLICATE is already on the FP & SIMD side. This means diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index f7f3acb75db..d37f7789b20 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -6953,6 +6953,25 @@ register_asm_p (const_rtx x) && DECL_REGISTER (REG_EXPR (x))); } +/* Return true if, for all OP of mode OP_MODE: + + (vec_select:RESULT_MODE OP SEL) + + is equivalent to the highpart RESULT_MODE of OP. */ + +bool +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel) +{ + int nunits; + if (GET_MODE_NUNITS (op_mode).is_constant (&nunits) + && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS)) + { + int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0); + return rtvec_series_p (XVEC (sel, 0), offset); + } + return false; +} + /* Return true if, for all OP of mode OP_MODE: (vec_select:RESULT_MODE OP SEL) diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h index e1642424db8..542dc7898be 100644 --- a/gcc/rtlanal.h +++ b/gcc/rtlanal.h @@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base () collecting the references a second time. */ using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>; +bool +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, + rtx sel); + bool vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel); diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c new file mode 100644 index 00000000000..ecc02e652a4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c @@ -0,0 +1,85 @@ +/* { dg-do compile } */ +/* { dg-options "-O3" } */ + +#include <arm_neon.h> + +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \ + rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \ + { \ + rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \ + vget_high_ ## ts (c)); \ + rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \ + vget_high_ ## ts (c)); \ + return vqaddq ## _ ## rs (t0, t1); \ + } + +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16) +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16) +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32) +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64) +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64) + +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64) + +#define TEST_MULL_N(name, rettype, intype, ts, rs) \ + rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \ + { \ + rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \ + rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \ + return vqaddq ## _ ## rs (t0, t1); \ + } + +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32) +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64) +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64) + +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32) +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64) + +#define TEST_MLXL_VEC(name, rettype, intype, ts) \ + rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \ + intype c) \ + { \ + acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \ + vget_high_ ## ts (b)); \ + return name ## _ ## ts (acc, vget_high_ ## ts (a), \ + vget_high_ ## ts (c)); \ + } + +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8) +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8) +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16) +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16) + +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8) +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8) +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16) +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16) + +#define TEST_MLXL_N(name, rettype, intype, ts) \ + rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \ + { \ + acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \ + return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \ + } + +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16) +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32) +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32) + +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16) +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32) +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32) + +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32) + +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16) +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32) + +/* { dg-final { scan-assembler-not "dup\\t" } } */
reply other threads:[~2021-08-04 15:59 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210804155915.50FED394AC1B@sourceware.org \ --to=jonwri01@gcc.gnu.org \ --cc=gcc-cvs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).