[gcc r12-2741] aarch64: Don't include vec_select high-half in SIMD multiply cost

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

From: Jonathan Wright <jonwri01@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-2741] aarch64: Don't include vec_select high-half in SIMD multiply cost
Date: Wed,  4 Aug 2021 15:59:15 +0000 (GMT)	[thread overview]
Message-ID: <20210804155915.50FED394AC1B@sourceware.org> (raw)

https://gcc.gnu.org/g:63834c84d43fc2eeeaa054c5e24d1e468e9eddab

commit r12-2741-g63834c84d43fc2eeeaa054c5e24d1e468e9eddab
Author: Jonathan Wright <jonathan.wright@arm.com>
Date:   Mon Jul 19 10:19:30 2021 +0100

    aarch64: Don't include vec_select high-half in SIMD multiply cost
    
    The Neon multiply/multiply-accumulate/multiply-subtract instructions
    can select the top or bottom half of the operand registers. This
    selection does not change the cost of the underlying instruction and
    this should be reflected by the RTL cost function.
    
    This patch adds RTL tree traversal in the Neon multiply cost function
    to match vec_select high-half of its operands. This traversal
    prevents the cost of the vec_select from being added into the cost of
    the multiply - meaning that these instructions can now be emitted in
    the combine pass as they are no longer deemed prohibitively
    expensive.
    
    gcc/ChangeLog:
    
    2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
    
            * config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
            Define.
            (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
            vec_select high-half from being added into Neon multiply
            cost.
            * rtlanal.c (vec_series_highpart_p): Define.
            * rtlanal.h (vec_series_highpart_p): Declare.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/vmul_high_cost.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.c                      | 22 ++++++
 gcc/rtlanal.c                                     | 19 +++++
 gcc/rtlanal.h                                     |  4 ++
 gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c | 85 +++++++++++++++++++++++
 4 files changed, 130 insertions(+)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 23829bb691c..e02cbcbcb38 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,6 +78,7 @@
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
 #include "fractional-cost.h"
+#include "rtlanal.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+/* Helper function for rtx cost calculation. Strip extension as well as any
+   inner VEC_SELECT high-half from X. Returns the inner vector operand if
+   successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT
+	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+				    XEXP (x, 1)))
+	x = XEXP (x, 0);
+    }
+  return x;
+}
 
 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
@@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
       if (vec_flags & VEC_ADVSIMD)
 	{
+	  /* The select-operand-high-half versions of the instruction have the
+	     same cost as the three vector version - don't add the costs of the
+	     extension or selection into the costs of the multiply.  */
+	  op0 = aarch64_strip_extend_vec_half (op0);
+	  op1 = aarch64_strip_extend_vec_half (op1);
 	  /* The by-element versions of the instruction have the same costs as
 	     the normal 3-vector version.  We make an assumption that the input
 	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index f7f3acb75db..d37f7789b20 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6953,6 +6953,25 @@ register_asm_p (const_rtx x)
 	  && DECL_REGISTER (REG_EXPR (x)));
 }
 
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
 /* Return true if, for all OP of mode OP_MODE:
 
      (vec_select:RESULT_MODE OP SEL)
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
index e1642424db8..542dc7898be 100644
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
    collecting the references a second time.  */
 using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
 
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+		       rtx sel);
+
 bool
 vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
new file mode 100644
index 00000000000..ecc02e652a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+					      vget_high_ ## ts (c)); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+					      vget_high_ ## ts (c)); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+				    intype c) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					    vget_high_ ## ts (b)); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					     vget_high_ ## ts (c)); \
+	}
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+	}
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */

                 reply	other threads:[~2021-08-04 15:59 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210804155915.50FED394AC1B@sourceware.org \
    --to=jonwri01@gcc.gnu.org \
    --cc=gcc-cvs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).