[gcc r12-2740] aarch64: Don't include vec_select element in SIMD multiply cost

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

From: Jonathan Wright <jonwri01@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-2740] aarch64: Don't include vec_select element in SIMD multiply cost
Date: Wed,  4 Aug 2021 15:59:10 +0000 (GMT)	[thread overview]
Message-ID: <20210804155910.328DD3857408@sourceware.org> (raw)

https://gcc.gnu.org/g:1d65c9d25199264bc8909018df1b0dca71c0b32d

commit r12-2740-g1d65c9d25199264bc8909018df1b0dca71c0b32d
Author: Jonathan Wright <jonathan.wright@arm.com>
Date:   Mon Jul 19 14:01:52 2021 +0100

    aarch64: Don't include vec_select element in SIMD multiply cost
    
    The Neon multiply/multiply-accumulate/multiply-subtract instructions
    can take various forms - multiplying full vector registers of values
    or multiplying one vector by a single element of another. Regardless
    of the form used, these instructions have the same cost, and this
    should be reflected by the RTL cost function.
    
    This patch adds RTL tree traversal in the Neon multiply cost function
    to match the vec_select used by the lane-referencing forms of the
    instructions already mentioned. This traversal prevents the cost of
    the vec_select from being added into the cost of the multiply -
    meaning that these instructions can now be emitted in the combine
    pass as they are no longer deemed prohibitively expensive.
    
    gcc/ChangeLog:
    
    2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
    
            * config/aarch64/aarch64.c (aarch64_strip_duplicate_vec_elt):
            Define.
            (aarch64_rtx_mult_cost): Traverse RTL tree to prevent
            vec_select cost from being added into Neon multiply cost.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/vmul_element_cost.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.c                       | 34 +++++---
 .../gcc.target/aarch64/vmul_element_cost.c         | 94 ++++++++++++++++++++++
 2 files changed, 119 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 81c002ba0b0..23829bb691c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12046,6 +12046,26 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+
+/* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
+   any subsequent extend and VEC_SELECT from X. Returns the inner scalar
+   operand if successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_duplicate_vec_elt (rtx x)
+{
+  if (GET_CODE (x) == VEC_DUPLICATE
+      && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT)
+	x = XEXP (x, 0);
+      else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+	       && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
+	x = XEXP (XEXP (x, 0), 0);
+    }
+  return x;
+}
+
 /* Return true iff CODE is a shift supported in combination
    with arithmetic instructions.  */
 
@@ -12114,15 +12134,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       if (vec_flags & VEC_ADVSIMD)
 	{
 	  /* The by-element versions of the instruction have the same costs as
-	     the normal 3-vector version.  So don't add the costs of the
-	     duplicate into the costs of the multiply.  We make an assumption
-	     that the input to the VEC_DUPLICATE is already on the FP & SIMD
-	     side.  This means costing of a MUL by element pre RA is a bit
-	     optimistic.  */
-	  if (GET_CODE (op0) == VEC_DUPLICATE)
-	    op0 = XEXP (op0, 0);
-	  else if (GET_CODE (op1) == VEC_DUPLICATE)
-	    op1 = XEXP (op1, 0);
+	     the normal 3-vector version.  We make an assumption that the input
+	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
+	     costing of a MUL by element pre RA is a bit optimistic.  */
+	  op0 = aarch64_strip_duplicate_vec_elt (op0);
+	  op1 = aarch64_strip_duplicate_vec_elt (op1);
 	}
       cost += rtx_cost (op0, mode, MULT, 0, speed);
       cost += rtx_cost (op1, mode, MULT, 1, speed);
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
new file mode 100644
index 00000000000..c153775f091
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_element_cost.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MUL_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype a, vectype b, vectype c) \
+	{ \
+		vectype t0 = name ## q ## _n_ ## ts (a, c[1]); \
+		vectype t1 = name ## q ## _n_ ## ts (b, c[1]); \
+		return vmul ## q ## _ ## ts (t0, t1); \
+	}
+
+TEST_MUL_UNIFORM (vmul, , int16x4_t, s16)
+TEST_MUL_UNIFORM (vmul, , uint16x4_t, u16)
+TEST_MUL_UNIFORM (vmul, , int32x2_t, s32)
+TEST_MUL_UNIFORM (vmul, , uint32x2_t, u32)
+TEST_MUL_UNIFORM (vmul, , float32x2_t, f32)
+TEST_MUL_UNIFORM (vmul, q, int16x8_t, s16)
+TEST_MUL_UNIFORM (vmul, q, uint16x8_t, u16)
+TEST_MUL_UNIFORM (vmul, q, int32x4_t, s32)
+TEST_MUL_UNIFORM (vmul, q, uint32x4_t, u32)
+TEST_MUL_UNIFORM (vmul, q, float32x4_t, f32)
+TEST_MUL_UNIFORM (vmul, q, float64x2_t, f64)
+
+#define TEST_MLX_UNIFORM(name, q, vectype, ts) \
+  vectype test_ ## name ## q ## _ ## ts (vectype acc, vectype a, vectype b) \
+	{ \
+		acc = name ## q ## _n_ ## ts (acc, a, b[1]); \
+		return name ## q ## _n_ ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_UNIFORM (vmla, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmla, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmla, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmla, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmla, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmla, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmla, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmla, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmla, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmla, q, float32x4_t, f32)
+
+TEST_MLX_UNIFORM (vmls, , int16x4_t, s16)
+TEST_MLX_UNIFORM (vmls, , uint16x4_t, u16)
+TEST_MLX_UNIFORM (vmls, , int32x2_t, s32)
+TEST_MLX_UNIFORM (vmls, , uint32x2_t, u32)
+TEST_MLX_UNIFORM (vmls, , float32x2_t, f32)
+TEST_MLX_UNIFORM (vmls, q, int16x8_t, s16)
+TEST_MLX_UNIFORM (vmls, q, uint16x8_t, u16)
+TEST_MLX_UNIFORM (vmls, q, int32x4_t, s32)
+TEST_MLX_UNIFORM (vmls, q, uint32x4_t, u32)
+TEST_MLX_UNIFORM (vmls, q, float32x4_t, f32)
+
+#define TEST_MUL_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## ts (a, c[1]); \
+		rettype t1 = name ## ts (b, c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MUL_LONG (vmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vmull_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MUL_LONG (vmull_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MUL_LONG (vmull_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MUL_LONG (vqdmull_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MUL_LONG (vqdmull_n_, int64x2_t, int32x2_t, s32, s64)
+
+#define TEST_MLX_LONG(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## ts (acc, a, b[1]); \
+		return name ## ts (acc, a, b[1]); \
+	}
+
+TEST_MLX_LONG (vmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlal_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlal_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlal_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vmlsl_n_, uint32x4_t, uint16x4_t, u16, u32)
+TEST_MLX_LONG (vmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+TEST_MLX_LONG (vmlsl_n_, uint64x2_t, uint32x2_t, u32, u64)
+
+TEST_MLX_LONG (vqdmlal_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlal_n_, int64x2_t, int32x2_t, s32, s64)
+
+TEST_MLX_LONG (vqdmlsl_n_, int32x4_t, int16x4_t, s16, s32)
+TEST_MLX_LONG (vqdmlsl_n_, int64x2_t, int32x2_t, s32, s64)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */

                 reply	other threads:[~2021-08-04 15:59 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210804155910.328DD3857408@sourceware.org \
    --to=jonwri01@gcc.gnu.org \
    --cc=gcc-cvs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).