public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-4828] AArch64: Add better costing for vector constants and operations
@ 2021-11-01 13:50 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2021-11-01 13:50 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:1d5c43db79b7ea14f0dc071beb40cf80de90eb86

commit r12-4828-g1d5c43db79b7ea14f0dc071beb40cf80de90eb86
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Mon Nov 1 13:49:46 2021 +0000

    AArch64: Add better costing for vector constants and operations
    
    This patch adds extended costing to cost the creation of constants and the
    manipulation of constants.  The default values provided are based on
    architectural expectations and each cost models can be individually tweaked as
    needed.
    
    The changes in this patch covers:
    
    * Construction of PARALLEL or CONST_VECTOR:
      Adds better costing for vector of constants which is based on the constant
      being created and the instruction that can be used to create it.  i.e. a movi
      is cheaper than a literal load etc.
    * Construction of a vector through a vec_dup.
    
    gcc/ChangeLog:
    
            * config/arm/aarch-common-protos.h (struct vector_cost_table): Add
            movi, dup and extract costing fields.
            * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
            thunderx_extra_costs, thunderx2t99_extra_costs,
            thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
            them.
            * config/arm/aarch-cost-tables.h (generic_extra_costs,
            cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
            exynosm1_extra_costs, xgene1_extra_costs): Likewise
            * config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
            * config/aarch64/aarch64.c (aarch64_rtx_costs): Add extra costs.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/vect-cse-codegen.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-cost-tables.h           | 30 +++++--
 gcc/config/aarch64/aarch64-simd.md                 | 10 ++-
 gcc/config/aarch64/aarch64.c                       | 38 ++++++++-
 gcc/config/arm/aarch-common-protos.h               |  3 +
 gcc/config/arm/aarch-cost-tables.h                 | 30 +++++--
 .../gcc.target/aarch64/vect-cse-codegen.c          | 97 ++++++++++++++++++++++
 6 files changed, 188 insertions(+), 20 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb1..bb499a1eae6 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3..61c3d7e195c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 8448e56443c..fd9249c62b3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12701,7 +12701,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -13462,8 +13462,7 @@ cost_plus:
 
 	 we must cost the explicit register move.  */
       if (mode == DImode
-	  && GET_MODE (op0) == SImode
-	  && outer == SET)
+	  && GET_MODE (op0) == SImode)
 	{
 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
 
@@ -14002,8 +14001,39 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  /* Load using MOVI/MVNI.  */
+	  if (aarch64_simd_valid_immediate (x, NULL))
+	    *cost = extra_cost->vect.movi;
+	  else /* Load using constant pool.  */
+	    *cost = extra_cost->ldst.load;
+	  break;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	/* Load using a DUP.  */
+	*cost = extra_cost->vect.dup;
+	return false;
+    case VEC_SELECT:
+	{
+	  rtx op0 = XEXP (x, 0);
+	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
 
-      /* Fall through.  */
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost = extra_cost->vect.dup;
+	  else
+	    *cost = extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083..55a470d8e14 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01f..0e6a62665c7 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 00000000000..d025e989a1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile  { target { lp64 } } } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-01 13:50 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-01 13:50 [gcc r12-4828] AArch64: Add better costing for vector constants and operations Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).