Hi,

Following the discussion below here's a revised patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
	movi, dup and extract costing fields.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
	them.
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise
	* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
	* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
	(aarch64_rtx_costs): Add extra costs.
	(aarch64_simd_dup_constant): Support check only mode.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vect-cse-codegen.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* mult.  */
+    COSTS_N_INSNS (4),	/* mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* Alu.  */
-    COSTS_N_INSNS (4)	/* Mult.  */
+    COSTS_N_INSNS (4),	/* Mult.  */
+    COSTS_N_INSNS (1),	/* movi.  */
+    COSTS_N_INSNS (2),	/* dup.  */
+    COSTS_N_INSNS (2)	/* extract.  */
   }
 };
 
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
 )
 
 (define_insn "aarch64_simd_dup<mode>"
-  [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+  [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
 	(vec_duplicate:VDQF_F16
-	  (match_operand:<VEL> 1 "register_operand" "w")))]
+	  (match_operand:<VEL> 1 "register_operand" "w,r")))]
   "TARGET_SIMD"
-  "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
-  [(set_attr "type" "neon_dup<q>")]
+  "@
+   dup\\t%0.<Vtype>, %1.<Vetype>[0]
+   dup\\t%0.<Vtype>, %<vw>1"
+  [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
 )
 
 (define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..1fb4350916572c915e5af339102444daf324efc7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 					    aarch64_addr_query_type);
 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
 
 /* Major revision number of the ARM Architecture implemented by the target.  */
 unsigned aarch64_architecture_version;
@@ -12705,7 +12706,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
   rtx op0, op1, op2;
   const struct cpu_cost_table *extra_cost
     = aarch64_tune_params.insn_extra_cost;
-  int code = GET_CODE (x);
+  rtx_code code = GET_CODE (x);
   scalar_int_mode int_mode;
 
   /* By default, assume that everything has equivalent cost to the
@@ -14006,8 +14007,52 @@ cost_plus:
 			     mode, MULT, 1, speed);
           return true;
         }
+	break;
+    case CONST_VECTOR:
+	{
+	  rtx gen_insn = aarch64_simd_make_constant (x, true);
+	  /* Not a valid const vector.  */
+	  if (!gen_insn)
+	    break;
 
-      /* Fall through.  */
+	  switch (GET_CODE (gen_insn))
+	  {
+	  case CONST_VECTOR:
+	    /* Load using MOVI/MVNI.  */
+	    if (aarch64_simd_valid_immediate (x, NULL))
+	      *cost += extra_cost->vect.movi;
+	    else /* Load using constant pool.  */
+	      *cost += extra_cost->ldst.load;
+	    break;
+	  /* Load using a DUP.  */
+	  case VEC_DUPLICATE:
+	    gcc_unreachable ();
+	    break;
+	  default:
+	    *cost += extra_cost->ldst.load;
+	    break;
+	  }
+	  return true;
+	}
+    case VEC_CONCAT:
+	/* depending on the operation, either DUP or INS.
+	   For now, keep default costing.  */
+	break;
+    case VEC_DUPLICATE:
+	*cost += extra_cost->vect.dup;
+	return true;
+    case VEC_SELECT:
+	{
+	  /* cost subreg of 0 as free, otherwise as DUP */
+	  rtx op1 = XEXP (x, 1);
+	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+	    ;
+	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+	    *cost += extra_cost->vect.dup;
+	  else
+	    *cost += extra_cost->vect.extract;
+	  return true;
+	}
     default:
       break;
     }
@@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
 
 /* If VALS is a vector constant that can be loaded into a register
    using DUP, generate instructions to do so and return an RTX to
-   assign to the register.  Otherwise return NULL_RTX.  */
+   assign to the register.  Otherwise return NULL_RTX.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
   /* We can load this constant by using DUP and a constant in a
      single ARM register.  This will be cheaper than a vector
      load.  */
-  x = copy_to_mode_reg (inner_mode, x);
+  if (!check)
+    x = copy_to_mode_reg (inner_mode, x);
   return gen_vec_duplicate (mode, x);
 }
 
@@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
 /* Generate code to load VALS, which is a PARALLEL containing only
    constants (for vec_init) or CONST_VECTOR, efficiently into a
    register.  Returns an RTX to copy into the register, or NULL_RTX
-   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
+   for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+   If CHECK then the resulting instruction may not be used in
+   codegen but can be used for costing.  */
 static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
 {
   machine_mode mode = GET_MODE (vals);
   rtx const_dup;
@@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
       && aarch64_simd_valid_immediate (const_vec, NULL))
     /* Load using MOVI/MVNI.  */
     return const_vec;
-  else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+  else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
     /* Loaded using DUP.  */
     return const_dup;
   else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
 {
   const int alu;
   const int mult;
+  const int movi;
+  const int dup;
+  const int extract;
 };
 
 struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),	/* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (1),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (0),  /* alu.  */
-    COSTS_N_INSNS (4)   /* mult.  */
+    COSTS_N_INSNS (4),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
   /* Vector */
   {
     COSTS_N_INSNS (2),  /* alu.  */
-    COSTS_N_INSNS (8)   /* mult.  */
+    COSTS_N_INSNS (8),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (2),  /* dup.  */
+    COSTS_N_INSNS (2)   /* extract.  */
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+  uint64_t res = a | arr[0];
+  uint64x2_t val = vld1q_u64 (arr);
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test2:
+**	adrp	x[0-9]+, .LC[0-1]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+**	str	q[0-9]+, \[x[0-9]+\]
+**	fmov	x[0-9]+, d[0-9]+
+**	orr	x[0-9]+, x[0-9]+, x[0-9]+
+**	ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+  uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+  uint64_t arr = vgetq_lane_u64 (val, 0);
+  uint64_t res = a | arr;
+  *rt = vaddq_u64 (val, b);
+  return res;
+}
+
+/*
+**test3:
+**	adrp	x[0-9]+, .LC[0-9]+
+**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	str	q[0-9]+, \[x1\]
+**	fmov	w[0-9]+, s[0-9]+
+**	orr	w[0-9]+, w[0-9]+, w[0-9]+
+**	ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+  uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+  uint32_t res = a | arr[0];
+  uint32x4_t val = vld1q_u32 (arr);
+  *rt = vaddq_u32 (val, b);
+  return res;
+}
+
+/*
+**test4:
+**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
+**	mov	x[0-9]+, 16512
+**	movk	x[0-9]+, 0x1020, lsl 16
+**	movk	x[0-9]+, 0x408, lsl 32
+**	movk	x[0-9]+, 0x102, lsl 48
+**	fmov	d[0-9]+, x[0-9]+
+**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
+**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**	umov	w[0-9]+, v[0-9]+.h\[3\]
+**	ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+    uint8x16_t bool_input = vshrq_n_u8(input, 7);
+    poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+    poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+                               vgetq_lane_p64(mask, 0));
+    poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+    uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+    return vget_lane_u16((uint16x4_t)res, 3);
+}
+

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, October 25, 2021 3:32 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, October 25, 2021 10:54 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>;
> >> Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus
> >> Shawcroft <Marcus.Shawcroft@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Saturday, October 23, 2021 11:40 AM
> >> >> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
> >> >> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>
> >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> >> constants and operations
> >> >>
> >> >> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> >> I'm still a bit sceptical about treating the high-part cost as lower.
> >> >> >> ISTM that the subreg cases are the ones that are truly “free”
> >> >> >> and any others should have a normal cost.  So if CSE handled
> >> >> >> the subreg case itself (to model how the rtx would actually be
> >> >> >> generated) then
> >> >> >> aarch64 code would have to do less work.  I imagine that will
> >> >> >> be true for
> >> >> other targets as well.
> >> >> >
> >> >> > I guess the main problem is that CSE lacks context because it's
> >> >> > not until after combine that the high part becomes truly "free"
> >> >> > when pushed
> >> >> into a high operation.
> >> >>
> >> >> Yeah.  And the aarch64 code is just being asked to cost the
> >> >> operation it's given, which could for example come from an
> >> >> existing aarch64_simd_mov_from_<mode>high.  I think we should try
> >> >> to ensure that a aarch64_simd_mov_from_<mode>high followed by
> some
> >> arithmetic
> >> >> on the result is more expensive than the fused operation (when
> >> >> fusing is possible).
> >> >>
> >> >> An analogy might be: if the cost code is given:
> >> >>
> >> >>   (add (reg X) (reg Y))
> >> >>
> >> >> then, at some later point, the (reg X) might be replaced with a
> >> >> multiplication, in which case we'd have a MADD operation and the
> >> >> addition is effectively free.  Something similar would happen if
> >> >> (reg
> >> >> X) became a shift by a small amount on newer cores, although I
> >> >> guess then you could argue either that the cost of the add
> >> >> disappears or that
> >> the cost of the shift disappears.
> >> >>
> >> >> But we shouldn't count ADD as free on the basis that it could be
> >> >> combined with a multiplication or shift in future.  We have to
> >> >> cost what we're given.  I think the same thing applies to the high part.
> >> >>
> >> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with
> >> >> a MOVI by saying that the DUP is strictly cheaper than the MOVI.
> >> >> I don't think that's really true though, and the cost tables in
> >> >> the patch say that DUP is more expensive (rather than less
> >> >> expensive) than
> >> MOVI.
> >> >
> >> > No we're not. The front end has already pushed the constant into
> >> > each operation that needs it which is the entire problem.
> >>
> >> I think we're talking about different things here.  I'll come to the
> >> gimple stuff below, but I was talking purely about the effect on the
> >> RTL optimisers.  What I meant above is that, in the cse1 dumps, the patch
> leads to changes like:
> >>
> >>  (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >> -        (const_vector:V8QI [
> >> +        (vec_select:V8QI (reg:V16QI 116)
> >> +            (parallel:V16QI [
> >> +                    (const_int 8 [0x8])
> >> +                    (const_int 9 [0x9])
> >> +                    (const_int 10 [0xa])
> >> +                    (const_int 11 [0xb])
> >> +                    (const_int 12 [0xc])
> >> +                    (const_int 13 [0xd])
> >> +                    (const_int 14 [0xe])
> >> +                    (const_int 15 [0xf])
> >> +                ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >> +     (expr_list:REG_EQUAL (const_vector:V8QI [
> >>                  (const_int 3 [0x3]) repeated x8
> >> -            ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >> -     (expr_list:REG_DEAD (reg:V16QI 117)
> >> -        (nil)))
> >> +            ])
> >> +        (expr_list:REG_DEAD (reg:V16QI 117)
> >> +            (nil))))
> >>
> >> The pre-cse1 code is:
> >>
> >> (insn 19 18 20 2 (set (reg:V16QI 117)
> >>         (const_vector:V16QI [
> >>                 (const_int 3 [0x3]) repeated x16
> >>             ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
> >>      (nil))
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >>         (vec_select:V8QI (reg:V16QI 117)
> >>             (parallel:V16QI [
> >>                     (const_int 8 [0x8])
> >>                     (const_int 9 [0x9])
> >>                     (const_int 10 [0xa])
> >>                     (const_int 11 [0xb])
> >>                     (const_int 12 [0xc])
> >>                     (const_int 13 [0xd])
> >>                     (const_int 14 [0xe])
> >>                     (const_int 15 [0xf])
> >>                 ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >>      (nil))
> >>
> >> That is, before the patch, we folded insn 19 into insn 20 to get:
> >>
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >>         (const_vector:V8QI [
> >>                 (const_int 3 [0x3]) repeated x8
> >>             ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >>      (expr_list:REG_DEAD (reg:V16QI 117)
> >>         (nil)))
> >>
> >> After the patch we reject that because:
> >>
> >>   (set (reg:V8QI X) (const_vector:V8QI [3]))
> >>
> >> is costed as a MOVI (cost 4) and the original
> >> aarch64_simd_mov_from_v16qihigh is costed as zero.  In other words,
> >> the patch makes the DUP (lane) in the “mov high” strictly cheaper
> >> than a constant move (MOVI).
> >
> > Yes, this was done intentionally because as we talked about a month
> > ago there's no real way to cost this correctly. The use of `X` there
> > determines whether it's cheaper to use the movi over the dup.  The
> > MOVI not only prevent re-use of the value, it also prevents combining
> > into high operations.  All of which is impossible to tell currently in how CSE
> and costing are done.
> >
> > This is an unmodified compiler created from last night's trunk
> > https://godbolt.org/z/1saTP4xWs
> >
> > While yes, it did fold movi into the set, reg 19 wasn't dead, so you
> > now materialized the constant 3 times
> >
> > test0:
> >         ldr     q0, [x0]
> >         movi    v3.8b, 0x3  <<<< first
> >         ldr     q2, [x1]
> >         movi    v5.16b, 0x3 <<< second
> >         uxtl    v1.8h, v0.8b
> >         dup     d4, v2.d[1] <<< third
> >         uxtl2   v0.8h, v0.16b
> >         umlal   v1.8h, v2.8b, v5.8b
> >         umlal   v0.8h, v4.8b, v3.8b
> >         addhn   v0.8b, v1.8h, v0.8h
> >         str     d0, [x2]
> >         ret
> >
> > whilst my patch, generates
> >
> > test0:
> > 	movi	v2.16b, 0x3 <<< once
> > 	ldr	q0, \[x0\]
> > 	uxtl	v1.8h, v0.8b
> > 	uxtl2	v0.8h, v0.16b
> > 	ldr	q3, \[x1\]
> > 	umlal	v1.8h, v3.8b, v2.8b
> > 	umlal2	v0.8h, v3.16b, v2.16b
> > 	addhn	v0.8b, v1.8h, v0.8h
> > 	str	d0, \[x2\]
> > 	ret
> >
> > Yes it's not perfect, yes you can end up with a dup instead of two
> > movi's but my argument is it's still a step forward as the perfect solution
> doesn't seem to be possible at all with the way things are currently set up.
> 
> I agree there's no out-of-the-box way of getting what we want for the
> original testcases.  It would require changes outside the target or (if the
> worst comes to the worst) a target-specific pass.
> 
> >> Preventing this fold seems like a key part of being able to match the
> >> *l2 forms in the testcase, since otherwise the “mov high” disappears
> >> and isn't available for combining later.
> >
> > Yes, and by preventing the folding combine should in principle be able
> > to fold it back if it wasn't pushed into another Instruction, but combine
> does not attempt to touch constants and selects on their own. If it did this
> "regression" would be fixed.
> 
> The problem is that combine is limited to individual EBBs and only combines
> def-use chains when there is a single use.  It's not a general folding engine.
> 
> > I'm not really quite sure what we're arguing about..  I did think about all
> three possible cases when making this:
> >
> > https://godbolt.org/z/hjWhWq1v1
> >
> > Of the three cases the compiler currently only generates something good
> for test2.  Both test1 and test0 are deficient.
> > The patch doesn't change test2, significantly improves test0 and whether
> test1 is a regression is likely uArch specific.
> >
> > On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar
> > has the same throughput and latencies as a MOVI according to the Arm
> Performance Software Optimization guides.
> 
> Costing them as equal would be OK when they are equal.  It's the “DUP
> (lane)/ mov high is strictly cheaper bit” I'm concerned about.
> 
> > So to me this looks like an improvement overall.  And this is where we likely
> disagree?
> 
> Well, the disagreement isn't about whether the new compiler output for
> these testcases is better than the old compiler output.  It's more a question
> of how we're getting there.
> 
> >> > MOVI as I mentioned before is the one case where this is a toss up.
> >> > But there are far more constants that cannot be created with a movi.
> >> > A simple example is
> >> >
> >> > #include <arm_neon.h>
> >> >
> >> > int8x16_t square(int8x16_t full, int8x8_t small) {
> >> >     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
> >> >     int8x8_t low = vget_high_s8 (cst);
> >> >     int8x8_t res1 = vmul_s8 (small, low);
> >> >     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1,
> >> > res1)); }
> >> >
> >> > Where in Gimple we get
> >> >
> >> >   <bb 2> [local count: 1073741824]:
> >> >   _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7,
> >> > 8, 9, 10, 11, 12,
> >> 13, 15, 0 });
> >> >   _4 = _2 * small_3(D);
> >> >   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >> >   _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >> >   _8 = _6 + _7;
> >> >   return _8;
> >> >
> >> > Regardless of what happens to __builtin_aarch64_get_highv16qi
> >> > nothing will recreate the relationship with cst, whether
> >> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will
> >> still push in constants.
> >>
> >> Yeah, constants are (by design) free in gimple.  But that's OK in
> >> itself, because RTL optimisers have the job of removing any
> >> duplicates that end up requiring separate moves.  I think we both agree
> on that.
> >>
> >> E.g. for:
> >>
> >> #include <arm_neon.h>
> >>
> >> void foo(int8x16_t *x) {
> >>   x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >>   x[1] = vaddq_s8 (x[1], (int8x16_t)
> >> {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >> }
> >>
> >> the final gimple is:
> >>
> >>   <bb 2> [local count: 1073741824]:
> >>   _1 = *x_4(D);
> >>   _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >>   *x_4(D) = _5;
> >>   _2 = MEM[(int8x16_t *)x_4(D) + 16B];
> >>   _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >>   MEM[(int8x16_t *)x_4(D) + 16B] = _7;
> >>   return;
> >>
> >> but cse1 removes the duplicated constant even before the patch.
> >
> > It doesn't for me, again an unmodified compiler:
> >
> > https://godbolt.org/z/qnvf7496h
> 
> FWIW, the link for my example is:
> 
>   https://godbolt.org/z/G6vaE3nab
> 
> but it sounds like the disagreement wasn't where I thought it was.
> 
> > and CSE1 has as the final codegen:
> >
> > (insn 7 4 8 2 (set (reg:V16QI 99)
> >         (const_vector:V16QI [
> >                 (const_int 0 [0])
> >                 (const_int 1 [0x1])
> >                 (const_int 2 [0x2])
> >                 (const_int 3 [0x3])
> >                 (const_int 4 [0x4])
> >                 (const_int 5 [0x5])
> >                 (const_int 6 [0x6])
> >                 (const_int 7 [0x7])
> >                 (const_int 8 [0x8])
> >                 (const_int 9 [0x9])
> >                 (const_int 10 [0xa])
> >                 (const_int 11 [0xb])
> >                 (const_int 12 [0xc])
> >                 (const_int 13 [0xd])
> >                 (const_int 15 [0xf])
> >                 (const_int 0 [0])
> >             ]))
> >
> > (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
> >         (const_vector:V8QI [
> >                 (const_int 8 [0x8])
> >                 (const_int 9 [0x9])
> >                 (const_int 10 [0xa])
> >                 (const_int 11 [0xb])
> >                 (const_int 12 [0xc])
> >                 (const_int 13 [0xd])
> >                 (const_int 15 [0xf])
> >                 (const_int 0 [0])
> >             ]))
> >
> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >                 (parallel:V16QI [
> >                         (const_int 0 [0])
> >                         (const_int 1 [0x1])
> >                         (const_int 2 [0x2])
> >                         (const_int 3 [0x3])
> >                         (const_int 4 [0x4])
> >                         (const_int 5 [0x5])
> >                         (const_int 6 [0x6])
> >                         (const_int 7 [0x7])
> >                     ]))
> >             (reg:V8QI 93 [ _4 ])))
> 
> Here, insn 8 is the folded version of the vget_high_s8 and insn 11 is part of
> the vcombine_s8.  With that caveat…
> 
> > So again same constant represented twice, which is reflected in the
> codegen.
> 
> …right, the above is also what I was saying that we generate before the patch
> for your square example.
> 
> But as you say later this testcase is demonstrating the point that constants
> loaded from memory should be more expensive than DUP (lane).
> I agree with that.  The bit I don't agree with is costing the DUP (lane) as zero,
> so that it's also strictly cheaper than MOVI.
> 
> So I think the disagreement is more about things like the first example in the
> testcase:
> 
>   https://godbolt.org/z/xrMnezrse
> 
> Specifically: is it legitimate to fold:
> 
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (vec_select:V8QI (reg:V16QI 117)
>             (parallel:V16QI [
>                     (const_int 8 [0x8])
>                     (const_int 9 [0x9])
>                     (const_int 10 [0xa])
>                     (const_int 11 [0xb])
>                     (const_int 12 [0xc])
>                     (const_int 13 [0xd])
>                     (const_int 14 [0xe])
>                     (const_int 15 [0xf])
>                 ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1394
> {aarch64_simd_mov_from_v16qihigh}
>      (nil))
> 
> to:
> 
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
>         (const_vector:V8QI [
>                 (const_int 3 [0x3]) repeated x8
>             ])) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 117)
>         (nil)))
> 
> without first trying to get rid of the instruction some other way (through
> combine)?
> 
> I think it is legitimate, since the new MOVI instruction is at least as cheap as
> the original DUP.  Even if CSE didn't do the fold itself, and just CSEd the two
> uses of the V16QI constant, I think it would be legitimate for a later patch to
> fold the instruction to a constant independently of CSE.
> 
> IMO:
> 
>   vget_high_s8(vdupq_n_u8(3))
> 
> is just a roundabout way of writing:
> 
>   vdup_n_u8(3)
> 
> We've described what vget_high_s8 does in target-independent rtl (i.e.
> without unspecs) so it's natural that operations with constant operands will
> themselves get folded to a constant.
> 
> I think we should accept that and try to generate the output we want in an
> environment where such folds do happen, rather than trying to prevent the
> folds from happening until during or after combine.
> 
> That approach could also work for autovec output, and cases where the user
> wrote the 8-byte constants directly.  E.g. I think we should aim to optimise:
> 
> void test0_mod (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0) {
>   uint8x8_t three_u8 = vdup_n_u8(3);
>   uint8x16_t x = vld1q_u8(inptr0);
>   uint8x16_t y = vld1q_u8(inptr1);
>   uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
>   uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
>   uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), three_u8);
>   uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), three_u8);
>   vst1_u8(outptr0, vaddhn_u16(z_l, z_h)); }
> 
> in the same way as the original test0.  Similarly we should aim to optimise:
> 
> int8x16_t square_mode(int8x16_t full, int8x8_t small) {
>     int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
>     int8x8_t low = {8,9,10,11,12,13,15};
>     int8x8_t res1 = vmul_s8 (small, low);
>     return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1)); }
> 
> in the same way as square.
> 
> >> so that there are no longer any duplicate constants (as far as the
> >> RTL code is concerned).  Instead we have one 16-byte constant and one 8-
> byte constant.
> >>
> >> The patch prevents the fold on insn 8 by making the “mov high”
> >> strictly cheaper than the constant move, so we keep the “mov high”
> >> and its 16-byte input.  Keeping the “mov high” means that we do have
> >> a duplicate constant for CSE to remove.
> >>
> >> What I meant…
> >>
> >> >> Also, if I've understood correctly, it looks like we'd be relying
> >> >> on the vget_high of a constant remaining unfolded until RTL cse1.
> >> >> I think it's likely in future that we'd try to fold vget_high at
> >> >> the gimple level instead, since that could expose more
> >> >> optimisations of a different kind.  The gimple optimisers would
> >> >> then fold
> >> >> vget_high(constant) in a similar way to
> >> >> cse1 does now.
> >> >>
> >> >> So perhaps we should continue to allow the vget_high(constant) to
> >> >> be foloded in cse1 and come up with some way of coping with the
> >> >> folded
> >> form.
> >>
> >> …here was that, in future, the gimple optimisers might be able to
> >> fold the vget_high themselves.  For your example, we'd then have:
> >>
> >>   _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
> >>   _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >>   _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >>   _8 = _6 + _7;
> >>   return _8;
> >>
> >> In this situation, we'd need to recreate the relationship between {
> >> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10,
> >> 11, 12, 13, 15, 0 }.  We can't ensure that the relationship is never lost.
> >>
> >> The same thing would be true for vget_low.  So a constant like:
> >>
> >>   cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
> >>   … vget_low* (cst) ..;
> >>   … vget_high* (cst) …;
> >>
> >> could be folded to two smaller constants:
> >>
> >>   … { 0, 1, 2, 3, 4, 5, 6, 7 } …;
> >>   … { 8, 9, 10, 11, 12, 13, 15, 0 } …;
> >>
> >> We might then need to recreate the combined form, rather than relying
> >> on the combined form already existing.
> >
> > Yes but this is what confuses me. My patch changes it so that CSE1
> > which is ran relatively early is able to find the relationship between the two
> constants.
> 
> Yeah, it does that for the case where the vector constant is a duplicate of a
> single element.  My example above doesn't fall into that category though.
> 
> What I was saying was: let's suppose that a vget_low/vget_high pair for a
> general V16QI vector constant is folded at the gimple level (by later patches).
> Then the RTL optimisers just see two V8QI constants rather than a single
> V16QI constant.  The optimisers would need to generate the V16QI “from
> scratch” if they wanted to, as for test0_mod above.
> 
> > CSE1 shouldn't do any folding, it doesn't have enough information to do so.
> > By CSE doing folding it makes it so combine is less efficient.
> 
> I don't agree with that as a general statement.  I agree that stopping pre-
> combine passes from folding helps examples like test0, but I don't think that
> means that pre-combine passes are doing the wrong thing by folding.  IMO
> the problem is more that we are very opportunistic in looking for high-part
> operations (and by-lane operations).  Legitimate optimisations can easily
> defeat this opportunistic matching.
> 
> >> > CSE1 doesn't fold it, because for CSE the cost is too high to do
> >> > so. Which is
> >> what this costing was attempting to fix.
> >> > CSE simply does not touch it. It leaves it as
> >> >
> >> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >> >         (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >> >                 (parallel:V16QI [
> >> >                         (const_int 0 [0])
> >> >                         (const_int 1 [0x1])
> >> >                         (const_int 2 [0x2])
> >> >                         (const_int 3 [0x3])
> >> >                         (const_int 4 [0x4])
> >> >                         (const_int 5 [0x5])
> >> >                         (const_int 6 [0x6])
> >> >                         (const_int 7 [0x7])
> >> >                     ]))
> >> >             (reg:V8QI 93 [ _4 ]))) "":6506:10 1908
> >> {aarch64_simd_move_hi_quad_v16qi}
> >> >      (nil))
> >> > (insn 12 11 13 2 (set (reg:V16QI 102)
> >> >         (const_vector:V16QI [
> >> >                 (const_int 0 [0])
> >> >                 (const_int 1 [0x1])
> >> >                 (const_int 2 [0x2])
> >> >                 (const_int 3 [0x3])
> >> >                 (const_int 4 [0x4])
> >> >                 (const_int 5 [0x5])
> >> >                 (const_int 6 [0x6])
> >> >                 (const_int 7 [0x7])
> >> >                 (const_int 8 [0x8])
> >> >                 (const_int 9 [0x9])
> >> >                 (const_int 10 [0xa])
> >> >                 (const_int 11 [0xb])
> >> >                 (const_int 12 [0xc])
> >> >                 (const_int 13 [0xd])
> >> >                 (const_int 15 [0xf])
> >> >                 (const_int 0 [0])
> >> >             ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
> >> >      (nil))
> >>
> >> I don't think that's true for the unpatched compiler.  Are you sure
> >> this isn't the “pre-CSE” part of the dump?  CSE is confusing (to me)
> >> in that it prints each function twice, once in unoptimised form and later in
> optimised form.
> >>
> >
> > Yes I'm sure, see all the compiler explorer links above.
> 
> Ah, yeah, I misunderstood which insn you were quoting.  But insn 11 in:
> 
>   https://godbolt.org/z/rrbP14var
> 
> is part of the vcombine_s8.  The preceding instructions are:
> 
> (insn 9 8 10 2 (set (reg:V8QI 93 [ _4 ])
>         (mult:V8QI (reg:V8QI 92 [ _2 ])
>             (reg/v:V8QI 98 [ small ]))) "/opt/compiler-explorer/arm64/gcc-trunk-
> 20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":1402:14 1428 {mulv8qi3}
>      (expr_list:REG_DEAD (reg/v:V8QI 98 [ small ])
>         (expr_list:REG_DEAD (reg:V8QI 92 [ _2 ])
>             (nil))))
> (insn 10 9 11 2 (set (reg:V16QI 95 [ _7 ])
>         (vec_concat:V16QI (reg:V8QI 93 [ _4 ])
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6506:10 1892
> {move_lo_quad_internal_v16qi}
>      (nil))
> 
> and since the multiplication result is variable, we can't fold this.
> 
> The vget_high is insn 8, which does get folded (but it sounds like we agree on
> that).
> 
> > > > And I don't see any way to fix this without having Gimple not push
> > > constants in, which would lead to worse regressions.
> > > > I can change the patch to cost the high as a dup which fixes this
> > > > codegen at
> > > least and has you rematerialize movi.   If that's
> > > > not acceptable I can drop costing for High entirely then, it's not
> > > > the main
> > > thing I am fixing.
> > >
> > > Costing the high as a dup leaves us in the same situation as before
> > > the
> > > patch: the folded V8QI constant is cheaper than the unfolded mov high.
> >
> > Yes and the dup will reflect that. The argument that it's not the
> > right cost no longer hold any water in that case.
> 
> Yeah, my concerns disappear in that case.
> 
> > In particular as I still maintain that is too early to do any constant
> > folding in CSE1 for AArch64.
> >
> > Whether it's folded or not doesn't make any difference to combine
> > which will Fold when combinations are possible with the folder version.
> >
> > So I have yet to see any actual regression.
> 
> Well, this is going to win any awards for realism :-), but:
> 
> #include <arm_neon.h>
> 
> int8x16_t foo() {
>   int8x16_t a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
>   int8x8_t b = vget_high_s8 (a);
>   int8x8_t c = { 4, 5, 6, 7, 8, 9, 10, 11 };
>   int8x8_t d = vadd_s8 (b, c);
>   int8x16_t e = vcombine_s8 (d, b);
>   return vaddq_s8 (e, a);
> }
> 
> is folded to a constant before the patch and isn't after the patch.
> 
> Your examples are more realistic than that one, but I think this does show
> why preventing folding can be counter-productive in some cases.
> 
> My hope is that one day gimple would fold that example to a constant.
> But if it does, it will also fold the vget_highs and vget_lows in the original
> testcase to constants, meaning that we can't rely on the original V16QI
> constant existing as well.
> 
> Thanks,
> Richard