From: Tamar Christina <Tamar.Christina@arm.com>
To: Richard Sandiford <Richard.Sandiford@arm.com>
Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>,
Richard Earnshaw <Richard.Earnshaw@arm.com>, nd <nd@arm.com>,
Marcus Shawcroft <Marcus.Shawcroft@arm.com>
Subject: RE: [PATCH 2/2]AArch64: Add better costing for vector constants and operations
Date: Tue, 26 Oct 2021 13:01:21 +0000 [thread overview]
Message-ID: <VI1PR08MB532599487C521DE56A1F3F3EFF849@VI1PR08MB5325.eurprd08.prod.outlook.com> (raw)
In-Reply-To: <mptpmrtmayv.fsf@arm.com>
[-- Attachment #1: Type: text/plain, Size: 42232 bytes --]
Hi,
Following the discussion below here's a revised patch.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/aarch-common-protos.h (struct vector_cost_table): Add
movi, dup and extract costing fields.
* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
thunderx_extra_costs, thunderx2t99_extra_costs,
thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use
them.
* config/arm/aarch-cost-tables.h (generic_extra_costs,
cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
exynosm1_extra_costs, xgene1_extra_costs): Likewise
* config/aarch64/aarch64-simd.md (aarch64_simd_dup<mode>): Add r->w dup.
* config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose.
(aarch64_rtx_costs): Add extra costs.
(aarch64_simd_dup_constant): Support check only mode.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vect-cse-codegen.c: New test.
--- inline copy of patch ---
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
)
(define_insn "aarch64_simd_dup<mode>"
- [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+ [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
(vec_duplicate:VDQF_F16
- (match_operand:<VEL> 1 "register_operand" "w")))]
+ (match_operand:<VEL> 1 "register_operand" "w,r")))]
"TARGET_SIMD"
- "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
- [(set_attr "type" "neon_dup<q>")]
+ "@
+ dup\\t%0.<Vtype>, %1.<Vetype>[0]
+ dup\\t%0.<Vtype>, %<vw>1"
+ [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
)
(define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..1fb4350916572c915e5af339102444daf324efc7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
/* Major revision number of the ARM Architecture implemented by the target. */
unsigned aarch64_architecture_version;
@@ -12705,7 +12706,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
rtx op0, op1, op2;
const struct cpu_cost_table *extra_cost
= aarch64_tune_params.insn_extra_cost;
- int code = GET_CODE (x);
+ rtx_code code = GET_CODE (x);
scalar_int_mode int_mode;
/* By default, assume that everything has equivalent cost to the
@@ -14006,8 +14007,52 @@ cost_plus:
mode, MULT, 1, speed);
return true;
}
+ break;
+ case CONST_VECTOR:
+ {
+ rtx gen_insn = aarch64_simd_make_constant (x, true);
+ /* Not a valid const vector. */
+ if (!gen_insn)
+ break;
- /* Fall through. */
+ switch (GET_CODE (gen_insn))
+ {
+ case CONST_VECTOR:
+ /* Load using MOVI/MVNI. */
+ if (aarch64_simd_valid_immediate (x, NULL))
+ *cost += extra_cost->vect.movi;
+ else /* Load using constant pool. */
+ *cost += extra_cost->ldst.load;
+ break;
+ /* Load using a DUP. */
+ case VEC_DUPLICATE:
+ gcc_unreachable ();
+ break;
+ default:
+ *cost += extra_cost->ldst.load;
+ break;
+ }
+ return true;
+ }
+ case VEC_CONCAT:
+ /* depending on the operation, either DUP or INS.
+ For now, keep default costing. */
+ break;
+ case VEC_DUPLICATE:
+ *cost += extra_cost->vect.dup;
+ return true;
+ case VEC_SELECT:
+ {
+ /* cost subreg of 0 as free, otherwise as DUP */
+ rtx op1 = XEXP (x, 1);
+ if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+ ;
+ else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+ *cost += extra_cost->vect.dup;
+ else
+ *cost += extra_cost->vect.extract;
+ return true;
+ }
default:
break;
}
@@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
/* If VALS is a vector constant that can be loaded into a register
using DUP, generate instructions to do so and return an RTX to
- assign to the register. Otherwise return NULL_RTX. */
+ assign to the register. Otherwise return NULL_RTX.
+
+ If CHECK then the resulting instruction may not be used in
+ codegen but can be used for costing. */
static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
{
machine_mode mode = GET_MODE (vals);
machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
/* We can load this constant by using DUP and a constant in a
single ARM register. This will be cheaper than a vector
load. */
- x = copy_to_mode_reg (inner_mode, x);
+ if (!check)
+ x = copy_to_mode_reg (inner_mode, x);
return gen_vec_duplicate (mode, x);
}
@@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
/* Generate code to load VALS, which is a PARALLEL containing only
constants (for vec_init) or CONST_VECTOR, efficiently into a
register. Returns an RTX to copy into the register, or NULL_RTX
- for a PARALLEL that cannot be converted into a CONST_VECTOR. */
+ for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+ If CHECK then the resulting instruction may not be used in
+ codegen but can be used for costing. */
static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
{
machine_mode mode = GET_MODE (vals);
rtx const_dup;
@@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
&& aarch64_simd_valid_immediate (const_vec, NULL))
/* Load using MOVI/MVNI. */
return const_vec;
- else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+ else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
/* Loaded using DUP. */
return const_dup;
else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
{
const int alu;
const int mult;
+ const int movi;
+ const int dup;
+ const int extract;
};
struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
/* Vector */
{
COSTS_N_INSNS (0), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
/* Vector */
{
COSTS_N_INSNS (2), /* alu. */
- COSTS_N_INSNS (8) /* mult. */
+ COSTS_N_INSNS (8), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+ uint64_t res = a | arr[0];
+ uint64x2_t val = vld1q_u64 (arr);
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test2:
+** adrp x[0-9]+, .LC[0-1]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+ uint64_t arr = vgetq_lane_u64 (val, 0);
+ uint64_t res = a | arr;
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test3:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** str q[0-9]+, \[x1\]
+** fmov w[0-9]+, s[0-9]+
+** orr w[0-9]+, w[0-9]+, w[0-9]+
+** ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+ uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+ uint32_t res = a | arr[0];
+ uint32x4_t val = vld1q_u32 (arr);
+ *rt = vaddq_u32 (val, b);
+ return res;
+}
+
+/*
+**test4:
+** ushr v[0-9]+.16b, v[0-9]+.16b, 7
+** mov x[0-9]+, 16512
+** movk x[0-9]+, 0x1020, lsl 16
+** movk x[0-9]+, 0x408, lsl 32
+** movk x[0-9]+, 0x102, lsl 48
+** fmov d[0-9]+, x[0-9]+
+** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+** dup v[0-9]+.2d, v[0-9]+.d\[0\]
+** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** umov w[0-9]+, v[0-9]+.h\[3\]
+** ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+ uint8x16_t bool_input = vshrq_n_u8(input, 7);
+ poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+ poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+ vgetq_lane_p64(mask, 0));
+ poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+ uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+ return vget_lane_u16((uint16x4_t)res, 3);
+}
+
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, October 25, 2021 3:32 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft <Marcus.Shawcroft@arm.com>
> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants
> and operations
>
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, October 25, 2021 10:54 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>;
> >> Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Marcus
> >> Shawcroft <Marcus.Shawcroft@arm.com>
> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> constants and operations
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Saturday, October 23, 2021 11:40 AM
> >> >> To: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
> >> >> Cc: Tamar Christina <Tamar.Christina@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; nd <nd@arm.com>; Marcus
> Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>
> >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector
> >> >> constants and operations
> >> >>
> >> >> Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >> >> >> I'm still a bit sceptical about treating the high-part cost as lower.
> >> >> >> ISTM that the subreg cases are the ones that are truly “free”
> >> >> >> and any others should have a normal cost. So if CSE handled
> >> >> >> the subreg case itself (to model how the rtx would actually be
> >> >> >> generated) then
> >> >> >> aarch64 code would have to do less work. I imagine that will
> >> >> >> be true for
> >> >> other targets as well.
> >> >> >
> >> >> > I guess the main problem is that CSE lacks context because it's
> >> >> > not until after combine that the high part becomes truly "free"
> >> >> > when pushed
> >> >> into a high operation.
> >> >>
> >> >> Yeah. And the aarch64 code is just being asked to cost the
> >> >> operation it's given, which could for example come from an
> >> >> existing aarch64_simd_mov_from_<mode>high. I think we should try
> >> >> to ensure that a aarch64_simd_mov_from_<mode>high followed by
> some
> >> arithmetic
> >> >> on the result is more expensive than the fused operation (when
> >> >> fusing is possible).
> >> >>
> >> >> An analogy might be: if the cost code is given:
> >> >>
> >> >> (add (reg X) (reg Y))
> >> >>
> >> >> then, at some later point, the (reg X) might be replaced with a
> >> >> multiplication, in which case we'd have a MADD operation and the
> >> >> addition is effectively free. Something similar would happen if
> >> >> (reg
> >> >> X) became a shift by a small amount on newer cores, although I
> >> >> guess then you could argue either that the cost of the add
> >> >> disappears or that
> >> the cost of the shift disappears.
> >> >>
> >> >> But we shouldn't count ADD as free on the basis that it could be
> >> >> combined with a multiplication or shift in future. We have to
> >> >> cost what we're given. I think the same thing applies to the high part.
> >> >>
> >> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with
> >> >> a MOVI by saying that the DUP is strictly cheaper than the MOVI.
> >> >> I don't think that's really true though, and the cost tables in
> >> >> the patch say that DUP is more expensive (rather than less
> >> >> expensive) than
> >> MOVI.
> >> >
> >> > No we're not. The front end has already pushed the constant into
> >> > each operation that needs it which is the entire problem.
> >>
> >> I think we're talking about different things here. I'll come to the
> >> gimple stuff below, but I was talking purely about the effect on the
> >> RTL optimisers. What I meant above is that, in the cse1 dumps, the patch
> leads to changes like:
> >>
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >> - (const_vector:V8QI [
> >> + (vec_select:V8QI (reg:V16QI 116)
> >> + (parallel:V16QI [
> >> + (const_int 8 [0x8])
> >> + (const_int 9 [0x9])
> >> + (const_int 10 [0xa])
> >> + (const_int 11 [0xb])
> >> + (const_int 12 [0xc])
> >> + (const_int 13 [0xd])
> >> + (const_int 14 [0xe])
> >> + (const_int 15 [0xf])
> >> + ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >> + (expr_list:REG_EQUAL (const_vector:V8QI [
> >> (const_int 3 [0x3]) repeated x8
> >> - ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >> - (expr_list:REG_DEAD (reg:V16QI 117)
> >> - (nil)))
> >> + ])
> >> + (expr_list:REG_DEAD (reg:V16QI 117)
> >> + (nil))))
> >>
> >> The pre-cse1 code is:
> >>
> >> (insn 19 18 20 2 (set (reg:V16QI 117)
> >> (const_vector:V16QI [
> >> (const_int 3 [0x3]) repeated x16
> >> ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi}
> >> (nil))
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >> (vec_select:V8QI (reg:V16QI 117)
> >> (parallel:V16QI [
> >> (const_int 8 [0x8])
> >> (const_int 9 [0x9])
> >> (const_int 10 [0xa])
> >> (const_int 11 [0xb])
> >> (const_int 12 [0xc])
> >> (const_int 13 [0xd])
> >> (const_int 14 [0xe])
> >> (const_int 15 [0xf])
> >> ]))) "include/arm_neon.h":6477:22 1394
> >> {aarch64_simd_mov_from_v16qihigh}
> >> (nil))
> >>
> >> That is, before the patch, we folded insn 19 into insn 20 to get:
> >>
> >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> >> (const_vector:V8QI [
> >> (const_int 3 [0x3]) repeated x8
> >> ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> >> (expr_list:REG_DEAD (reg:V16QI 117)
> >> (nil)))
> >>
> >> After the patch we reject that because:
> >>
> >> (set (reg:V8QI X) (const_vector:V8QI [3]))
> >>
> >> is costed as a MOVI (cost 4) and the original
> >> aarch64_simd_mov_from_v16qihigh is costed as zero. In other words,
> >> the patch makes the DUP (lane) in the “mov high” strictly cheaper
> >> than a constant move (MOVI).
> >
> > Yes, this was done intentionally because as we talked about a month
> > ago there's no real way to cost this correctly. The use of `X` there
> > determines whether it's cheaper to use the movi over the dup. The
> > MOVI not only prevent re-use of the value, it also prevents combining
> > into high operations. All of which is impossible to tell currently in how CSE
> and costing are done.
> >
> > This is an unmodified compiler created from last night's trunk
> > https://godbolt.org/z/1saTP4xWs
> >
> > While yes, it did fold movi into the set, reg 19 wasn't dead, so you
> > now materialized the constant 3 times
> >
> > test0:
> > ldr q0, [x0]
> > movi v3.8b, 0x3 <<<< first
> > ldr q2, [x1]
> > movi v5.16b, 0x3 <<< second
> > uxtl v1.8h, v0.8b
> > dup d4, v2.d[1] <<< third
> > uxtl2 v0.8h, v0.16b
> > umlal v1.8h, v2.8b, v5.8b
> > umlal v0.8h, v4.8b, v3.8b
> > addhn v0.8b, v1.8h, v0.8h
> > str d0, [x2]
> > ret
> >
> > whilst my patch, generates
> >
> > test0:
> > movi v2.16b, 0x3 <<< once
> > ldr q0, \[x0\]
> > uxtl v1.8h, v0.8b
> > uxtl2 v0.8h, v0.16b
> > ldr q3, \[x1\]
> > umlal v1.8h, v3.8b, v2.8b
> > umlal2 v0.8h, v3.16b, v2.16b
> > addhn v0.8b, v1.8h, v0.8h
> > str d0, \[x2\]
> > ret
> >
> > Yes it's not perfect, yes you can end up with a dup instead of two
> > movi's but my argument is it's still a step forward as the perfect solution
> doesn't seem to be possible at all with the way things are currently set up.
>
> I agree there's no out-of-the-box way of getting what we want for the
> original testcases. It would require changes outside the target or (if the
> worst comes to the worst) a target-specific pass.
>
> >> Preventing this fold seems like a key part of being able to match the
> >> *l2 forms in the testcase, since otherwise the “mov high” disappears
> >> and isn't available for combining later.
> >
> > Yes, and by preventing the folding combine should in principle be able
> > to fold it back if it wasn't pushed into another Instruction, but combine
> does not attempt to touch constants and selects on their own. If it did this
> "regression" would be fixed.
>
> The problem is that combine is limited to individual EBBs and only combines
> def-use chains when there is a single use. It's not a general folding engine.
>
> > I'm not really quite sure what we're arguing about.. I did think about all
> three possible cases when making this:
> >
> > https://godbolt.org/z/hjWhWq1v1
> >
> > Of the three cases the compiler currently only generates something good
> for test2. Both test1 and test0 are deficient.
> > The patch doesn't change test2, significantly improves test0 and whether
> test1 is a regression is likely uArch specific.
> >
> > On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar
> > has the same throughput and latencies as a MOVI according to the Arm
> Performance Software Optimization guides.
>
> Costing them as equal would be OK when they are equal. It's the “DUP
> (lane)/ mov high is strictly cheaper bit” I'm concerned about.
>
> > So to me this looks like an improvement overall. And this is where we likely
> disagree?
>
> Well, the disagreement isn't about whether the new compiler output for
> these testcases is better than the old compiler output. It's more a question
> of how we're getting there.
>
> >> > MOVI as I mentioned before is the one case where this is a toss up.
> >> > But there are far more constants that cannot be created with a movi.
> >> > A simple example is
> >> >
> >> > #include <arm_neon.h>
> >> >
> >> > int8x16_t square(int8x16_t full, int8x8_t small) {
> >> > int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
> >> > int8x8_t low = vget_high_s8 (cst);
> >> > int8x8_t res1 = vmul_s8 (small, low);
> >> > return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1,
> >> > res1)); }
> >> >
> >> > Where in Gimple we get
> >> >
> >> > <bb 2> [local count: 1073741824]:
> >> > _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7,
> >> > 8, 9, 10, 11, 12,
> >> 13, 15, 0 });
> >> > _4 = _2 * small_3(D);
> >> > _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >> > _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >> > _8 = _6 + _7;
> >> > return _8;
> >> >
> >> > Regardless of what happens to __builtin_aarch64_get_highv16qi
> >> > nothing will recreate the relationship with cst, whether
> >> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will
> >> still push in constants.
> >>
> >> Yeah, constants are (by design) free in gimple. But that's OK in
> >> itself, because RTL optimisers have the job of removing any
> >> duplicates that end up requiring separate moves. I think we both agree
> on that.
> >>
> >> E.g. for:
> >>
> >> #include <arm_neon.h>
> >>
> >> void foo(int8x16_t *x) {
> >> x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >> x[1] = vaddq_s8 (x[1], (int8x16_t)
> >> {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15});
> >> }
> >>
> >> the final gimple is:
> >>
> >> <bb 2> [local count: 1073741824]:
> >> _1 = *x_4(D);
> >> _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >> *x_4(D) = _5;
> >> _2 = MEM[(int8x16_t *)x_4(D) + 16B];
> >> _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> >> MEM[(int8x16_t *)x_4(D) + 16B] = _7;
> >> return;
> >>
> >> but cse1 removes the duplicated constant even before the patch.
> >
> > It doesn't for me, again an unmodified compiler:
> >
> > https://godbolt.org/z/qnvf7496h
>
> FWIW, the link for my example is:
>
> https://godbolt.org/z/G6vaE3nab
>
> but it sounds like the disagreement wasn't where I thought it was.
>
> > and CSE1 has as the final codegen:
> >
> > (insn 7 4 8 2 (set (reg:V16QI 99)
> > (const_vector:V16QI [
> > (const_int 0 [0])
> > (const_int 1 [0x1])
> > (const_int 2 [0x2])
> > (const_int 3 [0x3])
> > (const_int 4 [0x4])
> > (const_int 5 [0x5])
> > (const_int 6 [0x6])
> > (const_int 7 [0x7])
> > (const_int 8 [0x8])
> > (const_int 9 [0x9])
> > (const_int 10 [0xa])
> > (const_int 11 [0xb])
> > (const_int 12 [0xc])
> > (const_int 13 [0xd])
> > (const_int 15 [0xf])
> > (const_int 0 [0])
> > ]))
> >
> > (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ])
> > (const_vector:V8QI [
> > (const_int 8 [0x8])
> > (const_int 9 [0x9])
> > (const_int 10 [0xa])
> > (const_int 11 [0xb])
> > (const_int 12 [0xc])
> > (const_int 13 [0xd])
> > (const_int 15 [0xf])
> > (const_int 0 [0])
> > ]))
> >
> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> > (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> > (parallel:V16QI [
> > (const_int 0 [0])
> > (const_int 1 [0x1])
> > (const_int 2 [0x2])
> > (const_int 3 [0x3])
> > (const_int 4 [0x4])
> > (const_int 5 [0x5])
> > (const_int 6 [0x6])
> > (const_int 7 [0x7])
> > ]))
> > (reg:V8QI 93 [ _4 ])))
>
> Here, insn 8 is the folded version of the vget_high_s8 and insn 11 is part of
> the vcombine_s8. With that caveat…
>
> > So again same constant represented twice, which is reflected in the
> codegen.
>
> …right, the above is also what I was saying that we generate before the patch
> for your square example.
>
> But as you say later this testcase is demonstrating the point that constants
> loaded from memory should be more expensive than DUP (lane).
> I agree with that. The bit I don't agree with is costing the DUP (lane) as zero,
> so that it's also strictly cheaper than MOVI.
>
> So I think the disagreement is more about things like the first example in the
> testcase:
>
> https://godbolt.org/z/xrMnezrse
>
> Specifically: is it legitimate to fold:
>
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> (vec_select:V8QI (reg:V16QI 117)
> (parallel:V16QI [
> (const_int 8 [0x8])
> (const_int 9 [0x9])
> (const_int 10 [0xa])
> (const_int 11 [0xb])
> (const_int 12 [0xc])
> (const_int 13 [0xd])
> (const_int 14 [0xe])
> (const_int 15 [0xf])
> ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1394
> {aarch64_simd_mov_from_v16qihigh}
> (nil))
>
> to:
>
> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ])
> (const_vector:V8QI [
> (const_int 3 [0x3]) repeated x8
> ])) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi}
> (expr_list:REG_DEAD (reg:V16QI 117)
> (nil)))
>
> without first trying to get rid of the instruction some other way (through
> combine)?
>
> I think it is legitimate, since the new MOVI instruction is at least as cheap as
> the original DUP. Even if CSE didn't do the fold itself, and just CSEd the two
> uses of the V16QI constant, I think it would be legitimate for a later patch to
> fold the instruction to a constant independently of CSE.
>
> IMO:
>
> vget_high_s8(vdupq_n_u8(3))
>
> is just a roundabout way of writing:
>
> vdup_n_u8(3)
>
> We've described what vget_high_s8 does in target-independent rtl (i.e.
> without unspecs) so it's natural that operations with constant operands will
> themselves get folded to a constant.
>
> I think we should accept that and try to generate the output we want in an
> environment where such folds do happen, rather than trying to prevent the
> folds from happening until during or after combine.
>
> That approach could also work for autovec output, and cases where the user
> wrote the 8-byte constants directly. E.g. I think we should aim to optimise:
>
> void test0_mod (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0) {
> uint8x8_t three_u8 = vdup_n_u8(3);
> uint8x16_t x = vld1q_u8(inptr0);
> uint8x16_t y = vld1q_u8(inptr1);
> uint16x8_t x_l = vmovl_u8(vget_low_u8(x));
> uint16x8_t x_h = vmovl_u8(vget_high_u8(x));
> uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), three_u8);
> uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), three_u8);
> vst1_u8(outptr0, vaddhn_u16(z_l, z_h)); }
>
> in the same way as the original test0. Similarly we should aim to optimise:
>
> int8x16_t square_mode(int8x16_t full, int8x8_t small) {
> int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15};
> int8x8_t low = {8,9,10,11,12,13,15};
> int8x8_t res1 = vmul_s8 (small, low);
> return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1)); }
>
> in the same way as square.
>
> >> so that there are no longer any duplicate constants (as far as the
> >> RTL code is concerned). Instead we have one 16-byte constant and one 8-
> byte constant.
> >>
> >> The patch prevents the fold on insn 8 by making the “mov high”
> >> strictly cheaper than the constant move, so we keep the “mov high”
> >> and its 16-byte input. Keeping the “mov high” means that we do have
> >> a duplicate constant for CSE to remove.
> >>
> >> What I meant…
> >>
> >> >> Also, if I've understood correctly, it looks like we'd be relying
> >> >> on the vget_high of a constant remaining unfolded until RTL cse1.
> >> >> I think it's likely in future that we'd try to fold vget_high at
> >> >> the gimple level instead, since that could expose more
> >> >> optimisations of a different kind. The gimple optimisers would
> >> >> then fold
> >> >> vget_high(constant) in a similar way to
> >> >> cse1 does now.
> >> >>
> >> >> So perhaps we should continue to allow the vget_high(constant) to
> >> >> be foloded in cse1 and come up with some way of coping with the
> >> >> folded
> >> form.
> >>
> >> …here was that, in future, the gimple optimisers might be able to
> >> fold the vget_high themselves. For your example, we'd then have:
> >>
> >> _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D);
> >> _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 };
> >> _7 = __builtin_aarch64_combinev8qi (_4, _4);
> >> _8 = _6 + _7;
> >> return _8;
> >>
> >> In this situation, we'd need to recreate the relationship between {
> >> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10,
> >> 11, 12, 13, 15, 0 }. We can't ensure that the relationship is never lost.
> >>
> >> The same thing would be true for vget_low. So a constant like:
> >>
> >> cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }
> >> … vget_low* (cst) ..;
> >> … vget_high* (cst) …;
> >>
> >> could be folded to two smaller constants:
> >>
> >> … { 0, 1, 2, 3, 4, 5, 6, 7 } …;
> >> … { 8, 9, 10, 11, 12, 13, 15, 0 } …;
> >>
> >> We might then need to recreate the combined form, rather than relying
> >> on the combined form already existing.
> >
> > Yes but this is what confuses me. My patch changes it so that CSE1
> > which is ran relatively early is able to find the relationship between the two
> constants.
>
> Yeah, it does that for the case where the vector constant is a duplicate of a
> single element. My example above doesn't fall into that category though.
>
> What I was saying was: let's suppose that a vget_low/vget_high pair for a
> general V16QI vector constant is folded at the gimple level (by later patches).
> Then the RTL optimisers just see two V8QI constants rather than a single
> V16QI constant. The optimisers would need to generate the V16QI “from
> scratch” if they wanted to, as for test0_mod above.
>
> > CSE1 shouldn't do any folding, it doesn't have enough information to do so.
> > By CSE doing folding it makes it so combine is less efficient.
>
> I don't agree with that as a general statement. I agree that stopping pre-
> combine passes from folding helps examples like test0, but I don't think that
> means that pre-combine passes are doing the wrong thing by folding. IMO
> the problem is more that we are very opportunistic in looking for high-part
> operations (and by-lane operations). Legitimate optimisations can easily
> defeat this opportunistic matching.
>
> >> > CSE1 doesn't fold it, because for CSE the cost is too high to do
> >> > so. Which is
> >> what this costing was attempting to fix.
> >> > CSE simply does not touch it. It leaves it as
> >> >
> >> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ])
> >> > (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ])
> >> > (parallel:V16QI [
> >> > (const_int 0 [0])
> >> > (const_int 1 [0x1])
> >> > (const_int 2 [0x2])
> >> > (const_int 3 [0x3])
> >> > (const_int 4 [0x4])
> >> > (const_int 5 [0x5])
> >> > (const_int 6 [0x6])
> >> > (const_int 7 [0x7])
> >> > ]))
> >> > (reg:V8QI 93 [ _4 ]))) "":6506:10 1908
> >> {aarch64_simd_move_hi_quad_v16qi}
> >> > (nil))
> >> > (insn 12 11 13 2 (set (reg:V16QI 102)
> >> > (const_vector:V16QI [
> >> > (const_int 0 [0])
> >> > (const_int 1 [0x1])
> >> > (const_int 2 [0x2])
> >> > (const_int 3 [0x3])
> >> > (const_int 4 [0x4])
> >> > (const_int 5 [0x5])
> >> > (const_int 6 [0x6])
> >> > (const_int 7 [0x7])
> >> > (const_int 8 [0x8])
> >> > (const_int 9 [0x9])
> >> > (const_int 10 [0xa])
> >> > (const_int 11 [0xb])
> >> > (const_int 12 [0xc])
> >> > (const_int 13 [0xd])
> >> > (const_int 15 [0xf])
> >> > (const_int 0 [0])
> >> > ])) "":1466:14 1166 {*aarch64_simd_movv16qi}
> >> > (nil))
> >>
> >> I don't think that's true for the unpatched compiler. Are you sure
> >> this isn't the “pre-CSE” part of the dump? CSE is confusing (to me)
> >> in that it prints each function twice, once in unoptimised form and later in
> optimised form.
> >>
> >
> > Yes I'm sure, see all the compiler explorer links above.
>
> Ah, yeah, I misunderstood which insn you were quoting. But insn 11 in:
>
> https://godbolt.org/z/rrbP14var
>
> is part of the vcombine_s8. The preceding instructions are:
>
> (insn 9 8 10 2 (set (reg:V8QI 93 [ _4 ])
> (mult:V8QI (reg:V8QI 92 [ _2 ])
> (reg/v:V8QI 98 [ small ]))) "/opt/compiler-explorer/arm64/gcc-trunk-
> 20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":1402:14 1428 {mulv8qi3}
> (expr_list:REG_DEAD (reg/v:V8QI 98 [ small ])
> (expr_list:REG_DEAD (reg:V8QI 92 [ _2 ])
> (nil))))
> (insn 10 9 11 2 (set (reg:V16QI 95 [ _7 ])
> (vec_concat:V16QI (reg:V8QI 93 [ _4 ])
> (const_vector:V8QI [
> (const_int 0 [0]) repeated x8
> ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64-
> unknown-linux-gnu/lib/gcc/aarch64-unknown-linux-
> gnu/12.0.0/include/arm_neon.h":6506:10 1892
> {move_lo_quad_internal_v16qi}
> (nil))
>
> and since the multiplication result is variable, we can't fold this.
>
> The vget_high is insn 8, which does get folded (but it sounds like we agree on
> that).
>
> > > > And I don't see any way to fix this without having Gimple not push
> > > constants in, which would lead to worse regressions.
> > > > I can change the patch to cost the high as a dup which fixes this
> > > > codegen at
> > > least and has you rematerialize movi. If that's
> > > > not acceptable I can drop costing for High entirely then, it's not
> > > > the main
> > > thing I am fixing.
> > >
> > > Costing the high as a dup leaves us in the same situation as before
> > > the
> > > patch: the folded V8QI constant is cheaper than the unfolded mov high.
> >
> > Yes and the dup will reflect that. The argument that it's not the
> > right cost no longer hold any water in that case.
>
> Yeah, my concerns disappear in that case.
>
> > In particular as I still maintain that is too early to do any constant
> > folding in CSE1 for AArch64.
> >
> > Whether it's folded or not doesn't make any difference to combine
> > which will Fold when combinations are possible with the folder version.
> >
> > So I have yet to see any actual regression.
>
> Well, this is going to win any awards for realism :-), but:
>
> #include <arm_neon.h>
>
> int8x16_t foo() {
> int8x16_t a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> int8x8_t b = vget_high_s8 (a);
> int8x8_t c = { 4, 5, 6, 7, 8, 9, 10, 11 };
> int8x8_t d = vadd_s8 (b, c);
> int8x16_t e = vcombine_s8 (d, b);
> return vaddq_s8 (e, a);
> }
>
> is folded to a constant before the patch and isn't after the patch.
>
> Your examples are more realistic than that one, but I think this does show
> why preventing folding can be counter-productive in some cases.
>
> My hope is that one day gimple would fold that example to a constant.
> But if it does, it will also fold the vget_highs and vget_lows in the original
> testcase to constants, meaning that we can't rely on the original V16QI
> constant existing as well.
>
> Thanks,
> Richard
[-- Attachment #2: rb14774.patch --]
[-- Type: application/octet-stream, Size: 13229 bytes --]
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* Alu. */
- COSTS_N_INSNS (4) /* Mult. */
+ COSTS_N_INSNS (4), /* Mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>"
)
(define_insn "aarch64_simd_dup<mode>"
- [(set (match_operand:VDQF_F16 0 "register_operand" "=w")
+ [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w")
(vec_duplicate:VDQF_F16
- (match_operand:<VEL> 1 "register_operand" "w")))]
+ (match_operand:<VEL> 1 "register_operand" "w,r")))]
"TARGET_SIMD"
- "dup\\t%0.<Vtype>, %1.<Vetype>[0]"
- [(set_attr "type" "neon_dup<q>")]
+ "@
+ dup\\t%0.<Vtype>, %1.<Vetype>[0]
+ dup\\t%0.<Vtype>, %<vw>1"
+ [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")]
)
(define_insn "aarch64_dup_lane<mode>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 699c105a42a613c06c462e2de686795279d85bc9..1fb4350916572c915e5af339102444daf324efc7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
aarch64_addr_query_type);
static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
+static rtx aarch64_simd_make_constant (rtx, bool);
/* Major revision number of the ARM Architecture implemented by the target. */
unsigned aarch64_architecture_version;
@@ -12705,7 +12706,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
rtx op0, op1, op2;
const struct cpu_cost_table *extra_cost
= aarch64_tune_params.insn_extra_cost;
- int code = GET_CODE (x);
+ rtx_code code = GET_CODE (x);
scalar_int_mode int_mode;
/* By default, assume that everything has equivalent cost to the
@@ -14006,8 +14007,52 @@ cost_plus:
mode, MULT, 1, speed);
return true;
}
+ break;
+ case CONST_VECTOR:
+ {
+ rtx gen_insn = aarch64_simd_make_constant (x, true);
+ /* Not a valid const vector. */
+ if (!gen_insn)
+ break;
- /* Fall through. */
+ switch (GET_CODE (gen_insn))
+ {
+ case CONST_VECTOR:
+ /* Load using MOVI/MVNI. */
+ if (aarch64_simd_valid_immediate (x, NULL))
+ *cost += extra_cost->vect.movi;
+ else /* Load using constant pool. */
+ *cost += extra_cost->ldst.load;
+ break;
+ /* Load using a DUP. */
+ case VEC_DUPLICATE:
+ gcc_unreachable ();
+ break;
+ default:
+ *cost += extra_cost->ldst.load;
+ break;
+ }
+ return true;
+ }
+ case VEC_CONCAT:
+ /* depending on the operation, either DUP or INS.
+ For now, keep default costing. */
+ break;
+ case VEC_DUPLICATE:
+ *cost += extra_cost->vect.dup;
+ return true;
+ case VEC_SELECT:
+ {
+ /* cost subreg of 0 as free, otherwise as DUP */
+ rtx op1 = XEXP (x, 1);
+ if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+ ;
+ else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+ *cost += extra_cost->vect.dup;
+ else
+ *cost += extra_cost->vect.extract;
+ return true;
+ }
default:
break;
}
@@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode,
/* If VALS is a vector constant that can be loaded into a register
using DUP, generate instructions to do so and return an RTX to
- assign to the register. Otherwise return NULL_RTX. */
+ assign to the register. Otherwise return NULL_RTX.
+
+ If CHECK then the resulting instruction may not be used in
+ codegen but can be used for costing. */
static rtx
-aarch64_simd_dup_constant (rtx vals)
+aarch64_simd_dup_constant (rtx vals, bool check = false)
{
machine_mode mode = GET_MODE (vals);
machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
/* We can load this constant by using DUP and a constant in a
single ARM register. This will be cheaper than a vector
load. */
- x = copy_to_mode_reg (inner_mode, x);
+ if (!check)
+ x = copy_to_mode_reg (inner_mode, x);
return gen_vec_duplicate (mode, x);
}
@@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
/* Generate code to load VALS, which is a PARALLEL containing only
constants (for vec_init) or CONST_VECTOR, efficiently into a
register. Returns an RTX to copy into the register, or NULL_RTX
- for a PARALLEL that cannot be converted into a CONST_VECTOR. */
+ for a PARALLEL that cannot be converted into a CONST_VECTOR.
+
+ If CHECK then the resulting instruction may not be used in
+ codegen but can be used for costing. */
static rtx
-aarch64_simd_make_constant (rtx vals)
+aarch64_simd_make_constant (rtx vals, bool check = false)
{
machine_mode mode = GET_MODE (vals);
rtx const_dup;
@@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
&& aarch64_simd_valid_immediate (const_vec, NULL))
/* Load using MOVI/MVNI. */
return const_vec;
- else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
+ else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX)
/* Loaded using DUP. */
return const_dup;
else if (const_vec != NULL_RTX)
diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h
index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644
--- a/gcc/config/arm/aarch-common-protos.h
+++ b/gcc/config/arm/aarch-common-protos.h
@@ -133,6 +133,9 @@ struct vector_cost_table
{
const int alu;
const int mult;
+ const int movi;
+ const int dup;
+ const int extract;
};
struct cpu_cost_table
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =
/* Vector */
{
COSTS_N_INSNS (1), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =
/* Vector */
{
COSTS_N_INSNS (0), /* alu. */
- COSTS_N_INSNS (4) /* mult. */
+ COSTS_N_INSNS (4), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
@@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =
/* Vector */
{
COSTS_N_INSNS (2), /* alu. */
- COSTS_N_INSNS (8) /* mult. */
+ COSTS_N_INSNS (8), /* mult. */
+ COSTS_N_INSNS (1), /* movi. */
+ COSTS_N_INSNS (2), /* dup. */
+ COSTS_N_INSNS (2) /* extract. */
}
};
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <arm_neon.h>
+
+/*
+**test1:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
+ uint64_t res = a | arr[0];
+ uint64x2_t val = vld1q_u64 (arr);
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test2:
+** adrp x[0-9]+, .LC[0-1]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** str q[0-9]+, \[x[0-9]+\]
+** fmov x[0-9]+, d[0-9]+
+** orr x[0-9]+, x[0-9]+, x[0-9]+
+** ret
+*/
+
+uint64_t
+test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
+{
+ uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
+ uint64_t arr = vgetq_lane_u64 (val, 0);
+ uint64_t res = a | arr;
+ *rt = vaddq_u64 (val, b);
+ return res;
+}
+
+/*
+**test3:
+** adrp x[0-9]+, .LC[0-9]+
+** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
+** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** str q[0-9]+, \[x1\]
+** fmov w[0-9]+, s[0-9]+
+** orr w[0-9]+, w[0-9]+, w[0-9]+
+** ret
+*/
+
+uint32_t
+test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
+{
+ uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
+ uint32_t res = a | arr[0];
+ uint32x4_t val = vld1q_u32 (arr);
+ *rt = vaddq_u32 (val, b);
+ return res;
+}
+
+/*
+**test4:
+** ushr v[0-9]+.16b, v[0-9]+.16b, 7
+** mov x[0-9]+, 16512
+** movk x[0-9]+, 0x1020, lsl 16
+** movk x[0-9]+, 0x408, lsl 32
+** movk x[0-9]+, 0x102, lsl 48
+** fmov d[0-9]+, x[0-9]+
+** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
+** dup v[0-9]+.2d, v[0-9]+.d\[0\]
+** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
+** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** umov w[0-9]+, v[0-9]+.h\[3\]
+** ret
+*/
+
+uint64_t
+test4 (uint8x16_t input)
+{
+ uint8x16_t bool_input = vshrq_n_u8(input, 7);
+ poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
+ poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
+ vgetq_lane_p64(mask, 0));
+ poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
+ uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
+ return vget_lane_u16((uint16x4_t)res, 3);
+}
+
next prev parent reply other threads:[~2021-10-26 13:01 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-08-31 13:30 Tamar Christina
2021-08-31 15:13 ` Richard Sandiford
2021-08-31 15:47 ` Tamar Christina
2021-08-31 16:07 ` Richard Sandiford
2021-08-31 16:45 ` Tamar Christina
2021-08-31 18:37 ` Richard Sandiford
2021-09-08 12:58 ` Tamar Christina
2021-10-23 10:39 ` Richard Sandiford
2021-10-23 14:34 ` Tamar Christina
2021-10-25 9:54 ` Richard Sandiford
2021-10-25 11:49 ` Tamar Christina
2021-10-25 14:32 ` Richard Sandiford
2021-10-26 13:01 ` Tamar Christina [this message]
2021-10-26 14:46 ` Richard Sandiford
2021-10-27 15:44 ` Tamar Christina
2021-10-29 15:03 ` Tamar Christina
2021-10-29 15:23 ` Richard Sandiford
2021-11-02 10:39 ` Christophe Lyon
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=VI1PR08MB532599487C521DE56A1F3F3EFF849@VI1PR08MB5325.eurprd08.prod.outlook.com \
--to=tamar.christina@arm.com \
--cc=Marcus.Shawcroft@arm.com \
--cc=Richard.Earnshaw@arm.com \
--cc=Richard.Sandiford@arm.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=nd@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).