Hi, Following the discussion below here's a revised patch. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/arm/aarch-common-protos.h (struct vector_cost_table): Add movi, dup and extract costing fields. * config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs, thunderx_extra_costs, thunderx2t99_extra_costs, thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs): Use them. * config/arm/aarch-cost-tables.h (generic_extra_costs, cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs, exynosm1_extra_costs, xgene1_extra_costs): Likewise * config/aarch64/aarch64-simd.md (aarch64_simd_dup): Add r->w dup. * config/aarch64/aarch64.c (aarch64_simd_make_constant): Expose. (aarch64_rtx_costs): Add extra costs. (aarch64_simd_dup_constant): Support check only mode. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-cse-codegen.c: New test. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* Alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* Alu. */ - COSTS_N_INSNS (4) /* Mult. */ + COSTS_N_INSNS (4), /* Mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* Alu. */ - COSTS_N_INSNS (4) /* Mult. */ + COSTS_N_INSNS (4), /* Mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup" ) (define_insn "aarch64_simd_dup" - [(set (match_operand:VDQF_F16 0 "register_operand" "=w") + [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w") (vec_duplicate:VDQF_F16 - (match_operand: 1 "register_operand" "w")))] + (match_operand: 1 "register_operand" "w,r")))] "TARGET_SIMD" - "dup\\t%0., %1.[0]" - [(set_attr "type" "neon_dup")] + "@ + dup\\t%0., %1.[0] + dup\\t%0., %1" + [(set_attr "type" "neon_dup, neon_from_gp")] ) (define_insn "aarch64_dup_lane" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 699c105a42a613c06c462e2de686795279d85bc9..1fb4350916572c915e5af339102444daf324efc7 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -303,6 +303,7 @@ static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, aarch64_addr_query_type); static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); +static rtx aarch64_simd_make_constant (rtx, bool); /* Major revision number of the ARM Architecture implemented by the target. */ unsigned aarch64_architecture_version; @@ -12705,7 +12706,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, rtx op0, op1, op2; const struct cpu_cost_table *extra_cost = aarch64_tune_params.insn_extra_cost; - int code = GET_CODE (x); + rtx_code code = GET_CODE (x); scalar_int_mode int_mode; /* By default, assume that everything has equivalent cost to the @@ -14006,8 +14007,52 @@ cost_plus: mode, MULT, 1, speed); return true; } + break; + case CONST_VECTOR: + { + rtx gen_insn = aarch64_simd_make_constant (x, true); + /* Not a valid const vector. */ + if (!gen_insn) + break; - /* Fall through. */ + switch (GET_CODE (gen_insn)) + { + case CONST_VECTOR: + /* Load using MOVI/MVNI. */ + if (aarch64_simd_valid_immediate (x, NULL)) + *cost += extra_cost->vect.movi; + else /* Load using constant pool. */ + *cost += extra_cost->ldst.load; + break; + /* Load using a DUP. */ + case VEC_DUPLICATE: + gcc_unreachable (); + break; + default: + *cost += extra_cost->ldst.load; + break; + } + return true; + } + case VEC_CONCAT: + /* depending on the operation, either DUP or INS. + For now, keep default costing. */ + break; + case VEC_DUPLICATE: + *cost += extra_cost->vect.dup; + return true; + case VEC_SELECT: + { + /* cost subreg of 0 as free, otherwise as DUP */ + rtx op1 = XEXP (x, 1); + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) + ; + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) + *cost += extra_cost->vect.dup; + else + *cost += extra_cost->vect.extract; + return true; + } default: break; } @@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (machine_mode mode, /* If VALS is a vector constant that can be loaded into a register using DUP, generate instructions to do so and return an RTX to - assign to the register. Otherwise return NULL_RTX. */ + assign to the register. Otherwise return NULL_RTX. + + If CHECK then the resulting instruction may not be used in + codegen but can be used for costing. */ static rtx -aarch64_simd_dup_constant (rtx vals) +aarch64_simd_dup_constant (rtx vals, bool check = false) { machine_mode mode = GET_MODE (vals); machine_mode inner_mode = GET_MODE_INNER (mode); @@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals) /* We can load this constant by using DUP and a constant in a single ARM register. This will be cheaper than a vector load. */ - x = copy_to_mode_reg (inner_mode, x); + if (!check) + x = copy_to_mode_reg (inner_mode, x); return gen_vec_duplicate (mode, x); } @@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals) /* Generate code to load VALS, which is a PARALLEL containing only constants (for vec_init) or CONST_VECTOR, efficiently into a register. Returns an RTX to copy into the register, or NULL_RTX - for a PARALLEL that cannot be converted into a CONST_VECTOR. */ + for a PARALLEL that cannot be converted into a CONST_VECTOR. + + If CHECK then the resulting instruction may not be used in + codegen but can be used for costing. */ static rtx -aarch64_simd_make_constant (rtx vals) +aarch64_simd_make_constant (rtx vals, bool check = false) { machine_mode mode = GET_MODE (vals); rtx const_dup; @@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals) && aarch64_simd_valid_immediate (const_vec, NULL)) /* Load using MOVI/MVNI. */ return const_vec; - else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) + else if ((const_dup = aarch64_simd_dup_constant (vals, check)) != NULL_RTX) /* Loaded using DUP. */ return const_dup; else if (const_vec != NULL_RTX) diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-common-protos.h index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 100644 --- a/gcc/config/arm/aarch-common-protos.h +++ b/gcc/config/arm/aarch-common-protos.h @@ -133,6 +133,9 @@ struct vector_cost_table { const int alu; const int mult; + const int movi; + const int dup; + const int extract; }; struct cpu_cost_table diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef 100644 --- a/gcc/config/arm/aarch-cost-tables.h +++ b/gcc/config/arm/aarch-cost-tables.h @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs = /* Vector */ { COSTS_N_INSNS (1), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs = /* Vector */ { COSTS_N_INSNS (0), /* alu. */ - COSTS_N_INSNS (4) /* mult. */ + COSTS_N_INSNS (4), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs = /* Vector */ { COSTS_N_INSNS (2), /* alu. */ - COSTS_N_INSNS (8) /* mult. */ + COSTS_N_INSNS (8), /* mult. */ + COSTS_N_INSNS (1), /* movi. */ + COSTS_N_INSNS (2), /* dup. */ + COSTS_N_INSNS (2) /* extract. */ } }; diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c new file mode 100644 index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b0170cfda7f41655b3c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c @@ -0,0 +1,97 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include + +/* +**test1: +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** str q[0-9]+, \[x[0-9]+\] +** fmov x[0-9]+, d[0-9]+ +** orr x[0-9]+, x[0-9]+, x[0-9]+ +** ret +*/ + +uint64_t +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt) +{ + uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL}; + uint64_t res = a | arr[0]; + uint64x2_t val = vld1q_u64 (arr); + *rt = vaddq_u64 (val, b); + return res; +} + +/* +**test2: +** adrp x[0-9]+, .LC[0-1]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** str q[0-9]+, \[x[0-9]+\] +** fmov x[0-9]+, d[0-9]+ +** orr x[0-9]+, x[0-9]+, x[0-9]+ +** ret +*/ + +uint64_t +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt) +{ + uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL); + uint64_t arr = vgetq_lane_u64 (val, 0); + uint64_t res = a | arr; + *rt = vaddq_u64 (val, b); + return res; +} + +/* +**test3: +** adrp x[0-9]+, .LC[0-9]+ +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** str q[0-9]+, \[x1\] +** fmov w[0-9]+, s[0-9]+ +** orr w[0-9]+, w[0-9]+, w[0-9]+ +** ret +*/ + +uint32_t +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) +{ + uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 }; + uint32_t res = a | arr[0]; + uint32x4_t val = vld1q_u32 (arr); + *rt = vaddq_u32 (val, b); + return res; +} + +/* +**test4: +** ushr v[0-9]+.16b, v[0-9]+.16b, 7 +** mov x[0-9]+, 16512 +** movk x[0-9]+, 0x1020, lsl 16 +** movk x[0-9]+, 0x408, lsl 32 +** movk x[0-9]+, 0x102, lsl 48 +** fmov d[0-9]+, x[0-9]+ +** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d +** dup v[0-9]+.2d, v[0-9]+.d\[0\] +** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d +** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b +** umov w[0-9]+, v[0-9]+.h\[3\] +** ret +*/ + +uint64_t +test4 (uint8x16_t input) +{ + uint8x16_t bool_input = vshrq_n_u8(input, 7); + poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); + poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), + vgetq_lane_p64(mask, 0)); + poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); + uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); + return vget_lane_u16((uint16x4_t)res, 3); +} + > -----Original Message----- > From: Richard Sandiford > Sent: Monday, October 25, 2021 3:32 PM > To: Tamar Christina > Cc: Tamar Christina via Gcc-patches ; Richard > Earnshaw ; nd ; Marcus > Shawcroft > Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants > and operations > > Tamar Christina writes: > >> -----Original Message----- > >> From: Richard Sandiford > >> Sent: Monday, October 25, 2021 10:54 AM > >> To: Tamar Christina > >> Cc: Tamar Christina via Gcc-patches ; > >> Richard Earnshaw ; nd ; > Marcus > >> Shawcroft > >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector > >> constants and operations > >> > >> Tamar Christina writes: > >> >> -----Original Message----- > >> >> From: Richard Sandiford > >> >> Sent: Saturday, October 23, 2021 11:40 AM > >> >> To: Tamar Christina via Gcc-patches > >> >> Cc: Tamar Christina ; Richard Earnshaw > >> >> ; nd ; Marcus > Shawcroft > >> >> > >> >> Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector > >> >> constants and operations > >> >> > >> >> Tamar Christina via Gcc-patches writes: > >> >> >> I'm still a bit sceptical about treating the high-part cost as lower. > >> >> >> ISTM that the subreg cases are the ones that are truly “free” > >> >> >> and any others should have a normal cost. So if CSE handled > >> >> >> the subreg case itself (to model how the rtx would actually be > >> >> >> generated) then > >> >> >> aarch64 code would have to do less work. I imagine that will > >> >> >> be true for > >> >> other targets as well. > >> >> > > >> >> > I guess the main problem is that CSE lacks context because it's > >> >> > not until after combine that the high part becomes truly "free" > >> >> > when pushed > >> >> into a high operation. > >> >> > >> >> Yeah. And the aarch64 code is just being asked to cost the > >> >> operation it's given, which could for example come from an > >> >> existing aarch64_simd_mov_from_high. I think we should try > >> >> to ensure that a aarch64_simd_mov_from_high followed by > some > >> arithmetic > >> >> on the result is more expensive than the fused operation (when > >> >> fusing is possible). > >> >> > >> >> An analogy might be: if the cost code is given: > >> >> > >> >> (add (reg X) (reg Y)) > >> >> > >> >> then, at some later point, the (reg X) might be replaced with a > >> >> multiplication, in which case we'd have a MADD operation and the > >> >> addition is effectively free. Something similar would happen if > >> >> (reg > >> >> X) became a shift by a small amount on newer cores, although I > >> >> guess then you could argue either that the cost of the add > >> >> disappears or that > >> the cost of the shift disappears. > >> >> > >> >> But we shouldn't count ADD as free on the basis that it could be > >> >> combined with a multiplication or shift in future. We have to > >> >> cost what we're given. I think the same thing applies to the high part. > >> >> > >> >> Here we're trying to prevent cse1 from replacing a DUP (lane) with > >> >> a MOVI by saying that the DUP is strictly cheaper than the MOVI. > >> >> I don't think that's really true though, and the cost tables in > >> >> the patch say that DUP is more expensive (rather than less > >> >> expensive) than > >> MOVI. > >> > > >> > No we're not. The front end has already pushed the constant into > >> > each operation that needs it which is the entire problem. > >> > >> I think we're talking about different things here. I'll come to the > >> gimple stuff below, but I was talking purely about the effect on the > >> RTL optimisers. What I meant above is that, in the cse1 dumps, the patch > leads to changes like: > >> > >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ]) > >> - (const_vector:V8QI [ > >> + (vec_select:V8QI (reg:V16QI 116) > >> + (parallel:V16QI [ > >> + (const_int 8 [0x8]) > >> + (const_int 9 [0x9]) > >> + (const_int 10 [0xa]) > >> + (const_int 11 [0xb]) > >> + (const_int 12 [0xc]) > >> + (const_int 13 [0xd]) > >> + (const_int 14 [0xe]) > >> + (const_int 15 [0xf]) > >> + ]))) "include/arm_neon.h":6477:22 1394 > >> {aarch64_simd_mov_from_v16qihigh} > >> + (expr_list:REG_EQUAL (const_vector:V8QI [ > >> (const_int 3 [0x3]) repeated x8 > >> - ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi} > >> - (expr_list:REG_DEAD (reg:V16QI 117) > >> - (nil))) > >> + ]) > >> + (expr_list:REG_DEAD (reg:V16QI 117) > >> + (nil)))) > >> > >> The pre-cse1 code is: > >> > >> (insn 19 18 20 2 (set (reg:V16QI 117) > >> (const_vector:V16QI [ > >> (const_int 3 [0x3]) repeated x16 > >> ])) "include/arm_neon.h":6477:22 1166 {*aarch64_simd_movv16qi} > >> (nil)) > >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ]) > >> (vec_select:V8QI (reg:V16QI 117) > >> (parallel:V16QI [ > >> (const_int 8 [0x8]) > >> (const_int 9 [0x9]) > >> (const_int 10 [0xa]) > >> (const_int 11 [0xb]) > >> (const_int 12 [0xc]) > >> (const_int 13 [0xd]) > >> (const_int 14 [0xe]) > >> (const_int 15 [0xf]) > >> ]))) "include/arm_neon.h":6477:22 1394 > >> {aarch64_simd_mov_from_v16qihigh} > >> (nil)) > >> > >> That is, before the patch, we folded insn 19 into insn 20 to get: > >> > >> (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ]) > >> (const_vector:V8QI [ > >> (const_int 3 [0x3]) repeated x8 > >> ])) "include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi} > >> (expr_list:REG_DEAD (reg:V16QI 117) > >> (nil))) > >> > >> After the patch we reject that because: > >> > >> (set (reg:V8QI X) (const_vector:V8QI [3])) > >> > >> is costed as a MOVI (cost 4) and the original > >> aarch64_simd_mov_from_v16qihigh is costed as zero. In other words, > >> the patch makes the DUP (lane) in the “mov high” strictly cheaper > >> than a constant move (MOVI). > > > > Yes, this was done intentionally because as we talked about a month > > ago there's no real way to cost this correctly. The use of `X` there > > determines whether it's cheaper to use the movi over the dup. The > > MOVI not only prevent re-use of the value, it also prevents combining > > into high operations. All of which is impossible to tell currently in how CSE > and costing are done. > > > > This is an unmodified compiler created from last night's trunk > > https://godbolt.org/z/1saTP4xWs > > > > While yes, it did fold movi into the set, reg 19 wasn't dead, so you > > now materialized the constant 3 times > > > > test0: > > ldr q0, [x0] > > movi v3.8b, 0x3 <<<< first > > ldr q2, [x1] > > movi v5.16b, 0x3 <<< second > > uxtl v1.8h, v0.8b > > dup d4, v2.d[1] <<< third > > uxtl2 v0.8h, v0.16b > > umlal v1.8h, v2.8b, v5.8b > > umlal v0.8h, v4.8b, v3.8b > > addhn v0.8b, v1.8h, v0.8h > > str d0, [x2] > > ret > > > > whilst my patch, generates > > > > test0: > > movi v2.16b, 0x3 <<< once > > ldr q0, \[x0\] > > uxtl v1.8h, v0.8b > > uxtl2 v0.8h, v0.16b > > ldr q3, \[x1\] > > umlal v1.8h, v3.8b, v2.8b > > umlal2 v0.8h, v3.16b, v2.16b > > addhn v0.8b, v1.8h, v0.8h > > str d0, \[x2\] > > ret > > > > Yes it's not perfect, yes you can end up with a dup instead of two > > movi's but my argument is it's still a step forward as the perfect solution > doesn't seem to be possible at all with the way things are currently set up. > > I agree there's no out-of-the-box way of getting what we want for the > original testcases. It would require changes outside the target or (if the > worst comes to the worst) a target-specific pass. > > >> Preventing this fold seems like a key part of being able to match the > >> *l2 forms in the testcase, since otherwise the “mov high” disappears > >> and isn't available for combining later. > > > > Yes, and by preventing the folding combine should in principle be able > > to fold it back if it wasn't pushed into another Instruction, but combine > does not attempt to touch constants and selects on their own. If it did this > "regression" would be fixed. > > The problem is that combine is limited to individual EBBs and only combines > def-use chains when there is a single use. It's not a general folding engine. > > > I'm not really quite sure what we're arguing about.. I did think about all > three possible cases when making this: > > > > https://godbolt.org/z/hjWhWq1v1 > > > > Of the three cases the compiler currently only generates something good > for test2. Both test1 and test0 are deficient. > > The patch doesn't change test2, significantly improves test0 and whether > test1 is a regression is likely uArch specific. > > > > On Arm Cortex CPUs it is not a regression as a DUP on a SIMD scalar > > has the same throughput and latencies as a MOVI according to the Arm > Performance Software Optimization guides. > > Costing them as equal would be OK when they are equal. It's the “DUP > (lane)/ mov high is strictly cheaper bit” I'm concerned about. > > > So to me this looks like an improvement overall. And this is where we likely > disagree? > > Well, the disagreement isn't about whether the new compiler output for > these testcases is better than the old compiler output. It's more a question > of how we're getting there. > > >> > MOVI as I mentioned before is the one case where this is a toss up. > >> > But there are far more constants that cannot be created with a movi. > >> > A simple example is > >> > > >> > #include > >> > > >> > int8x16_t square(int8x16_t full, int8x8_t small) { > >> > int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15}; > >> > int8x8_t low = vget_high_s8 (cst); > >> > int8x8_t res1 = vmul_s8 (small, low); > >> > return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, > >> > res1)); } > >> > > >> > Where in Gimple we get > >> > > >> > [local count: 1073741824]: > >> > _2 = __builtin_aarch64_get_highv16qi ({ 0, 1, 2, 3, 4, 5, 6, 7, > >> > 8, 9, 10, 11, 12, > >> 13, 15, 0 }); > >> > _4 = _2 * small_3(D); > >> > _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }; > >> > _7 = __builtin_aarch64_combinev8qi (_4, _4); > >> > _8 = _6 + _7; > >> > return _8; > >> > > >> > Regardless of what happens to __builtin_aarch64_get_highv16qi > >> > nothing will recreate the relationship with cst, whether > >> __builtin_aarch64_get_highv16qi is lowered or not, constant prop will > >> still push in constants. > >> > >> Yeah, constants are (by design) free in gimple. But that's OK in > >> itself, because RTL optimisers have the job of removing any > >> duplicates that end up requiring separate moves. I think we both agree > on that. > >> > >> E.g. for: > >> > >> #include > >> > >> void foo(int8x16_t *x) { > >> x[0] = vaddq_s8 (x[0], (int8x16_t) {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}); > >> x[1] = vaddq_s8 (x[1], (int8x16_t) > >> {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}); > >> } > >> > >> the final gimple is: > >> > >> [local count: 1073741824]: > >> _1 = *x_4(D); > >> _5 = _1 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; > >> *x_4(D) = _5; > >> _2 = MEM[(int8x16_t *)x_4(D) + 16B]; > >> _7 = _2 + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; > >> MEM[(int8x16_t *)x_4(D) + 16B] = _7; > >> return; > >> > >> but cse1 removes the duplicated constant even before the patch. > > > > It doesn't for me, again an unmodified compiler: > > > > https://godbolt.org/z/qnvf7496h > > FWIW, the link for my example is: > > https://godbolt.org/z/G6vaE3nab > > but it sounds like the disagreement wasn't where I thought it was. > > > and CSE1 has as the final codegen: > > > > (insn 7 4 8 2 (set (reg:V16QI 99) > > (const_vector:V16QI [ > > (const_int 0 [0]) > > (const_int 1 [0x1]) > > (const_int 2 [0x2]) > > (const_int 3 [0x3]) > > (const_int 4 [0x4]) > > (const_int 5 [0x5]) > > (const_int 6 [0x6]) > > (const_int 7 [0x7]) > > (const_int 8 [0x8]) > > (const_int 9 [0x9]) > > (const_int 10 [0xa]) > > (const_int 11 [0xb]) > > (const_int 12 [0xc]) > > (const_int 13 [0xd]) > > (const_int 15 [0xf]) > > (const_int 0 [0]) > > ])) > > > > (insn 8 7 9 2 (set (reg:V8QI 92 [ _2 ]) > > (const_vector:V8QI [ > > (const_int 8 [0x8]) > > (const_int 9 [0x9]) > > (const_int 10 [0xa]) > > (const_int 11 [0xb]) > > (const_int 12 [0xc]) > > (const_int 13 [0xd]) > > (const_int 15 [0xf]) > > (const_int 0 [0]) > > ])) > > > > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ]) > > (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ]) > > (parallel:V16QI [ > > (const_int 0 [0]) > > (const_int 1 [0x1]) > > (const_int 2 [0x2]) > > (const_int 3 [0x3]) > > (const_int 4 [0x4]) > > (const_int 5 [0x5]) > > (const_int 6 [0x6]) > > (const_int 7 [0x7]) > > ])) > > (reg:V8QI 93 [ _4 ]))) > > Here, insn 8 is the folded version of the vget_high_s8 and insn 11 is part of > the vcombine_s8. With that caveat… > > > So again same constant represented twice, which is reflected in the > codegen. > > …right, the above is also what I was saying that we generate before the patch > for your square example. > > But as you say later this testcase is demonstrating the point that constants > loaded from memory should be more expensive than DUP (lane). > I agree with that. The bit I don't agree with is costing the DUP (lane) as zero, > so that it's also strictly cheaper than MOVI. > > So I think the disagreement is more about things like the first example in the > testcase: > > https://godbolt.org/z/xrMnezrse > > Specifically: is it legitimate to fold: > > (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ]) > (vec_select:V8QI (reg:V16QI 117) > (parallel:V16QI [ > (const_int 8 [0x8]) > (const_int 9 [0x9]) > (const_int 10 [0xa]) > (const_int 11 [0xb]) > (const_int 12 [0xc]) > (const_int 13 [0xd]) > (const_int 14 [0xe]) > (const_int 15 [0xf]) > ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64- > unknown-linux-gnu/lib/gcc/aarch64-unknown-linux- > gnu/12.0.0/include/arm_neon.h":6477:22 1394 > {aarch64_simd_mov_from_v16qihigh} > (nil)) > > to: > > (insn 20 19 21 2 (set (reg:V8QI 96 [ _8 ]) > (const_vector:V8QI [ > (const_int 3 [0x3]) repeated x8 > ])) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64- > unknown-linux-gnu/lib/gcc/aarch64-unknown-linux- > gnu/12.0.0/include/arm_neon.h":6477:22 1160 {*aarch64_simd_movv8qi} > (expr_list:REG_DEAD (reg:V16QI 117) > (nil))) > > without first trying to get rid of the instruction some other way (through > combine)? > > I think it is legitimate, since the new MOVI instruction is at least as cheap as > the original DUP. Even if CSE didn't do the fold itself, and just CSEd the two > uses of the V16QI constant, I think it would be legitimate for a later patch to > fold the instruction to a constant independently of CSE. > > IMO: > > vget_high_s8(vdupq_n_u8(3)) > > is just a roundabout way of writing: > > vdup_n_u8(3) > > We've described what vget_high_s8 does in target-independent rtl (i.e. > without unspecs) so it's natural that operations with constant operands will > themselves get folded to a constant. > > I think we should accept that and try to generate the output we want in an > environment where such folds do happen, rather than trying to prevent the > folds from happening until during or after combine. > > That approach could also work for autovec output, and cases where the user > wrote the 8-byte constants directly. E.g. I think we should aim to optimise: > > void test0_mod (uint8_t *inptr0, uint8_t *inptr1, uint8_t *outptr0) { > uint8x8_t three_u8 = vdup_n_u8(3); > uint8x16_t x = vld1q_u8(inptr0); > uint8x16_t y = vld1q_u8(inptr1); > uint16x8_t x_l = vmovl_u8(vget_low_u8(x)); > uint16x8_t x_h = vmovl_u8(vget_high_u8(x)); > uint16x8_t z_l = vmlal_u8(x_l, vget_low_u8(y), three_u8); > uint16x8_t z_h = vmlal_u8(x_h, vget_high_u8(y), three_u8); > vst1_u8(outptr0, vaddhn_u16(z_l, z_h)); } > > in the same way as the original test0. Similarly we should aim to optimise: > > int8x16_t square_mode(int8x16_t full, int8x8_t small) { > int8x16_t cst = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,15}; > int8x8_t low = {8,9,10,11,12,13,15}; > int8x8_t res1 = vmul_s8 (small, low); > return vaddq_s8 (vmulq_s8 (full, cst), vcombine_s8 (res1, res1)); } > > in the same way as square. > > >> so that there are no longer any duplicate constants (as far as the > >> RTL code is concerned). Instead we have one 16-byte constant and one 8- > byte constant. > >> > >> The patch prevents the fold on insn 8 by making the “mov high” > >> strictly cheaper than the constant move, so we keep the “mov high” > >> and its 16-byte input. Keeping the “mov high” means that we do have > >> a duplicate constant for CSE to remove. > >> > >> What I meant… > >> > >> >> Also, if I've understood correctly, it looks like we'd be relying > >> >> on the vget_high of a constant remaining unfolded until RTL cse1. > >> >> I think it's likely in future that we'd try to fold vget_high at > >> >> the gimple level instead, since that could expose more > >> >> optimisations of a different kind. The gimple optimisers would > >> >> then fold > >> >> vget_high(constant) in a similar way to > >> >> cse1 does now. > >> >> > >> >> So perhaps we should continue to allow the vget_high(constant) to > >> >> be foloded in cse1 and come up with some way of coping with the > >> >> folded > >> form. > >> > >> …here was that, in future, the gimple optimisers might be able to > >> fold the vget_high themselves. For your example, we'd then have: > >> > >> _4 = { 8, 9, 10, 11, 12, 13, 15, 0 } * small_3(D); > >> _6 = full_5(D) * { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 }; > >> _7 = __builtin_aarch64_combinev8qi (_4, _4); > >> _8 = _6 + _7; > >> return _8; > >> > >> In this situation, we'd need to recreate the relationship between { > >> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } and { 8, 9, 10, > >> 11, 12, 13, 15, 0 }. We can't ensure that the relationship is never lost. > >> > >> The same thing would be true for vget_low. So a constant like: > >> > >> cst = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 0 } > >> … vget_low* (cst) ..; > >> … vget_high* (cst) …; > >> > >> could be folded to two smaller constants: > >> > >> … { 0, 1, 2, 3, 4, 5, 6, 7 } …; > >> … { 8, 9, 10, 11, 12, 13, 15, 0 } …; > >> > >> We might then need to recreate the combined form, rather than relying > >> on the combined form already existing. > > > > Yes but this is what confuses me. My patch changes it so that CSE1 > > which is ran relatively early is able to find the relationship between the two > constants. > > Yeah, it does that for the case where the vector constant is a duplicate of a > single element. My example above doesn't fall into that category though. > > What I was saying was: let's suppose that a vget_low/vget_high pair for a > general V16QI vector constant is folded at the gimple level (by later patches). > Then the RTL optimisers just see two V8QI constants rather than a single > V16QI constant. The optimisers would need to generate the V16QI “from > scratch” if they wanted to, as for test0_mod above. > > > CSE1 shouldn't do any folding, it doesn't have enough information to do so. > > By CSE doing folding it makes it so combine is less efficient. > > I don't agree with that as a general statement. I agree that stopping pre- > combine passes from folding helps examples like test0, but I don't think that > means that pre-combine passes are doing the wrong thing by folding. IMO > the problem is more that we are very opportunistic in looking for high-part > operations (and by-lane operations). Legitimate optimisations can easily > defeat this opportunistic matching. > > >> > CSE1 doesn't fold it, because for CSE the cost is too high to do > >> > so. Which is > >> what this costing was attempting to fix. > >> > CSE simply does not touch it. It leaves it as > >> > > >> > (insn 11 10 12 2 (set (reg:V16QI 95 [ _7 ]) > >> > (vec_concat:V16QI (vec_select:V8QI (reg:V16QI 95 [ _7 ]) > >> > (parallel:V16QI [ > >> > (const_int 0 [0]) > >> > (const_int 1 [0x1]) > >> > (const_int 2 [0x2]) > >> > (const_int 3 [0x3]) > >> > (const_int 4 [0x4]) > >> > (const_int 5 [0x5]) > >> > (const_int 6 [0x6]) > >> > (const_int 7 [0x7]) > >> > ])) > >> > (reg:V8QI 93 [ _4 ]))) "":6506:10 1908 > >> {aarch64_simd_move_hi_quad_v16qi} > >> > (nil)) > >> > (insn 12 11 13 2 (set (reg:V16QI 102) > >> > (const_vector:V16QI [ > >> > (const_int 0 [0]) > >> > (const_int 1 [0x1]) > >> > (const_int 2 [0x2]) > >> > (const_int 3 [0x3]) > >> > (const_int 4 [0x4]) > >> > (const_int 5 [0x5]) > >> > (const_int 6 [0x6]) > >> > (const_int 7 [0x7]) > >> > (const_int 8 [0x8]) > >> > (const_int 9 [0x9]) > >> > (const_int 10 [0xa]) > >> > (const_int 11 [0xb]) > >> > (const_int 12 [0xc]) > >> > (const_int 13 [0xd]) > >> > (const_int 15 [0xf]) > >> > (const_int 0 [0]) > >> > ])) "":1466:14 1166 {*aarch64_simd_movv16qi} > >> > (nil)) > >> > >> I don't think that's true for the unpatched compiler. Are you sure > >> this isn't the “pre-CSE” part of the dump? CSE is confusing (to me) > >> in that it prints each function twice, once in unoptimised form and later in > optimised form. > >> > > > > Yes I'm sure, see all the compiler explorer links above. > > Ah, yeah, I misunderstood which insn you were quoting. But insn 11 in: > > https://godbolt.org/z/rrbP14var > > is part of the vcombine_s8. The preceding instructions are: > > (insn 9 8 10 2 (set (reg:V8QI 93 [ _4 ]) > (mult:V8QI (reg:V8QI 92 [ _2 ]) > (reg/v:V8QI 98 [ small ]))) "/opt/compiler-explorer/arm64/gcc-trunk- > 20211025/aarch64-unknown-linux-gnu/lib/gcc/aarch64-unknown-linux- > gnu/12.0.0/include/arm_neon.h":1402:14 1428 {mulv8qi3} > (expr_list:REG_DEAD (reg/v:V8QI 98 [ small ]) > (expr_list:REG_DEAD (reg:V8QI 92 [ _2 ]) > (nil)))) > (insn 10 9 11 2 (set (reg:V16QI 95 [ _7 ]) > (vec_concat:V16QI (reg:V8QI 93 [ _4 ]) > (const_vector:V8QI [ > (const_int 0 [0]) repeated x8 > ]))) "/opt/compiler-explorer/arm64/gcc-trunk-20211025/aarch64- > unknown-linux-gnu/lib/gcc/aarch64-unknown-linux- > gnu/12.0.0/include/arm_neon.h":6506:10 1892 > {move_lo_quad_internal_v16qi} > (nil)) > > and since the multiplication result is variable, we can't fold this. > > The vget_high is insn 8, which does get folded (but it sounds like we agree on > that). > > > > > And I don't see any way to fix this without having Gimple not push > > > constants in, which would lead to worse regressions. > > > > I can change the patch to cost the high as a dup which fixes this > > > > codegen at > > > least and has you rematerialize movi. If that's > > > > not acceptable I can drop costing for High entirely then, it's not > > > > the main > > > thing I am fixing. > > > > > > Costing the high as a dup leaves us in the same situation as before > > > the > > > patch: the folded V8QI constant is cheaper than the unfolded mov high. > > > > Yes and the dup will reflect that. The argument that it's not the > > right cost no longer hold any water in that case. > > Yeah, my concerns disappear in that case. > > > In particular as I still maintain that is too early to do any constant > > folding in CSE1 for AArch64. > > > > Whether it's folded or not doesn't make any difference to combine > > which will Fold when combinations are possible with the folder version. > > > > So I have yet to see any actual regression. > > Well, this is going to win any awards for realism :-), but: > > #include > > int8x16_t foo() { > int8x16_t a = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; > int8x8_t b = vget_high_s8 (a); > int8x8_t c = { 4, 5, 6, 7, 8, 9, 10, 11 }; > int8x8_t d = vadd_s8 (b, c); > int8x16_t e = vcombine_s8 (d, b); > return vaddq_s8 (e, a); > } > > is folded to a constant before the patch and isn't after the patch. > > Your examples are more realistic than that one, but I think this does show > why preventing folding can be counter-productive in some cases. > > My hope is that one day gimple would fold that example to a constant. > But if it does, it will also fold the vget_highs and vget_lows in the original > testcase to constants, meaning that we can't rely on the original V16QI > constant existing as well. > > Thanks, > Richard