From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from foss.arm.com (foss.arm.com [217.140.110.172]) by sourceware.org (Postfix) with ESMTP id 5D1CC3858D39 for ; Tue, 26 Oct 2021 14:46:08 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 5D1CC3858D39 Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14]) by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id E0DAF1063; Tue, 26 Oct 2021 07:46:07 -0700 (PDT) Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.88]) by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 138543F73D; Tue, 26 Oct 2021 07:46:06 -0700 (PDT) From: Richard Sandiford To: Tamar Christina Mail-Followup-To: Tamar Christina , Tamar Christina via Gcc-patches , Richard Earnshaw , nd , Marcus Shawcroft , richard.sandiford@arm.com Cc: Tamar Christina via Gcc-patches , Richard Earnshaw , nd , Marcus Shawcroft Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants and operations References: Date: Tue, 26 Oct 2021 15:46:05 +0100 In-Reply-To: (Tamar Christina's message of "Tue, 26 Oct 2021 13:01:21 +0000") Message-ID: User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux) MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: quoted-printable X-Spam-Status: No, score=-12.2 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, KAM_LOTSOFHASH, KAM_SHORT, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 26 Oct 2021 14:46:10 -0000 Tamar Christina writes: > Hi, > > Following the discussion below here's a revised patch. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? Looks good functionally, just got some comments about the implementation. > @@ -14006,8 +14007,52 @@ cost_plus: > mode, MULT, 1, speed); > return true; > } > + break; > + case CONST_VECTOR: > + { > + rtx gen_insn =3D aarch64_simd_make_constant (x, true); > + /* Not a valid const vector. */ > + if (!gen_insn) > + break; >=20=20 > - /* Fall through. */ > + switch (GET_CODE (gen_insn)) > + { > + case CONST_VECTOR: > + /* Load using MOVI/MVNI. */ > + if (aarch64_simd_valid_immediate (x, NULL)) > + *cost +=3D extra_cost->vect.movi; > + else /* Load using constant pool. */ > + *cost +=3D extra_cost->ldst.load; > + break; > + /* Load using a DUP. */ > + case VEC_DUPLICATE: > + gcc_unreachable (); > + break; > + default: > + *cost +=3D extra_cost->ldst.load; > + break; > + } > + return true; > + } This might be a problem (if it is a problem) with some of the existing cases too, but: is using +=3D rather than =3D the right behaviour here? It maens that we add our cost on top of whatever the target-independent rtx_costs thought was a good default choice, whereas it looks like these table entries specify the correct full cost. If it's not clear-cut, then I think using =3D would be better. Also, going back to an earlier part of the thread, I think the =E2=80=9Cinn= er=E2=80=9D CONST_VECTOR case is now a correct replacement for the =E2=80=9Couter=E2=80= =9D CONST_VECTOR case, meaning we don't need the aarch64_simd_make_constant bits. I.e. I think we can make the top-level case: case CONST_VECTOR: /* Load using MOVI/MVNI. */ if (aarch64_simd_valid_immediate (x, NULL)) *cost =3D extra_cost->vect.movi; else /* Load using constant pool. */ *cost =3D extra_cost->ldst.load; break; > + case VEC_CONCAT: > + /* depending on the operation, either DUP or INS. > + For now, keep default costing. */ > + break; > + case VEC_DUPLICATE: > + *cost +=3D extra_cost->vect.dup; > + return true; For this I think we should do: *cost =3D extra_cost->vect.dup; return false; so that we cost the operand of the vec_duplicate as well. This will have no effect if the operand is a REG, but would affect more complex expressions. > + case VEC_SELECT: > + { Here I think we should recurse on operand 0: rtx op0 =3D XEXP (x, 0); *cost =3D rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); > + /* cost subreg of 0 as free, otherwise as DUP */ > + rtx op1 =3D XEXP (x, 1); > + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) > + ; > + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) > + *cost +=3D extra_cost->vect.dup; > + else > + *cost +=3D extra_cost->vect.extract; > + return true; > + } > default: > break; > } > @@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (mac= hine_mode mode, >=20=20 > /* If VALS is a vector constant that can be loaded into a register > using DUP, generate instructions to do so and return an RTX to > - assign to the register. Otherwise return NULL_RTX. */ > + assign to the register. Otherwise return NULL_RTX. > + > + If CHECK then the resulting instruction may not be used in > + codegen but can be used for costing. */ > static rtx > -aarch64_simd_dup_constant (rtx vals) > +aarch64_simd_dup_constant (rtx vals, bool check =3D false) > { > machine_mode mode =3D GET_MODE (vals); > machine_mode inner_mode =3D GET_MODE_INNER (mode); > @@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals) > /* We can load this constant by using DUP and a constant in a > single ARM register. This will be cheaper than a vector > load. */ > - x =3D copy_to_mode_reg (inner_mode, x); > + if (!check) > + x =3D copy_to_mode_reg (inner_mode, x); > return gen_vec_duplicate (mode, x); > } >=20=20 > @@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals) > /* Generate code to load VALS, which is a PARALLEL containing only > constants (for vec_init) or CONST_VECTOR, efficiently into a > register. Returns an RTX to copy into the register, or NULL_RTX > - for a PARALLEL that cannot be converted into a CONST_VECTOR. */ > + for a PARALLEL that cannot be converted into a CONST_VECTOR. > + > + If CHECK then the resulting instruction may not be used in > + codegen but can be used for costing. */ > static rtx > -aarch64_simd_make_constant (rtx vals) > +aarch64_simd_make_constant (rtx vals, bool check =3D false) > { > machine_mode mode =3D GET_MODE (vals); > rtx const_dup; > @@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals) > && aarch64_simd_valid_immediate (const_vec, NULL)) > /* Load using MOVI/MVNI. */ > return const_vec; > - else if ((const_dup =3D aarch64_simd_dup_constant (vals)) !=3D NULL_RT= X) > + else if ((const_dup =3D aarch64_simd_dup_constant (vals, check)) !=3D = NULL_RTX) > /* Loaded using DUP. */ > return const_dup; > else if (const_vec !=3D NULL_RTX) With the inner CONST_VECTOR case replacing the outer one, I think we can drop the aarch64_simd_dup_constant and aarch64_simd_make_constant bits. > diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-= common-protos.h > index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084e= c11b468485c1400 100644 > --- a/gcc/config/arm/aarch-common-protos.h > +++ b/gcc/config/arm/aarch-common-protos.h > @@ -133,6 +133,9 @@ struct vector_cost_table > { > const int alu; > const int mult; > + const int movi; > + const int dup; > + const int extract; > }; >=20=20 > struct cpu_cost_table > diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-co= st-tables.h > index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294= a37945188fb90ef 100644 > --- a/gcc/config/arm/aarch-cost-tables.h > +++ b/gcc/config/arm/aarch-cost-tables.h > @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (0), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =3D > /* Vector */ > { > COSTS_N_INSNS (2), /* alu. */ > - COSTS_N_INSNS (8) /* mult. */ > + COSTS_N_INSNS (8), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; >=20=20 > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/te= stsuite/gcc.target/aarch64/vect-cse-codegen.c > new file mode 100644 > index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b017= 0cfda7f41655b3c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -march=3Darmv8.2-a+crypto -fno-schedule-= insns -fno-schedule-insns2 -mcmodel=3Dsmall" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ Could you try this with -mabi=3Dilp32? It looks like it might fail. Skipping it is OK if so. OK with those changes, if they work. Thanks, Richard > + > +#include > + > +/* > +**test1: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64_t arr[2] =3D { 0x0942430810234076UL, 0x0942430810234076UL}; > + uint64_t res =3D a | arr[0]; > + uint64x2_t val =3D vld1q_u64 (arr); > + *rt =3D vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test2: > +** adrp x[0-9]+, .LC[0-1]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64x2_t val =3D vdupq_n_u64 (0x0424303242234076UL); > + uint64_t arr =3D vgetq_lane_u64 (val, 0); > + uint64_t res =3D a | arr; > + *rt =3D vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test3: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** str q[0-9]+, \[x1\] > +** fmov w[0-9]+, s[0-9]+ > +** orr w[0-9]+, w[0-9]+, w[0-9]+ > +** ret > +*/ > + > +uint32_t > +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) > +{ > + uint32_t arr[4] =3D { 0x094243, 0x094243, 0x094243, 0x094243 }; > + uint32_t res =3D a | arr[0]; > + uint32x4_t val =3D vld1q_u32 (arr); > + *rt =3D vaddq_u32 (val, b); > + return res; > +} > + > +/* > +**test4: > +** ushr v[0-9]+.16b, v[0-9]+.16b, 7 > +** mov x[0-9]+, 16512 > +** movk x[0-9]+, 0x1020, lsl 16 > +** movk x[0-9]+, 0x408, lsl 32 > +** movk x[0-9]+, 0x102, lsl 48 > +** fmov d[0-9]+, x[0-9]+ > +** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d > +** dup v[0-9]+.2d, v[0-9]+.d\[0\] > +** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d > +** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b > +** umov w[0-9]+, v[0-9]+.h\[3\] > +** ret > +*/ > + > +uint64_t > +test4 (uint8x16_t input) > +{ > + uint8x16_t bool_input =3D vshrq_n_u8(input, 7); > + poly64x2_t mask =3D vdupq_n_p64(0x0102040810204080UL); > + poly64_t prodL =3D vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bo= ol_input, 0), > + vgetq_lane_p64(mask, 0)); > + poly64_t prodH =3D vmull_high_p64((poly64x2_t)bool_input, mask); > + uint8x8_t res =3D vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); > + return vget_lane_u16((uint16x4_t)res, 3); > +} > +