From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-il1-x134.google.com (mail-il1-x134.google.com [IPv6:2607:f8b0:4864:20::134]) by sourceware.org (Postfix) with ESMTPS id D13743858000 for ; Tue, 2 Nov 2021 10:39:41 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org D13743858000 Received: by mail-il1-x134.google.com with SMTP id w15so15314674ill.2 for ; Tue, 02 Nov 2021 03:39:41 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to; bh=9/Qe/JTnm+9+D+3uP47Ks7BIJefQdU/NPFIbBE4PsN4=; b=PiIn1a1+XMoHFgWf6lunkZYKUFv4jbP+xb0xgMwx0PgDi7xC+LEs4Ht2PFj1RyRq+Q lZu4E8V3PdhD8ZDKgrQuRGrBZlPBm2xB5GTDaQN+DDV4vJkDuHm7onsOsTw/XhEkQ/7M kmNN4DpRQQOwV8XV86fwosKWAMZjlIzWE0FBccj4hjbON6MOtM6fs24PPDt2viSLb3IY ktPUSgsWOcEKJfg0j3rb8ebBqAbU2KSFCZoD0i2kb+HN04V9I5pT+iogFZQmqE0JtzHn CZOZh9fa3reRMRNp+y05kCaIFbSJ+KJ2ToHRhczhokYuYOzmsD8AN8Z7I2Tqj1SNIoBp lHrg== X-Gm-Message-State: AOAM5307JFEBJNkJntHj+h8IQJ6j+BEfJAvFAEjogxye3w8M7g3e9wuA rO5FLlHu9PEtEKOEnz1hs6kzkh1YXrVomjleMyI= X-Google-Smtp-Source: ABdhPJznGUY1CwGmcAGOEjWtd7fa8knTM0jvelkgOa3kyN02vD4ScCmFIsBaOz+gc5Nbzv5Lu7Juoaz4TMGWQQH/ifs= X-Received: by 2002:a92:cb12:: with SMTP id s18mr16659043ilo.321.1635849581122; Tue, 02 Nov 2021 03:39:41 -0700 (PDT) MIME-Version: 1.0 References: In-Reply-To: From: Christophe Lyon Date: Tue, 2 Nov 2021 11:39:30 +0100 Message-ID: Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants and operations To: Richard Sandiford , Tamar Christina , Tamar Christina via Gcc-patches , Richard Earnshaw , nd , Marcus Shawcroft X-Spam-Status: No, score=-8.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, HTML_MESSAGE, KAM_LOTSOFHASH, KAM_MANYTO, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org Content-Type: text/plain; charset="UTF-8" X-Content-Filtered-By: Mailman/MimeDel 2.1.29 X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 02 Nov 2021 10:39:45 -0000 Hi Tamar, On Fri, Oct 29, 2021 at 5:23 PM Richard Sandiford via Gcc-patches < gcc-patches@gcc.gnu.org> wrote: > Tamar Christina writes: > > Hi All, > > > > Attached is a new version that fixes the previous SVE fallouts in a new > way. > > > > Ok for master? > Looks like you forgot to try to build for arm* targets, you patch breaks the build: gcc/config/arm/arm.c:1194:1: error: uninitialized const member 'vector_cost_table::movi' [....] You probably need to initialize the new field for arm targets too. Can you check? Thanks, Christophe > > > Thanks, > > Tamar > > > > --- inline copy of patch --- > > > > > > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h > b/gcc/config/aarch64/aarch64-cost-tables.h > > index > dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 > 100644 > > --- a/gcc/config/aarch64/aarch64-cost-tables.h > > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > > @@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* Alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -333,7 +339,10 @@ const struct cpu_cost_table > thunderx2t99_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* Alu. */ > > - COSTS_N_INSNS (4) /* Mult. */ > > + COSTS_N_INSNS (4), /* Mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -437,7 +446,10 @@ const struct cpu_cost_table > thunderx3t110_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* Alu. */ > > - COSTS_N_INSNS (4) /* Mult. */ > > + COSTS_N_INSNS (4), /* Mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > > index > 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 > 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup" > > ) > > > > (define_insn "aarch64_simd_dup" > > - [(set (match_operand:VDQF_F16 0 "register_operand" "=w") > > + [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w") > > (vec_duplicate:VDQF_F16 > > - (match_operand: 1 "register_operand" "w")))] > > + (match_operand: 1 "register_operand" "w,r")))] > > "TARGET_SIMD" > > - "dup\\t%0., %1.[0]" > > - [(set_attr "type" "neon_dup")] > > + "@ > > + dup\\t%0., %1.[0] > > + dup\\t%0., %1" > > + [(set_attr "type" "neon_dup, neon_from_gp")] > > ) > > > > (define_insn "aarch64_dup_lane" > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > > index > 699c105a42a613c06c462e2de686795279d85bc9..542fc874a4e224fb2cbe94e64eab590458fe935b > 100644 > > --- a/gcc/config/aarch64/aarch64.c > > +++ b/gcc/config/aarch64/aarch64.c > > @@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int > outer ATTRIBUTE_UNUSED, > > rtx op0, op1, op2; > > const struct cpu_cost_table *extra_cost > > = aarch64_tune_params.insn_extra_cost; > > - int code = GET_CODE (x); > > + rtx_code code = GET_CODE (x); > > scalar_int_mode int_mode; > > > > /* By default, assume that everything has equivalent cost to the > > @@ -13466,8 +13466,7 @@ cost_plus: > > > > we must cost the explicit register move. */ > > if (mode == DImode > > - && GET_MODE (op0) == SImode > > - && outer == SET) > > + && GET_MODE (op0) == SImode) > > { > > int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed); > > > > @@ -14006,8 +14005,39 @@ cost_plus: > > mode, MULT, 1, speed); > > return true; > > } > > + break; > > + case CONST_VECTOR: > > + { > > + /* Load using MOVI/MVNI. */ > > + if (aarch64_simd_valid_immediate (x, NULL)) > > + *cost = extra_cost->vect.movi; > > + else /* Load using constant pool. */ > > + *cost = extra_cost->ldst.load; > > + break; > > + } > > + case VEC_CONCAT: > > + /* depending on the operation, either DUP or INS. > > + For now, keep default costing. */ > > + break; > > + /* Load using a DUP. */ > > + case VEC_DUPLICATE: > > Ultra minor nit, but: putting the comment after the case would be > more consistent with surrounding code. > > OK with that change, and thanks for you patience. > > Richard > > > + *cost = extra_cost->vect.dup; > > + return false; > > + case VEC_SELECT: > > + { > > + rtx op0 = XEXP (x, 0); > > + *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); > > > > - /* Fall through. */ > > + /* cost subreg of 0 as free, otherwise as DUP */ > > + rtx op1 = XEXP (x, 1); > > + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) > > + ; > > + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) > > + *cost = extra_cost->vect.dup; > > + else > > + *cost = extra_cost->vect.extract; > > + return true; > > + } > > default: > > break; > > } > > diff --git a/gcc/config/arm/aarch-common-protos.h > b/gcc/config/arm/aarch-common-protos.h > > index > 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 > 100644 > > --- a/gcc/config/arm/aarch-common-protos.h > > +++ b/gcc/config/arm/aarch-common-protos.h > > @@ -133,6 +133,9 @@ struct vector_cost_table > > { > > const int alu; > > const int mult; > > + const int movi; > > + const int dup; > > + const int extract; > > }; > > > > struct cpu_cost_table > > diff --git a/gcc/config/arm/aarch-cost-tables.h > b/gcc/config/arm/aarch-cost-tables.h > > index > 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef > 100644 > > --- a/gcc/config/arm/aarch-cost-tables.h > > +++ b/gcc/config/arm/aarch-cost-tables.h > > @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (1), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (0), /* alu. */ > > - COSTS_N_INSNS (4) /* mult. */ > > + COSTS_N_INSNS (4), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs = > > /* Vector */ > > { > > COSTS_N_INSNS (2), /* alu. */ > > - COSTS_N_INSNS (8) /* mult. */ > > + COSTS_N_INSNS (8), /* mult. */ > > + COSTS_N_INSNS (1), /* movi. */ > > + COSTS_N_INSNS (2), /* dup. */ > > + COSTS_N_INSNS (2) /* extract. */ > > } > > }; > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > > new file mode 100644 > > index > 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > > @@ -0,0 +1,97 @@ > > +/* { dg-do compile { target { lp64 } } } */ > > +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto > -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } > */ > > + > > +#include > > + > > +/* > > +**test1: > > +** adrp x[0-9]+, .LC[0-9]+ > > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > > +** str q[0-9]+, \[x[0-9]+\] > > +** fmov x[0-9]+, d[0-9]+ > > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > > +** ret > > +*/ > > + > > +uint64_t > > +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > > +{ > > + uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL}; > > + uint64_t res = a | arr[0]; > > + uint64x2_t val = vld1q_u64 (arr); > > + *rt = vaddq_u64 (val, b); > > + return res; > > +} > > + > > +/* > > +**test2: > > +** adrp x[0-9]+, .LC[0-1]+ > > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > > +** str q[0-9]+, \[x[0-9]+\] > > +** fmov x[0-9]+, d[0-9]+ > > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > > +** ret > > +*/ > > + > > +uint64_t > > +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > > +{ > > + uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL); > > + uint64_t arr = vgetq_lane_u64 (val, 0); > > + uint64_t res = a | arr; > > + *rt = vaddq_u64 (val, b); > > + return res; > > +} > > + > > +/* > > +**test3: > > +** adrp x[0-9]+, .LC[0-9]+ > > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > > +** str q[0-9]+, \[x1\] > > +** fmov w[0-9]+, s[0-9]+ > > +** orr w[0-9]+, w[0-9]+, w[0-9]+ > > +** ret > > +*/ > > + > > +uint32_t > > +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) > > +{ > > + uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 }; > > + uint32_t res = a | arr[0]; > > + uint32x4_t val = vld1q_u32 (arr); > > + *rt = vaddq_u32 (val, b); > > + return res; > > +} > > + > > +/* > > +**test4: > > +** ushr v[0-9]+.16b, v[0-9]+.16b, 7 > > +** mov x[0-9]+, 16512 > > +** movk x[0-9]+, 0x1020, lsl 16 > > +** movk x[0-9]+, 0x408, lsl 32 > > +** movk x[0-9]+, 0x102, lsl 48 > > +** fmov d[0-9]+, x[0-9]+ > > +** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d > > +** dup v[0-9]+.2d, v[0-9]+.d\[0\] > > +** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d > > +** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b > > +** umov w[0-9]+, v[0-9]+.h\[3\] > > +** ret > > +*/ > > + > > +uint64_t > > +test4 (uint8x16_t input) > > +{ > > + uint8x16_t bool_input = vshrq_n_u8(input, 7); > > + poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); > > + poly64_t prodL = > vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), > > + vgetq_lane_p64(mask, 0)); > > + poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); > > + uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); > > + return vget_lane_u16((uint16x4_t)res, 3); > > +} > > + >