From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1895) id 281203858C1F; Mon, 18 Sep 2023 12:28:12 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 281203858C1F DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1695040092; bh=FMUngxv5GyStGi3AMVxREVodgUOTi3I1ehXMrs0JTCE=; h=From:To:Subject:Date:From; b=dylgcYRKTq+Zf6XQH+46wASRS0bpKuQEwkrNRTYPIcQybyG/T+FVYSL1NdZmU8oHf BBO9l5LI+LaIwX/ub6n6QfUmlzgL/A9wyZ0Zhtu9YOza/lWc3/CqNEVhbgTSi94veH f9IZvQgWszszQW/+G/m6uJXL2uh6aNB6yEaxtS0U= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Wilco Dijkstra To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-4096] AArch64: Improve immediate expansion [PR105928] X-Act-Checkin: gcc X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/master X-Git-Oldrev: 64d5bc35c8c2a66ac133a3e6ace820b0ad8a63fb X-Git-Newrev: fc7070025d1a6668ff6cb4391f84771a7662def7 Message-Id: <20230918122812.281203858C1F@sourceware.org> Date: Mon, 18 Sep 2023 12:28:12 +0000 (GMT) List-Id: https://gcc.gnu.org/g:fc7070025d1a6668ff6cb4391f84771a7662def7 commit r14-4096-gfc7070025d1a6668ff6cb4391f84771a7662def7 Author: Wilco Dijkstra Date: Wed Sep 13 13:21:50 2023 +0100 AArch64: Improve immediate expansion [PR105928] Support immediate expansion of immediates which can be created from 2 MOVKs and a shifted ORR or BIC instruction. Change aarch64_split_dimode_const_store to apply if we save one instruction. This reduces the number of 4-instruction immediates in SPECINT/FP by 5%. gcc/ChangeLog: PR target/105928 * config/aarch64/aarch64.cc (aarch64_internal_mov_immediate) Add support for immediates using shifted ORR/BIC. (aarch64_split_dimode_const_store): Apply if we save one instruction. * config/aarch64/aarch64.md (_3): Make pattern global. gcc/testsuite: PR target/105928 * gcc.target/aarch64/pr105928.c: Add new test. * gcc.target/aarch64/vect-cse-codegen.c: Fix test. Diff: --- gcc/config/aarch64/aarch64.cc | 43 ++++++++++++++++------ gcc/config/aarch64/aarch64.md | 2 +- gcc/testsuite/gcc.target/aarch64/pr105928.c | 43 ++++++++++++++++++++++ .../gcc.target/aarch64/vect-cse-codegen.c | 3 +- 4 files changed, 77 insertions(+), 14 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 7bb1161f943..219c4ee6d4c 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -5640,7 +5640,7 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, machine_mode mode) { int i; - unsigned HOST_WIDE_INT val, val2, mask; + unsigned HOST_WIDE_INT val, val2, val3, mask; int one_match, zero_match; int num_insns; @@ -5722,6 +5722,35 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, } return 3; } + + /* Try shifting and inserting the bottom 32-bits into the top bits. */ + val2 = val & 0xffffffff; + val3 = 0xffffffff; + val3 = val2 | (val3 << 32); + for (i = 17; i < 48; i++) + if ((val2 | (val2 << i)) == val) + { + if (generate) + { + emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff))); + emit_insn (gen_insv_immdi (dest, GEN_INT (16), + GEN_INT (val2 >> 16))); + emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest)); + } + return 3; + } + else if ((val3 & ~(val3 << i)) == val) + { + if (generate) + { + emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000))); + emit_insn (gen_insv_immdi (dest, GEN_INT (16), + GEN_INT (val2 >> 16))); + emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i), + dest)); + } + return 3; + } } /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which @@ -25540,8 +25569,6 @@ aarch64_split_dimode_const_store (rtx dst, rtx src) rtx lo = gen_lowpart (SImode, src); rtx hi = gen_highpart_mode (SImode, DImode, src); - bool size_p = optimize_function_for_size_p (cfun); - if (!rtx_equal_p (lo, hi)) return false; @@ -25560,14 +25587,8 @@ aarch64_split_dimode_const_store (rtx dst, rtx src) MOV w1, 49370 MOVK w1, 0x140, lsl 16 STP w1, w1, [x0] - So we want to perform this only when we save two instructions - or more. When optimizing for size, however, accept any code size - savings we can. */ - if (size_p && orig_cost <= lo_cost) - return false; - - if (!size_p - && (orig_cost <= lo_cost + 1)) + So we want to perform this when we save at least one instruction. */ + if (orig_cost <= lo_cost) return false; rtx mem_lo = adjust_address (dst, SImode, 0); diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6f7827bd8c9..60133b541e9 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4637,7 +4637,7 @@ [(set_attr "type" "logics_shift_imm")] ) -(define_insn "*_3" +(define_insn "_3" [(set (match_operand:GPI 0 "register_operand" "=r") (LOGICAL:GPI (SHIFT:GPI (match_operand:GPI 1 "register_operand" "r") diff --git a/gcc/testsuite/gcc.target/aarch64/pr105928.c b/gcc/testsuite/gcc.target/aarch64/pr105928.c new file mode 100644 index 00000000000..9aff892670e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr105928.c @@ -0,0 +1,43 @@ +/* { dg-do assemble } */ +/* { dg-options "-O2 --save-temps" } */ + +long long f1 (void) +{ + return 0x80402010080400; +} + +long long f2 (void) +{ + return 0x1234567812345678; +} + +long long f3 (void) +{ + return 0x4567800012345678; +} + +long long f4 (void) +{ + return 0x3ecccccd3ecccccd; +} + +long long f5 (void) +{ + return 0x38e38e38e38e38e; +} + +long long f6 (void) +{ + return 0x1745d1745d1745d; +} + +void f7 (long long *p) +{ + *p = 0x1234567812345678; +} + +/* { dg-final { scan-assembler-times {\tmovk\t} 7 } } */ +/* { dg-final { scan-assembler-times {\tmov\t} 7 } } */ +/* { dg-final { scan-assembler-times {\tbic\t} 2 } } */ +/* { dg-final { scan-assembler-times {\torr\t} 4 } } */ +/* { dg-final { scan-assembler-times {\tstp\t} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c index d025e989a1e..2b8e64313bb 100644 --- a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c @@ -72,8 +72,7 @@ test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) ** ushr v[0-9]+.16b, v[0-9]+.16b, 7 ** mov x[0-9]+, 16512 ** movk x[0-9]+, 0x1020, lsl 16 -** movk x[0-9]+, 0x408, lsl 32 -** movk x[0-9]+, 0x102, lsl 48 +** orr x[0-9]+, x[0-9]+, x[0-9]+, lsl 28 ** fmov d[0-9]+, x[0-9]+ ** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d ** dup v[0-9]+.2d, v[0-9]+.d\[0\]