From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7810) id EA5CA3857011; Wed, 14 Feb 2024 11:55:10 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EA5CA3857011 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1707911710; bh=7d9UAB5C4DFHP5iPDnEZ3kxQxc7+Nbb0mB/wb0rVjvU=; h=From:To:Subject:Date:From; b=Tuj6GZVlRDKAB5aGYTxr7zOwUZOvSa7DXf44cpakR8MoVbfJPFMaYykx06RqVvjxj VquzPiMZTZWHDSZEyQGtW+74mvLi3hRDj1gJGctWqDQZBE0P42r5rCe5Mkt8C5Qahx jhDCOnAur78uxbp2OFRkbi2olAVbLz+7dbAISFaw= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Alex Coplan To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-10156] aarch64: Avoid out-of-range shrink-wrapped saves [PR111677] X-Act-Checkin: gcc X-Git-Author: Alex Coplan X-Git-Refname: refs/heads/releases/gcc-12 X-Git-Oldrev: 2f16c53558d01135f0f78cf78a2f722b774684d7 X-Git-Newrev: fddce05d67f34174be0f306e1015d3868bbe7c31 Message-Id: <20240214115510.EA5CA3857011@sourceware.org> Date: Wed, 14 Feb 2024 11:55:10 +0000 (GMT) List-Id: https://gcc.gnu.org/g:fddce05d67f34174be0f306e1015d3868bbe7c31 commit r12-10156-gfddce05d67f34174be0f306e1015d3868bbe7c31 Author: Alex Coplan Date: Tue Jan 30 09:39:59 2024 +0000 aarch64: Avoid out-of-range shrink-wrapped saves [PR111677] The PR shows us ICEing due to an unrecognizable TFmode save emitted by aarch64_process_components. The problem is that for T{I,F,D}mode we conservatively require mems to be in range for x-register ldp/stp. That is because (at least for TImode) it can be allocated to both GPRs and FPRs, and in the GPR case that is an x-reg ldp/stp, and the FPR case is a q-register load/store. As Richard pointed out in the PR, aarch64_get_separate_components already checks that the offsets are suitable for a single load, so we just need to choose a mode in aarch64_reg_save_mode that gives the full q-register range. In this patch, we choose V16QImode as an alternative 16-byte "bag-of-bits" mode that doesn't have the artificial range restrictions imposed on T{I,F,D}mode. Unlike for GCC 14 we need additional handling in the load/store pair code as various cases are not expecting to see V16QImode (particularly the writeback patterns, but also aarch64_gen_load_pair). gcc/ChangeLog: PR target/111677 * config/aarch64/aarch64.cc (aarch64_reg_save_mode): Use V16QImode for the full 16-byte FPR saves in the vector PCS case. (aarch64_gen_storewb_pair): Handle V16QImode. (aarch64_gen_loadwb_pair): Likewise. (aarch64_gen_load_pair): Likewise. * config/aarch64/aarch64.md (loadwb_pair_): Rename to ... (loadwb_pair_): ... this, extending to V16QImode. (storewb_pair_): Rename to ... (storewb_pair_): ... this, extending to V16QImode. * config/aarch64/iterators.md (TX_V16QI): New. gcc/testsuite/ChangeLog: PR target/111677 * gcc.target/aarch64/torture/pr111677.c: New test. (cherry picked from commit 2bd8264a131ee1215d3bc6181722f9d30f5569c3) Diff: --- gcc/config/aarch64/aarch64.cc | 13 +++++++- gcc/config/aarch64/aarch64.md | 35 +++++++++++----------- gcc/config/aarch64/iterators.md | 3 ++ .../gcc.target/aarch64/torture/pr111677.c | 28 +++++++++++++++++ 4 files changed, 61 insertions(+), 18 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 3bccd96a23d3..2bbba323770f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -4135,7 +4135,7 @@ aarch64_reg_save_mode (unsigned int regno) case ARM_PCS_SIMD: /* The vector PCS saves the low 128 bits (which is the full register on non-SVE targets). */ - return TFmode; + return V16QImode; case ARM_PCS_SVE: /* Use vectors of DImode for registers that need frame @@ -8602,6 +8602,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, return gen_storewb_pairtf_di (base, base, reg, reg2, GEN_INT (-adjustment), GEN_INT (UNITS_PER_VREG - adjustment)); + case E_V16QImode: + return gen_storewb_pairv16qi_di (base, base, reg, reg2, + GEN_INT (-adjustment), + GEN_INT (UNITS_PER_VREG - adjustment)); default: gcc_unreachable (); } @@ -8647,6 +8651,10 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, case E_TFmode: return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment), GEN_INT (UNITS_PER_VREG)); + case E_V16QImode: + return gen_loadwb_pairv16qi_di (base, base, reg, reg2, + GEN_INT (adjustment), + GEN_INT (UNITS_PER_VREG)); default: gcc_unreachable (); } @@ -8730,6 +8738,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, case E_V4SImode: return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2); + case E_V16QImode: + return gen_load_pairv16qiv16qi (reg1, mem1, reg2, mem2); + default: gcc_unreachable (); } diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index fb100bdf6b33..99f185718c93 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1874,17 +1874,18 @@ [(set_attr "type" "neon_load1_2reg")] ) -(define_insn "loadwb_pair_" +(define_insn "loadwb_pair_" [(parallel [(set (match_operand:P 0 "register_operand" "=k") - (plus:P (match_operand:P 1 "register_operand" "0") - (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) - (set (match_operand:TX 2 "register_operand" "=w") - (mem:TX (match_dup 1))) - (set (match_operand:TX 3 "register_operand" "=w") - (mem:TX (plus:P (match_dup 1) + (plus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) + (set (match_operand:TX_V16QI 2 "register_operand" "=w") + (mem:TX_V16QI (match_dup 1))) + (set (match_operand:TX_V16QI 3 "register_operand" "=w") + (mem:TX_V16QI (plus:P (match_dup 1) (match_operand:P 5 "const_int_operand" "n"))))])] - "TARGET_SIMD && INTVAL (operands[5]) == GET_MODE_SIZE (mode)" + "TARGET_SIMD + && known_eq (INTVAL (operands[5]), GET_MODE_SIZE (mode))" "ldp\\t%q2, %q3, [%1], %4" [(set_attr "type" "neon_ldp_q")] ) @@ -1923,20 +1924,20 @@ [(set_attr "type" "neon_store1_2reg")] ) -(define_insn "storewb_pair_" +(define_insn "storewb_pair_" [(parallel [(set (match_operand:P 0 "register_operand" "=&k") - (plus:P (match_operand:P 1 "register_operand" "0") - (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) - (set (mem:TX (plus:P (match_dup 0) + (plus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) + (set (mem:TX_V16QI (plus:P (match_dup 0) (match_dup 4))) - (match_operand:TX 2 "register_operand" "w")) - (set (mem:TX (plus:P (match_dup 0) + (match_operand:TX_V16QI 2 "register_operand" "w")) + (set (mem:TX_V16QI (plus:P (match_dup 0) (match_operand:P 5 "const_int_operand" "n"))) - (match_operand:TX 3 "register_operand" "w"))])] + (match_operand:TX_V16QI 3 "register_operand" "w"))])] "TARGET_SIMD - && INTVAL (operands[5]) - == INTVAL (operands[4]) + GET_MODE_SIZE (mode)" + && known_eq (INTVAL (operands[5]), + INTVAL (operands[4]) + GET_MODE_SIZE (mode))" "stp\\t%q2, %q3, [%0, %4]!" [(set_attr "type" "neon_stp_q")] ) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 26a840d7fe9e..d49e37893df9 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -303,6 +303,9 @@ (define_mode_iterator TX [TI TF]) +;; TX plus V16QImode. +(define_mode_iterator TX_V16QI [TI TF V16QI]) + ;; Advanced SIMD opaque structure modes. (define_mode_iterator VSTRUCT [OI CI XI]) diff --git a/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c b/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c new file mode 100644 index 000000000000..6bb640c42c03 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target fopenmp } */ +/* { dg-options "-ffast-math -fstack-protector-strong -fopenmp" } */ +typedef struct { + long size_z; + int width; +} dt_bilateral_t; +typedef float dt_aligned_pixel_t[4]; +#pragma omp declare simd +void dt_bilateral_splat(dt_bilateral_t *b) { + float *buf; + long offsets[8]; + for (; b;) { + int firstrow; + for (int j = firstrow; j; j++) + for (int i; i < b->width; i++) { + dt_aligned_pixel_t contrib; + for (int k = 0; k < 4; k++) + buf[offsets[k]] += contrib[k]; + } + float *dest; + for (int j = (long)b; j; j++) { + float *src = (float *)b->size_z; + for (int i = 0; i < (long)b; i++) + dest[i] += src[i]; + } + } +}