From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <acoplan@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7810)
	id EA5CA3857011; Wed, 14 Feb 2024 11:55:10 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EA5CA3857011
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1707911710;
	bh=7d9UAB5C4DFHP5iPDnEZ3kxQxc7+Nbb0mB/wb0rVjvU=;
	h=From:To:Subject:Date:From;
	b=Tuj6GZVlRDKAB5aGYTxr7zOwUZOvSa7DXf44cpakR8MoVbfJPFMaYykx06RqVvjxj
	 VquzPiMZTZWHDSZEyQGtW+74mvLi3hRDj1gJGctWqDQZBE0P42r5rCe5Mkt8C5Qahx
	 jhDCOnAur78uxbp2OFRkbi2olAVbLz+7dbAISFaw=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Alex Coplan <acoplan@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r12-10156] aarch64: Avoid out-of-range shrink-wrapped saves
 [PR111677]
X-Act-Checkin: gcc
X-Git-Author: Alex Coplan <alex.coplan@arm.com>
X-Git-Refname: refs/heads/releases/gcc-12
X-Git-Oldrev: 2f16c53558d01135f0f78cf78a2f722b774684d7
X-Git-Newrev: fddce05d67f34174be0f306e1015d3868bbe7c31
Message-Id: <20240214115510.EA5CA3857011@sourceware.org>
Date: Wed, 14 Feb 2024 11:55:10 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:fddce05d67f34174be0f306e1015d3868bbe7c31

commit r12-10156-gfddce05d67f34174be0f306e1015d3868bbe7c31
Author: Alex Coplan <alex.coplan@arm.com>
Date:   Tue Jan 30 09:39:59 2024 +0000

    aarch64: Avoid out-of-range shrink-wrapped saves [PR111677]
    
    The PR shows us ICEing due to an unrecognizable TFmode save emitted by
    aarch64_process_components.  The problem is that for T{I,F,D}mode we
    conservatively require mems to be in range for x-register ldp/stp.  That
    is because (at least for TImode) it can be allocated to both GPRs and
    FPRs, and in the GPR case that is an x-reg ldp/stp, and the FPR case is
    a q-register load/store.
    
    As Richard pointed out in the PR, aarch64_get_separate_components
    already checks that the offsets are suitable for a single load, so we
    just need to choose a mode in aarch64_reg_save_mode that gives the full
    q-register range.  In this patch, we choose V16QImode as an alternative
    16-byte "bag-of-bits" mode that doesn't have the artificial range
    restrictions imposed on T{I,F,D}mode.
    
    Unlike for GCC 14 we need additional handling in the load/store pair
    code as various cases are not expecting to see V16QImode (particularly
    the writeback patterns, but also aarch64_gen_load_pair).
    
    gcc/ChangeLog:
    
            PR target/111677
            * config/aarch64/aarch64.cc (aarch64_reg_save_mode): Use
            V16QImode for the full 16-byte FPR saves in the vector PCS case.
            (aarch64_gen_storewb_pair): Handle V16QImode.
            (aarch64_gen_loadwb_pair): Likewise.
            (aarch64_gen_load_pair): Likewise.
            * config/aarch64/aarch64.md (loadwb_pair<TX:mode>_<P:mode>):
            Rename to ...
            (loadwb_pair<TX_V16QI:mode>_<P:mode>): ... this, extending to
            V16QImode.
            (storewb_pair<TX:mode>_<P:mode>): Rename to ...
            (storewb_pair<TX_V16QI:mode>_<P:mode>): ... this, extending to
            V16QImode.
            * config/aarch64/iterators.md (TX_V16QI): New.
    
    gcc/testsuite/ChangeLog:
    
            PR target/111677
            * gcc.target/aarch64/torture/pr111677.c: New test.
    
    (cherry picked from commit 2bd8264a131ee1215d3bc6181722f9d30f5569c3)

Diff:
---
 gcc/config/aarch64/aarch64.cc                      | 13 +++++++-
 gcc/config/aarch64/aarch64.md                      | 35 +++++++++++-----------
 gcc/config/aarch64/iterators.md                    |  3 ++
 .../gcc.target/aarch64/torture/pr111677.c          | 28 +++++++++++++++++
 4 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 3bccd96a23d3..2bbba323770f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4135,7 +4135,7 @@ aarch64_reg_save_mode (unsigned int regno)
       case ARM_PCS_SIMD:
 	/* The vector PCS saves the low 128 bits (which is the full
 	   register on non-SVE targets).  */
-	return TFmode;
+	return V16QImode;
 
       case ARM_PCS_SVE:
 	/* Use vectors of DImode for registers that need frame
@@ -8602,6 +8602,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
       return gen_storewb_pairtf_di (base, base, reg, reg2,
 				    GEN_INT (-adjustment),
 				    GEN_INT (UNITS_PER_VREG - adjustment));
+    case E_V16QImode:
+      return gen_storewb_pairv16qi_di (base, base, reg, reg2,
+				       GEN_INT (-adjustment),
+				       GEN_INT (UNITS_PER_VREG - adjustment));
     default:
       gcc_unreachable ();
     }
@@ -8647,6 +8651,10 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
     case E_TFmode:
       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
 				   GEN_INT (UNITS_PER_VREG));
+    case E_V16QImode:
+      return gen_loadwb_pairv16qi_di (base, base, reg, reg2,
+				      GEN_INT (adjustment),
+				      GEN_INT (UNITS_PER_VREG));
     default:
       gcc_unreachable ();
     }
@@ -8730,6 +8738,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
     case E_V4SImode:
       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
 
+    case E_V16QImode:
+      return gen_load_pairv16qiv16qi (reg1, mem1, reg2, mem2);
+
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index fb100bdf6b33..99f185718c93 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1874,17 +1874,18 @@
   [(set_attr "type" "neon_load1_2reg")]
 )
 
-(define_insn "loadwb_pair<TX:mode>_<P:mode>"
+(define_insn "loadwb_pair<TX_V16QI:mode>_<P:mode>"
   [(parallel
     [(set (match_operand:P 0 "register_operand" "=k")
-          (plus:P (match_operand:P 1 "register_operand" "0")
-                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
-     (set (match_operand:TX 2 "register_operand" "=w")
-          (mem:TX (match_dup 1)))
-     (set (match_operand:TX 3 "register_operand" "=w")
-          (mem:TX (plus:P (match_dup 1)
+	  (plus:P (match_operand:P 1 "register_operand" "0")
+		  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (match_operand:TX_V16QI 2 "register_operand" "=w")
+	  (mem:TX_V16QI (match_dup 1)))
+     (set (match_operand:TX_V16QI 3 "register_operand" "=w")
+	  (mem:TX_V16QI (plus:P (match_dup 1)
 			  (match_operand:P 5 "const_int_operand" "n"))))])]
-  "TARGET_SIMD && INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)"
+  "TARGET_SIMD
+   && known_eq (INTVAL (operands[5]), GET_MODE_SIZE (<TX_V16QI:MODE>mode))"
   "ldp\\t%q2, %q3, [%1], %4"
   [(set_attr "type" "neon_ldp_q")]
 )
@@ -1923,20 +1924,20 @@
   [(set_attr "type" "neon_store1_2reg<q>")]
 )
 
-(define_insn "storewb_pair<TX:mode>_<P:mode>"
+(define_insn "storewb_pair<TX_V16QI:mode>_<P:mode>"
   [(parallel
     [(set (match_operand:P 0 "register_operand" "=&k")
-          (plus:P (match_operand:P 1 "register_operand" "0")
-                  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
-     (set (mem:TX (plus:P (match_dup 0)
+	  (plus:P (match_operand:P 1 "register_operand" "0")
+		  (match_operand:P 4 "aarch64_mem_pair_offset" "n")))
+     (set (mem:TX_V16QI (plus:P (match_dup 0)
 			  (match_dup 4)))
-          (match_operand:TX 2 "register_operand" "w"))
-     (set (mem:TX (plus:P (match_dup 0)
+	  (match_operand:TX_V16QI 2 "register_operand" "w"))
+     (set (mem:TX_V16QI (plus:P (match_dup 0)
 			  (match_operand:P 5 "const_int_operand" "n")))
-          (match_operand:TX 3 "register_operand" "w"))])]
+	  (match_operand:TX_V16QI 3 "register_operand" "w"))])]
   "TARGET_SIMD
-   && INTVAL (operands[5])
-      == INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)"
+   && known_eq (INTVAL (operands[5]),
+		INTVAL (operands[4]) + GET_MODE_SIZE (<TX_V16QI:MODE>mode))"
   "stp\\t%q2, %q3, [%0, %4]!"
   [(set_attr "type" "neon_stp_q")]
 )
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 26a840d7fe9e..d49e37893df9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -303,6 +303,9 @@
 
 (define_mode_iterator TX [TI TF])
 
+;; TX plus V16QImode.
+(define_mode_iterator TX_V16QI [TI TF V16QI])
+
 ;; Advanced SIMD opaque structure modes.
 (define_mode_iterator VSTRUCT [OI CI XI])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c b/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c
new file mode 100644
index 000000000000..6bb640c42c03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/torture/pr111677.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenmp } */
+/* { dg-options "-ffast-math -fstack-protector-strong -fopenmp" } */
+typedef struct {
+  long size_z;
+  int width;
+} dt_bilateral_t;
+typedef float dt_aligned_pixel_t[4];
+#pragma omp declare simd
+void dt_bilateral_splat(dt_bilateral_t *b) {
+  float *buf;
+  long offsets[8];
+  for (; b;) {
+    int firstrow;
+    for (int j = firstrow; j; j++)
+      for (int i; i < b->width; i++) {
+        dt_aligned_pixel_t contrib;
+        for (int k = 0; k < 4; k++)
+          buf[offsets[k]] += contrib[k];
+      }
+    float *dest;
+    for (int j = (long)b; j; j++) {
+      float *src = (float *)b->size_z;
+      for (int i = 0; i < (long)b; i++)
+        dest[i] += src[i];
+    }
+  }
+}