From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rsandifo@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1130)
	id 9D6253858C66; Thu, 25 Jan 2024 12:03:32 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9D6253858C66
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1706184212;
	bh=pSDDn4FBdbPD7LujeXgmmh9MMOtj03Lp2nVSxRzCYX8=;
	h=From:To:Subject:Date:From;
	b=eSzCaXy5WgOk0ZHbUA/+fvuEqdvarsDk2qzFceqwm3/Yi77rwkxKZckGAMi+5mdYr
	 I0sJDfAYp4W8fthHgEhHbA0kUm49Njo8zQbyqQKP1Fea+ncD9vQyEmfTG9qBuDWdiV
	 SKaGnaj9T74H3QzrujJExQJqhFGJ3Ju65ZNTwrCM=
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset="utf-8"
From: Richard Sandiford <rsandifo@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-8420] aarch64: Avoid paradoxical subregs in UXTL split
 [PR113485]
X-Act-Checkin: gcc
X-Git-Author: Richard Sandiford <richard.sandiford@arm.com>
X-Git-Refname: refs/heads/trunk
X-Git-Oldrev: c6c2a1d79eb333a00124bf67820a7f405d0d8641
X-Git-Newrev: f251bbfec9174169510b2dec14b9bf763e7b77af
Message-Id: <20240125120332.9D6253858C66@sourceware.org>
Date: Thu, 25 Jan 2024 12:03:32 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:f251bbfec9174169510b2dec14b9bf763e7b77af

commit r14-8420-gf251bbfec9174169510b2dec14b9bf763e7b77af
Author: Richard Sandiford <richard.sandiford@arm.com>
Date:   Thu Jan 25 12:03:17 2024 +0000

    aarch64: Avoid paradoxical subregs in UXTL split [PR113485]
    
    g:74e3e839ab2d36841320 handled the UXTL{,2}-ZIP[12] optimisation
    in split1.  The UXTL input is a 64-bit vector of N-bit elements
    and the result is a 128-bit vector of 2N-bit elements.  The
    corresponding ZIP1 operates on 128-bit vectors of N-bit elements.
    
    This meant that the ZIP1 input had to be a 128-bit paradoxical subreg
    of the 64-bit UXTL input.  In the PRs, it wasn't possible to generate
    this subreg because the inputs were already subregs of a x[234]
    structure of 64-bit vectors.
    
    I don't think the same thing can happen for UXTL2->ZIP2 because
    UXTL2 input is a 128-bit vector rather than a 64-bit vector.
    
    It isn't really necessary for ZIP1 to take 128-bit inputs,
    since the upper 64 bits are ignored.  This patch therefore adds
    a pattern for 64-bit → 128-bit ZIP1s.
    
    In principle, we should probably use this form for all ZIP1s.
    But in practice, that creates an awkward special case, and
    would be quite invasive for stage 4.
    
    gcc/
            PR target/113485
            * config/aarch64/aarch64-simd.md (aarch64_zip1<mode>_low): New
            pattern.
            (<optab><Vnarrowq><mode>2): Use it instead of generating a
            paradoxical subreg for the input.
    
    gcc/testsuite/
            PR target/113485
            * gcc.target/aarch64/pr113485.c: New test.
            * gcc.target/aarch64/pr113573.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md          | 17 +++++++++---
 gcc/testsuite/gcc.target/aarch64/pr113485.c | 25 ++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/pr113573.c | 40 +++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 556d0cf359f..48f0741e7d0 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -8505,6 +8505,18 @@
   [(set_attr "type" "neon_permute<q>")]
 )
 
+;; ZIP1 ignores the contents of the upper halves of the registers,
+;; so we can describe 128-bit operations in terms of 64-bit inputs.
+(define_insn "aarch64_zip1<mode>_low"
+  [(set (match_operand:VQ 0 "register_operand" "=w")
+	(unspec:VQ [(match_operand:<VHALF> 1 "register_operand" "w")
+		    (match_operand:<VHALF> 2 "register_operand" "w")]
+		   UNSPEC_ZIP1))]
+  "TARGET_SIMD"
+  "zip1\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_permute_q")]
+)
+
 ;; This instruction's pattern is generated directly by
 ;; aarch64_expand_vec_perm_const, so any changes to the pattern would
 ;; need corresponding changes there.  Note that the immediate (third)
@@ -9685,9 +9697,8 @@
        not sufficient uses of the zero to make the split worthwhile.  */
     rtx res = simplify_gen_subreg (<VNARROWQ2>mode, operands[0],
 				   <MODE>mode, 0);
-    rtx zero = aarch64_gen_shareable_zero (<VNARROWQ2>mode);
-    rtx op = lowpart_subreg (<VNARROWQ2>mode, operands[1], <VNARROWQ>mode);
-    emit_insn (gen_aarch64_zip1<Vnarrowq2> (res, op, zero));
+    rtx zero = aarch64_gen_shareable_zero (<VNARROWQ>mode);
+    emit_insn (gen_aarch64_zip1<Vnarrowq2>_low (res, operands[1], zero));
     DONE;
   }
   [(set_attr "type" "neon_shift_imm_long")]
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113485.c b/gcc/testsuite/gcc.target/aarch64/pr113485.c
new file mode 100644
index 00000000000..c7028245b61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113485.c
@@ -0,0 +1,25 @@
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+void test()
+{
+  while (1)
+  {
+    static const uint16_t jsimd_rgb_ycc_neon_consts[] = {19595, 0, 0, 0, 0, 0, 0, 0};
+    uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+
+    uint8_t tmp_buf[0];
+    uint8x8x3_t input_pixels = vld3_u8(tmp_buf);
+    uint16x8_t r = vmovl_u8(input_pixels.val[1]);
+    uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+
+    uint32x4_t s = vdupq_n_u32(1);
+    uint16x4_t a = vrshrn_n_u32(s, 16);
+    uint16x4_t y = vrshrn_n_u32(y_l, 16);
+    uint16x8_t ay = vcombine_u16(a, y);
+
+    unsigned char ***out_buf;
+    vst1_u8(out_buf[1][0], vmovn_u16(ay));
+  }
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c b/gcc/testsuite/gcc.target/aarch64/pr113573.c
new file mode 100644
index 00000000000..a8e445c6e19
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
@@ -0,0 +1,40 @@
+/* { dg-options "-O2" } */
+
+#pragma GCC aarch64 "arm_neon.h"
+typedef __Uint8x8_t uint8x8_t;
+typedef __Uint16x4_t uint16x4_t;
+typedef __Int16x8_t int16x8_t;
+typedef __Uint16x8_t uint16x8_t;
+int jsimd_extbgrx_ycc_convert_neon_image_width,
+    jsimd_extbgrx_ycc_convert_neon___trans_tmp_1;
+uint16x4_t jsimd_extbgrx_ycc_convert_neon___trans_tmp_2;
+uint16x8_t vcombine_u16();
+uint16x8_t vmovl_u8(uint8x8_t __a) {
+  return __builtin_aarch64_uxtlv8hi_uu(__a);
+}
+__inline int __attribute__((__gnu_inline__)) vmull_laneq_u16();
+uint8x8x4_t vld4_u8();
+void jsimd_extbgrx_ycc_convert_neon() {
+  int scaled_128_5 = jsimd_extbgrx_ycc_convert_neon___trans_tmp_1,
+      cols_remaining = jsimd_extbgrx_ycc_convert_neon_image_width;
+  for (;;)
+    if (cols_remaining) {
+      uint8x8x4_t input_pixels = vld4_u8();
+      uint16x8_t r = vmovl_u8(input_pixels.val[2]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[1]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[0]);
+      int y_l = vmull_laneq_u16(r);
+      uint16x8_t __a = g;
+      jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
+          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
+      __a = b;
+      int cb_l = scaled_128_5;
+      int cb_h = scaled_128_5;
+      int cr_l = scaled_128_5;
+      int cr_h = scaled_128_5;
+      uint16x8_t y_u16 = vcombine_u16(y_l);
+      uint16x8_t cb_u16 = vcombine_u16(cb_l, cb_h);
+      uint16x8_t cr_u16 = vcombine_u16(cr_l, cr_h);
+      __a = y_u16 = cb_u16 = cr_u16;
+    }
+}