From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tnfchris@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1984)
	id ED1B4385840A; Sun, 12 Mar 2023 18:44:05 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org ED1B4385840A
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1678646645;
	bh=6xwz/8uSUhaChe+Y+ADrsGdioN0fC4nBsBcD5Ac/+14=;
	h=From:To:Subject:Date:From;
	b=qW65sOgFKhQrc68U8/D2tcjsIYUcKd9cWC1fPFQV2+qwTE+WqCHTSU1ip1yY9rMm2
	 lqrpyofpZQYGCAFE0LrNsQyCLjipjN6RAEzPGXRj6exa7Q/EsfOXJOPS+SJO1mosGk
	 jBRW2vaVWBpdSb1boQTv+eWyiuk0LQ8Nc5rwpCPY=
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Content-Type: text/plain; charset="utf-8"
From: Tamar Christina <tnfchris@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r13-6620] AArch64: Update div-bitmask to implement new optab
 instead of target hook [PR108583]
X-Act-Checkin: gcc
X-Git-Author: Tamar Christina <tamar.christina@arm.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 81fd62d1378b7ddc1fa0967cbddcdcdcdd2d8d8c
X-Git-Newrev: f23dc726875c26f2c38dfded453aa9beba0b9be9
Message-Id: <20230312184405.ED1B4385840A@sourceware.org>
Date: Sun, 12 Mar 2023 18:44:05 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:f23dc726875c26f2c38dfded453aa9beba0b9be9

commit r13-6620-gf23dc726875c26f2c38dfded453aa9beba0b9be9
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Sun Mar 12 18:42:59 2023 +0000

    AArch64: Update div-bitmask to implement new optab instead of target hook [PR108583]
    
    This replaces the custom division hook with just an implementation through
    add_highpart.  For NEON we implement the add highpart (Addition + extraction of
    the upper highpart of the register in the same precision) as ADD + LSR.
    
    This representation allows us to easily optimize the sequence using existing
    sequences. This gets us a pretty decent sequence using SRA:
    
            umull   v1.8h, v0.8b, v3.8b
            umull2  v0.8h, v0.16b, v3.16b
            add     v5.8h, v1.8h, v2.8h
            add     v4.8h, v0.8h, v2.8h
            usra    v1.8h, v5.8h, 8
            usra    v0.8h, v4.8h, 8
            uzp2    v1.16b, v1.16b, v0.16b
    
    To get the most optimal sequence however we match (a + ((b + c) >> n)) where n
    is half the precision of the mode of the operation into addhn + uaddw which is
    a general good optimization on its own and gets us back to:
    
    .L4:
            ldr     q0, [x3]
            umull   v1.8h, v0.8b, v5.8b
            umull2  v0.8h, v0.16b, v5.16b
            addhn   v3.8b, v1.8h, v4.8h
            addhn   v2.8b, v0.8h, v4.8h
            uaddw   v1.8h, v1.8h, v3.8b
            uaddw   v0.8h, v0.8h, v2.8b
            uzp2    v1.16b, v1.16b, v0.16b
            str     q1, [x3], 16
            cmp     x3, x4
            bne     .L4
    
    For SVE2 we optimize the initial sequence to the same ADD + LSR which gets us:
    
    .L3:
            ld1b    z0.h, p0/z, [x0, x3]
            mul     z0.h, p1/m, z0.h, z2.h
            add     z1.h, z0.h, z3.h
            usra    z0.h, z1.h, #8
            lsr     z0.h, z0.h, #8
            st1b    z0.h, p0, [x0, x3]
            inch    x3
            whilelo p0.h, w3, w2
            b.any   .L3
    .L1:
            ret
    
    and to get the most optimal sequence I match (a + b) >> n (same constraint on n)
    to addhnb which gets us to:
    
    .L3:
            ld1b    z0.h, p0/z, [x0, x3]
            mul     z0.h, p1/m, z0.h, z2.h
            addhnb  z1.b, z0.h, z3.h
            addhnb  z0.b, z0.h, z1.h
            st1b    z0.h, p0, [x0, x3]
            inch    x3
            whilelo p0.h, w3, w2
            b.any   .L3
    
    There are multiple RTL representations possible for these optimizations, I did
    not represent them using a zero_extend because we seem very inconsistent in this
    in the backend.  Since they are unspecs we won't match them from vector ops
    anyway. I figured maintainers would prefer this, but my maintainer ouija board
    is still out for repairs :)
    
    There are no new test as new correctness tests were added to the mid-end and
    the existing codegen tests for this already exist.
    
    gcc/ChangeLog:
    
            PR target/108583
            * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): Remove.
            (*bitmask_shift_plus<mode>): New.
            * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New.
            (@aarch64_bitmask_udiv<mode>3): Remove.
            * config/aarch64/aarch64.cc
            (aarch64_vectorize_can_special_div_by_constant,
            TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed.
            (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT,
            aarch64_vectorize_preferred_div_as_shifts_over_mult): New.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md | 71 ++++++++++----------------------------
 gcc/config/aarch64/aarch64-sve2.md | 57 +++++++++---------------------
 gcc/config/aarch64/aarch64.cc      | 61 +++++++++-----------------------
 3 files changed, 52 insertions(+), 137 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 772dd7693b3..b63c1fe1543 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4867,60 +4867,27 @@
   }
 )
 
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; If we imagine a short as being composed of two blocks of bytes then
-;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
-;; adding 1 to each sub component:
-;;
-;;      short value of 16-bits
-;; ┌──────────────┬────────────────┐
-;; │              │                │
-;; └──────────────┴────────────────┘
-;;   8-bit part1 ▲  8-bit part2   ▲
-;;               │                │
-;;               │                │
-;;              +1               +1
-;;
-;; after the first addition, we have to shift right by 8, and narrow the
-;; results back to a byte.  Remember that the addition must be done in
-;; double the precision of the input.  Since 8 is half the size of a short
-;; we can use a narrowing halfing instruction in AArch64, addhn which also
-;; does the addition in a wider precision and narrows back to a byte.  The
-;; shift itself is implicit in the operation as it writes back only the top
-;; half of the result. i.e. bits 2*esize-1:esize.
-;;
-;; Since we have narrowed the result of the first part back to a byte, for
-;; the second addition we can use a widening addition, uaddw.
-;;
-;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
-;;
-;; The shift is later optimized by combine to a uzp2 with movi #0.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
-  [(match_operand:VQN 0 "register_operand")
-   (match_operand:VQN 1 "register_operand")
-   (match_operand:VQN 2 "immediate_operand")]
+;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector
+(define_insn_and_split "*bitmask_shift_plus<mode>"
+  [(set (match_operand:VQN 0 "register_operand" "=&w")
+	(plus:VQN
+	  (lshiftrt:VQN
+	    (plus:VQN (match_operand:VQN 1 "register_operand" "w")
+		      (match_operand:VQN 2 "register_operand" "w"))
+	    (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" ""))
+	  (match_operand:VQN 4 "register_operand" "w")))]
   "TARGET_SIMD"
+  "#"
+  "&& true"
+  [(const_int 0)]
 {
-  unsigned HOST_WIDE_INT size
-    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
-  rtx elt = unwrap_const_vec_duplicate (operands[2]);
-  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
-    FAIL;
-
-  rtx addend = gen_reg_rtx (<MODE>mode);
-  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
-  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
-  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
-  rtx tmp2 = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
-  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
-  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
-  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
-  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  rtx tmp;
+  if (can_create_pseudo_p ())
+    tmp = gen_reg_rtx (<VNARROWQ>mode);
+  else
+    tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0]));
+  emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2]));
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 40c0728a7e6..2346f9f835d 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,7 +71,6 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
-;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -1600,6 +1599,22 @@
   "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
 )
 
+;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
+(define_insn "*bitmask_shift_plus<mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_HSDI
+	   [(match_operand:<VPRED> 1)
+	    (lshiftrt:SVE_FULL_HSDI
+	      (plus:SVE_FULL_HSDI
+		(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+		(match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
+	      (match_operand:SVE_FULL_HSDI 4
+		 "aarch64_simd_shift_imm_vec_exact_top" ""))]
+          UNSPEC_PRED_X))]
+  "TARGET_SVE2"
+  "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Narrowing right shifts
 ;; -------------------------------------------------------------------------
@@ -2313,46 +2328,6 @@
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
-;; -------------------------------------------------------------------------
-;; ---- [INT] Misc optab implementations
-;; -------------------------------------------------------------------------
-;; Includes:
-;; - aarch64_bitmask_udiv
-;; -------------------------------------------------------------------------
-
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; See aarch64-simd.md for bigger explanation.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
-  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
-   (match_operand:SVE_FULL_HSDI 1 "register_operand")
-   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
-  "TARGET_SVE2"
-{
-  unsigned HOST_WIDE_INT size
-    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
-  rtx elt = unwrap_const_vec_duplicate (operands[2]);
-  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
-    FAIL;
-
-  rtx addend = gen_reg_rtx (<MODE>mode);
-  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
-  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
-  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
-  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
-  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
-			      addend));
-  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
-			      lowpart_subreg (<MODE>mode, tmp1,
-					      <VNARROW>mode)));
-  emit_move_insn (operands[0],
-		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
-  DONE;
-})
-
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5c40b6ed22a..8a06879e94f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3847,6 +3847,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
 }
 
+/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+static bool
+aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
+{
+  machine_mode mode = TYPE_MODE (type);
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  bool sve_p = (vec_flags & VEC_ANY_SVE);
+  bool simd_p = (vec_flags & VEC_ADVSIMD);
+
+  return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
+}
+
 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
    prefer to use the first arithmetic operand as the else value if
    the else value doesn't matter, since that exactly matches the SVE
@@ -24361,46 +24374,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 
   return ret;
 }
-
-/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
-
-bool
-aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
-					       tree vectype, wide_int cst,
-					       rtx *output, rtx in0, rtx in1)
-{
-  if (code != TRUNC_DIV_EXPR
-      || !TYPE_UNSIGNED (vectype))
-    return false;
-
-  machine_mode mode = TYPE_MODE (vectype);
-  unsigned int flags = aarch64_classify_vector_mode (mode);
-  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
-    return false;
-
-  int pow = wi::exact_log2 (cst + 1);
-  auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
-  /* SVE actually has a div operator, we may have gotten here through
-     that route.  */
-  if (pow != (int) (element_precision (vectype) / 2)
-      || insn_code == CODE_FOR_nothing)
-    return false;
-
-  /* We can use the optimized pattern.  */
-  if (in0 == NULL_RTX && in1 == NULL_RTX)
-    return true;
-
-  gcc_assert (output);
-
-  expand_operand ops[3];
-  create_output_operand (&ops[0], *output, mode);
-  create_input_operand (&ops[1], in0, mode);
-  create_fixed_operand (&ops[2], in1);
-  expand_insn (insn_code, 3, ops);
-  *output = ops[0].value;
-  return true;
-}
-
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27902,13 +27875,13 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_MAX_ANCHOR_OFFSET
 #define TARGET_MAX_ANCHOR_OFFSET 4095
 
+#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
+#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
+  aarch64_vectorize_preferred_div_as_shifts_over_mult
+
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
 
-#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
-#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
-  aarch64_vectorize_can_special_div_by_constant
-
 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
   aarch64_vectorize_preferred_vector_alignment