diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7f212bf37cd2c120dceb7efa733c9fa76226f029..26871a56d1fdb134f0ad9d828ce68a8df0272c53 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4867,62 +4867,48 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; If we imagine a short as being composed of two blocks of bytes then
-;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
-;; adding 1 to each sub component:
-;;
-;;      short value of 16-bits
-;; ┌──────────────┬────────────────┐
-;; │              │                │
-;; └──────────────┴────────────────┘
-;;   8-bit part1 ▲  8-bit part2   ▲
-;;               │                │
-;;               │                │
-;;              +1               +1
-;;
-;; after the first addition, we have to shift right by 8, and narrow the
-;; results back to a byte.  Remember that the addition must be done in
-;; double the precision of the input.  Since 8 is half the size of a short
-;; we can use a narrowing halfing instruction in AArch64, addhn which also
-;; does the addition in a wider precision and narrows back to a byte.  The
-;; shift itself is implicit in the operation as it writes back only the top
-;; half of the result. i.e. bits 2*esize-1:esize.
-;;
-;; Since we have narrowed the result of the first part back to a byte, for
-;; the second addition we can use a widening addition, uaddw.
-;;
-;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
-;;
-;; The shift is later optimized by combine to a uzp2 with movi #0.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
-  [(match_operand:VQN 0 "register_operand")
-   (match_operand:VQN 1 "register_operand")
-   (match_operand:VQN 2 "immediate_operand")]
+;; Implement add_highpart as ADD + RSHIFT, we have various optimization for
+;; narrowing represented as shifts and so this representation will allow us to
+;; further optimize this should the result require narrowing. The alternative
+;; representation of ADDHN + UXTL is less efficient and harder to futher
+;; optimize.
+(define_expand "<su>add<mode>3_highpart"
+  [(set (match_operand:VQN 0 "register_operand")
+	(unspec:VQN [(match_operand:VQN 1 "register_operand")
+		     (match_operand:VQN 2 "register_operand")]
+		    ADD_HIGHPART))]
+  "TARGET_SIMD"
+{
+  rtx result = gen_reg_rtx (<MODE>mode);
+  int shift_amount = GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+							shift_amount);
+  emit_insn (gen_add<mode>3 (result, operands[1], operands[2]));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], result, shift_vector));
+  DONE;
+})
+
+;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector
+(define_insn_and_split "*bitmask_shift_plus<mode>"
+  [(set (match_operand:VQN 0 "register_operand" "=w")
+	(plus:VQN
+	  (lshiftrt:VQN
+	    (plus:VQN (match_operand:VQN 1 "register_operand" "w")
+		      (match_operand:VQN 2 "register_operand" "w"))
+	    (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" "Dr"))
+	  (match_operand:VQN 4 "register_operand" "w")))]
   "TARGET_SIMD"
+  "#"
+  "&& !reload_completed"
+  [(const_int 0)]
 {
-  unsigned HOST_WIDE_INT size
-    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
-  rtx elt = unwrap_const_vec_duplicate (operands[2]);
-  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
-    FAIL;
-
-  rtx addend = gen_reg_rtx (<MODE>mode);
-  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
-  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
-  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
-  rtx tmp2 = gen_reg_rtx (<MODE>mode);
-  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
-  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
-  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
-  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
-  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  rtx tmp = gen_reg_rtx (<VNARROWQ>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2]));
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp));
   DONE;
-})
+}
+  [(set_attr "type" "neon_add_halve<q>")]
+)
 
 ;; pmul.
 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 40c0728a7e6f00c395c360ce7625bc2e4a018809..ad01c1ddf9257cec951ed0c16558a3c4d856813b 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -2317,39 +2317,51 @@ (define_insn "@aarch64_sve_<optab><mode>"
 ;; ---- [INT] Misc optab implementations
 ;; -------------------------------------------------------------------------
 ;; Includes:
-;; - aarch64_bitmask_udiv
+;; - add_highpart
 ;; -------------------------------------------------------------------------
 
-;; div optimizations using narrowings
-;; we can do the division e.g. shorts by 255 faster by calculating it as
-;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
-;; double the precision of x.
-;;
-;; See aarch64-simd.md for bigger explanation.
-(define_expand "@aarch64_bitmask_udiv<mode>3"
-  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
-   (match_operand:SVE_FULL_HSDI 1 "register_operand")
-   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+;; Implement add_highpart as ADD + RSHIFT, we have various optimization for
+;; narrowing represented as shifts and so this representation will allow us to
+;; further optimize this should the result require narrowing. The alternative
+;; representation of ADDHN + UXTL is less efficient and harder to futher
+;; optimize.
+(define_expand "<su>add<mode>3_highpart"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand")
+	(unspec:SVE_FULL_HSDI
+	  [(match_operand:SVE_FULL_HSDI 1 "register_operand")
+	   (match_operand:SVE_FULL_HSDI 2 "register_operand")]
+	  ADD_HIGHPART))]
   "TARGET_SVE2"
 {
-  unsigned HOST_WIDE_INT size
-    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
-  rtx elt = unwrap_const_vec_duplicate (operands[2]);
-  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
-    FAIL;
+  rtx result = gen_reg_rtx (<MODE>mode);
+  int shift_amount = GET_MODE_UNIT_BITSIZE (<MODE>mode) / 2;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+							shift_amount);
+  emit_insn (gen_add<mode>3 (result, operands[1], operands[2]));
+  emit_insn (gen_vlshr<mode>3 (operands[0], result, shift_vector));
+  DONE;
+})
 
-  rtx addend = gen_reg_rtx (<MODE>mode);
+;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
+(define_insn_and_split "*bitmask_shift_plus<mode>"
+  [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+	(unspec:SVE_FULL_HSDI [
+	    (match_operand:<VPRED> 1 "register_operand" "Upl")
+	    (lshiftrt:SVE_FULL_HSDI
+	      (plus:SVE_FULL_HSDI
+		(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+		(match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
+	      (match_operand:SVE_FULL_HSDI 4 "aarch64_simd_shift_imm_vec_exact_top" "Dr"))
+        ] UNSPEC_PRED_X))]
+  "TARGET_SVE2"
+  "#"
+  "&& !reload_completed"
+  [(const_int 0)]
+{
   rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
-  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
-  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
-  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
-  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
-			      addend));
-  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
-			      lowpart_subreg (<MODE>mode, tmp1,
-					      <VNARROW>mode)));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[2], operands[3]));
   emit_move_insn (operands[0],
-		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+		  lowpart_subreg (<MODE>mode, tmp1, <VNARROW>mode));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..8a04feb29e6bfb423a09dde2cd64853e69d0e1ba 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24363,46 +24363,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 
   return ret;
 }
-
-/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
-
-bool
-aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
-					       tree vectype, wide_int cst,
-					       rtx *output, rtx in0, rtx in1)
-{
-  if (code != TRUNC_DIV_EXPR
-      || !TYPE_UNSIGNED (vectype))
-    return false;
-
-  machine_mode mode = TYPE_MODE (vectype);
-  unsigned int flags = aarch64_classify_vector_mode (mode);
-  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
-    return false;
-
-  int pow = wi::exact_log2 (cst + 1);
-  auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
-  /* SVE actually has a div operator, we may have gotten here through
-     that route.  */
-  if (pow != (int) (element_precision (vectype) / 2)
-      || insn_code == CODE_FOR_nothing)
-    return false;
-
-  /* We can use the optimized pattern.  */
-  if (in0 == NULL_RTX && in1 == NULL_RTX)
-    return true;
-
-  gcc_assert (output);
-
-  expand_operand ops[3];
-  create_output_operand (&ops[0], *output, mode);
-  create_input_operand (&ops[1], in0, mode);
-  create_fixed_operand (&ops[2], in1);
-  expand_insn (insn_code, 3, ops);
-  *output = ops[0].value;
-  return true;
-}
-
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 6cbc97cc82c06a68259bdf4dec8a0eab230081e5..ae627ae56cbd1e8b882e596dba974e74ef396e0e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -750,6 +750,8 @@ (define_c_enum "unspec"
     UNSPEC_REVH		; Used in aarch64-sve.md.
     UNSPEC_REVW		; Used in aarch64-sve.md.
     UNSPEC_REVBHW	; Used in aarch64-sve.md.
+    UNSPEC_SADD_HIGHPART ; Used in aarch64-sve.md.
+    UNSPEC_UADD_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_FMLA		; Used in aarch64-sve.md.
@@ -2704,6 +2706,7 @@ (define_int_iterator UNPACK [UNSPEC_UNPACKSHI UNSPEC_UNPACKUHI
 
 (define_int_iterator UNPACK_UNSIGNED [UNSPEC_UNPACKULO UNSPEC_UNPACKUHI])
 
+(define_int_iterator ADD_HIGHPART [UNSPEC_SADD_HIGHPART UNSPEC_UADD_HIGHPART])
 (define_int_iterator MUL_HIGHPART [UNSPEC_SMUL_HIGHPART UNSPEC_UMUL_HIGHPART])
 
 (define_int_iterator CLAST [UNSPEC_CLASTA UNSPEC_CLASTB])
@@ -3342,6 +3345,8 @@ (define_int_attr su [(UNSPEC_SADDV "s")
 		     (UNSPEC_UNPACKUHI "u")
 		     (UNSPEC_UNPACKSLO "s")
 		     (UNSPEC_UNPACKULO "u")
+		     (UNSPEC_SADD_HIGHPART "s")
+		     (UNSPEC_UADD_HIGHPART "u")
 		     (UNSPEC_SMUL_HIGHPART "s")
 		     (UNSPEC_UMUL_HIGHPART "u")
 		     (UNSPEC_COND_FCVTZS "s")