diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}