Ping,

And updated the hook to allow to differentiate between ISAs.

As Andy said before initializing a ranger instance is cheap but not free, and if
the intention is to call it often during a pass it should be instantiated at
pass startup and passed along to the places that need it.  This is a big
refactoring and doesn't seem right to do in this PR.  But we should in GCC 14.

Currently we only instantiate it after a long series of much cheaper checks.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/108583
	* target.def (preferred_div_as_shifts_over_mult): New.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
	* targhooks.h (default_preferred_div_as_shifts_over_mult): New.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.

gcc/testsuite/ChangeLog:

	PR target/108583
	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.dg/vect/vect-div-bitmask-5.c: New test.

--- inline copy of patch ---

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50a8872a6695b18b9bed0d393bacf733833633db..f69f7f036272e867ea1c3fee851b117f057f68c5 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6137,6 +6137,10 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+If possible, when decomposing a division operation of vectors of
+type @var{type} during vectorization, prefer to use shifts rather than
+multiplication by magic constants.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..bdee9b7f9c941508738fac49593b5baa525e2915 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,16 @@ correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "If possible, when decomposing a division operation of vectors of\n\
+type @var{type} during vectorization, prefer to use shifts rather than\n\
+multiplication by magic constants.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+  (const_tree);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 211525720a620d6f533e2da91e03877337a931e7..becea6ef4b6329cfa0b676f8d844630fbdc97f20 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree /* type */)
+{
+  return false;
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 5ab5b944a573ad24ce8427aff24fc5215bf05dac..26ed91d58fa4709a67c903ad446d267a3113c172 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3346,6 +3346,20 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 		    param_avoid_fma_max_bits));
   bool defer = check_defer;
   bool seen_negate_p = false;
+
+  /* There is no numerical difference between fused and unfused integer FMAs,
+     and the assumption below that FMA is as cheap as addition is unlikely
+     to be true, especially if the multiplication occurs multiple times on
+     the same chain.  E.g., for something like:
+
+	 (((a * b) + c) >> 1) + (a * b)
+
+     we do not want to duplicate the a * b into two additions, not least
+     because the result is not a natural FMA chain.  */
+  if (ANY_INTEGRAL_TYPE_P (type)
+      && !has_single_use (mul_result))
+    return false;
+
   /* Make sure that the multiplication statement becomes dead after
      the transformation, thus that all uses are transformed to FMAs.
      This means we assume that an FMA operation has the same cost
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..27fb4c6a59f0182c4a836f96ab6b5e2e405a18a0 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3913,6 +3913,84 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && TYPE_UNSIGNED (itype)
+	   && rhs_code == TRUNC_DIV_EXPR
+	   && vectype
+	   && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+    {
+      /* div optimizations using narrowings
+       we can do the division e.g. shorts by 255 faster by calculating it as
+       (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+       double the precision of x.
+
+       If we imagine a short as being composed of two blocks of bytes then
+       adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+       adding 1 to each sub component:
+
+	    short value of 16-bits
+       ┌──────────────┬────────────────┐
+       │              │                │
+       └──────────────┴────────────────┘
+	 8-bit part1 ▲  8-bit part2   ▲
+		     │                │
+		     │                │
+		    +1               +1
+
+       after the first addition, we have to shift right by 8, and narrow the
+       results back to a byte.  Remember that the addition must be done in
+       double the precision of the input.  However if we know that the addition
+       `x + 257` does not overflow then we can do the operation in the current
+       precision.  In which case we don't need the pack and unpacks.  */
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == (int) (element_precision (vectype) / 2))
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::to_wide (build_one_cst (itype));
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))