Hi, Here's the respun patch. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR target/108583 * target.def (preferred_div_as_shifts_over_mult): New. * doc/tm.texi.in: Document it. * doc/tm.texi: Regenerate. * targhooks.cc (default_preferred_div_as_shifts_over_mult): New. * targhooks.h (default_preferred_div_as_shifts_over_mult): New. * tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it. gcc/testsuite/ChangeLog: PR target/108583 * gcc.dg/vect/vect-div-bitmask-4.c: New test. * gcc.dg/vect/vect-div-bitmask-5.c: New test. --- inline copy of patch --- diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 50a8872a6695b18b9bed0d393bacf733833633db..bf7269e323de1a065d4d04376e5a2703cbb0f9fa 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6137,6 +6137,12 @@ instruction pattern. There is no need for the hook to handle these two implementation approaches itself. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type}) +Sometimes it is possible to implement a vector division using a sequence +of two addition-shift pairs, giving four instructions in total. +Return true if taking this approach for @var{vectype} is likely +to be better than using a sequence involving highpart multiplication. +Default is false if @code{can_mult_highpart_p}, otherwise true. @end deftypefn @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in}) diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4173,6 +4173,7 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_VEC_PERM_CONST +@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION diff --git a/gcc/target.def b/gcc/target.def index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..e4474a3ed6bd2f5f5c010bf0d40c2a371370490c 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1868,6 +1868,18 @@ correct for most targets.", poly_uint64, (const_tree type), default_preferred_vector_alignment) +/* Returns whether the target has a preference for decomposing divisions using + shifts rather than multiplies. */ +DEFHOOK +(preferred_div_as_shifts_over_mult, + "Sometimes it is possible to implement a vector division using a sequence\n\ +of two addition-shift pairs, giving four instructions in total.\n\ +Return true if taking this approach for @var{vectype} is likely\n\ +to be better than using a sequence involving highpart multiplication.\n\ +Default is false if @code{can_mult_highpart_p}, otherwise true.", + bool, (const_tree type), + default_preferred_div_as_shifts_over_mult) + /* Return true if vector alignment is reachable (by peeling N iterations) for the given scalar type. */ DEFHOOK diff --git a/gcc/targhooks.h b/gcc/targhooks.h index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void); extern unsigned HOST_WIDE_INT default_shift_truncation_mask (machine_mode); extern unsigned int default_min_divisions_for_recip_mul (machine_mode); +extern bool default_preferred_div_as_shifts_over_mult + (const_tree); extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode); extern tree default_stack_protect_guard (void); diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index 211525720a620d6f533e2da91e03877337a931e7..7f39ff9b7ec2bf66625d48a47bb76e96c05a3233 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type) return TYPE_ALIGN (type); } +/* The default implementation of + TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ + +bool +default_preferred_div_as_shifts_over_mult (const_tree type) +{ + return can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type)); +} + /* By default assume vectors of element TYPE require a multiple of the natural alignment of TYPE. TYPE is naturally aligned if IS_PACKED is false. */ bool diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c new file mode 100644 index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +typedef unsigned __attribute__((__vector_size__ (16))) V; + +static __attribute__((__noinline__)) __attribute__((__noclone__)) V +foo (V v, unsigned short i) +{ + v /= i; + return v; +} + +int +main (void) +{ + V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff); + for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++) + if (v[i] != 0x00010001) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c new file mode 100644 index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include +#include "tree-vect.h" + +#define N 50 +#define TYPE uint8_t + +#ifndef DEBUG +#define DEBUG 0 +#endif + +#define BASE ((TYPE) -1 < 0 ? -126 : 4) + + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] + level) / 0xff; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] + level) / 0xff; +} + +int main () +{ + TYPE a[N]; + TYPE b[N]; + + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 13; + b[i] = BASE + i * 13; + if (DEBUG) + printf ("%d: 0x%x\n", i, a[i]); + } + + fun1 (a, N / 2, N); + fun2 (b, N / 2, N); + + for (int i = 0; i < N; ++i) + { + if (DEBUG) + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); + + if (a[i] != b[i]) + __builtin_abort (); + } + return 0; +} + +/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..183f1a623fbde34f505259cf8f4fb4d34e069614 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -3914,6 +3914,83 @@ vect_recog_divmod_pattern (vec_info *vinfo, return pattern_stmt; } + if ((cst = uniform_integer_cst_p (oprnd1)) + && TYPE_UNSIGNED (itype) + && rhs_code == TRUNC_DIV_EXPR + && vectype + && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype)) + { + /* We can use the relationship: + + x // N == ((x+N+2) // (N+1) + x) // (N+1) for 0 <= x < N(N+3) + + to optimize cases where N+1 is a power of 2, and where // (N+1) + is therefore a shift right. When operating in modes that are + multiples of a byte in size, there are two cases: + + (1) N(N+3) is not representable, in which case the question + becomes whether the replacement expression overflows. + It is enough to test that x+N+2 does not overflow, + i.e. that x < MAX-(N+1). + + (2) N(N+3) is representable, in which case it is the (only) + bound that we need to check. + + ??? For now we just handle the case where // (N+1) is a shift + right by half the precision, since some architectures can + optimize the associated addition and shift combinations + into single instructions. */ + + auto wcst = wi::to_wide (cst); + int pow = wi::exact_log2 (wcst + 1); + if (pow == prec / 2) + { + gimple *stmt = SSA_NAME_DEF_STMT (oprnd0); + + gimple_ranger ranger; + int_range_max r; + + /* Check that no overflow will occur. If we don't have range + information we can't perform the optimization. */ + + if (ranger.range_of_expr (r, oprnd0, stmt)) + { + wide_int max = r.upper_bound (); + wide_int one = wi::shwi (1, prec); + wide_int adder = wi::add (one, wi::lshift (one, pow)); + wi::overflow_type ovf; + wi::add (max, adder, UNSIGNED, &ovf); + if (ovf == wi::OVF_NONE) + { + *type_out = vectype; + tree tadder = wide_int_to_tree (itype, adder); + tree rshift = wide_int_to_tree (itype, pow); + + tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL); + gassign *patt1 + = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL); + patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1, + rshift); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL); + patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2, + oprnd0); + append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype); + + tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL); + pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR, + new_lhs3, rshift); + + return pattern_stmt; + } + } + } + } + if (prec > HOST_BITS_PER_WIDE_INT || integer_zerop (oprnd1)) return NULL;