Hi All, In plenty of image and video processing code it's common to modify pixel values by a widening operation and then scale them back into range by dividing by 255. This patch adds an optab to allow us to emit an optimized sequence when doing an unsigned division that is equivalent to: x = y / (2 ^ (bitsize (y)/2)-1 Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * internal-fn.def (DIV_POW2_BITMASK): New. * optabs.def (udiv_pow2_bitmask_optab): New. * doc/md.texi: Document it. * tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-div-bitmask-1.c: New test. * gcc.dg/vect/vect-div-bitmask-2.c: New test. * gcc.dg/vect/vect-div-bitmask-3.c: New test. * gcc.dg/vect/vect-div-bitmask.h: New file. --- inline copy of patch -- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5588,6 +5588,18 @@ signed op0, op1; op0 = op1 / (1 << imm); @end smallexample +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern +@item @samp{udiv_pow2_bitmask@var{m2}} +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern +@itemx @samp{udiv_pow2_bitmask@var{m2}} +Unsigned vector division by an immediate that is equivalent to +@samp{2^(bitsize(m) / 2) - 1}. +@smallexample +unsigned short op0; op1; +@dots{} +op0 = op1 / 0xffU; +@end smallexample + @cindex @code{vec_shl_insert_@var{m}} instruction pattern @item @samp{vec_shl_insert_@var{m}} Shift the elements in vector input operand 1 left one element (i.e.@: diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW, vec_shl_insert, binary) DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary) +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW, + udiv_pow2_bitmask, unary) DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary) DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) diff --git a/gcc/optabs.def b/gcc/optabs.def index 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3") OPTAB_D (umulhs_optab, "umulhs$a3") OPTAB_D (umulhrs_optab, "umulhrs$a3") OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3") +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2") OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a") OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a") OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c new file mode 100644 index 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +#define N 50 +#define TYPE uint8_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c new file mode 100644 index 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include +#include "tree-vect.h" + +#define N 50 +#define TYPE uint16_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c new file mode 100644 index 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c @@ -0,0 +1,26 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */ + +#include +#include "tree-vect.h" + +#define N 50 +#define TYPE uint32_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h new file mode 100644 index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h @@ -0,0 +1,43 @@ +#include + +#ifndef N +#define N 65 +#endif + +#ifndef TYPE +#define TYPE uint32_t +#endif + +#ifndef DEBUG +#define DEBUG 0 +#endif + +#define BASE ((TYPE) -1 < 0 ? -126 : 4) + +int main () +{ + TYPE a[N]; + TYPE b[N]; + + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 13; + b[i] = BASE + i * 13; + if (DEBUG) + printf ("%d: 0x%x\n", i, a[i]); + } + + fun1 (a, N / 2, N); + fun2 (b, N / 2, N); + + for (int i = 0; i < N; ++i) + { + if (DEBUG) + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); + + if (a[i] != b[i]) + __builtin_abort (); + } + return 0; +} + diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo, return pattern_stmt; } + else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1) + && rhs_code != TRUNC_MOD_EXPR) + { + wide_int icst = wi::to_wide (oprnd1); + wide_int val = wi::add (icst, 1); + int pow = wi::exact_log2 (val); + if (pow == (prec / 2)) + { + /* Pattern detected. */ + vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt); + + *type_out = vectype; + + /* Check if the target supports this internal function. */ + internal_fn ifn = IFN_DIV_POW2_BITMASK; + if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED)) + { + tree var_div = vect_recog_temp_ssa_var (itype, NULL); + gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0); + gimple_call_set_lhs (div_stmt, var_div); + + gimple_set_location (div_stmt, gimple_location (last_stmt)); + + return div_stmt; + } + } + } if (prec > HOST_BITS_PER_WIDE_INT || integer_zerop (oprnd1)) --