public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
@ 2022-06-09  4:39 Tamar Christina
  2022-06-09  4:40 ` [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division Tamar Christina
                   ` (7 more replies)
  0 siblings, 8 replies; 35+ messages in thread
From: Tamar Christina @ 2022-06-09  4:39 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 8070 bytes --]

Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an optab to allow us to emit an optimized sequence when doing
an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* internal-fn.def (DIV_POW2_BITMASK): New.
	* optabs.def (udiv_pow2_bitmask_optab): New.
	* doc/md.texi: Document it.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
	* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5588,6 +5588,18 @@ signed op0, op1;
 op0 = op1 / (1 << imm);
 @end smallexample
 
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@item @samp{udiv_pow2_bitmask@var{m2}}
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@itemx @samp{udiv_pow2_bitmask@var{m2}}
+Unsigned vector division by an immediate that is equivalent to
+@samp{2^(bitsize(m) / 2) - 1}.
+@smallexample
+unsigned short op0; op1;
+@dots{}
+op0 = op1 / 0xffU;
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
 		       vec_shl_insert, binary)
 
 DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
+		       udiv_pow2_bitmask, unary)
 
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
 OPTAB_D (umulhs_optab, "umulhs$a3")
 OPTAB_D (umulhrs_optab, "umulhrs$a3")
 OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
+	   && rhs_code != TRUNC_MOD_EXPR)
+    {
+      wide_int icst = wi::to_wide (oprnd1);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      if (pow == (prec / 2))
+	{
+	  /* Pattern detected.  */
+	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
+
+	  *type_out = vectype;
+
+	  /* Check if the target supports this internal function.  */
+	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
+	  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+	    {
+	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
+	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
+	      gimple_call_set_lhs (div_stmt, var_div);
+
+	      gimple_set_location (div_stmt, gimple_location (last_stmt));
+
+	      return div_stmt;
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))




-- 

[-- Attachment #2: rb15779.patch --]
[-- Type: text/plain, Size: 7169 bytes --]

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5588,6 +5588,18 @@ signed op0, op1;
 op0 = op1 / (1 << imm);
 @end smallexample
 
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@item @samp{udiv_pow2_bitmask@var{m2}}
+@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
+@itemx @samp{udiv_pow2_bitmask@var{m2}}
+Unsigned vector division by an immediate that is equivalent to
+@samp{2^(bitsize(m) / 2) - 1}.
+@smallexample
+unsigned short op0; op1;
+@dots{}
+op0 = op1 / 0xffU;
+@end smallexample
+
 @cindex @code{vec_shl_insert_@var{m}} instruction pattern
 @item @samp{vec_shl_insert_@var{m}}
 Shift the elements in vector input operand 1 left one element (i.e.@:
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
 		       vec_shl_insert, binary)
 
 DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
+DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
+		       udiv_pow2_bitmask, unary)
 
 DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
 DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
 OPTAB_D (umulhs_optab, "umulhs$a3")
 OPTAB_D (umulhrs_optab, "umulhrs$a3")
 OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
+OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
 OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
 OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
 OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
+	   && rhs_code != TRUNC_MOD_EXPR)
+    {
+      wide_int icst = wi::to_wide (oprnd1);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      if (pow == (prec / 2))
+	{
+	  /* Pattern detected.  */
+	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
+
+	  *type_out = vectype;
+
+	  /* Check if the target supports this internal function.  */
+	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
+	  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+	    {
+	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
+	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
+	      gimple_call_set_lhs (div_stmt, var_div);
+
+	      gimple_set_location (div_stmt, gimple_location (last_stmt));
+
+	      return div_stmt;
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))




^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division.
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
@ 2022-06-09  4:40 ` Tamar Christina
  2022-06-13  9:24 ` [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Richard Biener
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-06-09  4:40 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 7291 bytes --]

Hi All,

This adds an implementation for the new optab for unsigned pow2 bitmask for
AArch64.

The implementation rewrites:

   x = y / (2 ^ (sizeof (y)/2)-1

into e.g. (for bytes)

   (x + ((x + 257) >> 8)) >> 8

where it's required that the additions be done in double the precision of x
such that we don't lose any bits during an overflow.

Essentially the sequence decomposes the division into doing two smaller
divisions, one for the top and bottom parts of the number and adding the results
back together.

To account for the fact that shift by 8 would be division by 256 we add 1 to
both parts of x such that when 255 we still get 1 as the answer.

Because the amount we shift are half the original datatype we can use the
halfing instructions the ISA provides to do the operation instead of using
actual shifts.

For AArch64 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

	movi    v3.16b, 0x1
	umull2  v1.8h, v0.16b, v2.16b
	umull   v0.8h, v0.8b, v2.8b
	addhn   v5.8b, v1.8h, v3.8h
	addhn   v4.8b, v0.8h, v3.8h
	uaddw   v1.8h, v1.8h, v5.8b
	uaddw   v0.8h, v0.8h, v4.8b
	uzp2    v0.16b, v0.16b, v1.16b

instead of:

	umull   v2.8h, v1.8b, v5.8b
	umull2  v1.8h, v1.16b, v5.16b
	umull   v0.4s, v2.4h, v3.4h
	umull2  v2.4s, v2.8h, v3.8h
	umull   v4.4s, v1.4h, v3.4h
	umull2  v1.4s, v1.8h, v3.8h
	uzp2    v0.8h, v0.8h, v2.8h
	uzp2    v1.8h, v4.8h, v1.8h
	shrn    v0.8b, v0.8h, 7
	shrn2   v0.16b, v1.8h, 7

Which results in significantly faster code.

Thanks for Wilco for the concept.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (udiv_pow2_bitmask<mode>2): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/div-by-bitmask.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 18733428f3fb91d937346aa360f6d1fe13ca1eae..6b0405924a03a243949a6741f4c0e989d9ca2869 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4845,6 +4845,57 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "udiv_pow2_bitmask<mode>2"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..c03aee695ef834fbe3533a21d54a218160b0007d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99 -fdump-tree-vect -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	umull2	v[0-9]+.8h, v[0-9]+.16b, v[0-9]+.16b
+** 	umull	v[0-9]+.8h, v[0-9]+.8b, v[0-9]+.8b
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	umull2	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** 	umull	v[0-9]+.4s, v[0-9]+.4h, v[0-9]+.4h
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	umull2	v[0-9]+.2d, v[0-9]+.4s, v[0-9]+.4s
+** 	umull	v[0-9]+.2d, v[0-9]+.2s, v[0-9]+.2s
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+/* Costing for long vectorization seems off, so disable
+   the cost model to test the codegen.  */
+__attribute__ ((optimize("-fno-vect-cost-model")))
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+/* { dg-final { scan-tree-dump-times "\.DIV_POW2_BITMASK" 6 "vect" } } */




-- 

[-- Attachment #2: rb15780.patch --]
[-- Type: text/plain, Size: 5362 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 18733428f3fb91d937346aa360f6d1fe13ca1eae..6b0405924a03a243949a6741f4c0e989d9ca2869 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4845,6 +4845,57 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "udiv_pow2_bitmask<mode>2"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")]
+  "TARGET_SIMD"
+{
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..c03aee695ef834fbe3533a21d54a218160b0007d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,70 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99 -fdump-tree-vect -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	umull2	v[0-9]+.8h, v[0-9]+.16b, v[0-9]+.16b
+** 	umull	v[0-9]+.8h, v[0-9]+.8b, v[0-9]+.8b
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	umull2	v[0-9]+.4s, v[0-9]+.8h, v[0-9]+.8h
+** 	umull	v[0-9]+.4s, v[0-9]+.4h, v[0-9]+.4h
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	umull2	v[0-9]+.2d, v[0-9]+.4s, v[0-9]+.4s
+** 	umull	v[0-9]+.2d, v[0-9]+.2s, v[0-9]+.2s
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+/* Costing for long vectorization seems off, so disable
+   the cost model to test the codegen.  */
+__attribute__ ((optimize("-fno-vect-cost-model")))
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+/* { dg-final { scan-tree-dump-times "\.DIV_POW2_BITMASK" 6 "vect" } } */




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
  2022-06-09  4:40 ` [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division Tamar Christina
@ 2022-06-13  9:24 ` Richard Biener
  2022-06-13  9:39   ` Richard Biener
  2022-09-23  9:33 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Tamar Christina
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2022-06-13  9:24 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, richard.sandiford

On Thu, 9 Jun 2022, Tamar Christina wrote:

> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel values
> by a widening operation and then scale them back into range by dividing by 255.
> 
> This patch adds an optab to allow us to emit an optimized sequence when doing
> an unsigned division that is equivalent to:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

Looking at 2/2 it seems that this is the wrong way to attack the
problem.  The ISA doesn't have such instruction so adding an optab
looks premature.  I suppose that there's no unsigned vector integer
division and thus we open-code that in a different way?  Isn't the
correct thing then to fixup that open-coding if it is more efficient?

Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.def (DIV_POW2_BITMASK): New.
> 	* optabs.def (udiv_pow2_bitmask_optab): New.
> 	* doc/md.texi: Document it.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5588,6 +5588,18 @@ signed op0, op1;
>  op0 = op1 / (1 << imm);
>  @end smallexample
>  
> +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> +@item @samp{udiv_pow2_bitmask@var{m2}}
> +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> +@itemx @samp{udiv_pow2_bitmask@var{m2}}
> +Unsigned vector division by an immediate that is equivalent to
> +@samp{2^(bitsize(m) / 2) - 1}.
> +@smallexample
> +unsigned short op0; op1;
> +@dots{}
> +op0 = op1 / 0xffU;
> +@end smallexample
> +
>  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
>  @item @samp{vec_shl_insert_@var{m}}
>  Shift the elements in vector input operand 1 left one element (i.e.@:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
>  		       vec_shl_insert, binary)
>  
>  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
> +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
> +		       udiv_pow2_bitmask, unary)
>  
>  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
>  DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
>  OPTAB_D (umulhs_optab, "umulhs$a3")
>  OPTAB_D (umulhrs_optab, "umulhrs$a3")
>  OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
>  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
>  OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
>  OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>  
>        return pattern_stmt;
>      }
> +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> +	   && rhs_code != TRUNC_MOD_EXPR)
> +    {
> +      wide_int icst = wi::to_wide (oprnd1);
> +      wide_int val = wi::add (icst, 1);
> +      int pow = wi::exact_log2 (val);
> +      if (pow == (prec / 2))
> +	{
> +	  /* Pattern detected.  */
> +	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
> +
> +	  *type_out = vectype;
> +
> +	  /* Check if the target supports this internal function.  */
> +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> +	  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> +	    {
> +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
> +	      gimple_call_set_lhs (div_stmt, var_div);
> +
> +	      gimple_set_location (div_stmt, gimple_location (last_stmt));
> +
> +	      return div_stmt;
> +	    }
> +	}
> +    }
>  
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> 
> 
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-13  9:24 ` [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Richard Biener
@ 2022-06-13  9:39   ` Richard Biener
  2022-06-13 10:09     ` Tamar Christina
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2022-06-13  9:39 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, richard.sandiford

On Mon, 13 Jun 2022, Richard Biener wrote:

> On Thu, 9 Jun 2022, Tamar Christina wrote:
> 
> > Hi All,
> > 
> > In plenty of image and video processing code it's common to modify pixel values
> > by a widening operation and then scale them back into range by dividing by 255.
> > 
> > This patch adds an optab to allow us to emit an optimized sequence when doing
> > an unsigned division that is equivalent to:
> > 
> >    x = y / (2 ^ (bitsize (y)/2)-1
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> > 
> > Ok for master?
> 
> Looking at 2/2 it seems that this is the wrong way to attack the
> problem.  The ISA doesn't have such instruction so adding an optab
> looks premature.  I suppose that there's no unsigned vector integer
> division and thus we open-code that in a different way?  Isn't the
> correct thing then to fixup that open-coding if it is more efficient?

Btw, on x86 we use

t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
t.c:3:21: note:   with patt_25 = patt_19 >> 7;
t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;

which translates to

        vpmulhuw        %ymm4, %ymm0, %ymm0
        vpmulhuw        %ymm4, %ymm1, %ymm1
        vpsrlw  $7, %ymm0, %ymm0
        vpsrlw  $7, %ymm1, %ymm1

there's odd

        vpand   %ymm0, %ymm3, %ymm0
        vpand   %ymm1, %ymm3, %ymm1

before (%ymm3 is all 0x00ff)

        vpackuswb       %ymm1, %ymm0, %ymm0

that's not visible in GIMPLE.  I guess aarch64 lacks a highpart
multiply here?  In any case, it seems that generic division expansion
could be improved here? (choose_multiplier?)

Richard.

> Richard.
> 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > 	* doc/md.texi: Document it.
> > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize pattern.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > 
> > --- inline copy of patch -- 
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7b4f6e945866c38 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -5588,6 +5588,18 @@ signed op0, op1;
> >  op0 = op1 / (1 << imm);
> >  @end smallexample
> >  
> > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > +@item @samp{udiv_pow2_bitmask@var{m2}}
> > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > +@itemx @samp{udiv_pow2_bitmask@var{m2}}
> > +Unsigned vector division by an immediate that is equivalent to
> > +@samp{2^(bitsize(m) / 2) - 1}.
> > +@smallexample
> > +unsigned short op0; op1;
> > +@dots{}
> > +op0 = op1 / 0xffU;
> > +@end smallexample
> > +
> >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
> >  @item @samp{vec_shl_insert_@var{m}}
> >  Shift the elements in vector input operand 1 left one element (i.e.@:
> > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > index d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898f916ed45de475f 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
> >  		       vec_shl_insert, binary)
> >  
> >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW, sdiv_pow2, binary)
> > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST | ECF_NOTHROW,
> > +		       udiv_pow2_bitmask, unary)
> >  
> >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> >  DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > index 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f4eed71b0494e17f 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> >  OPTAB_D (umulhs_optab, "umulhs$a3")
> >  OPTAB_D (umulhrs_optab, "umulhrs$a3")
> >  OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> >  OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")
> >  OPTAB_D (vec_pack_trunc_optab, "vec_pack_trunc_$a")
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0bead1f6a452de3f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..009e16e1b36497e5724410d9843f1ce122b26dda
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d94220df849cc47930b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t 
> > +
> > +__attribute__((noipa, noinline, optimize("O1")))
> > +void fun1(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > +}
> > +
> > +__attribute__((noipa, noinline, optimize("O3")))
> > +void fun2(TYPE* restrict pixel, TYPE level, int n)
> > +{
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > +}
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern: detected" "vect" } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > index 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d716310ca8d82957b5 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >  
> >        return pattern_stmt;
> >      }
> > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > +	   && rhs_code != TRUNC_MOD_EXPR)
> > +    {
> > +      wide_int icst = wi::to_wide (oprnd1);
> > +      wide_int val = wi::add (icst, 1);
> > +      int pow = wi::exact_log2 (val);
> > +      if (pow == (prec / 2))
> > +	{
> > +	  /* Pattern detected.  */
> > +	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
> > +
> > +	  *type_out = vectype;
> > +
> > +	  /* Check if the target supports this internal function.  */
> > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > +	  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
> > +	    {
> > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
> > +	      gimple_call_set_lhs (div_stmt, var_div);
> > +
> > +	      gimple_set_location (div_stmt, gimple_location (last_stmt));
> > +
> > +	      return div_stmt;
> > +	    }
> > +	}
> > +    }
> >  
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > 
> > 
> > 
> > 
> > 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-13  9:39   ` Richard Biener
@ 2022-06-13 10:09     ` Tamar Christina
  2022-06-13 11:47       ` Richard Biener
  0 siblings, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-06-13 10:09 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, Richard Sandiford

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 13, 2022 10:39 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
> bitmask
> 
> On Mon, 13 Jun 2022, Richard Biener wrote:
> 
> > On Thu, 9 Jun 2022, Tamar Christina wrote:
> >
> > > Hi All,
> > >
> > > In plenty of image and video processing code it's common to modify
> > > pixel values by a widening operation and then scale them back into range
> by dividing by 255.
> > >
> > > This patch adds an optab to allow us to emit an optimized sequence
> > > when doing an unsigned division that is equivalent to:
> > >
> > >    x = y / (2 ^ (bitsize (y)/2)-1
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > x86_64-pc-linux-gnu and no issues.
> > >
> > > Ok for master?
> >
> > Looking at 2/2 it seems that this is the wrong way to attack the
> > problem.  The ISA doesn't have such instruction so adding an optab
> > looks premature.  I suppose that there's no unsigned vector integer
> > division and thus we open-code that in a different way?  Isn't the
> > correct thing then to fixup that open-coding if it is more efficient?
> 

The problem is that even if you fixup the open-coding it would need to
be something target specific? The sequence of instructions we generate
don't have a GIMPLE representation.  So whatever is generated I'd have to fixup
in RTL then.

The problem with this is that it seemed fragile. We generate from the
Vectorizer:

  vect__3.8_35 = MEM <vector(16) unsigned char> [(uint8_t *)_21];
  vect_patt_28.9_37 = WIDEN_MULT_LO_EXPR <vect__3.8_35, vect_cst__36>;
  vect_patt_28.9_38 = WIDEN_MULT_HI_EXPR <vect__3.8_35, vect_cst__36>;
  vect_patt_19.10_40 = vect_patt_28.9_37 h* { 32897, 32897, 32897, 32897, 32897, 32897, 32897, 32897 };
  vect_patt_19.10_41 = vect_patt_28.9_38 h* { 32897, 32897, 32897, 32897, 32897, 32897, 32897, 32897 };
  vect_patt_25.11_42 = vect_patt_19.10_40 >> 7;
  vect_patt_25.11_43 = vect_patt_19.10_41 >> 7;
  vect_patt_11.12_44 = VEC_PACK_TRUNC_EXPR <vect_patt_25.11_42, vect_patt_25.11_43>;

and if the magic constants change then we miss the optimization. I could rewrite the open coding to use
shifts alone, but that might be a regression for some uarches I would imagine.

> Btw, on x86 we use
> 
> t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
> t.c:3:21: note:   with patt_25 = patt_19 >> 7;
> t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;
> 
> which translates to
> 
>         vpmulhuw        %ymm4, %ymm0, %ymm0
>         vpmulhuw        %ymm4, %ymm1, %ymm1
>         vpsrlw  $7, %ymm0, %ymm0
>         vpsrlw  $7, %ymm1, %ymm1
> 
> there's odd
> 
>         vpand   %ymm0, %ymm3, %ymm0
>         vpand   %ymm1, %ymm3, %ymm1
> 
> before (%ymm3 is all 0x00ff)
> 
>         vpackuswb       %ymm1, %ymm0, %ymm0
> 
> that's not visible in GIMPLE.  I guess aarch64 lacks a highpart multiply here?
> In any case, it seems that generic division expansion could be improved
> here? (choose_multiplier?)

We do generate multiply highpart here, but the patch completely avoids multiplies
and shifts entirely by creative use of the ISA. Another reason I went for an optab is costing.
The chosen operations are significantly cheaper on all Arm uarches than Shifts and multiply.

This means we get vectorization in some cases where the cost model would correctly say
It's too expensive to vectorize. Particularly around double precision.

Thanks,
Tamar

> 
> Richard.
> 
> > Richard.
> >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > > 	* doc/md.texi: Document it.
> > > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize
> pattern.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > >
> f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7
> > > b4f6e945866c38 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -5588,6 +5588,18 @@ signed op0, op1;
> > >  op0 = op1 / (1 << imm);
> > >  @end smallexample
> > >
> > > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> @item
> > > +@samp{udiv_pow2_bitmask@var{m2}} @cindex
> > > +@code{udiv_pow2_bitmask@var{m2}} instruction pattern @itemx
> > > +@samp{udiv_pow2_bitmask@var{m2}} Unsigned vector division by an
> > > +immediate that is equivalent to
> > > +@samp{2^(bitsize(m) / 2) - 1}.
> > > +@smallexample
> > > +unsigned short op0; op1;
> > > +@dots{}
> > > +op0 = op1 / 0xffU;
> > > +@end smallexample
> > > +
> > >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern  @item
> > > @samp{vec_shl_insert_@var{m}}  Shift the elements in vector input
> > > operand 1 left one element (i.e.@:
> > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > >
> d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898
> > > f916ed45de475f 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT,
> ECF_CONST | ECF_NOTHROW,
> > >  		       vec_shl_insert, binary)
> > >
> > >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW,
> > > sdiv_pow2, binary)
> > > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST |
> ECF_NOTHROW,
> > > +		       udiv_pow2_bitmask, unary)
> > >
> > >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> > > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) diff --git
> > > a/gcc/optabs.def b/gcc/optabs.def index
> > >
> 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f
> 4
> > > eed71b0494e17f 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> OPTAB_D
> > > (umulhs_optab, "umulhs$a3")  OPTAB_D (umulhrs_optab, "umulhrs$a3")
> > > OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> > >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> > > OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")  OPTAB_D
> > > (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git
> > > a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > new file mode 100644
> > > index
> > >
> 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0b
> > > ead1f6a452de3f
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-require-effective-target vect_int } */
> > > +
> > > +#include <stdint.h>
> > > +#include "tree-vect.h"
> > > +
> > > +#define N 50
> > > +#define TYPE uint8_t
> > > +
> > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > +
> > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > +
> > > +#include "vect-div-bitmask.h"
> > > +
> > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > +detected" "vect" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > new file mode 100644
> > > index
> > >
> 0000000000000000000000000000000000000000..009e16e1b36497e5724410d98
> 4
> > > 3f1ce122b26dda
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-require-effective-target vect_int } */
> > > +
> > > +#include <stdint.h>
> > > +#include "tree-vect.h"
> > > +
> > > +#define N 50
> > > +#define TYPE uint16_t
> > > +
> > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > +
> > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > +
> > > +#include "vect-div-bitmask.h"
> > > +
> > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > +detected" "vect" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > new file mode 100644
> > > index
> > >
> 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d942
> 2
> > > 0df849cc47930b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > @@ -0,0 +1,26 @@
> > > +/* { dg-require-effective-target vect_int } */
> > > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > > +aarch64*-*-* } } */
> > > +
> > > +#include <stdint.h>
> > > +#include "tree-vect.h"
> > > +
> > > +#define N 50
> > > +#define TYPE uint32_t
> > > +
> > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > +
> > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > +restrict pixel, TYPE level, int n) {
> > > +  for (int i = 0; i < n; i+=1)
> > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > +
> > > +#include "vect-div-bitmask.h"
> > > +
> > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > +detected" "vect" } } */
> > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > new file mode 100644
> > > index
> > >
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> 8
> > > 32f28ebd07993e
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > @@ -0,0 +1,43 @@
> > > +#include <stdio.h>
> > > +
> > > +#ifndef N
> > > +#define N 65
> > > +#endif
> > > +
> > > +#ifndef TYPE
> > > +#define TYPE uint32_t
> > > +#endif
> > > +
> > > +#ifndef DEBUG
> > > +#define DEBUG 0
> > > +#endif
> > > +
> > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > +
> > > +int main ()
> > > +{
> > > +  TYPE a[N];
> > > +  TYPE b[N];
> > > +
> > > +  for (int i = 0; i < N; ++i)
> > > +    {
> > > +      a[i] = BASE + i * 13;
> > > +      b[i] = BASE + i * 13;
> > > +      if (DEBUG)
> > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > +    }
> > > +
> > > +  fun1 (a, N / 2, N);
> > > +  fun2 (b, N / 2, N);
> > > +
> > > +  for (int i = 0; i < N; ++i)
> > > +    {
> > > +      if (DEBUG)
> > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > +
> > > +      if (a[i] != b[i])
> > > +        __builtin_abort ();
> > > +    }
> > > +  return 0;
> > > +}
> > > +
> > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > index
> > >
> 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d71
> 6
> > > 310ca8d82957b5 100644
> > > --- a/gcc/tree-vect-patterns.cc
> > > +++ b/gcc/tree-vect-patterns.cc
> > > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> > >
> > >        return pattern_stmt;
> > >      }
> > > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > > +	   && rhs_code != TRUNC_MOD_EXPR)
> > > +    {
> > > +      wide_int icst = wi::to_wide (oprnd1);
> > > +      wide_int val = wi::add (icst, 1);
> > > +      int pow = wi::exact_log2 (val);
> > > +      if (pow == (prec / 2))
> > > +	{
> > > +	  /* Pattern detected.  */
> > > +	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
> > > +
> > > +	  *type_out = vectype;
> > > +
> > > +	  /* Check if the target supports this internal function.  */
> > > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > > +	  if (direct_internal_fn_supported_p (ifn, vectype,
> OPTIMIZE_FOR_SPEED))
> > > +	    {
> > > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > > +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
> > > +	      gimple_call_set_lhs (div_stmt, var_div);
> > > +
> > > +	      gimple_set_location (div_stmt, gimple_location (last_stmt));
> > > +
> > > +	      return div_stmt;
> > > +	    }
> > > +	}
> > > +    }
> > >
> > >    if (prec > HOST_BITS_PER_WIDE_INT
> > >        || integer_zerop (oprnd1))
> > >
> > >
> > >
> > >
> > >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> Boudien Moerman; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-13 10:09     ` Tamar Christina
@ 2022-06-13 11:47       ` Richard Biener
  2022-06-13 14:37         ` Tamar Christina
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2022-06-13 11:47 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, Richard Sandiford

On Mon, 13 Jun 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 13, 2022 10:39 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
> > bitmask
> > 
> > On Mon, 13 Jun 2022, Richard Biener wrote:
> > 
> > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> > >
> > > > Hi All,
> > > >
> > > > In plenty of image and video processing code it's common to modify
> > > > pixel values by a widening operation and then scale them back into range
> > by dividing by 255.
> > > >
> > > > This patch adds an optab to allow us to emit an optimized sequence
> > > > when doing an unsigned division that is equivalent to:
> > > >
> > > >    x = y / (2 ^ (bitsize (y)/2)-1
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > x86_64-pc-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > >
> > > Looking at 2/2 it seems that this is the wrong way to attack the
> > > problem.  The ISA doesn't have such instruction so adding an optab
> > > looks premature.  I suppose that there's no unsigned vector integer
> > > division and thus we open-code that in a different way?  Isn't the
> > > correct thing then to fixup that open-coding if it is more efficient?
> > 
> 
> The problem is that even if you fixup the open-coding it would need to
> be something target specific? The sequence of instructions we generate
> don't have a GIMPLE representation.  So whatever is generated I'd have to fixup
> in RTL then.

What's the operation that doesn't have a GIMPLE representation?

I think for costing you could resort to the *_cost functions as used
by synth_mult and friends.

> The problem with this is that it seemed fragile. We generate from the
> Vectorizer:
> 
>   vect__3.8_35 = MEM <vector(16) unsigned char> [(uint8_t *)_21];
>   vect_patt_28.9_37 = WIDEN_MULT_LO_EXPR <vect__3.8_35, vect_cst__36>;
>   vect_patt_28.9_38 = WIDEN_MULT_HI_EXPR <vect__3.8_35, vect_cst__36>;
>   vect_patt_19.10_40 = vect_patt_28.9_37 h* { 32897, 32897, 32897, 32897, 32897, 32897, 32897, 32897 };
>   vect_patt_19.10_41 = vect_patt_28.9_38 h* { 32897, 32897, 32897, 32897, 32897, 32897, 32897, 32897 };
>   vect_patt_25.11_42 = vect_patt_19.10_40 >> 7;
>   vect_patt_25.11_43 = vect_patt_19.10_41 >> 7;
>   vect_patt_11.12_44 = VEC_PACK_TRUNC_EXPR <vect_patt_25.11_42, vect_patt_25.11_43>;
> 
> and if the magic constants change then we miss the optimization. I could rewrite the open coding to use
> shifts alone, but that might be a regression for some uarches I would imagine.

OK, so you do have a highpart multiply.  I suppose the pattern is too deep
to be recognized by combine?  What's the RTL good vs. bad before combine
of one of the expressions?

> > Btw, on x86 we use
> > 
> > t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
> > t.c:3:21: note:   with patt_25 = patt_19 >> 7;
> > t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;
> > 
> > which translates to
> > 
> >         vpmulhuw        %ymm4, %ymm0, %ymm0
> >         vpmulhuw        %ymm4, %ymm1, %ymm1
> >         vpsrlw  $7, %ymm0, %ymm0
> >         vpsrlw  $7, %ymm1, %ymm1
> > 
> > there's odd
> > 
> >         vpand   %ymm0, %ymm3, %ymm0
> >         vpand   %ymm1, %ymm3, %ymm1
> > 
> > before (%ymm3 is all 0x00ff)
> > 
> >         vpackuswb       %ymm1, %ymm0, %ymm0
> > 
> > that's not visible in GIMPLE.  I guess aarch64 lacks a highpart multiply here?
> > In any case, it seems that generic division expansion could be improved
> > here? (choose_multiplier?)
> 
> We do generate multiply highpart here, but the patch completely avoids multiplies
> and shifts entirely by creative use of the ISA. Another reason I went for an optab is costing.
> The chosen operations are significantly cheaper on all Arm uarches than Shifts and multiply.
> 
> This means we get vectorization in some cases where the cost model would correctly say
> It's too expensive to vectorize. Particularly around double precision.
> 
> Thanks,
> Tamar
> 
> > 
> > Richard.
> > 
> > > Richard.
> > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > > > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > > > 	* doc/md.texi: Document it.
> > > > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize
> > pattern.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > > > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > > > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > > > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > >
> > f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7
> > > > b4f6e945866c38 100644
> > > > --- a/gcc/doc/md.texi
> > > > +++ b/gcc/doc/md.texi
> > > > @@ -5588,6 +5588,18 @@ signed op0, op1;
> > > >  op0 = op1 / (1 << imm);
> > > >  @end smallexample
> > > >
> > > > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > @item
> > > > +@samp{udiv_pow2_bitmask@var{m2}} @cindex
> > > > +@code{udiv_pow2_bitmask@var{m2}} instruction pattern @itemx
> > > > +@samp{udiv_pow2_bitmask@var{m2}} Unsigned vector division by an
> > > > +immediate that is equivalent to
> > > > +@samp{2^(bitsize(m) / 2) - 1}.
> > > > +@smallexample
> > > > +unsigned short op0; op1;
> > > > +@dots{}
> > > > +op0 = op1 / 0xffU;
> > > > +@end smallexample
> > > > +
> > > >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern  @item
> > > > @samp{vec_shl_insert_@var{m}}  Shift the elements in vector input
> > > > operand 1 left one element (i.e.@:
> > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > > >
> > d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898
> > > > f916ed45de475f 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT,
> > ECF_CONST | ECF_NOTHROW,
> > > >  		       vec_shl_insert, binary)
> > > >
> > > >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST | ECF_NOTHROW,
> > > > sdiv_pow2, binary)
> > > > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST |
> > ECF_NOTHROW,
> > > > +		       udiv_pow2_bitmask, unary)
> > > >
> > > >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> > > > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) diff --git
> > > > a/gcc/optabs.def b/gcc/optabs.def index
> > > >
> > 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f
> > 4
> > > > eed71b0494e17f 100644
> > > > --- a/gcc/optabs.def
> > > > +++ b/gcc/optabs.def
> > > > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> > OPTAB_D
> > > > (umulhs_optab, "umulhs$a3")  OPTAB_D (umulhrs_optab, "umulhrs$a3")
> > > > OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > > > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> > > >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> > > > OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")  OPTAB_D
> > > > (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git
> > > > a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > new file mode 100644
> > > > index
> > > >
> > 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0b
> > > > ead1f6a452de3f
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > @@ -0,0 +1,25 @@
> > > > +/* { dg-require-effective-target vect_int } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "tree-vect.h"
> > > > +
> > > > +#define N 50
> > > > +#define TYPE uint8_t
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > +
> > > > +#include "vect-div-bitmask.h"
> > > > +
> > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > +detected" "vect" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > new file mode 100644
> > > > index
> > > >
> > 0000000000000000000000000000000000000000..009e16e1b36497e5724410d98
> > 4
> > > > 3f1ce122b26dda
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > @@ -0,0 +1,25 @@
> > > > +/* { dg-require-effective-target vect_int } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "tree-vect.h"
> > > > +
> > > > +#define N 50
> > > > +#define TYPE uint16_t
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > +
> > > > +#include "vect-div-bitmask.h"
> > > > +
> > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > +detected" "vect" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > new file mode 100644
> > > > index
> > > >
> > 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d942
> > 2
> > > > 0df849cc47930b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > @@ -0,0 +1,26 @@
> > > > +/* { dg-require-effective-target vect_int } */
> > > > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > > > +aarch64*-*-* } } */
> > > > +
> > > > +#include <stdint.h>
> > > > +#include "tree-vect.h"
> > > > +
> > > > +#define N 50
> > > > +#define TYPE uint32_t
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > +
> > > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > > > +restrict pixel, TYPE level, int n) {
> > > > +  for (int i = 0; i < n; i+=1)
> > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > +
> > > > +#include "vect-div-bitmask.h"
> > > > +
> > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > +detected" "vect" } } */
> > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > new file mode 100644
> > > > index
> > > >
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > 8
> > > > 32f28ebd07993e
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > @@ -0,0 +1,43 @@
> > > > +#include <stdio.h>
> > > > +
> > > > +#ifndef N
> > > > +#define N 65
> > > > +#endif
> > > > +
> > > > +#ifndef TYPE
> > > > +#define TYPE uint32_t
> > > > +#endif
> > > > +
> > > > +#ifndef DEBUG
> > > > +#define DEBUG 0
> > > > +#endif
> > > > +
> > > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > > +
> > > > +int main ()
> > > > +{
> > > > +  TYPE a[N];
> > > > +  TYPE b[N];
> > > > +
> > > > +  for (int i = 0; i < N; ++i)
> > > > +    {
> > > > +      a[i] = BASE + i * 13;
> > > > +      b[i] = BASE + i * 13;
> > > > +      if (DEBUG)
> > > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > > +    }
> > > > +
> > > > +  fun1 (a, N / 2, N);
> > > > +  fun2 (b, N / 2, N);
> > > > +
> > > > +  for (int i = 0; i < N; ++i)
> > > > +    {
> > > > +      if (DEBUG)
> > > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > > +
> > > > +      if (a[i] != b[i])
> > > > +        __builtin_abort ();
> > > > +    }
> > > > +  return 0;
> > > > +}
> > > > +
> > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> > > > index
> > > >
> > 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d71
> > 6
> > > > 310ca8d82957b5 100644
> > > > --- a/gcc/tree-vect-patterns.cc
> > > > +++ b/gcc/tree-vect-patterns.cc
> > > > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> > > >
> > > >        return pattern_stmt;
> > > >      }
> > > > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > > > +	   && rhs_code != TRUNC_MOD_EXPR)
> > > > +    {
> > > > +      wide_int icst = wi::to_wide (oprnd1);
> > > > +      wide_int val = wi::add (icst, 1);
> > > > +      int pow = wi::exact_log2 (val);
> > > > +      if (pow == (prec / 2))
> > > > +	{
> > > > +	  /* Pattern detected.  */
> > > > +	  vect_pattern_detected ("vect_recog_divmod_pattern", last_stmt);
> > > > +
> > > > +	  *type_out = vectype;
> > > > +
> > > > +	  /* Check if the target supports this internal function.  */
> > > > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > > > +	  if (direct_internal_fn_supported_p (ifn, vectype,
> > OPTIMIZE_FOR_SPEED))
> > > > +	    {
> > > > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > > > +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1, oprnd0);
> > > > +	      gimple_call_set_lhs (div_stmt, var_div);
> > > > +
> > > > +	      gimple_set_location (div_stmt, gimple_location (last_stmt));
> > > > +
> > > > +	      return div_stmt;
> > > > +	    }
> > > > +	}
> > > > +    }
> > > >
> > > >    if (prec > HOST_BITS_PER_WIDE_INT
> > > >        || integer_zerop (oprnd1))
> > > >
> > > >
> > > >
> > > >
> > > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > Boudien Moerman; HRB 36809 (AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-13 11:47       ` Richard Biener
@ 2022-06-13 14:37         ` Tamar Christina
  2022-06-14 13:18           ` Richard Biener
  0 siblings, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-06-13 14:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, Richard Sandiford

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 13, 2022 12:48 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
> bitmask
> 
> On Mon, 13 Jun 2022, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Monday, June 13, 2022 10:39 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > > <Richard.Sandiford@arm.com>
> > > Subject: Re: [PATCH 1/2]middle-end Support optimized division by
> > > pow2 bitmask
> > >
> > > On Mon, 13 Jun 2022, Richard Biener wrote:
> > >
> > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> > > >
> > > > > Hi All,
> > > > >
> > > > > In plenty of image and video processing code it's common to
> > > > > modify pixel values by a widening operation and then scale them
> > > > > back into range
> > > by dividing by 255.
> > > > >
> > > > > This patch adds an optab to allow us to emit an optimized
> > > > > sequence when doing an unsigned division that is equivalent to:
> > > > >
> > > > >    x = y / (2 ^ (bitsize (y)/2)-1
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > x86_64-pc-linux-gnu and no issues.
> > > > >
> > > > > Ok for master?
> > > >
> > > > Looking at 2/2 it seems that this is the wrong way to attack the
> > > > problem.  The ISA doesn't have such instruction so adding an optab
> > > > looks premature.  I suppose that there's no unsigned vector
> > > > integer division and thus we open-code that in a different way?
> > > > Isn't the correct thing then to fixup that open-coding if it is more
> efficient?
> > >
> >
> > The problem is that even if you fixup the open-coding it would need to
> > be something target specific? The sequence of instructions we generate
> > don't have a GIMPLE representation.  So whatever is generated I'd have
> > to fixup in RTL then.
> 
> What's the operation that doesn't have a GIMPLE representation?

For NEON use two operations:
1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
    Where the + widens and the >> narrows.  So you give it two shorts, get a byte
2. Add widening add of lowpart so basically lowpart (a +w b)

For SVE2 we use a different sequence, we use two back-to-back sequences of:
1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions select
   Even and odd elements of the vector rather than "top half" and "bottom half".

   So this instruction does : Add each vector element of the first source vector to the
   corresponding vector element of the second source vector, and place the most
    significant half of the result in the even-numbered half-width destination elements,
    while setting the odd-numbered elements to zero.

So there's an explicit permute in there. The instructions are sufficiently different that there
wouldn't be a single GIMPLE representation.

> 
> I think for costing you could resort to the *_cost functions as used by
> synth_mult and friends.
> 
> > The problem with this is that it seemed fragile. We generate from the
> > Vectorizer:
> >
> >   vect__3.8_35 = MEM <vector(16) unsigned char> [(uint8_t *)_21];
> >   vect_patt_28.9_37 = WIDEN_MULT_LO_EXPR <vect__3.8_35,
> vect_cst__36>;
> >   vect_patt_28.9_38 = WIDEN_MULT_HI_EXPR <vect__3.8_35,
> vect_cst__36>;
> >   vect_patt_19.10_40 = vect_patt_28.9_37 h* { 32897, 32897, 32897, 32897,
> 32897, 32897, 32897, 32897 };
> >   vect_patt_19.10_41 = vect_patt_28.9_38 h* { 32897, 32897, 32897, 32897,
> 32897, 32897, 32897, 32897 };
> >   vect_patt_25.11_42 = vect_patt_19.10_40 >> 7;
> >   vect_patt_25.11_43 = vect_patt_19.10_41 >> 7;
> >   vect_patt_11.12_44 = VEC_PACK_TRUNC_EXPR <vect_patt_25.11_42,
> > vect_patt_25.11_43>;
> >
> > and if the magic constants change then we miss the optimization. I
> > could rewrite the open coding to use shifts alone, but that might be a
> regression for some uarches I would imagine.
> 
> OK, so you do have a highpart multiply.  I suppose the pattern is too deep to
> be recognized by combine?  What's the RTL good vs. bad before combine of
> one of the expressions?

Yeah combine only tried 2-3 instructions, but to use these sequences we have to
match the entire chain as the instructions do the narrowing themselves.  So the RTL
for the bad case before combine is

(insn 39 37 42 4 (set (reg:V4SI 119)
        (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
                    (parallel:V8HI [
                            (const_int 4 [0x4])
                            (const_int 5 [0x5])
                            (const_int 6 [0x6])
                            (const_int 7 [0x7])
                        ])))
            (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
                    (parallel:V8HI [
                            (const_int 4 [0x4])
                            (const_int 5 [0x5])
                            (const_int 6 [0x6])
                            (const_int 7 [0x7])
                        ]))))) "/app/example.c":6:14 2114 {aarch64_simd_vec_umult_hi_v8hi}
     (expr_list:REG_DEAD (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
        (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
                        (parallel:V8HI [
                                (const_int 4 [0x4])
                                (const_int 5 [0x5])
                                (const_int 6 [0x6])
                                (const_int 7 [0x7])
                            ])))
                (const_vector:V4SI [
                        (const_int 32897 [0x8081]) repeated x4
                    ]))
            (nil))))
(insn 42 39 43 4 (set (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
        (unspec:V8HI [
                (subreg:V8HI (reg:V4SI 117) 0)
                (subreg:V8HI (reg:V4SI 119) 0)
            ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
     (expr_list:REG_DEAD (reg:V4SI 119)
        (expr_list:REG_DEAD (reg:V4SI 117)
            (nil))))
(insn 43 42 44 4 (set (reg:V8HI 124 [ vect_patt_25.11D.3756 ])
        (lshiftrt:V8HI (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
            (const_vector:V8HI [
                    (const_int 7 [0x7]) repeated x8
                ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
     (expr_list:REG_DEAD (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
        (nil)))
(insn 44 43 46 4 (set (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
        (mult:V8HI (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 115 [ MEM <vector(16) unsigned charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
                    (parallel:V16QI [
                            (const_int 8 [0x8])
                            (const_int 9 [0x9])
                            (const_int 10 [0xa])
                            (const_int 11 [0xb])
                            (const_int 12 [0xc])
                            (const_int 13 [0xd])
                            (const_int 14 [0xe])
                            (const_int 15 [0xf])
                        ])))
            (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 100 [ vect_cst__36 ])
                    (parallel:V16QI [
                            (const_int 8 [0x8])
                            (const_int 9 [0x9])
                            (const_int 10 [0xa])
                            (const_int 11 [0xb])
                            (const_int 12 [0xc])
                            (const_int 13 [0xd])
                            (const_int 14 [0xe])
                            (const_int 15 [0xf])
                        ]))))) "/app/example.c":6:14 2112 {aarch64_simd_vec_umult_hi_v16qi}
     (expr_list:REG_DEAD (reg:V16QI 115 [ MEM <vector(16) unsigned charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
        (nil)))
(insn 46 44 48 4 (set (reg:V4SI 126)
        (mult:V4SI (zero_extend:V4SI (subreg:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ]) 0))
            (zero_extend:V4SI (subreg:V4HI (reg:V8HI 118) 0)))) "/app/example.c":6:14 2108 {aarch64_intrinsic_vec_umult_lo_v4hi}
     (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (subreg:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ]) 0))
            (const_vector:V4SI [
                    (const_int 32897 [0x8081]) repeated x4
                ]))
        (nil)))
(insn 48 46 51 4 (set (reg:V4SI 128)
        (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
                    (parallel:V8HI [
                            (const_int 4 [0x4])
                            (const_int 5 [0x5])
                            (const_int 6 [0x6])
                            (const_int 7 [0x7])
                        ])))
            (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
                    (parallel:V8HI [
                            (const_int 4 [0x4])
                            (const_int 5 [0x5])
                            (const_int 6 [0x6])
                            (const_int 7 [0x7])
                        ]))))) "/app/example.c":6:14 2114 {aarch64_simd_vec_umult_hi_v8hi}
     (expr_list:REG_DEAD (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
        (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
                        (parallel:V8HI [
                                (const_int 4 [0x4])
                                (const_int 5 [0x5])
                                (const_int 6 [0x6])
                                (const_int 7 [0x7])
                            ])))
                (const_vector:V4SI [
                        (const_int 32897 [0x8081]) repeated x4
                    ]))
            (nil))))
(insn 51 48 52 4 (set (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
        (unspec:V8HI [
                (subreg:V8HI (reg:V4SI 126) 0)
                (subreg:V8HI (reg:V4SI 128) 0)
            ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
     (expr_list:REG_DEAD (reg:V4SI 128)
        (expr_list:REG_DEAD (reg:V4SI 126)
            (nil))))
(insn 52 51 53 4 (set (reg:V8HI 133 [ vect_patt_25.11D.3756 ])
        (lshiftrt:V8HI (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
            (const_vector:V8HI [
                    (const_int 7 [0x7]) repeated x8
                ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
     (expr_list:REG_DEAD (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
        (nil)))

And for good:

(insn 32 30 34 4 (set (reg:V16QI 118)
        (vec_concat:V16QI (unspec:V8QI [
                    (reg:V8HI 114 [ vect_patt_28.9 ])
                    (reg:V8HI 115)
                ] UNSPEC_ADDHN)
            (const_vector:V8QI [
                    (const_int 0 [0]) repeated x8
                ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
     (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
                    (reg:V8HI 114 [ vect_patt_28.9 ])
                    (const_vector:V8HI [
                            (const_int 257 [0x101]) repeated x8
                        ])
                ] UNSPEC_ADDHN)
            (const_vector:V8QI [
                    (const_int 0 [0]) repeated x8
                ]))
        (nil)))
(insn 34 32 35 4 (set (reg:V8HI 117)
        (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 118) 0))
            (reg:V8HI 114 [ vect_patt_28.9 ]))) "draw.c":6:35 2635 {aarch64_uaddwv8qi}
     (expr_list:REG_DEAD (reg:V16QI 118)
        (expr_list:REG_DEAD (reg:V8HI 114 [ vect_patt_28.9 ])
            (nil))))
(insn 35 34 37 4 (set (reg:V8HI 103 [ vect_patt_25.10 ])
        (lshiftrt:V8HI (reg:V8HI 117)
            (const_vector:V8HI [
                    (const_int 8 [0x8]) repeated x8
                ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}
     (expr_list:REG_DEAD (reg:V8HI 117)
        (nil)))
(insn 37 35 39 4 (set (reg:V16QI 122)
        (vec_concat:V16QI (unspec:V8QI [
                    (reg:V8HI 102 [ vect_patt_28.9 ])
                    (reg:V8HI 115)
                ] UNSPEC_ADDHN)
            (const_vector:V8QI [
                    (const_int 0 [0]) repeated x8
                ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
     (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
                    (reg:V8HI 102 [ vect_patt_28.9 ])
                    (const_vector:V8HI [
                            (const_int 257 [0x101]) repeated x8
                        ])
                ] UNSPEC_ADDHN)
            (const_vector:V8QI [
                    (const_int 0 [0]) repeated x8
                ]))
        (nil)))
(insn 39 37 40 4 (set (reg:V8HI 121)
        (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 122) 0))
            (reg:V8HI 102 [ vect_patt_28.9 ]))) "draw.c":6:35 2635 {aarch64_uaddwv8qi}
     (expr_list:REG_DEAD (reg:V16QI 122)
        (expr_list:REG_DEAD (reg:V8HI 102 [ vect_patt_28.9 ])
            (nil))))
(insn 40 39 41 4 (set (reg:V8HI 104 [ vect_patt_25.10 ])
        (lshiftrt:V8HI (reg:V8HI 121)
            (const_vector:V8HI [
                    (const_int 8 [0x8]) repeated x8
                ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}

Cheers,
Tamar

> 
> > > Btw, on x86 we use
> > >
> > > t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
> > > t.c:3:21: note:   with patt_25 = patt_19 >> 7;
> > > t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;
> > >
> > > which translates to
> > >
> > >         vpmulhuw        %ymm4, %ymm0, %ymm0
> > >         vpmulhuw        %ymm4, %ymm1, %ymm1
> > >         vpsrlw  $7, %ymm0, %ymm0
> > >         vpsrlw  $7, %ymm1, %ymm1
> > >
> > > there's odd
> > >
> > >         vpand   %ymm0, %ymm3, %ymm0
> > >         vpand   %ymm1, %ymm3, %ymm1
> > >
> > > before (%ymm3 is all 0x00ff)
> > >
> > >         vpackuswb       %ymm1, %ymm0, %ymm0
> > >
> > > that's not visible in GIMPLE.  I guess aarch64 lacks a highpart multiply
> here?
> > > In any case, it seems that generic division expansion could be
> > > improved here? (choose_multiplier?)
> >
> > We do generate multiply highpart here, but the patch completely avoids
> > multiplies and shifts entirely by creative use of the ISA. Another reason I
> went for an optab is costing.
> > The chosen operations are significantly cheaper on all Arm uarches than
> Shifts and multiply.
> >
> > This means we get vectorization in some cases where the cost model
> > would correctly say It's too expensive to vectorize. Particularly around
> double precision.
> >
> > Thanks,
> > Tamar
> >
> > >
> > > Richard.
> > >
> > > > Richard.
> > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > > > > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > > > > 	* doc/md.texi: Document it.
> > > > > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize
> > > pattern.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > > > > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > > > > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > > > > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > > > >
> > > > > --- inline copy of patch --
> > > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > > >
> > >
> f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7
> > > > > b4f6e945866c38 100644
> > > > > --- a/gcc/doc/md.texi
> > > > > +++ b/gcc/doc/md.texi
> > > > > @@ -5588,6 +5588,18 @@ signed op0, op1;
> > > > >  op0 = op1 / (1 << imm);
> > > > >  @end smallexample
> > > > >
> > > > > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > > @item
> > > > > +@samp{udiv_pow2_bitmask@var{m2}} @cindex
> > > > > +@code{udiv_pow2_bitmask@var{m2}} instruction pattern @itemx
> > > > > +@samp{udiv_pow2_bitmask@var{m2}} Unsigned vector division by
> an
> > > > > +immediate that is equivalent to
> > > > > +@samp{2^(bitsize(m) / 2) - 1}.
> > > > > +@smallexample
> > > > > +unsigned short op0; op1;
> > > > > +@dots{}
> > > > > +op0 = op1 / 0xffU;
> > > > > +@end smallexample
> > > > > +
> > > > >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
> > > > > @item @samp{vec_shl_insert_@var{m}}  Shift the elements in
> > > > > vector input operand 1 left one element (i.e.@:
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > > > >
> > >
> d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898
> > > > > f916ed45de475f 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT,
> > > ECF_CONST | ECF_NOTHROW,
> > > > >  		       vec_shl_insert, binary)
> > > > >
> > > > >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST |
> ECF_NOTHROW,
> > > > > sdiv_pow2, binary)
> > > > > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST |
> > > ECF_NOTHROW,
> > > > > +		       udiv_pow2_bitmask, unary)
> > > > >
> > > > >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> > > > > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) diff
> > > > > --git a/gcc/optabs.def b/gcc/optabs.def index
> > > > >
> > >
> 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f
> > > 4
> > > > > eed71b0494e17f 100644
> > > > > --- a/gcc/optabs.def
> > > > > +++ b/gcc/optabs.def
> > > > > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> > > OPTAB_D
> > > > > (umulhs_optab, "umulhs$a3")  OPTAB_D (umulhrs_optab,
> > > > > "umulhrs$a3") OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > > > > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> > > > >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> > > > > OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")  OPTAB_D
> > > > > (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git
> > > > > a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > >
> 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0b
> > > > > ead1f6a452de3f
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > @@ -0,0 +1,25 @@
> > > > > +/* { dg-require-effective-target vect_int } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +#include "tree-vect.h"
> > > > > +
> > > > > +#define N 50
> > > > > +#define TYPE uint8_t
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > +
> > > > > +#include "vect-div-bitmask.h"
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > +detected" "vect" } } */
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > >
> 0000000000000000000000000000000000000000..009e16e1b36497e5724410d98
> > > 4
> > > > > 3f1ce122b26dda
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > @@ -0,0 +1,25 @@
> > > > > +/* { dg-require-effective-target vect_int } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +#include "tree-vect.h"
> > > > > +
> > > > > +#define N 50
> > > > > +#define TYPE uint16_t
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > +
> > > > > +#include "vect-div-bitmask.h"
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > +detected" "vect" } } */
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > >
> 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d942
> > > 2
> > > > > 0df849cc47930b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > @@ -0,0 +1,26 @@
> > > > > +/* { dg-require-effective-target vect_int } */
> > > > > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > > > > +aarch64*-*-* } } */
> > > > > +
> > > > > +#include <stdint.h>
> > > > > +#include "tree-vect.h"
> > > > > +
> > > > > +#define N 50
> > > > > +#define TYPE uint32_t
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > > +
> > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > +  for (int i = 0; i < n; i+=1)
> > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > > +
> > > > > +#include "vect-div-bitmask.h"
> > > > > +
> > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > +detected" "vect" } } */
> > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > new file mode 100644
> > > > > index
> > > > >
> > >
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > > 8
> > > > > 32f28ebd07993e
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > @@ -0,0 +1,43 @@
> > > > > +#include <stdio.h>
> > > > > +
> > > > > +#ifndef N
> > > > > +#define N 65
> > > > > +#endif
> > > > > +
> > > > > +#ifndef TYPE
> > > > > +#define TYPE uint32_t
> > > > > +#endif
> > > > > +
> > > > > +#ifndef DEBUG
> > > > > +#define DEBUG 0
> > > > > +#endif
> > > > > +
> > > > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > > > +
> > > > > +int main ()
> > > > > +{
> > > > > +  TYPE a[N];
> > > > > +  TYPE b[N];
> > > > > +
> > > > > +  for (int i = 0; i < N; ++i)
> > > > > +    {
> > > > > +      a[i] = BASE + i * 13;
> > > > > +      b[i] = BASE + i * 13;
> > > > > +      if (DEBUG)
> > > > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > > > +    }
> > > > > +
> > > > > +  fun1 (a, N / 2, N);
> > > > > +  fun2 (b, N / 2, N);
> > > > > +
> > > > > +  for (int i = 0; i < N; ++i)
> > > > > +    {
> > > > > +      if (DEBUG)
> > > > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > > > +
> > > > > +      if (a[i] != b[i])
> > > > > +        __builtin_abort ();
> > > > > +    }
> > > > > +  return 0;
> > > > > +}
> > > > > +
> > > > > diff --git a/gcc/tree-vect-patterns.cc
> > > > > b/gcc/tree-vect-patterns.cc index
> > > > >
> > >
> 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d71
> > > 6
> > > > > 310ca8d82957b5 100644
> > > > > --- a/gcc/tree-vect-patterns.cc
> > > > > +++ b/gcc/tree-vect-patterns.cc
> > > > > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info
> > > > > *vinfo,
> > > > >
> > > > >        return pattern_stmt;
> > > > >      }
> > > > > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > > > > +	   && rhs_code != TRUNC_MOD_EXPR)
> > > > > +    {
> > > > > +      wide_int icst = wi::to_wide (oprnd1);
> > > > > +      wide_int val = wi::add (icst, 1);
> > > > > +      int pow = wi::exact_log2 (val);
> > > > > +      if (pow == (prec / 2))
> > > > > +	{
> > > > > +	  /* Pattern detected.  */
> > > > > +	  vect_pattern_detected ("vect_recog_divmod_pattern",
> > > > > +last_stmt);
> > > > > +
> > > > > +	  *type_out = vectype;
> > > > > +
> > > > > +	  /* Check if the target supports this internal function.  */
> > > > > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > > > > +	  if (direct_internal_fn_supported_p (ifn, vectype,
> > > OPTIMIZE_FOR_SPEED))
> > > > > +	    {
> > > > > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > > > > +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1,
> oprnd0);
> > > > > +	      gimple_call_set_lhs (div_stmt, var_div);
> > > > > +
> > > > > +	      gimple_set_location (div_stmt, gimple_location
> > > > > +(last_stmt));
> > > > > +
> > > > > +	      return div_stmt;
> > > > > +	    }
> > > > > +	}
> > > > > +    }
> > > > >
> > > > >    if (prec > HOST_BITS_PER_WIDE_INT
> > > > >        || integer_zerop (oprnd1))
> > > > >
> > > > >
> > > > >
> > > > >
> > > > >
> > > >
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> > > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > > Boudien Moerman; HRB 36809 (AG Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> Boudien Moerman; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-13 14:37         ` Tamar Christina
@ 2022-06-14 13:18           ` Richard Biener
  2022-06-14 13:38             ` Tamar Christina
  2022-06-14 13:42             ` Richard Sandiford
  0 siblings, 2 replies; 35+ messages in thread
From: Richard Biener @ 2022-06-14 13:18 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, Richard Sandiford

On Mon, 13 Jun 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 13, 2022 12:48 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
> > bitmask
> > 
> > On Mon, 13 Jun 2022, Tamar Christina wrote:
> > 
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Monday, June 13, 2022 10:39 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > > > <Richard.Sandiford@arm.com>
> > > > Subject: Re: [PATCH 1/2]middle-end Support optimized division by
> > > > pow2 bitmask
> > > >
> > > > On Mon, 13 Jun 2022, Richard Biener wrote:
> > > >
> > > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> > > > >
> > > > > > Hi All,
> > > > > >
> > > > > > In plenty of image and video processing code it's common to
> > > > > > modify pixel values by a widening operation and then scale them
> > > > > > back into range
> > > > by dividing by 255.
> > > > > >
> > > > > > This patch adds an optab to allow us to emit an optimized
> > > > > > sequence when doing an unsigned division that is equivalent to:
> > > > > >
> > > > > >    x = y / (2 ^ (bitsize (y)/2)-1
> > > > > >
> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > > x86_64-pc-linux-gnu and no issues.
> > > > > >
> > > > > > Ok for master?
> > > > >
> > > > > Looking at 2/2 it seems that this is the wrong way to attack the
> > > > > problem.  The ISA doesn't have such instruction so adding an optab
> > > > > looks premature.  I suppose that there's no unsigned vector
> > > > > integer division and thus we open-code that in a different way?
> > > > > Isn't the correct thing then to fixup that open-coding if it is more
> > efficient?
> > > >
> > >
> > > The problem is that even if you fixup the open-coding it would need to
> > > be something target specific? The sequence of instructions we generate
> > > don't have a GIMPLE representation.  So whatever is generated I'd have
> > > to fixup in RTL then.
> > 
> > What's the operation that doesn't have a GIMPLE representation?
> 
> For NEON use two operations:
> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
>     Where the + widens and the >> narrows.  So you give it two shorts, get a byte
> 2. Add widening add of lowpart so basically lowpart (a +w b)
> 
> For SVE2 we use a different sequence, we use two back-to-back sequences of:
> 1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions select
>    Even and odd elements of the vector rather than "top half" and "bottom half".
> 
>    So this instruction does : Add each vector element of the first source vector to the
>    corresponding vector element of the second source vector, and place the most
>     significant half of the result in the even-numbered half-width destination elements,
>     while setting the odd-numbered elements to zero.
> 
> So there's an explicit permute in there. The instructions are sufficiently different that there
> wouldn't be a single GIMPLE representation.

I see.  Are these also useful to express scalar integer division?

I'll defer to others to ack the special udiv_pow2_bitmask optab
or suggest some piecemail things other targets might be able to do as 
well.  It does look very special.  I'd also bikeshed it to
udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming
I interpreted 'bitmask' correctly ;)).  It seems to be even less
general since it is an unary op and the actual divisor is constrained
by the mode itself?

Richard.

> > 
> > I think for costing you could resort to the *_cost functions as used by
> > synth_mult and friends.
> > 
> > > The problem with this is that it seemed fragile. We generate from the
> > > Vectorizer:
> > >
> > >   vect__3.8_35 = MEM <vector(16) unsigned char> [(uint8_t *)_21];
> > >   vect_patt_28.9_37 = WIDEN_MULT_LO_EXPR <vect__3.8_35,
> > vect_cst__36>;
> > >   vect_patt_28.9_38 = WIDEN_MULT_HI_EXPR <vect__3.8_35,
> > vect_cst__36>;
> > >   vect_patt_19.10_40 = vect_patt_28.9_37 h* { 32897, 32897, 32897, 32897,
> > 32897, 32897, 32897, 32897 };
> > >   vect_patt_19.10_41 = vect_patt_28.9_38 h* { 32897, 32897, 32897, 32897,
> > 32897, 32897, 32897, 32897 };
> > >   vect_patt_25.11_42 = vect_patt_19.10_40 >> 7;
> > >   vect_patt_25.11_43 = vect_patt_19.10_41 >> 7;
> > >   vect_patt_11.12_44 = VEC_PACK_TRUNC_EXPR <vect_patt_25.11_42,
> > > vect_patt_25.11_43>;
> > >
> > > and if the magic constants change then we miss the optimization. I
> > > could rewrite the open coding to use shifts alone, but that might be a
> > regression for some uarches I would imagine.
> > 
> > OK, so you do have a highpart multiply.  I suppose the pattern is too deep to
> > be recognized by combine?  What's the RTL good vs. bad before combine of
> > one of the expressions?
> 
> Yeah combine only tried 2-3 instructions, but to use these sequences we have to
> match the entire chain as the instructions do the narrowing themselves.  So the RTL
> for the bad case before combine is
> 
> (insn 39 37 42 4 (set (reg:V4SI 119)
>         (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
>                     (parallel:V8HI [
>                             (const_int 4 [0x4])
>                             (const_int 5 [0x5])
>                             (const_int 6 [0x6])
>                             (const_int 7 [0x7])
>                         ])))
>             (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
>                     (parallel:V8HI [
>                             (const_int 4 [0x4])
>                             (const_int 5 [0x5])
>                             (const_int 6 [0x6])
>                             (const_int 7 [0x7])
>                         ]))))) "/app/example.c":6:14 2114 {aarch64_simd_vec_umult_hi_v8hi}
>      (expr_list:REG_DEAD (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
>         (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
>                         (parallel:V8HI [
>                                 (const_int 4 [0x4])
>                                 (const_int 5 [0x5])
>                                 (const_int 6 [0x6])
>                                 (const_int 7 [0x7])
>                             ])))
>                 (const_vector:V4SI [
>                         (const_int 32897 [0x8081]) repeated x4
>                     ]))
>             (nil))))
> (insn 42 39 43 4 (set (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
>         (unspec:V8HI [
>                 (subreg:V8HI (reg:V4SI 117) 0)
>                 (subreg:V8HI (reg:V4SI 119) 0)
>             ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
>      (expr_list:REG_DEAD (reg:V4SI 119)
>         (expr_list:REG_DEAD (reg:V4SI 117)
>             (nil))))
> (insn 43 42 44 4 (set (reg:V8HI 124 [ vect_patt_25.11D.3756 ])
>         (lshiftrt:V8HI (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
>             (const_vector:V8HI [
>                     (const_int 7 [0x7]) repeated x8
>                 ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
>      (expr_list:REG_DEAD (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
>         (nil)))
> (insn 44 43 46 4 (set (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
>         (mult:V8HI (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 115 [ MEM <vector(16) unsigned charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
>                     (parallel:V16QI [
>                             (const_int 8 [0x8])
>                             (const_int 9 [0x9])
>                             (const_int 10 [0xa])
>                             (const_int 11 [0xb])
>                             (const_int 12 [0xc])
>                             (const_int 13 [0xd])
>                             (const_int 14 [0xe])
>                             (const_int 15 [0xf])
>                         ])))
>             (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 100 [ vect_cst__36 ])
>                     (parallel:V16QI [
>                             (const_int 8 [0x8])
>                             (const_int 9 [0x9])
>                             (const_int 10 [0xa])
>                             (const_int 11 [0xb])
>                             (const_int 12 [0xc])
>                             (const_int 13 [0xd])
>                             (const_int 14 [0xe])
>                             (const_int 15 [0xf])
>                         ]))))) "/app/example.c":6:14 2112 {aarch64_simd_vec_umult_hi_v16qi}
>      (expr_list:REG_DEAD (reg:V16QI 115 [ MEM <vector(16) unsigned charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
>         (nil)))
> (insn 46 44 48 4 (set (reg:V4SI 126)
>         (mult:V4SI (zero_extend:V4SI (subreg:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ]) 0))
>             (zero_extend:V4SI (subreg:V4HI (reg:V8HI 118) 0)))) "/app/example.c":6:14 2108 {aarch64_intrinsic_vec_umult_lo_v4hi}
>      (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (subreg:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ]) 0))
>             (const_vector:V4SI [
>                     (const_int 32897 [0x8081]) repeated x4
>                 ]))
>         (nil)))
> (insn 48 46 51 4 (set (reg:V4SI 128)
>         (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
>                     (parallel:V8HI [
>                             (const_int 4 [0x4])
>                             (const_int 5 [0x5])
>                             (const_int 6 [0x6])
>                             (const_int 7 [0x7])
>                         ])))
>             (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
>                     (parallel:V8HI [
>                             (const_int 4 [0x4])
>                             (const_int 5 [0x5])
>                             (const_int 6 [0x6])
>                             (const_int 7 [0x7])
>                         ]))))) "/app/example.c":6:14 2114 {aarch64_simd_vec_umult_hi_v8hi}
>      (expr_list:REG_DEAD (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
>         (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
>                         (parallel:V8HI [
>                                 (const_int 4 [0x4])
>                                 (const_int 5 [0x5])
>                                 (const_int 6 [0x6])
>                                 (const_int 7 [0x7])
>                             ])))
>                 (const_vector:V4SI [
>                         (const_int 32897 [0x8081]) repeated x4
>                     ]))
>             (nil))))
> (insn 51 48 52 4 (set (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
>         (unspec:V8HI [
>                 (subreg:V8HI (reg:V4SI 126) 0)
>                 (subreg:V8HI (reg:V4SI 128) 0)
>             ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
>      (expr_list:REG_DEAD (reg:V4SI 128)
>         (expr_list:REG_DEAD (reg:V4SI 126)
>             (nil))))
> (insn 52 51 53 4 (set (reg:V8HI 133 [ vect_patt_25.11D.3756 ])
>         (lshiftrt:V8HI (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
>             (const_vector:V8HI [
>                     (const_int 7 [0x7]) repeated x8
>                 ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
>      (expr_list:REG_DEAD (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
>         (nil)))
> 
> And for good:
> 
> (insn 32 30 34 4 (set (reg:V16QI 118)
>         (vec_concat:V16QI (unspec:V8QI [
>                     (reg:V8HI 114 [ vect_patt_28.9 ])
>                     (reg:V8HI 115)
>                 ] UNSPEC_ADDHN)
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
>      (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
>                     (reg:V8HI 114 [ vect_patt_28.9 ])
>                     (const_vector:V8HI [
>                             (const_int 257 [0x101]) repeated x8
>                         ])
>                 ] UNSPEC_ADDHN)
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))
>         (nil)))
> (insn 34 32 35 4 (set (reg:V8HI 117)
>         (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 118) 0))
>             (reg:V8HI 114 [ vect_patt_28.9 ]))) "draw.c":6:35 2635 {aarch64_uaddwv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 118)
>         (expr_list:REG_DEAD (reg:V8HI 114 [ vect_patt_28.9 ])
>             (nil))))
> (insn 35 34 37 4 (set (reg:V8HI 103 [ vect_patt_25.10 ])
>         (lshiftrt:V8HI (reg:V8HI 117)
>             (const_vector:V8HI [
>                     (const_int 8 [0x8]) repeated x8
>                 ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}
>      (expr_list:REG_DEAD (reg:V8HI 117)
>         (nil)))
> (insn 37 35 39 4 (set (reg:V16QI 122)
>         (vec_concat:V16QI (unspec:V8QI [
>                     (reg:V8HI 102 [ vect_patt_28.9 ])
>                     (reg:V8HI 115)
>                 ] UNSPEC_ADDHN)
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
>      (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
>                     (reg:V8HI 102 [ vect_patt_28.9 ])
>                     (const_vector:V8HI [
>                             (const_int 257 [0x101]) repeated x8
>                         ])
>                 ] UNSPEC_ADDHN)
>             (const_vector:V8QI [
>                     (const_int 0 [0]) repeated x8
>                 ]))
>         (nil)))
> (insn 39 37 40 4 (set (reg:V8HI 121)
>         (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 122) 0))
>             (reg:V8HI 102 [ vect_patt_28.9 ]))) "draw.c":6:35 2635 {aarch64_uaddwv8qi}
>      (expr_list:REG_DEAD (reg:V16QI 122)
>         (expr_list:REG_DEAD (reg:V8HI 102 [ vect_patt_28.9 ])
>             (nil))))
> (insn 40 39 41 4 (set (reg:V8HI 104 [ vect_patt_25.10 ])
>         (lshiftrt:V8HI (reg:V8HI 121)
>             (const_vector:V8HI [
>                     (const_int 8 [0x8]) repeated x8
>                 ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}
> 
> Cheers,
> Tamar
> 
> > 
> > > > Btw, on x86 we use
> > > >
> > > > t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
> > > > t.c:3:21: note:   with patt_25 = patt_19 >> 7;
> > > > t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;
> > > >
> > > > which translates to
> > > >
> > > >         vpmulhuw        %ymm4, %ymm0, %ymm0
> > > >         vpmulhuw        %ymm4, %ymm1, %ymm1
> > > >         vpsrlw  $7, %ymm0, %ymm0
> > > >         vpsrlw  $7, %ymm1, %ymm1
> > > >
> > > > there's odd
> > > >
> > > >         vpand   %ymm0, %ymm3, %ymm0
> > > >         vpand   %ymm1, %ymm3, %ymm1
> > > >
> > > > before (%ymm3 is all 0x00ff)
> > > >
> > > >         vpackuswb       %ymm1, %ymm0, %ymm0
> > > >
> > > > that's not visible in GIMPLE.  I guess aarch64 lacks a highpart multiply
> > here?
> > > > In any case, it seems that generic division expansion could be
> > > > improved here? (choose_multiplier?)
> > >
> > > We do generate multiply highpart here, but the patch completely avoids
> > > multiplies and shifts entirely by creative use of the ISA. Another reason I
> > went for an optab is costing.
> > > The chosen operations are significantly cheaper on all Arm uarches than
> > Shifts and multiply.
> > >
> > > This means we get vectorization in some cases where the cost model
> > > would correctly say It's too expensive to vectorize. Particularly around
> > double precision.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > >
> > > > Richard.
> > > >
> > > > > Richard.
> > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > > > > > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > > > > > 	* doc/md.texi: Document it.
> > > > > > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Recognize
> > > > pattern.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > > > > > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > > > > > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > > > > > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > > > > >
> > > > > > --- inline copy of patch --
> > > > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > > > >
> > > >
> > f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7
> > > > > > b4f6e945866c38 100644
> > > > > > --- a/gcc/doc/md.texi
> > > > > > +++ b/gcc/doc/md.texi
> > > > > > @@ -5588,6 +5588,18 @@ signed op0, op1;
> > > > > >  op0 = op1 / (1 << imm);
> > > > > >  @end smallexample
> > > > > >
> > > > > > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction pattern
> > > > @item
> > > > > > +@samp{udiv_pow2_bitmask@var{m2}} @cindex
> > > > > > +@code{udiv_pow2_bitmask@var{m2}} instruction pattern @itemx
> > > > > > +@samp{udiv_pow2_bitmask@var{m2}} Unsigned vector division by
> > an
> > > > > > +immediate that is equivalent to
> > > > > > +@samp{2^(bitsize(m) / 2) - 1}.
> > > > > > +@smallexample
> > > > > > +unsigned short op0; op1;
> > > > > > +@dots{}
> > > > > > +op0 = op1 / 0xffU;
> > > > > > +@end smallexample
> > > > > > +
> > > > > >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
> > > > > > @item @samp{vec_shl_insert_@var{m}}  Shift the elements in
> > > > > > vector input operand 1 left one element (i.e.@:
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > > > > >
> > > >
> > d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898
> > > > > > f916ed45de475f 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT,
> > > > ECF_CONST | ECF_NOTHROW,
> > > > > >  		       vec_shl_insert, binary)
> > > > > >
> > > > > >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST |
> > ECF_NOTHROW,
> > > > > > sdiv_pow2, binary)
> > > > > > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST |
> > > > ECF_NOTHROW,
> > > > > > +		       udiv_pow2_bitmask, unary)
> > > > > >
> > > > > >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> > > > > > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary) diff
> > > > > > --git a/gcc/optabs.def b/gcc/optabs.def index
> > > > > >
> > > >
> > 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f
> > > > 4
> > > > > > eed71b0494e17f 100644
> > > > > > --- a/gcc/optabs.def
> > > > > > +++ b/gcc/optabs.def
> > > > > > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> > > > OPTAB_D
> > > > > > (umulhs_optab, "umulhs$a3")  OPTAB_D (umulhrs_optab,
> > > > > > "umulhrs$a3") OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > > > > > +OPTAB_D (udiv_pow2_bitmask_optab, "udiv_pow2_bitmask$a2")
> > > > > >  OPTAB_D (vec_pack_sfix_trunc_optab, "vec_pack_sfix_trunc_$a")
> > > > > > OPTAB_D (vec_pack_ssat_optab, "vec_pack_ssat_$a")  OPTAB_D
> > > > > > (vec_pack_trunc_optab, "vec_pack_trunc_$a") diff --git
> > > > > > a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > >
> > 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0b
> > > > > > ead1f6a452de3f
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > @@ -0,0 +1,25 @@
> > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > +
> > > > > > +#include <stdint.h>
> > > > > > +#include "tree-vect.h"
> > > > > > +
> > > > > > +#define N 50
> > > > > > +#define TYPE uint8_t
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > > +
> > > > > > +#include "vect-div-bitmask.h"
> > > > > > +
> > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > +detected" "vect" } } */
> > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > >
> > 0000000000000000000000000000000000000000..009e16e1b36497e5724410d98
> > > > 4
> > > > > > 3f1ce122b26dda
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > @@ -0,0 +1,25 @@
> > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > +
> > > > > > +#include <stdint.h>
> > > > > > +#include "tree-vect.h"
> > > > > > +
> > > > > > +#define N 50
> > > > > > +#define TYPE uint16_t
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > > +
> > > > > > +#include "vect-div-bitmask.h"
> > > > > > +
> > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > +detected" "vect" } } */
> > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > >
> > 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d942
> > > > 2
> > > > > > 0df849cc47930b
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > @@ -0,0 +1,26 @@
> > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > > > > > +aarch64*-*-* } } */
> > > > > > +
> > > > > > +#include <stdint.h>
> > > > > > +#include "tree-vect.h"
> > > > > > +
> > > > > > +#define N 50
> > > > > > +#define TYPE uint32_t
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > > > +
> > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > > > > > +
> > > > > > +#include "vect-div-bitmask.h"
> > > > > > +
> > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > +detected" "vect" } } */
> > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > >
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > > > 8
> > > > > > 32f28ebd07993e
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > @@ -0,0 +1,43 @@
> > > > > > +#include <stdio.h>
> > > > > > +
> > > > > > +#ifndef N
> > > > > > +#define N 65
> > > > > > +#endif
> > > > > > +
> > > > > > +#ifndef TYPE
> > > > > > +#define TYPE uint32_t
> > > > > > +#endif
> > > > > > +
> > > > > > +#ifndef DEBUG
> > > > > > +#define DEBUG 0
> > > > > > +#endif
> > > > > > +
> > > > > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > > > > +
> > > > > > +int main ()
> > > > > > +{
> > > > > > +  TYPE a[N];
> > > > > > +  TYPE b[N];
> > > > > > +
> > > > > > +  for (int i = 0; i < N; ++i)
> > > > > > +    {
> > > > > > +      a[i] = BASE + i * 13;
> > > > > > +      b[i] = BASE + i * 13;
> > > > > > +      if (DEBUG)
> > > > > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > > > > +    }
> > > > > > +
> > > > > > +  fun1 (a, N / 2, N);
> > > > > > +  fun2 (b, N / 2, N);
> > > > > > +
> > > > > > +  for (int i = 0; i < N; ++i)
> > > > > > +    {
> > > > > > +      if (DEBUG)
> > > > > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > > > > +
> > > > > > +      if (a[i] != b[i])
> > > > > > +        __builtin_abort ();
> > > > > > +    }
> > > > > > +  return 0;
> > > > > > +}
> > > > > > +
> > > > > > diff --git a/gcc/tree-vect-patterns.cc
> > > > > > b/gcc/tree-vect-patterns.cc index
> > > > > >
> > > >
> > 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d71
> > > > 6
> > > > > > 310ca8d82957b5 100644
> > > > > > --- a/gcc/tree-vect-patterns.cc
> > > > > > +++ b/gcc/tree-vect-patterns.cc
> > > > > > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info
> > > > > > *vinfo,
> > > > > >
> > > > > >        return pattern_stmt;
> > > > > >      }
> > > > > > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1) != 1)
> > > > > > +	   && rhs_code != TRUNC_MOD_EXPR)
> > > > > > +    {
> > > > > > +      wide_int icst = wi::to_wide (oprnd1);
> > > > > > +      wide_int val = wi::add (icst, 1);
> > > > > > +      int pow = wi::exact_log2 (val);
> > > > > > +      if (pow == (prec / 2))
> > > > > > +	{
> > > > > > +	  /* Pattern detected.  */
> > > > > > +	  vect_pattern_detected ("vect_recog_divmod_pattern",
> > > > > > +last_stmt);
> > > > > > +
> > > > > > +	  *type_out = vectype;
> > > > > > +
> > > > > > +	  /* Check if the target supports this internal function.  */
> > > > > > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > > > > > +	  if (direct_internal_fn_supported_p (ifn, vectype,
> > > > OPTIMIZE_FOR_SPEED))
> > > > > > +	    {
> > > > > > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > > > > > +	      gimple *div_stmt = gimple_build_call_internal (ifn, 1,
> > oprnd0);
> > > > > > +	      gimple_call_set_lhs (div_stmt, var_div);
> > > > > > +
> > > > > > +	      gimple_set_location (div_stmt, gimple_location
> > > > > > +(last_stmt));
> > > > > > +
> > > > > > +	      return div_stmt;
> > > > > > +	    }
> > > > > > +	}
> > > > > > +    }
> > > > > >
> > > > > >    if (prec > HOST_BITS_PER_WIDE_INT
> > > > > >        || integer_zerop (oprnd1))
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > >
> > > > >
> > > >
> > > > --
> > > > Richard Biener <rguenther@suse.de>
> > > > SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> > > > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > > > Boudien Moerman; HRB 36809 (AG Nuernberg)
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > Boudien Moerman; HRB 36809 (AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-14 13:18           ` Richard Biener
@ 2022-06-14 13:38             ` Tamar Christina
  2022-06-14 13:42             ` Richard Sandiford
  1 sibling, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-06-14 13:38 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, Richard Sandiford

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Tuesday, June 14, 2022 2:19 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
> bitmask
> 
> On Mon, 13 Jun 2022, Tamar Christina wrote:
> 
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Monday, June 13, 2022 12:48 PM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > > <Richard.Sandiford@arm.com>
> > > Subject: RE: [PATCH 1/2]middle-end Support optimized division by
> > > pow2 bitmask
> > >
> > > On Mon, 13 Jun 2022, Tamar Christina wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Monday, June 13, 2022 10:39 AM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > > > > <Richard.Sandiford@arm.com>
> > > > > Subject: Re: [PATCH 1/2]middle-end Support optimized division by
> > > > > pow2 bitmask
> > > > >
> > > > > On Mon, 13 Jun 2022, Richard Biener wrote:
> > > > >
> > > > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> > > > > >
> > > > > > > Hi All,
> > > > > > >
> > > > > > > In plenty of image and video processing code it's common to
> > > > > > > modify pixel values by a widening operation and then scale
> > > > > > > them back into range
> > > > > by dividing by 255.
> > > > > > >
> > > > > > > This patch adds an optab to allow us to emit an optimized
> > > > > > > sequence when doing an unsigned division that is equivalent to:
> > > > > > >
> > > > > > >    x = y / (2 ^ (bitsize (y)/2)-1
> > > > > > >
> > > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > > > x86_64-pc-linux-gnu and no issues.
> > > > > > >
> > > > > > > Ok for master?
> > > > > >
> > > > > > Looking at 2/2 it seems that this is the wrong way to attack
> > > > > > the problem.  The ISA doesn't have such instruction so adding
> > > > > > an optab looks premature.  I suppose that there's no unsigned
> > > > > > vector integer division and thus we open-code that in a different
> way?
> > > > > > Isn't the correct thing then to fixup that open-coding if it
> > > > > > is more
> > > efficient?
> > > > >
> > > >
> > > > The problem is that even if you fixup the open-coding it would
> > > > need to be something target specific? The sequence of instructions
> > > > we generate don't have a GIMPLE representation.  So whatever is
> > > > generated I'd have to fixup in RTL then.
> > >
> > > What's the operation that doesn't have a GIMPLE representation?
> >
> > For NEON use two operations:
> > 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
> >     Where the + widens and the >> narrows.  So you give it two shorts,
> > get a byte 2. Add widening add of lowpart so basically lowpart (a +w
> > b)
> >
> > For SVE2 we use a different sequence, we use two back-to-back
> sequences of:
> > 1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions
> select
> >    Even and odd elements of the vector rather than "top half" and "bottom
> half".
> >
> >    So this instruction does : Add each vector element of the first source
> vector to the
> >    corresponding vector element of the second source vector, and place the
> most
> >     significant half of the result in the even-numbered half-width destination
> elements,
> >     while setting the odd-numbered elements to zero.
> >
> > So there's an explicit permute in there. The instructions are
> > sufficiently different that there wouldn't be a single GIMPLE
> representation.
> 
> I see.  Are these also useful to express scalar integer division?

Hmm not these exact instructions as they only exist on vector. Scalar may
Potentially benefit from rewriting this to (x + ((x + 257) >> 8)) >> 8
Which avoids the multiply with the magic constant.  But the problem here is
that unless undone for vector it would likely generate worse code if vectorized
exactly like this on most ISAs compared to what we have now.

> 
> I'll defer to others to ack the special udiv_pow2_bitmask optab or suggest
> some piecemail things other targets might be able to do as well.  It does look
> very special.  I'd also bikeshed it to
> udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming I
> interpreted 'bitmask' correctly ;)).  It seems to be even less general since it is
> an unary op and the actual divisor is constrained by the mode itself?

I am happy to change the name, and quite happy to add the constant as an
argument.   I had only made it this specific because this was the only fairly
common operation I had found.  Though perhaps it's indeed better to keep
the optab a bit more general?

Thanks,
Tamar

> 
> Richard.
> 
> > >
> > > I think for costing you could resort to the *_cost functions as used
> > > by synth_mult and friends.
> > >
> > > > The problem with this is that it seemed fragile. We generate from
> > > > the
> > > > Vectorizer:
> > > >
> > > >   vect__3.8_35 = MEM <vector(16) unsigned char> [(uint8_t *)_21];
> > > >   vect_patt_28.9_37 = WIDEN_MULT_LO_EXPR <vect__3.8_35,
> > > vect_cst__36>;
> > > >   vect_patt_28.9_38 = WIDEN_MULT_HI_EXPR <vect__3.8_35,
> > > vect_cst__36>;
> > > >   vect_patt_19.10_40 = vect_patt_28.9_37 h* { 32897, 32897, 32897,
> > > > 32897,
> > > 32897, 32897, 32897, 32897 };
> > > >   vect_patt_19.10_41 = vect_patt_28.9_38 h* { 32897, 32897, 32897,
> > > > 32897,
> > > 32897, 32897, 32897, 32897 };
> > > >   vect_patt_25.11_42 = vect_patt_19.10_40 >> 7;
> > > >   vect_patt_25.11_43 = vect_patt_19.10_41 >> 7;
> > > >   vect_patt_11.12_44 = VEC_PACK_TRUNC_EXPR <vect_patt_25.11_42,
> > > > vect_patt_25.11_43>;
> > > >
> > > > and if the magic constants change then we miss the optimization. I
> > > > could rewrite the open coding to use shifts alone, but that might
> > > > be a
> > > regression for some uarches I would imagine.
> > >
> > > OK, so you do have a highpart multiply.  I suppose the pattern is
> > > too deep to be recognized by combine?  What's the RTL good vs. bad
> > > before combine of one of the expressions?
> >
> > Yeah combine only tried 2-3 instructions, but to use these sequences
> > we have to match the entire chain as the instructions do the narrowing
> > themselves.  So the RTL for the bad case before combine is
> >
> > (insn 39 37 42 4 (set (reg:V4SI 119)
> >         (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 116 [
> vect_patt_28.9D.3754 ])
> >                     (parallel:V8HI [
> >                             (const_int 4 [0x4])
> >                             (const_int 5 [0x5])
> >                             (const_int 6 [0x6])
> >                             (const_int 7 [0x7])
> >                         ])))
> >             (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
> >                     (parallel:V8HI [
> >                             (const_int 4 [0x4])
> >                             (const_int 5 [0x5])
> >                             (const_int 6 [0x6])
> >                             (const_int 7 [0x7])
> >                         ]))))) "/app/example.c":6:14 2114
> {aarch64_simd_vec_umult_hi_v8hi}
> >      (expr_list:REG_DEAD (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
> >         (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI
> (reg:V8HI 116 [ vect_patt_28.9D.3754 ])
> >                         (parallel:V8HI [
> >                                 (const_int 4 [0x4])
> >                                 (const_int 5 [0x5])
> >                                 (const_int 6 [0x6])
> >                                 (const_int 7 [0x7])
> >                             ])))
> >                 (const_vector:V4SI [
> >                         (const_int 32897 [0x8081]) repeated x4
> >                     ]))
> >             (nil))))
> > (insn 42 39 43 4 (set (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
> >         (unspec:V8HI [
> >                 (subreg:V8HI (reg:V4SI 117) 0)
> >                 (subreg:V8HI (reg:V4SI 119) 0)
> >             ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
> >      (expr_list:REG_DEAD (reg:V4SI 119)
> >         (expr_list:REG_DEAD (reg:V4SI 117)
> >             (nil))))
> > (insn 43 42 44 4 (set (reg:V8HI 124 [ vect_patt_25.11D.3756 ])
> >         (lshiftrt:V8HI (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
> >             (const_vector:V8HI [
> >                     (const_int 7 [0x7]) repeated x8
> >                 ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
> >      (expr_list:REG_DEAD (reg:V8HI 121 [ vect_patt_19.10D.3755 ])
> >         (nil)))
> > (insn 44 43 46 4 (set (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
> >         (mult:V8HI (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 115 [ MEM
> <vector(16) unsigned charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
> >                     (parallel:V16QI [
> >                             (const_int 8 [0x8])
> >                             (const_int 9 [0x9])
> >                             (const_int 10 [0xa])
> >                             (const_int 11 [0xb])
> >                             (const_int 12 [0xc])
> >                             (const_int 13 [0xd])
> >                             (const_int 14 [0xe])
> >                             (const_int 15 [0xf])
> >                         ])))
> >             (zero_extend:V8HI (vec_select:V8QI (reg:V16QI 100 [ vect_cst__36 ])
> >                     (parallel:V16QI [
> >                             (const_int 8 [0x8])
> >                             (const_int 9 [0x9])
> >                             (const_int 10 [0xa])
> >                             (const_int 11 [0xb])
> >                             (const_int 12 [0xc])
> >                             (const_int 13 [0xd])
> >                             (const_int 14 [0xe])
> >                             (const_int 15 [0xf])
> >                         ]))))) "/app/example.c":6:14 2112
> {aarch64_simd_vec_umult_hi_v16qi}
> >      (expr_list:REG_DEAD (reg:V16QI 115 [ MEM <vector(16) unsigned
> charD.21> [(uint8_tD.3704 *)_21 clique 1 base 1] ])
> >         (nil)))
> > (insn 46 44 48 4 (set (reg:V4SI 126)
> >         (mult:V4SI (zero_extend:V4SI (subreg:V4HI (reg:V8HI 125 [
> vect_patt_28.9D.3754 ]) 0))
> >             (zero_extend:V4SI (subreg:V4HI (reg:V8HI 118) 0))))
> "/app/example.c":6:14 2108 {aarch64_intrinsic_vec_umult_lo_v4hi}
> >      (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (subreg:V4HI
> (reg:V8HI 125 [ vect_patt_28.9D.3754 ]) 0))
> >             (const_vector:V4SI [
> >                     (const_int 32897 [0x8081]) repeated x4
> >                 ]))
> >         (nil)))
> > (insn 48 46 51 4 (set (reg:V4SI 128)
> >         (mult:V4SI (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 125 [
> vect_patt_28.9D.3754 ])
> >                     (parallel:V8HI [
> >                             (const_int 4 [0x4])
> >                             (const_int 5 [0x5])
> >                             (const_int 6 [0x6])
> >                             (const_int 7 [0x7])
> >                         ])))
> >             (zero_extend:V4SI (vec_select:V4HI (reg:V8HI 118)
> >                     (parallel:V8HI [
> >                             (const_int 4 [0x4])
> >                             (const_int 5 [0x5])
> >                             (const_int 6 [0x6])
> >                             (const_int 7 [0x7])
> >                         ]))))) "/app/example.c":6:14 2114
> {aarch64_simd_vec_umult_hi_v8hi}
> >      (expr_list:REG_DEAD (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
> >         (expr_list:REG_EQUAL (mult:V4SI (zero_extend:V4SI (vec_select:V4HI
> (reg:V8HI 125 [ vect_patt_28.9D.3754 ])
> >                         (parallel:V8HI [
> >                                 (const_int 4 [0x4])
> >                                 (const_int 5 [0x5])
> >                                 (const_int 6 [0x6])
> >                                 (const_int 7 [0x7])
> >                             ])))
> >                 (const_vector:V4SI [
> >                         (const_int 32897 [0x8081]) repeated x4
> >                     ]))
> >             (nil))))
> > (insn 51 48 52 4 (set (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
> >         (unspec:V8HI [
> >                 (subreg:V8HI (reg:V4SI 126) 0)
> >                 (subreg:V8HI (reg:V4SI 128) 0)
> >             ] UNSPEC_UZP2)) "/app/example.c":6:14 4096 {aarch64_uzp2v8hi}
> >      (expr_list:REG_DEAD (reg:V4SI 128)
> >         (expr_list:REG_DEAD (reg:V4SI 126)
> >             (nil))))
> > (insn 52 51 53 4 (set (reg:V8HI 133 [ vect_patt_25.11D.3756 ])
> >         (lshiftrt:V8HI (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
> >             (const_vector:V8HI [
> >                     (const_int 7 [0x7]) repeated x8
> >                 ]))) "/app/example.c":6:14 1803 {aarch64_simd_lshrv8hi}
> >      (expr_list:REG_DEAD (reg:V8HI 130 [ vect_patt_19.10D.3755 ])
> >         (nil)))
> >
> > And for good:
> >
> > (insn 32 30 34 4 (set (reg:V16QI 118)
> >         (vec_concat:V16QI (unspec:V8QI [
> >                     (reg:V8HI 114 [ vect_patt_28.9 ])
> >                     (reg:V8HI 115)
> >                 ] UNSPEC_ADDHN)
> >             (const_vector:V8QI [
> >                     (const_int 0 [0]) repeated x8
> >                 ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
> >      (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
> >                     (reg:V8HI 114 [ vect_patt_28.9 ])
> >                     (const_vector:V8HI [
> >                             (const_int 257 [0x101]) repeated x8
> >                         ])
> >                 ] UNSPEC_ADDHN)
> >             (const_vector:V8QI [
> >                     (const_int 0 [0]) repeated x8
> >                 ]))
> >         (nil)))
> > (insn 34 32 35 4 (set (reg:V8HI 117)
> >         (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 118) 0))
> >             (reg:V8HI 114 [ vect_patt_28.9 ]))) "draw.c":6:35 2635
> {aarch64_uaddwv8qi}
> >      (expr_list:REG_DEAD (reg:V16QI 118)
> >         (expr_list:REG_DEAD (reg:V8HI 114 [ vect_patt_28.9 ])
> >             (nil))))
> > (insn 35 34 37 4 (set (reg:V8HI 103 [ vect_patt_25.10 ])
> >         (lshiftrt:V8HI (reg:V8HI 117)
> >             (const_vector:V8HI [
> >                     (const_int 8 [0x8]) repeated x8
> >                 ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}
> >      (expr_list:REG_DEAD (reg:V8HI 117)
> >         (nil)))
> > (insn 37 35 39 4 (set (reg:V16QI 122)
> >         (vec_concat:V16QI (unspec:V8QI [
> >                     (reg:V8HI 102 [ vect_patt_28.9 ])
> >                     (reg:V8HI 115)
> >                 ] UNSPEC_ADDHN)
> >             (const_vector:V8QI [
> >                     (const_int 0 [0]) repeated x8
> >                 ]))) "draw.c":6:35 2688 {aarch64_addhnv8hi_insn_le}
> >      (expr_list:REG_EQUAL (vec_concat:V16QI (unspec:V8QI [
> >                     (reg:V8HI 102 [ vect_patt_28.9 ])
> >                     (const_vector:V8HI [
> >                             (const_int 257 [0x101]) repeated x8
> >                         ])
> >                 ] UNSPEC_ADDHN)
> >             (const_vector:V8QI [
> >                     (const_int 0 [0]) repeated x8
> >                 ]))
> >         (nil)))
> > (insn 39 37 40 4 (set (reg:V8HI 121)
> >         (plus:V8HI (zero_extend:V8HI (subreg:V8QI (reg:V16QI 122) 0))
> >             (reg:V8HI 102 [ vect_patt_28.9 ]))) "draw.c":6:35 2635
> {aarch64_uaddwv8qi}
> >      (expr_list:REG_DEAD (reg:V16QI 122)
> >         (expr_list:REG_DEAD (reg:V8HI 102 [ vect_patt_28.9 ])
> >             (nil))))
> > (insn 40 39 41 4 (set (reg:V8HI 104 [ vect_patt_25.10 ])
> >         (lshiftrt:V8HI (reg:V8HI 121)
> >             (const_vector:V8HI [
> >                     (const_int 8 [0x8]) repeated x8
> >                 ]))) "draw.c":6:35 1741 {aarch64_simd_lshrv8hi}
> >
> > Cheers,
> > Tamar
> >
> > >
> > > > > Btw, on x86 we use
> > > > >
> > > > > t.c:3:21: note:   replacing earlier pattern patt_25 = patt_28 / 255;
> > > > > t.c:3:21: note:   with patt_25 = patt_19 >> 7;
> > > > > t.c:3:21: note:   extra pattern stmt: patt_19 = patt_28 h* 32897;
> > > > >
> > > > > which translates to
> > > > >
> > > > >         vpmulhuw        %ymm4, %ymm0, %ymm0
> > > > >         vpmulhuw        %ymm4, %ymm1, %ymm1
> > > > >         vpsrlw  $7, %ymm0, %ymm0
> > > > >         vpsrlw  $7, %ymm1, %ymm1
> > > > >
> > > > > there's odd
> > > > >
> > > > >         vpand   %ymm0, %ymm3, %ymm0
> > > > >         vpand   %ymm1, %ymm3, %ymm1
> > > > >
> > > > > before (%ymm3 is all 0x00ff)
> > > > >
> > > > >         vpackuswb       %ymm1, %ymm0, %ymm0
> > > > >
> > > > > that's not visible in GIMPLE.  I guess aarch64 lacks a highpart
> > > > > multiply
> > > here?
> > > > > In any case, it seems that generic division expansion could be
> > > > > improved here? (choose_multiplier?)
> > > >
> > > > We do generate multiply highpart here, but the patch completely
> > > > avoids multiplies and shifts entirely by creative use of the ISA.
> > > > Another reason I
> > > went for an optab is costing.
> > > > The chosen operations are significantly cheaper on all Arm uarches
> > > > than
> > > Shifts and multiply.
> > > >
> > > > This means we get vectorization in some cases where the cost model
> > > > would correctly say It's too expensive to vectorize. Particularly
> > > > around
> > > double precision.
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > >
> > > > > Richard.
> > > > >
> > > > > > Richard.
> > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > > 	* internal-fn.def (DIV_POW2_BITMASK): New.
> > > > > > > 	* optabs.def (udiv_pow2_bitmask_optab): New.
> > > > > > > 	* doc/md.texi: Document it.
> > > > > > > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern):
> > > > > > > Recognize
> > > > > pattern.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > > > > > > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > > > > > > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > > > > > > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > > > > > >
> > > > > > > --- inline copy of patch --
> > > > > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > > > > >
> > > > >
> > >
> f3619c505c025f158c2bc64756531877378b22e1..784c49d7d24cef7619e4d613f7
> > > > > > > b4f6e945866c38 100644
> > > > > > > --- a/gcc/doc/md.texi
> > > > > > > +++ b/gcc/doc/md.texi
> > > > > > > @@ -5588,6 +5588,18 @@ signed op0, op1;
> > > > > > >  op0 = op1 / (1 << imm);
> > > > > > >  @end smallexample
> > > > > > >
> > > > > > > +@cindex @code{udiv_pow2_bitmask@var{m2}} instruction
> > > > > > > +pattern
> > > > > @item
> > > > > > > +@samp{udiv_pow2_bitmask@var{m2}} @cindex
> > > > > > > +@code{udiv_pow2_bitmask@var{m2}} instruction pattern
> @itemx
> > > > > > > +@samp{udiv_pow2_bitmask@var{m2}} Unsigned vector division
> > > > > > > +by
> > > an
> > > > > > > +immediate that is equivalent to
> > > > > > > +@samp{2^(bitsize(m) / 2) - 1}.
> > > > > > > +@smallexample
> > > > > > > +unsigned short op0; op1;
> > > > > > > +@dots{}
> > > > > > > +op0 = op1 / 0xffU;
> > > > > > > +@end smallexample
> > > > > > > +
> > > > > > >  @cindex @code{vec_shl_insert_@var{m}} instruction pattern
> > > > > > > @item @samp{vec_shl_insert_@var{m}}  Shift the elements in
> > > > > > > vector input operand 1 left one element (i.e.@:
> > > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > > > > > >
> > > > >
> > >
> d2d550d358606022b1cb44fa842f06e0be507bc3..a3e3cc1520f77683ebf6256898
> > > > > > > f916ed45de475f 100644
> > > > > > > --- a/gcc/internal-fn.def
> > > > > > > +++ b/gcc/internal-fn.def
> > > > > > > @@ -159,6 +159,8 @@ DEF_INTERNAL_OPTAB_FN
> (VEC_SHL_INSERT,
> > > > > ECF_CONST | ECF_NOTHROW,
> > > > > > >  		       vec_shl_insert, binary)
> > > > > > >
> > > > > > >  DEF_INTERNAL_OPTAB_FN (DIV_POW2, ECF_CONST |
> > > ECF_NOTHROW,
> > > > > > > sdiv_pow2, binary)
> > > > > > > +DEF_INTERNAL_OPTAB_FN (DIV_POW2_BITMASK, ECF_CONST |
> > > > > ECF_NOTHROW,
> > > > > > > +		       udiv_pow2_bitmask, unary)
> > > > > > >
> > > > > > >  DEF_INTERNAL_OPTAB_FN (FMS, ECF_CONST, fms, ternary)
> > > > > > > DEF_INTERNAL_OPTAB_FN (FNMA, ECF_CONST, fnma, ternary)
> diff
> > > > > > > --git a/gcc/optabs.def b/gcc/optabs.def index
> > > > > > >
> > > > >
> > >
> 801310ebaa7d469520809bb7efed6820f8eb866b..3f0ac05ef5ad5aed8d6ca391f
> > > > > 4
> > > > > > > eed71b0494e17f 100644
> > > > > > > --- a/gcc/optabs.def
> > > > > > > +++ b/gcc/optabs.def
> > > > > > > @@ -372,6 +372,7 @@ OPTAB_D (smulhrs_optab, "smulhrs$a3")
> > > > > OPTAB_D
> > > > > > > (umulhs_optab, "umulhs$a3")  OPTAB_D (umulhrs_optab,
> > > > > > > "umulhrs$a3") OPTAB_D (sdiv_pow2_optab, "sdiv_pow2$a3")
> > > > > > > +OPTAB_D (udiv_pow2_bitmask_optab,
> "udiv_pow2_bitmask$a2")
> > > > > > >  OPTAB_D (vec_pack_sfix_trunc_optab,
> > > > > > > "vec_pack_sfix_trunc_$a") OPTAB_D (vec_pack_ssat_optab,
> > > > > > > "vec_pack_ssat_$a")  OPTAB_D (vec_pack_trunc_optab,
> > > > > > > "vec_pack_trunc_$a") diff --git
> > > > > > > a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > > new file mode 100644
> > > > > > > index
> > > > > > >
> > > > >
> > >
> 0000000000000000000000000000000000000000..a7ea3cce4764239c5d281a8f0b
> > > > > > > ead1f6a452de3f
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > > > > > > @@ -0,0 +1,25 @@
> > > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > > +
> > > > > > > +#include <stdint.h>
> > > > > > > +#include "tree-vect.h"
> > > > > > > +
> > > > > > > +#define N 50
> > > > > > > +#define TYPE uint8_t
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > > > > > > +
> > > > > > > +#include "vect-div-bitmask.h"
> > > > > > > +
> > > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > > +detected" "vect" } } */
> > > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > > new file mode 100644
> > > > > > > index
> > > > > > >
> > > > >
> > >
> 0000000000000000000000000000000000000000..009e16e1b36497e5724410d98
> > > > > 4
> > > > > > > 3f1ce122b26dda
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > > > > > > @@ -0,0 +1,25 @@
> > > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > > +
> > > > > > > +#include <stdint.h>
> > > > > > > +#include "tree-vect.h"
> > > > > > > +
> > > > > > > +#define N 50
> > > > > > > +#define TYPE uint16_t
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > > > > > > +
> > > > > > > +#include "vect-div-bitmask.h"
> > > > > > > +
> > > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > > +detected" "vect" } } */
> > > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > > new file mode 100644
> > > > > > > index
> > > > > > >
> > > > >
> > >
> 0000000000000000000000000000000000000000..bf35a0bda8333c418e692d942
> > > > > 2
> > > > > > > 0df849cc47930b
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > > > > > > @@ -0,0 +1,26 @@
> > > > > > > +/* { dg-require-effective-target vect_int } */
> > > > > > > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > > > > > > +aarch64*-*-* } } */
> > > > > > > +
> > > > > > > +#include <stdint.h>
> > > > > > > +#include "tree-vect.h"
> > > > > > > +
> > > > > > > +#define N 50
> > > > > > > +#define TYPE uint32_t
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O1"))) void
> > > > > > > +fun1(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > > > > > > +}
> > > > > > > +
> > > > > > > +__attribute__((noipa, noinline, optimize("O3"))) void
> > > > > > > +fun2(TYPE* restrict pixel, TYPE level, int n) {
> > > > > > > +  for (int i = 0; i < n; i+=1)
> > > > > > > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> > > > > > > +}
> > > > > > > +
> > > > > > > +#include "vect-div-bitmask.h"
> > > > > > > +
> > > > > > > +/* { dg-final { scan-tree-dump "vect_recog_divmod_pattern:
> > > > > > > +detected" "vect" } } */
> > > > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > > new file mode 100644
> > > > > > > index
> > > > > > >
> > > > >
> > >
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > > > > 8
> > > > > > > 32f28ebd07993e
> > > > > > > --- /dev/null
> > > > > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > > > > > > @@ -0,0 +1,43 @@
> > > > > > > +#include <stdio.h>
> > > > > > > +
> > > > > > > +#ifndef N
> > > > > > > +#define N 65
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +#ifndef TYPE
> > > > > > > +#define TYPE uint32_t
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +#ifndef DEBUG
> > > > > > > +#define DEBUG 0
> > > > > > > +#endif
> > > > > > > +
> > > > > > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > > > > > > +
> > > > > > > +int main ()
> > > > > > > +{
> > > > > > > +  TYPE a[N];
> > > > > > > +  TYPE b[N];
> > > > > > > +
> > > > > > > +  for (int i = 0; i < N; ++i)
> > > > > > > +    {
> > > > > > > +      a[i] = BASE + i * 13;
> > > > > > > +      b[i] = BASE + i * 13;
> > > > > > > +      if (DEBUG)
> > > > > > > +        printf ("%d: 0x%x\n", i, a[i]);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  fun1 (a, N / 2, N);
> > > > > > > +  fun2 (b, N / 2, N);
> > > > > > > +
> > > > > > > +  for (int i = 0; i < N; ++i)
> > > > > > > +    {
> > > > > > > +      if (DEBUG)
> > > > > > > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > > > > > > +
> > > > > > > +      if (a[i] != b[i])
> > > > > > > +        __builtin_abort ();
> > > > > > > +    }
> > > > > > > +  return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > diff --git a/gcc/tree-vect-patterns.cc
> > > > > > > b/gcc/tree-vect-patterns.cc index
> > > > > > >
> > > > >
> > >
> 217bdfd7045a22578a35bb891a4318d741071872..a738558cb8d12296bff462d71
> > > > > 6
> > > > > > > 310ca8d82957b5 100644
> > > > > > > --- a/gcc/tree-vect-patterns.cc
> > > > > > > +++ b/gcc/tree-vect-patterns.cc
> > > > > > > @@ -3558,6 +3558,33 @@ vect_recog_divmod_pattern (vec_info
> > > > > > > *vinfo,
> > > > > > >
> > > > > > >        return pattern_stmt;
> > > > > > >      }
> > > > > > > +  else if ((TYPE_UNSIGNED (itype) || tree_int_cst_sgn (oprnd1)
> != 1)
> > > > > > > +	   && rhs_code != TRUNC_MOD_EXPR)
> > > > > > > +    {
> > > > > > > +      wide_int icst = wi::to_wide (oprnd1);
> > > > > > > +      wide_int val = wi::add (icst, 1);
> > > > > > > +      int pow = wi::exact_log2 (val);
> > > > > > > +      if (pow == (prec / 2))
> > > > > > > +	{
> > > > > > > +	  /* Pattern detected.  */
> > > > > > > +	  vect_pattern_detected ("vect_recog_divmod_pattern",
> > > > > > > +last_stmt);
> > > > > > > +
> > > > > > > +	  *type_out = vectype;
> > > > > > > +
> > > > > > > +	  /* Check if the target supports this internal function.  */
> > > > > > > +	  internal_fn ifn = IFN_DIV_POW2_BITMASK;
> > > > > > > +	  if (direct_internal_fn_supported_p (ifn, vectype,
> > > > > OPTIMIZE_FOR_SPEED))
> > > > > > > +	    {
> > > > > > > +	      tree var_div = vect_recog_temp_ssa_var (itype, NULL);
> > > > > > > +	      gimple *div_stmt = gimple_build_call_internal (ifn,
> > > > > > > +1,
> > > oprnd0);
> > > > > > > +	      gimple_call_set_lhs (div_stmt, var_div);
> > > > > > > +
> > > > > > > +	      gimple_set_location (div_stmt, gimple_location
> > > > > > > +(last_stmt));
> > > > > > > +
> > > > > > > +	      return div_stmt;
> > > > > > > +	    }
> > > > > > > +	}
> > > > > > > +    }
> > > > > > >
> > > > > > >    if (prec > HOST_BITS_PER_WIDE_INT
> > > > > > >        || integer_zerop (oprnd1))
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > > >
> > > > > >
> > > > > >
> > > > >
> > > > > --
> > > > > Richard Biener <rguenther@suse.de> SUSE Software Solutions
> > > > > Germany GmbH, Frankenstraße 146, 90461 Nuernberg, Germany; GF:
> > > > > Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
> HRB
> > > > > 36809 (AG Nuernberg)
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> > > Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> > > Boudien Moerman; HRB 36809 (AG Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Frankenstraße 146, 90461
> Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald,
> Boudien Moerman; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-14 13:18           ` Richard Biener
  2022-06-14 13:38             ` Tamar Christina
@ 2022-06-14 13:42             ` Richard Sandiford
  2022-06-14 15:57               ` Tamar Christina
  1 sibling, 1 reply; 35+ messages in thread
From: Richard Sandiford @ 2022-06-14 13:42 UTC (permalink / raw)
  To: Richard Biener; +Cc: Tamar Christina, gcc-patches, nd

Richard Biener <rguenther@suse.de> writes:
> On Mon, 13 Jun 2022, Tamar Christina wrote:
>
>> > -----Original Message-----
>> > From: Richard Biener <rguenther@suse.de>
>> > Sent: Monday, June 13, 2022 12:48 PM
>> > To: Tamar Christina <Tamar.Christina@arm.com>
>> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
>> > <Richard.Sandiford@arm.com>
>> > Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
>> > bitmask
>> > 
>> > On Mon, 13 Jun 2022, Tamar Christina wrote:
>> > 
>> > > > -----Original Message-----
>> > > > From: Richard Biener <rguenther@suse.de>
>> > > > Sent: Monday, June 13, 2022 10:39 AM
>> > > > To: Tamar Christina <Tamar.Christina@arm.com>
>> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
>> > > > <Richard.Sandiford@arm.com>
>> > > > Subject: Re: [PATCH 1/2]middle-end Support optimized division by
>> > > > pow2 bitmask
>> > > >
>> > > > On Mon, 13 Jun 2022, Richard Biener wrote:
>> > > >
>> > > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
>> > > > >
>> > > > > > Hi All,
>> > > > > >
>> > > > > > In plenty of image and video processing code it's common to
>> > > > > > modify pixel values by a widening operation and then scale them
>> > > > > > back into range
>> > > > by dividing by 255.
>> > > > > >
>> > > > > > This patch adds an optab to allow us to emit an optimized
>> > > > > > sequence when doing an unsigned division that is equivalent to:
>> > > > > >
>> > > > > >    x = y / (2 ^ (bitsize (y)/2)-1
>> > > > > >
>> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
>> > > > > > x86_64-pc-linux-gnu and no issues.
>> > > > > >
>> > > > > > Ok for master?
>> > > > >
>> > > > > Looking at 2/2 it seems that this is the wrong way to attack the
>> > > > > problem.  The ISA doesn't have such instruction so adding an optab
>> > > > > looks premature.  I suppose that there's no unsigned vector
>> > > > > integer division and thus we open-code that in a different way?
>> > > > > Isn't the correct thing then to fixup that open-coding if it is more
>> > efficient?
>> > > >
>> > >
>> > > The problem is that even if you fixup the open-coding it would need to
>> > > be something target specific? The sequence of instructions we generate
>> > > don't have a GIMPLE representation.  So whatever is generated I'd have
>> > > to fixup in RTL then.
>> > 
>> > What's the operation that doesn't have a GIMPLE representation?
>> 
>> For NEON use two operations:
>> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
>>     Where the + widens and the >> narrows.  So you give it two shorts, get a byte
>> 2. Add widening add of lowpart so basically lowpart (a +w b)
>> 
>> For SVE2 we use a different sequence, we use two back-to-back sequences of:
>> 1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions select
>>    Even and odd elements of the vector rather than "top half" and "bottom half".
>> 
>>    So this instruction does : Add each vector element of the first source vector to the
>>    corresponding vector element of the second source vector, and place the most
>>     significant half of the result in the even-numbered half-width destination elements,
>>     while setting the odd-numbered elements to zero.
>> 
>> So there's an explicit permute in there. The instructions are sufficiently different that there
>> wouldn't be a single GIMPLE representation.
>
> I see.  Are these also useful to express scalar integer division?
>
> I'll defer to others to ack the special udiv_pow2_bitmask optab
> or suggest some piecemail things other targets might be able to do as 
> well.  It does look very special.  I'd also bikeshed it to
> udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming
> I interpreted 'bitmask' correctly ;)).  It seems to be even less
> general since it is an unary op and the actual divisor is constrained
> by the mode itself?

Yeah, those were my concerns as well.  For n-bit numbers, the same kind
of arithmetic transformation can be used for any 2^m-1 for m in [n/2, n),
so from a target-independent point of view, m==n/2 isn't particularly
special.  Hard-coding one value of m would make sense if there was an
underlying instruction that did exactly this, but like you say, there
isn't.

Would a compromise be to define an optab for ADDHN and then add a vector
pattern for this division that (at least initially) prefers ADDHN over
the current approach whenever ADDHN is available?  We could then adapt
the conditions on the pattern if other targets also provide ADDHN but
don't want this transform.  (I think the other instructions in the
pattern already have optabs.)

That still leaves open the question about what to do about SVE2,
but the underlying problem there is that the vectoriser doesn't
know about the B/T layout.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-14 13:42             ` Richard Sandiford
@ 2022-06-14 15:57               ` Tamar Christina
  2022-06-14 16:09                 ` Richard Biener
  2022-06-22  0:34                 ` Tamar Christina
  0 siblings, 2 replies; 35+ messages in thread
From: Tamar Christina @ 2022-06-14 15:57 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener; +Cc: gcc-patches, nd



> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Tuesday, June 14, 2022 2:43 PM
> To: Richard Biener <rguenther@suse.de>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> nd <nd@arm.com>
> Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
> bitmask
> 
> Richard Biener <rguenther@suse.de> writes:
> > On Mon, 13 Jun 2022, Tamar Christina wrote:
> >
> >> > -----Original Message-----
> >> > From: Richard Biener <rguenther@suse.de>
> >> > Sent: Monday, June 13, 2022 12:48 PM
> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> >> > <Richard.Sandiford@arm.com>
> >> > Subject: RE: [PATCH 1/2]middle-end Support optimized division by
> >> > pow2 bitmask
> >> >
> >> > On Mon, 13 Jun 2022, Tamar Christina wrote:
> >> >
> >> > > > -----Original Message-----
> >> > > > From: Richard Biener <rguenther@suse.de>
> >> > > > Sent: Monday, June 13, 2022 10:39 AM
> >> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> >> > > > <Richard.Sandiford@arm.com>
> >> > > > Subject: Re: [PATCH 1/2]middle-end Support optimized division
> >> > > > by
> >> > > > pow2 bitmask
> >> > > >
> >> > > > On Mon, 13 Jun 2022, Richard Biener wrote:
> >> > > >
> >> > > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> >> > > > >
> >> > > > > > Hi All,
> >> > > > > >
> >> > > > > > In plenty of image and video processing code it's common to
> >> > > > > > modify pixel values by a widening operation and then scale
> >> > > > > > them back into range
> >> > > > by dividing by 255.
> >> > > > > >
> >> > > > > > This patch adds an optab to allow us to emit an optimized
> >> > > > > > sequence when doing an unsigned division that is equivalent to:
> >> > > > > >
> >> > > > > >    x = y / (2 ^ (bitsize (y)/2)-1
> >> > > > > >
> >> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> >> > > > > > x86_64-pc-linux-gnu and no issues.
> >> > > > > >
> >> > > > > > Ok for master?
> >> > > > >
> >> > > > > Looking at 2/2 it seems that this is the wrong way to attack
> >> > > > > the problem.  The ISA doesn't have such instruction so adding
> >> > > > > an optab looks premature.  I suppose that there's no unsigned
> >> > > > > vector integer division and thus we open-code that in a different
> way?
> >> > > > > Isn't the correct thing then to fixup that open-coding if it
> >> > > > > is more
> >> > efficient?
> >> > > >
> >> > >
> >> > > The problem is that even if you fixup the open-coding it would
> >> > > need to be something target specific? The sequence of
> >> > > instructions we generate don't have a GIMPLE representation.  So
> >> > > whatever is generated I'd have to fixup in RTL then.
> >> >
> >> > What's the operation that doesn't have a GIMPLE representation?
> >>
> >> For NEON use two operations:
> >> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
> >>     Where the + widens and the >> narrows.  So you give it two
> >> shorts, get a byte 2. Add widening add of lowpart so basically
> >> lowpart (a +w b)
> >>
> >> For SVE2 we use a different sequence, we use two back-to-back
> sequences of:
> >> 1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions
> select
> >>    Even and odd elements of the vector rather than "top half" and "bottom
> half".
> >>
> >>    So this instruction does : Add each vector element of the first source
> vector to the
> >>    corresponding vector element of the second source vector, and place
> the most
> >>     significant half of the result in the even-numbered half-width
> destination elements,
> >>     while setting the odd-numbered elements to zero.
> >>
> >> So there's an explicit permute in there. The instructions are
> >> sufficiently different that there wouldn't be a single GIMPLE
> representation.
> >
> > I see.  Are these also useful to express scalar integer division?
> >
> > I'll defer to others to ack the special udiv_pow2_bitmask optab or
> > suggest some piecemail things other targets might be able to do as
> > well.  It does look very special.  I'd also bikeshed it to
> > udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming I
> > interpreted 'bitmask' correctly ;)).  It seems to be even less general
> > since it is an unary op and the actual divisor is constrained by the
> > mode itself?
> 
> Yeah, those were my concerns as well.  For n-bit numbers, the same kind of
> arithmetic transformation can be used for any 2^m-1 for m in [n/2, n), so
> from a target-independent point of view, m==n/2 isn't particularly special.
> Hard-coding one value of m would make sense if there was an underlying
> instruction that did exactly this, but like you say, there isn't.
> 
> Would a compromise be to define an optab for ADDHN and then add a vector
> pattern for this division that (at least initially) prefers ADDHN over the
> current approach whenever ADDHN is available?  We could then adapt the
> conditions on the pattern if other targets also provide ADDHN but don't want
> this transform.  (I think the other instructions in the pattern already have
> optabs.)
> 
> That still leaves open the question about what to do about SVE2, but the
> underlying problem there is that the vectoriser doesn't know about the B/T
> layout.

Wouldn't it be better to just generalize the optab and to pass on the mask?
I'd prefer to do that than teach the vectorizer about ADDHN (which can't be
easily done now) let alone teaching it about B/T.   It also seems somewhat
unnecessary to diverge the implementation here in the mid-end. After all,
you can generate better SSE code here as well, so focusing on generating ISA
specific code from here for each ISA seems like the wrong approach to me.

Thanks,
Tamar

> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-14 15:57               ` Tamar Christina
@ 2022-06-14 16:09                 ` Richard Biener
  2022-06-22  0:34                 ` Tamar Christina
  1 sibling, 0 replies; 35+ messages in thread
From: Richard Biener @ 2022-06-14 16:09 UTC (permalink / raw)
  To: Tamar Christina via Gcc-patches; +Cc: Richard Sandiford, nd



> Am 14.06.2022 um 17:58 schrieb Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>:
> 
> 
> 
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Tuesday, June 14, 2022 2:43 PM
>> To: Richard Biener <rguenther@suse.de>
>> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
>> nd <nd@arm.com>
>> Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
>> bitmask
>> 
>> Richard Biener <rguenther@suse.de> writes:
>>>> On Mon, 13 Jun 2022, Tamar Christina wrote:
>>> 
>>>>> -----Original Message-----
>>>>> From: Richard Biener <rguenther@suse.de>
>>>>> Sent: Monday, June 13, 2022 12:48 PM
>>>>> To: Tamar Christina <Tamar.Christina@arm.com>
>>>>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
>>>>> <Richard.Sandiford@arm.com>
>>>>> Subject: RE: [PATCH 1/2]middle-end Support optimized division by
>>>>> pow2 bitmask
>>>>> 
>>>>> On Mon, 13 Jun 2022, Tamar Christina wrote:
>>>>> 
>>>>>>> -----Original Message-----
>>>>>>> From: Richard Biener <rguenther@suse.de>
>>>>>>> Sent: Monday, June 13, 2022 10:39 AM
>>>>>>> To: Tamar Christina <Tamar.Christina@arm.com>
>>>>>>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
>>>>>>> <Richard.Sandiford@arm.com>
>>>>>>> Subject: Re: [PATCH 1/2]middle-end Support optimized division
>>>>>>> by
>>>>>>> pow2 bitmask
>>>>>>> 
>>>>>>> On Mon, 13 Jun 2022, Richard Biener wrote:
>>>>>>> 
>>>>>>>> On Thu, 9 Jun 2022, Tamar Christina wrote:
>>>>>>>> 
>>>>>>>>> Hi All,
>>>>>>>>> 
>>>>>>>>> In plenty of image and video processing code it's common to
>>>>>>>>> modify pixel values by a widening operation and then scale
>>>>>>>>> them back into range
>>>>>>> by dividing by 255.
>>>>>>>>> 
>>>>>>>>> This patch adds an optab to allow us to emit an optimized
>>>>>>>>> sequence when doing an unsigned division that is equivalent to:
>>>>>>>>> 
>>>>>>>>>   x = y / (2 ^ (bitsize (y)/2)-1
>>>>>>>>> 
>>>>>>>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>>>>>>>>> x86_64-pc-linux-gnu and no issues.
>>>>>>>>> 
>>>>>>>>> Ok for master?
>>>>>>>> 
>>>>>>>> Looking at 2/2 it seems that this is the wrong way to attack
>>>>>>>> the problem.  The ISA doesn't have such instruction so adding
>>>>>>>> an optab looks premature.  I suppose that there's no unsigned
>>>>>>>> vector integer division and thus we open-code that in a different
>> way?
>>>>>>>> Isn't the correct thing then to fixup that open-coding if it
>>>>>>>> is more
>>>>> efficient?
>>>>>>> 
>>>>>> 
>>>>>> The problem is that even if you fixup the open-coding it would
>>>>>> need to be something target specific? The sequence of
>>>>>> instructions we generate don't have a GIMPLE representation.  So
>>>>>> whatever is generated I'd have to fixup in RTL then.
>>>>> 
>>>>> What's the operation that doesn't have a GIMPLE representation?
>>>> 
>>>> For NEON use two operations:
>>>> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n bitsize(a)/2
>>>>    Where the + widens and the >> narrows.  So you give it two
>>>> shorts, get a byte 2. Add widening add of lowpart so basically
>>>> lowpart (a +w b)
>>>> 
>>>> For SVE2 we use a different sequence, we use two back-to-back
>> sequences of:
>>>> 1. Add narrow high part (bottom).  In SVE the Top and Bottom instructions
>> select
>>>>   Even and odd elements of the vector rather than "top half" and "bottom
>> half".
>>>> 
>>>>   So this instruction does : Add each vector element of the first source
>> vector to the
>>>>   corresponding vector element of the second source vector, and place
>> the most
>>>>    significant half of the result in the even-numbered half-width
>> destination elements,
>>>>    while setting the odd-numbered elements to zero.
>>>> 
>>>> So there's an explicit permute in there. The instructions are
>>>> sufficiently different that there wouldn't be a single GIMPLE
>> representation.
>>> 
>>> I see.  Are these also useful to express scalar integer division?
>>> 
>>> I'll defer to others to ack the special udiv_pow2_bitmask optab or
>>> suggest some piecemail things other targets might be able to do as
>>> well.  It does look very special.  I'd also bikeshed it to
>>> udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming I
>>> interpreted 'bitmask' correctly ;)).  It seems to be even less general
>>> since it is an unary op and the actual divisor is constrained by the
>>> mode itself?
>> 
>> Yeah, those were my concerns as well.  For n-bit numbers, the same kind of
>> arithmetic transformation can be used for any 2^m-1 for m in [n/2, n), so
>> from a target-independent point of view, m==n/2 isn't particularly special.
>> Hard-coding one value of m would make sense if there was an underlying
>> instruction that did exactly this, but like you say, there isn't.
>> 
>> Would a compromise be to define an optab for ADDHN and then add a vector
>> pattern for this division that (at least initially) prefers ADDHN over the
>> current approach whenever ADDHN is available?  We could then adapt the
>> conditions on the pattern if other targets also provide ADDHN but don't want
>> this transform.  (I think the other instructions in the pattern already have
>> optabs.)
>> 
>> That still leaves open the question about what to do about SVE2, but the
>> underlying problem there is that the vectoriser doesn't know about the B/T
>> layout.
> 
> Wouldn't it be better to just generalize the optab and to pass on the mask?

You could implement udivvhiN3 as well, but we’d need to make sure to test predicates which should make sure only supported constants are let through.

> I'd prefer to do that than teach the vectorizer about ADDHN (which can't be
> easily done now) let alone teaching it about B/T.   It also seems somewhat
> unnecessary to diverge the implementation here in the mid-end. After all,
> you can generate better SSE code here as well, so focusing on generating ISA
> specific code from here for each ISA seems like the wrong approach to me.
> 
> Thanks,
> Tamar
> 
>> 
>> Thanks,
>> Richard

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-14 15:57               ` Tamar Christina
  2022-06-14 16:09                 ` Richard Biener
@ 2022-06-22  0:34                 ` Tamar Christina
  2022-06-26 19:55                   ` Jeff Law
  1 sibling, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-06-22  0:34 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener; +Cc: gcc-patches, nd

> -----Original Message-----
> From: Tamar Christina
> Sent: Tuesday, June 14, 2022 4:58 PM
> To: Richard Sandiford <richard.sandiford@arm.com>; Richard Biener
> <rguenther@suse.de>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
> bitmask
> 
> 
> 
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Tuesday, June 14, 2022 2:43 PM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>;
> > gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
> > bitmask
> >
> > Richard Biener <rguenther@suse.de> writes:
> > > On Mon, 13 Jun 2022, Tamar Christina wrote:
> > >
> > >> > -----Original Message-----
> > >> > From: Richard Biener <rguenther@suse.de>
> > >> > Sent: Monday, June 13, 2022 12:48 PM
> > >> > To: Tamar Christina <Tamar.Christina@arm.com>
> > >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
> > >> > <Richard.Sandiford@arm.com>
> > >> > Subject: RE: [PATCH 1/2]middle-end Support optimized division by
> > >> > pow2 bitmask
> > >> >
> > >> > On Mon, 13 Jun 2022, Tamar Christina wrote:
> > >> >
> > >> > > > -----Original Message-----
> > >> > > > From: Richard Biener <rguenther@suse.de>
> > >> > > > Sent: Monday, June 13, 2022 10:39 AM
> > >> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > >> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
> > >> > > > Sandiford <Richard.Sandiford@arm.com>
> > >> > > > Subject: Re: [PATCH 1/2]middle-end Support optimized division
> > >> > > > by
> > >> > > > pow2 bitmask
> > >> > > >
> > >> > > > On Mon, 13 Jun 2022, Richard Biener wrote:
> > >> > > >
> > >> > > > > On Thu, 9 Jun 2022, Tamar Christina wrote:
> > >> > > > >
> > >> > > > > > Hi All,
> > >> > > > > >
> > >> > > > > > In plenty of image and video processing code it's common
> > >> > > > > > to modify pixel values by a widening operation and then
> > >> > > > > > scale them back into range
> > >> > > > by dividing by 255.
> > >> > > > > >
> > >> > > > > > This patch adds an optab to allow us to emit an optimized
> > >> > > > > > sequence when doing an unsigned division that is equivalent
> to:
> > >> > > > > >
> > >> > > > > >    x = y / (2 ^ (bitsize (y)/2)-1
> > >> > > > > >
> > >> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > >> > > > > > x86_64-pc-linux-gnu and no issues.
> > >> > > > > >
> > >> > > > > > Ok for master?
> > >> > > > >
> > >> > > > > Looking at 2/2 it seems that this is the wrong way to
> > >> > > > > attack the problem.  The ISA doesn't have such instruction
> > >> > > > > so adding an optab looks premature.  I suppose that there's
> > >> > > > > no unsigned vector integer division and thus we open-code
> > >> > > > > that in a different
> > way?
> > >> > > > > Isn't the correct thing then to fixup that open-coding if
> > >> > > > > it is more
> > >> > efficient?
> > >> > > >
> > >> > >
> > >> > > The problem is that even if you fixup the open-coding it would
> > >> > > need to be something target specific? The sequence of
> > >> > > instructions we generate don't have a GIMPLE representation.
> > >> > > So whatever is generated I'd have to fixup in RTL then.
> > >> >
> > >> > What's the operation that doesn't have a GIMPLE representation?
> > >>
> > >> For NEON use two operations:
> > >> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n
> bitsize(a)/2
> > >>     Where the + widens and the >> narrows.  So you give it two
> > >> shorts, get a byte 2. Add widening add of lowpart so basically
> > >> lowpart (a +w b)
> > >>
> > >> For SVE2 we use a different sequence, we use two back-to-back
> > sequences of:
> > >> 1. Add narrow high part (bottom).  In SVE the Top and Bottom
> > >> instructions
> > select
> > >>    Even and odd elements of the vector rather than "top half" and
> > >> "bottom
> > half".
> > >>
> > >>    So this instruction does : Add each vector element of the first
> > >> source
> > vector to the
> > >>    corresponding vector element of the second source vector, and
> > >> place
> > the most
> > >>     significant half of the result in the even-numbered half-width
> > destination elements,
> > >>     while setting the odd-numbered elements to zero.
> > >>
> > >> So there's an explicit permute in there. The instructions are
> > >> sufficiently different that there wouldn't be a single GIMPLE
> > representation.
> > >
> > > I see.  Are these also useful to express scalar integer division?
> > >
> > > I'll defer to others to ack the special udiv_pow2_bitmask optab or
> > > suggest some piecemail things other targets might be able to do as
> > > well.  It does look very special.  I'd also bikeshed it to
> > > udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming I
> > > interpreted 'bitmask' correctly ;)).  It seems to be even less
> > > general since it is an unary op and the actual divisor is
> > > constrained by the mode itself?
> >
> > Yeah, those were my concerns as well.  For n-bit numbers, the same
> > kind of arithmetic transformation can be used for any 2^m-1 for m in
> > [n/2, n), so from a target-independent point of view, m==n/2 isn't
> particularly special.
> > Hard-coding one value of m would make sense if there was an underlying
> > instruction that did exactly this, but like you say, there isn't.
> >
> > Would a compromise be to define an optab for ADDHN and then add a
> > vector pattern for this division that (at least initially) prefers
> > ADDHN over the current approach whenever ADDHN is available?  We
> could
> > then adapt the conditions on the pattern if other targets also provide
> > ADDHN but don't want this transform.  (I think the other instructions
> > in the pattern already have
> > optabs.)
> >
> > That still leaves open the question about what to do about SVE2, but
> > the underlying problem there is that the vectoriser doesn't know about
> > the B/T layout.
> 
> Wouldn't it be better to just generalize the optab and to pass on the mask?
> I'd prefer to do that than teach the vectorizer about ADDHN (which can't be
> easily done now) let alone teaching it about B/T.   It also seems somewhat
> unnecessary to diverge the implementation here in the mid-end. After all,
> you can generate better SSE code here as well, so focusing on generating ISA
> specific code from here for each ISA seems like the wrong approach to me.

Ping, is there any consensus here? 

Thanks,
Tamar

> 
> Thanks,
> Tamar
> 
> >
> > Thanks,
> > Richard

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2]middle-end Support optimized division by pow2 bitmask
  2022-06-22  0:34                 ` Tamar Christina
@ 2022-06-26 19:55                   ` Jeff Law
  0 siblings, 0 replies; 35+ messages in thread
From: Jeff Law @ 2022-06-26 19:55 UTC (permalink / raw)
  To: gcc-patches



On 6/21/2022 6:34 PM, Tamar Christina via Gcc-patches wrote:
>> -----Original Message-----
>> From: Tamar Christina
>> Sent: Tuesday, June 14, 2022 4:58 PM
>> To: Richard Sandiford <richard.sandiford@arm.com>; Richard Biener
>> <rguenther@suse.de>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
>> Subject: RE: [PATCH 1/2]middle-end Support optimized division by pow2
>> bitmask
>>
>>
>>
>>> -----Original Message-----
>>> From: Richard Sandiford <richard.sandiford@arm.com>
>>> Sent: Tuesday, June 14, 2022 2:43 PM
>>> To: Richard Biener <rguenther@suse.de>
>>> Cc: Tamar Christina <Tamar.Christina@arm.com>;
>>> gcc-patches@gcc.gnu.org; nd <nd@arm.com>
>>> Subject: Re: [PATCH 1/2]middle-end Support optimized division by pow2
>>> bitmask
>>>
>>> Richard Biener <rguenther@suse.de> writes:
>>>> On Mon, 13 Jun 2022, Tamar Christina wrote:
>>>>
>>>>>> -----Original Message-----
>>>>>> From: Richard Biener <rguenther@suse.de>
>>>>>> Sent: Monday, June 13, 2022 12:48 PM
>>>>>> To: Tamar Christina <Tamar.Christina@arm.com>
>>>>>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Sandiford
>>>>>> <Richard.Sandiford@arm.com>
>>>>>> Subject: RE: [PATCH 1/2]middle-end Support optimized division by
>>>>>> pow2 bitmask
>>>>>>
>>>>>> On Mon, 13 Jun 2022, Tamar Christina wrote:
>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Richard Biener <rguenther@suse.de>
>>>>>>>> Sent: Monday, June 13, 2022 10:39 AM
>>>>>>>> To: Tamar Christina <Tamar.Christina@arm.com>
>>>>>>>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
>>>>>>>> Sandiford <Richard.Sandiford@arm.com>
>>>>>>>> Subject: Re: [PATCH 1/2]middle-end Support optimized division
>>>>>>>> by
>>>>>>>> pow2 bitmask
>>>>>>>>
>>>>>>>> On Mon, 13 Jun 2022, Richard Biener wrote:
>>>>>>>>
>>>>>>>>> On Thu, 9 Jun 2022, Tamar Christina wrote:
>>>>>>>>>
>>>>>>>>>> Hi All,
>>>>>>>>>>
>>>>>>>>>> In plenty of image and video processing code it's common
>>>>>>>>>> to modify pixel values by a widening operation and then
>>>>>>>>>> scale them back into range
>>>>>>>> by dividing by 255.
>>>>>>>>>> This patch adds an optab to allow us to emit an optimized
>>>>>>>>>> sequence when doing an unsigned division that is equivalent
>> to:
>>>>>>>>>>     x = y / (2 ^ (bitsize (y)/2)-1
>>>>>>>>>>
>>>>>>>>>> Bootstrapped Regtested on aarch64-none-linux-gnu,
>>>>>>>>>> x86_64-pc-linux-gnu and no issues.
>>>>>>>>>>
>>>>>>>>>> Ok for master?
>>>>>>>>> Looking at 2/2 it seems that this is the wrong way to
>>>>>>>>> attack the problem.  The ISA doesn't have such instruction
>>>>>>>>> so adding an optab looks premature.  I suppose that there's
>>>>>>>>> no unsigned vector integer division and thus we open-code
>>>>>>>>> that in a different
>>> way?
>>>>>>>>> Isn't the correct thing then to fixup that open-coding if
>>>>>>>>> it is more
>>>>>> efficient?
>>>>>>> The problem is that even if you fixup the open-coding it would
>>>>>>> need to be something target specific? The sequence of
>>>>>>> instructions we generate don't have a GIMPLE representation.
>>>>>>> So whatever is generated I'd have to fixup in RTL then.
>>>>>> What's the operation that doesn't have a GIMPLE representation?
>>>>> For NEON use two operations:
>>>>> 1. Add High narrowing lowpart, essentially doing (a +w b) >>.n
>> bitsize(a)/2
>>>>>      Where the + widens and the >> narrows.  So you give it two
>>>>> shorts, get a byte 2. Add widening add of lowpart so basically
>>>>> lowpart (a +w b)
>>>>>
>>>>> For SVE2 we use a different sequence, we use two back-to-back
>>> sequences of:
>>>>> 1. Add narrow high part (bottom).  In SVE the Top and Bottom
>>>>> instructions
>>> select
>>>>>     Even and odd elements of the vector rather than "top half" and
>>>>> "bottom
>>> half".
>>>>>     So this instruction does : Add each vector element of the first
>>>>> source
>>> vector to the
>>>>>     corresponding vector element of the second source vector, and
>>>>> place
>>> the most
>>>>>      significant half of the result in the even-numbered half-width
>>> destination elements,
>>>>>      while setting the odd-numbered elements to zero.
>>>>>
>>>>> So there's an explicit permute in there. The instructions are
>>>>> sufficiently different that there wouldn't be a single GIMPLE
>>> representation.
>>>> I see.  Are these also useful to express scalar integer division?
>>>>
>>>> I'll defer to others to ack the special udiv_pow2_bitmask optab or
>>>> suggest some piecemail things other targets might be able to do as
>>>> well.  It does look very special.  I'd also bikeshed it to
>>>> udiv_pow2m1 since 'bitmask' is less obvious than 2^n-1 (assuming I
>>>> interpreted 'bitmask' correctly ;)).  It seems to be even less
>>>> general since it is an unary op and the actual divisor is
>>>> constrained by the mode itself?
>>> Yeah, those were my concerns as well.  For n-bit numbers, the same
>>> kind of arithmetic transformation can be used for any 2^m-1 for m in
>>> [n/2, n), so from a target-independent point of view, m==n/2 isn't
>> particularly special.
>>> Hard-coding one value of m would make sense if there was an underlying
>>> instruction that did exactly this, but like you say, there isn't.
>>>
>>> Would a compromise be to define an optab for ADDHN and then add a
>>> vector pattern for this division that (at least initially) prefers
>>> ADDHN over the current approach whenever ADDHN is available?  We
>> could
>>> then adapt the conditions on the pattern if other targets also provide
>>> ADDHN but don't want this transform.  (I think the other instructions
>>> in the pattern already have
>>> optabs.)
>>>
>>> That still leaves open the question about what to do about SVE2, but
>>> the underlying problem there is that the vectoriser doesn't know about
>>> the B/T layout.
>> Wouldn't it be better to just generalize the optab and to pass on the mask?
>> I'd prefer to do that than teach the vectorizer about ADDHN (which can't be
>> easily done now) let alone teaching it about B/T.   It also seems somewhat
>> unnecessary to diverge the implementation here in the mid-end. After all,
>> you can generate better SSE code here as well, so focusing on generating ISA
>> specific code from here for each ISA seems like the wrong approach to me.
> Ping, is there any consensus here?
Not that I've seen.  The ongoing discussion has clarified a few things 
in my mind, but I'm still wrapping my brain around what you're doing here.

jeff


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
  2022-06-09  4:40 ` [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division Tamar Christina
  2022-06-13  9:24 ` [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Richard Biener
@ 2022-09-23  9:33 ` Tamar Christina
  2022-09-23  9:33 ` [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division Tamar Christina
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-09-23  9:33 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 20331 bytes --]

Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

e.g.:

   x = y / (2 ^ (bitsize (y)/2)-1

This patch adds a new target hook can_special_div_by_const, similar to
can_vec_perm which can be called to check if a target will handle a particular
division in a special way in the back-end.

The vectorizer will then vectorize the division using the standard tree code
and at expansion time the hook is called again to generate the code for the
division.

Alot of the changes in the patch are to pass down the tree operands in all paths
that can lead to the divmod expansion so that the target hook always has the
type of the expression you're expanding since the types can change the
expansion.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
	* expmed.cc (expand_divmod): Likewise.
	* explow.cc (round_push, align_dynamic_address): Likewise.
	* expr.cc (force_operand, expand_expr_divmod): Likewise.
	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
	Likewise.
	* target.h: Include tree-core.
	* target.def (can_special_div_by_const): New.
	* targhooks.cc (default_can_special_div_by_const): New.
	* targhooks.h (default_can_special_div_by_const): New.
	* tree-vect-generic.cc (expand_vector_operation): Use it.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
	* tree-vect-stmts.cc (vectorizable_operation): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
	* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch -- 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);




-- 

[-- Attachment #2: rb15779.patch --]
[-- Type: text/plain, Size: 18391 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);




^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division.
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
                   ` (2 preceding siblings ...)
  2022-09-23  9:33 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Tamar Christina
@ 2022-09-23  9:33 ` Tamar Christina
  2022-10-31 11:34   ` Tamar Christina
  2022-09-23  9:33 ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-09-23  9:33 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 27259 bytes --]

Hi All,

This adds an implementation for the new optab for unsigned pow2 bitmask for
AArch64.

The implementation rewrites:

   x = y / (2 ^ (sizeof (y)/2)-1

into e.g. (for bytes)

   (x + ((x + 257) >> 8)) >> 8

where it's required that the additions be done in double the precision of x
such that we don't lose any bits during an overflow.

Essentially the sequence decomposes the division into doing two smaller
divisions, one for the top and bottom parts of the number and adding the results
back together.

To account for the fact that shift by 8 would be division by 256 we add 1 to
both parts of x such that when 255 we still get 1 as the answer.

Because the amount we shift are half the original datatype we can use the
halfing instructions the ISA provides to do the operation instead of using
actual shifts.

For AArch64 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

	movi    v3.16b, 0x1
	umull2  v1.8h, v0.16b, v2.16b
	umull   v0.8h, v0.8b, v2.8b
	addhn   v5.8b, v1.8h, v3.8h
	addhn   v4.8b, v0.8h, v3.8h
	uaddw   v1.8h, v1.8h, v5.8b
	uaddw   v0.8h, v0.8h, v4.8b
	uzp2    v0.16b, v0.16b, v1.16b

instead of:

	umull   v2.8h, v1.8b, v5.8b
	umull2  v1.8h, v1.16b, v5.16b
	umull   v0.4s, v2.4h, v3.4h
	umull2  v2.4s, v2.8h, v3.8h
	umull   v4.4s, v1.4h, v3.4h
	umull2  v1.4s, v1.8h, v3.8h
	uzp2    v0.8h, v0.8h, v2.8h
	uzp2    v1.8h, v4.8h, v1.8h
	shrn    v0.8b, v0.8h, 7
	shrn2   v0.16b, v1.8h, 7

Which results in significantly faster code.

Thanks for Wilco for the concept.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New.
	* config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/div-by-bitmask.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0ba6386c1ab50f77e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc37484b6fc6901bfb4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+					       tree vectype,
+					       tree treeop0, tree treeop1,
+					       rtx *output, rtx in0, rtx in1)
+{
+
+  if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX))
+    return false;
+
+  tree cst = uniform_integer_cst_p (treeop1);
+  tree type;
+  if (code != TRUNC_DIV_EXPR
+      || !cst
+      || !TYPE_UNSIGNED ((type = TREE_TYPE (cst)))
+      || tree_int_cst_sgn (cst) != 1)
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      gcc_assert (treeop0 && treeop1);
+      wide_int icst = wi::to_wide (cst);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (TYPE_PRECISION (type) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);




-- 

[-- Attachment #2: rb15780.patch --]
[-- Type: text/plain, Size: 25243 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0ba6386c1ab50f77e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc37484b6fc6901bfb4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+					       tree vectype,
+					       tree treeop0, tree treeop1,
+					       rtx *output, rtx in0, rtx in1)
+{
+
+  if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX))
+    return false;
+
+  tree cst = uniform_integer_cst_p (treeop1);
+  tree type;
+  if (code != TRUNC_DIV_EXPR
+      || !cst
+      || !TYPE_UNSIGNED ((type = TREE_TYPE (cst)))
+      || tree_int_cst_sgn (cst) != 1)
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      gcc_assert (treeop0 && treeop1);
+      wide_int icst = wi::to_wide (cst);
+      wide_int val = wi::add (icst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (TYPE_PRECISION (type) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the two operands @code{treeop0},
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  if (treeop0
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     treeop0, treeop1,
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
+and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
+							  rhs2, NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+						       oprnd0, oprnd1, NULL,
+						       NULL_RTX, NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      if (!target_support_p)
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							op0, op1, NULL,
+							NULL_RTX, NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);




^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
                   ` (3 preceding siblings ...)
  2022-09-23  9:33 ` [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division Tamar Christina
@ 2022-09-23  9:33 ` Tamar Christina
  2022-10-31 11:34   ` Tamar Christina
  2022-11-12 12:17   ` Richard Sandiford
  2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 35+ messages in thread
From: Tamar Christina @ 2022-09-23  9:33 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 5630 bytes --]

Hi All,

In plenty of image and video processing code it's common to modify pixel values
by a widening operation and then scale them back into range by dividing by 255.

This patch adds an named function to allow us to emit an optimized sequence
when doing an unsigned division that is equivalent to:

   x = y / (2 ^ (bitsize (y)/2)-1)

For SVE2 this means we generate for:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] = (pixel[i] * level) / 0xff;
}

the following:

        mov     z3.b, #1
.L3:
        ld1b    z0.h, p0/z, [x0, x3]
        mul     z0.h, p1/m, z0.h, z2.h
        addhnb  z1.b, z0.h, z3.h
        addhnb  z0.b, z0.h, z1.h
        st1b    z0.h, p0, [x0, x3]
        inch    x3
        whilelo p0.h, w3, w2
        b.any   .L3

instead of:

.L3:
        ld1b    z0.h, p1/z, [x0, x3]
        mul     z0.h, p0/m, z0.h, z1.h
        umulh   z0.h, p0/m, z0.h, z2.h
        lsr     z0.h, z0.h, #7
        st1b    z0.h, p1, [x0, x3]
        inch    x3
        whilelo p1.h, w3, w2
        b.any   .L3

Which results in significantly faster code.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}




-- 

[-- Attachment #2: rb15813.patch --]
[-- Type: text/plain, Size: 4196 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -71,6 +71,7 @@
 ;; ---- [INT] Reciprocal approximation
 ;; ---- [INT<-FP] Base-2 logarithm
 ;; ---- [INT] Polynomial multiplication
+;; ---- [INT] Misc optab implementations
 ;;
 ;; == Permutation
 ;; ---- [INT,FP] General permutes
@@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
   "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
 )
 
+;; -------------------------------------------------------------------------
+;; ---- [INT] Misc optab implementations
+;; -------------------------------------------------------------------------
+;; Includes:
+;; - aarch64_bitmask_udiv
+;; -------------------------------------------------------------------------
+
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; See aarch64-simd.md for bigger explanation.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
+   (match_operand:SVE_FULL_HSDI 1 "register_operand")
+   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
+  "TARGET_SVE2"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
+  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
+			      addend));
+  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
+			      lowpart_subreg (<MODE>mode, tmp1,
+					      <VNARROW>mode)));
+  emit_move_insn (operands[0],
+		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
+  DONE;
+})
+
 ;; =========================================================================
 ;; == Permutation
 ;; =========================================================================
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}




^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
                   ` (4 preceding siblings ...)
  2022-09-23  9:33 ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
@ 2022-09-23  9:34 ` Tamar Christina
  2022-10-31 11:34   ` Tamar Christina
  2022-11-12 12:25   ` Richard Sandiford
  2022-09-26 10:39 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Richard Biener
  2022-11-09 10:37 ` Kyrylo Tkachov
  7 siblings, 2 replies; 35+ messages in thread
From: Tamar Christina @ 2022-09-23  9:34 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 5862 bytes --]

Hi All,

This adds an RTL pattern for when two NARROWB instructions are being combined
with a PACK.  The second NARROWB is then transformed into a NARROWT.

For the example:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] += (pixel[i] * level) / 0xff;
}

we generate:

        addhnb  z6.b, z0.h, z4.h
        addhnb  z5.b, z1.h, z4.h
        addhnb  z0.b, z0.h, z6.h
        addhnt  z0.b, z1.h, z5.h
        add     z0.b, z0.b, z2.b

instead of:

        addhnb  z6.b, z1.h, z4.h
        addhnb  z5.b, z0.h, z4.h
        addhnb  z1.b, z1.h, z6.h
        addhnb  z0.b, z0.h, z5.h
        uzp1    z0.b, z0.b, z1.b
        add     z0.b, z0.b, z2.b

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
	New.
	* config/aarch64/iterators.md (binary_top): New.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
   "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
+  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
+	(unspec:<VNARROW>
+	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
+	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
+	     SVE2_INT_BINARY_NARROWB) 0)]
+	  UNSPEC_PACK))]
+  "TARGET_SVE2"
+  "#"
+  "&& true"
+  [(const_int 0)]
+{
+  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
+  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
+			      operands[0], tmp, operands[2], operands[3]));
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Narrowing right shifts
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
 
 (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
 
+(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
+			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
+			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
+			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
+
 (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
 			     (UNSPEC_ADCLT "adclt")
 			     (UNSPEC_ADDHNB "addhnb")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhnb	z6.b, z0.h, z4.h
+** 	addhnb	z5.b, z1.h, z4.h
+** 	addhnb	z0.b, z0.h, z6.h
+** 	addhnt	z0.b, z1.h, z5.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhnb	z6.h, z0.s, z4.s
+** 	addhnb	z5.h, z1.s, z4.s
+** 	addhnb	z0.h, z0.s, z6.s
+** 	addhnt	z0.h, z1.s, z5.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhnb	z6.s, z0.d, z4.d
+** 	addhnb	z5.s, z1.d, z4.d
+** 	addhnb	z0.s, z0.d, z6.d
+** 	addhnt	z0.s, z1.d, z5.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}




-- 

[-- Attachment #2: rb15820.patch --]
[-- Type: text/plain, Size: 4735 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
   "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
+  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
+	(unspec:<VNARROW>
+	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
+	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
+	     SVE2_INT_BINARY_NARROWB) 0)]
+	  UNSPEC_PACK))]
+  "TARGET_SVE2"
+  "#"
+  "&& true"
+  [(const_int 0)]
+{
+  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
+  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
+			      operands[0], tmp, operands[2], operands[3]));
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Narrowing right shifts
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
 
 (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
 
+(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
+			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
+			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
+			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
+
 (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
 			     (UNSPEC_ADCLT "adclt")
 			     (UNSPEC_ADDHNB "addhnb")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhnb	z6.b, z0.h, z4.h
+** 	addhnb	z5.b, z1.h, z4.h
+** 	addhnb	z0.b, z0.h, z6.h
+** 	addhnt	z0.b, z1.h, z5.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhnb	z6.h, z0.s, z4.s
+** 	addhnb	z5.h, z1.s, z4.s
+** 	addhnb	z0.h, z0.s, z6.s
+** 	addhnt	z0.h, z1.s, z5.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhnb	z6.s, z0.d, z4.d
+** 	addhnb	z5.s, z1.d, z4.d
+** 	addhnb	z0.s, z0.d, z6.d
+** 	addhnt	z0.s, z1.d, z5.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
                   ` (5 preceding siblings ...)
  2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
@ 2022-09-26 10:39 ` Richard Biener
  2022-10-31 11:34   ` Tamar Christina
  2022-11-09 10:37 ` Kyrylo Tkachov
  7 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2022-09-26 10:39 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jeffreyalaw

On Fri, 23 Sep 2022, Tamar Christina wrote:

> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel values
> by a widening operation and then scale them back into range by dividing by 255.
> 
> e.g.:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1
> 
> This patch adds a new target hook can_special_div_by_const, similar to
> can_vec_perm which can be called to check if a target will handle a particular
> division in a special way in the back-end.
> 
> The vectorizer will then vectorize the division using the standard tree code
> and at expansion time the hook is called again to generate the code for the
> division.
> 
> Alot of the changes in the patch are to pass down the tree operands in all paths
> that can lead to the divmod expansion so that the target hook always has the
> type of the expression you're expanding since the types can change the
> expansion.

The type of the expression should be available via the mode and the
signedness, no?  So maybe to avoid having both RTX and TREE on the
target hook pass it a wide_int instead for the divisor?

> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
> +This hook is used to test whether the target has a special method of
> +division of vectors of type @var{vectype} using the two operands @code{treeop0},
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The division
> +will then not be decomposed by the and kept as a div.
> +
> +When the hook is being used to test whether the target supports a special
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
> +is being used to emit a division, @var{in0} and @var{in1} are the source
> +vectors of type @var{vecttype} and @var{output} is the destination vector of
> +type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
>  This hook should return the decl of a function that implements the
>  vectorized variant of the function with the @code{combined_fn} code
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
>  
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
>  
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
>  
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc
> index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
>  
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h
> index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
>  extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
>  
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
>  
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
>  
> +  /* Check if the target has specific expansions for the division.  */
> +  if (treeop0
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
> +						     treeop0, treeop1,
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p);
> @@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
>  
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
> -				     gen_int_mode (INTVAL (op1), word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
>  
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
> +				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
>  
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
>  
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
> +				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
>  
> diff --git a/gcc/target.def b/gcc/target.def
> index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
>  
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method of\n\
> +division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The division\n\
> +will then not be decomposed by the and kept as a div.\n\
> +\n\
> +When the hook is being used to test whether the target supports a special\n\
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
> +is being used to emit a division, @var{in0} and @var{in1} are the source\n\
> +vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
> +type @var{vectype}.\n\
> +\n\
> +Return true if the operation is possible, emitting instructions for it\n\
> +if rtxes are provided and updating @var{output}.",
> + bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */
> diff --git a/gcc/target.h b/gcc/target.h
> index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
>  
>  #if CHECKING_P
>  
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
>  extern rtx default_addr_space_convert (rtx, tree, tree);
>  extern unsigned int default_case_values_threshold (void);
>  extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree,
> +					      rtx *, rtx, rtx);
>  
>  extern bool default_libc_has_function (enum function_class, tree);
>  extern bool default_libc_has_fast_function (int fcode);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
>  
> +/* Default that no division by constant operations are special.  */
> +bool
> +default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t 
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
>  
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> +							  rhs2, NULL,
> +							  NULL_RTX, NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>  
>        return pattern_stmt;
>      }
> +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> +						       oprnd0, oprnd1, NULL,
> +						       NULL_RTX, NULL_RTX))
> +    {
> +      return NULL;
> +    }
>  
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      if (!target_support_p)
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							op0, op1, NULL,
> +							NULL_RTX, NULL_RTX);
>      }
>  
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 
> 
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-09-26 10:39 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Richard Biener
@ 2022-10-31 11:34   ` Tamar Christina
  2022-10-31 17:12     ` Jeff Law
  2022-11-08 17:36     ` Tamar Christina
  0 siblings, 2 replies; 35+ messages in thread
From: Tamar Christina @ 2022-10-31 11:34 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 20724 bytes --]

> 
> The type of the expression should be available via the mode and the
> signedness, no?  So maybe to avoid having both RTX and TREE on the target
> hook pass it a wide_int instead for the divisor?
> 

Done.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
	* expmed.cc (expand_divmod): Likewise.
	* explow.cc (round_push, align_dynamic_address): Likewise.
	* expr.cc (force_operand, expand_expr_divmod): Likewise.
	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
	Likewise.
	* target.h: Include tree-core.
	* target.def (can_special_div_by_const): New.
	* targhooks.cc (default_can_special_div_by_const): New.
	* targhooks.h (default_can_special_div_by_const): New.
	* tree-vect-generic.cc (expand_vector_operation): Use it.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
	* tree-vect-stmts.cc (vectorizable_operation): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
	* gcc.dg/vect/vect-div-bitmask.h: New file.

--- inline copy of patch ---

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094c7a712c0604fb77 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the value @var{constant},
+and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7333f271b1965b78 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  tree cst;
+  if (treeop0
+      && treeop1
+      && (cst = uniform_integer_cst_p (treeop1))
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     wi::to_wide (cst),
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dacf18d0e0b133e4c 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the value @var{constant},\n\
+and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, wide_int constant, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887dd980b1a7c5cf1c 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b69fe04593ae3419e 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, wide_int, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e79d66aa24e2f3b 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  tree cst = uniform_integer_cst_p (rhs2);
+	  if (cst &&
+	      targetm.vectorize.can_special_div_by_const (code, type,
+							  wi::to_wide (cst),
+							  NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0affd56e7314370 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
   gimple *pattern_stmt, *def_stmt;
   enum tree_code rhs_code;
   optab optab;
-  tree q;
+  tree q, cst;
   int dummy_int, prec;
 
   if (!is_gimple_assign (last_stmt))
@@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+							  wi::to_wide (cst),
+							  NULL, NULL_RTX,
+							  NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e83b67e3ea53d25f4 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      tree cst;
+      if (!target_support_p
+	  && (cst = uniform_integer_cst_p (op1)))
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							wi::to_wide (cst),
+							NULL, NULL_RTX,
+							NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);

[-- Attachment #2: rb15779.patch --]
[-- Type: application/octet-stream, Size: 18833 bytes --]

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094c7a712c0604fb77 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1})
+This hook is used to test whether the target has a special method of
+division of vectors of type @var{vectype} using the value @var{constant},
+and producing a vector of type @var{vectype}.  The division
+will then not be decomposed by the and kept as a div.
+
+When the hook is being used to test whether the target supports a special
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
+is being used to emit a division, @var{in0} and @var{in1} are the source
+vectors of type @var{vecttype} and @var{output} is the destination vector of
+type @var{vectype}.
+
+Return true if the operation is possible, emitting instructions for it
+if rtxes are provided and updating @var{output}.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
 This hook should return the decl of a function that implements the
 vectorized variant of the function with the @code{combined_fn} code
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
 @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
diff --git a/gcc/explow.cc b/gcc/explow.cc
index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644
--- a/gcc/explow.cc
+++ b/gcc/explow.cc
@@ -1037,7 +1037,7 @@ round_push (rtx size)
      TRUNC_DIV_EXPR.  */
   size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
 		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
+  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
 
@@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align)
 			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
 				       Pmode),
 			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
+  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
 			  gen_int_mode (required_align / BITS_PER_UNIT,
 					Pmode),
 			  NULL_RTX, 1);
diff --git a/gcc/expmed.h b/gcc/expmed.h
index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644
--- a/gcc/expmed.h
+++ b/gcc/expmed.h
@@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode,
 extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx,
 			 int);
 #ifdef GCC_OPTABS_H
-extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
-			  rtx, int, enum optab_methods = OPTAB_LIB_WIDEN);
+extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
+			  rtx, rtx, rtx, int,
+			  enum optab_methods = OPTAB_LIB_WIDEN);
 #endif
 #endif
 
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7333f271b1965b78 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d)
 
 rtx
 expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
-	       rtx op0, rtx op1, rtx target, int unsignedp,
-	       enum optab_methods methods)
+	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
+	       int unsignedp, enum optab_methods methods)
 {
   machine_mode compute_mode;
   rtx tquotient;
@@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 
   last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
 
+  /* Check if the target has specific expansions for the division.  */
+  tree cst;
+  if (treeop0
+      && treeop1
+      && (cst = uniform_integer_cst_p (treeop1))
+      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0),
+						     wi::to_wide (cst),
+						     &target, op0, op1))
+    return target;
+
+
   /* Now convert to the best mode to use.  */
   if (compute_mode != mode)
     {
@@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 			    || (optab_handler (sdivmod_optab, int_mode)
 				!= CODE_FOR_nothing)))
 		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
-						int_mode, op0,
-						gen_int_mode (abs_d,
+						int_mode, treeop0, treeop1,
+						op0, gen_int_mode (abs_d,
 							      int_mode),
 						NULL_RTX, 0);
 		    else
@@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
 				      size - 1, NULL_RTX, 0);
 		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
 				    NULL_RTX);
-		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1,
-				    NULL_RTX, 0);
+		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0,
+				    treeop1, t3, op1, NULL_RTX, 0);
 		if (t4)
 		  {
 		    rtx t5;
diff --git a/gcc/expr.cc b/gcc/expr.cc
index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
 	    return expand_divmod (0,
 				  FLOAT_MODE_P (GET_MODE (value))
 				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
-				  GET_MODE (value), op1, op2, target, 0);
+				  GET_MODE (value), NULL, NULL, op1, op2,
+				  target, 0);
 	case MOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 0);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 0);
 	case UDIV:
-	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case UMOD:
-	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2,
-				target, 1);
+	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL,
+				op1, op2, target, 1);
 	case ASHIFTRT:
 	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
 				      target, 0, OPTAB_LIB_WIDEN);
@@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       bool speed_p = optimize_insn_for_speed_p ();
       do_pending_stack_adjust ();
       start_sequence ();
-      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
+      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 1);
       rtx_insn *uns_insns = get_insns ();
       end_sequence ();
       start_sequence ();
-      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
+      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
+				   op0, op1, target, 0);
       rtx_insn *sgn_insns = get_insns ();
       end_sequence ();
       unsigned uns_cost = seq_cost (uns_insns, speed_p);
@@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0,
       emit_insn (sgn_insns);
       return sgn_ret;
     }
-  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
+  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
+			op0, op1, target, unsignedp);
 }
 
 rtx
diff --git a/gcc/optabs.cc b/gcc/optabs.cc
index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644
--- a/gcc/optabs.cc
+++ b/gcc/optabs.cc
@@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
 		return NULL_RTX;
 	    }
 	}
-      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum,
-				     gen_int_mode (INTVAL (op1), word_mode),
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL,
+				     sum, gen_int_mode (INTVAL (op1),
+							word_mode),
 				     NULL_RTX, 1, OPTAB_DIRECT);
       if (remainder == NULL_RTX)
 	return NULL_RTX;
@@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
 
   if (op11 != const1_rtx)
     {
-      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
-				NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1,
+				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
@@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem,
       if (rem2 == NULL_RTX)
 	return NULL_RTX;
 
-      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
-				 NULL_RTX, unsignedp, OPTAB_DIRECT);
+      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1,
+				 op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
       if (quot2 == NULL_RTX)
 	return NULL_RTX;
 
diff --git a/gcc/target.def b/gcc/target.def
index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dacf18d0e0b133e4c 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1902,6 +1902,25 @@ implementation approaches itself.",
 	const vec_perm_indices &sel),
  NULL)
 
+DEFHOOK
+(can_special_div_by_const,
+ "This hook is used to test whether the target has a special method of\n\
+division of vectors of type @var{vectype} using the value @var{constant},\n\
+and producing a vector of type @var{vectype}.  The division\n\
+will then not be decomposed by the and kept as a div.\n\
+\n\
+When the hook is being used to test whether the target supports a special\n\
+divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook\n\
+is being used to emit a division, @var{in0} and @var{in1} are the source\n\
+vectors of type @var{vecttype} and @var{output} is the destination vector of\n\
+type @var{vectype}.\n\
+\n\
+Return true if the operation is possible, emitting instructions for it\n\
+if rtxes are provided and updating @var{output}.",
+ bool, (enum tree_code, tree vectype, wide_int constant, rtx *output,
+	rtx in0, rtx in1),
+ default_can_special_div_by_const)
+
 /* Return true if the target supports misaligned store/load of a
    specific factor denoted in the third parameter.  The last parameter
    is true if the access is defined in a packed struct.  */
diff --git a/gcc/target.h b/gcc/target.h
index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -51,6 +51,7 @@
 #include "insn-codes.h"
 #include "tm.h"
 #include "hard-reg-set.h"
+#include "tree-core.h"
 
 #if CHECKING_P
 
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887dd980b1a7c5cf1c 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t);
 extern rtx default_addr_space_convert (rtx, tree, tree);
 extern unsigned int default_case_values_threshold (void);
 extern bool default_have_conditional_execution (void);
+extern bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
+					      rtx *, rtx, rtx);
 
 extern bool default_libc_has_function (enum function_class, tree);
 extern bool default_libc_has_fast_function (int fcode);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b69fe04593ae3419e 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
   return HAVE_conditional_execution;
 }
 
+/* Default that no division by constant operations are special.  */
+bool
+default_can_special_div_by_const (enum tree_code, tree, wide_int, rtx *, rtx,
+				  rtx)
+{
+  return false;
+}
+
 /* By default we assume that c99 functions are present at the runtime,
    but sincos is not.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint16_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
new file mode 100644
index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t 
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
@@ -0,0 +1,43 @@
+#include <stdio.h>
+
+#ifndef N
+#define N 65
+#endif
+
+#ifndef TYPE
+#define TYPE uint32_t
+#endif
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e79d66aa24e2f3b 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type
 	  tree rhs2 = gimple_assign_rhs2 (assign);
 	  tree ret;
 
+	  /* Check if the target was going to handle it through the special
+	     division callback hook.  */
+	  tree cst = uniform_integer_cst_p (rhs2);
+	  if (cst &&
+	      targetm.vectorize.can_special_div_by_const (code, type,
+							  wi::to_wide (cst),
+							  NULL,
+							  NULL_RTX, NULL_RTX))
+	    return NULL_TREE;
+
+
 	  if (!optimize
 	      || !VECTOR_INTEGER_TYPE_P (type)
 	      || TREE_CODE (rhs2) != VECTOR_CST
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0affd56e7314370 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
   gimple *pattern_stmt, *def_stmt;
   enum tree_code rhs_code;
   optab optab;
-  tree q;
+  tree q, cst;
   int dummy_int, prec;
 
   if (!is_gimple_assign (last_stmt))
@@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
+							  wi::to_wide (cst),
+							  NULL, NULL_RTX,
+							  NULL_RTX))
+    {
+      return NULL;
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e83b67e3ea53d25f4 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
 	}
       target_support_p = (optab_handler (optab, vec_mode)
 			  != CODE_FOR_nothing);
+      tree cst;
+      if (!target_support_p
+	  && (cst = uniform_integer_cst_p (op1)))
+	target_support_p
+	  = targetm.vectorize.can_special_div_by_const (code, vectype,
+							wi::to_wide (cst),
+							NULL, NULL_RTX,
+							NULL_RTX);
     }
 
   bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division.
  2022-09-23  9:33 ` [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division Tamar Christina
@ 2022-10-31 11:34   ` Tamar Christina
  2022-11-09  8:33     ` Tamar Christina
  2022-11-09 16:02     ` Kyrylo Tkachov
  0 siblings, 2 replies; 35+ messages in thread
From: Tamar Christina @ 2022-10-31 11:34 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 37841 bytes --]

Hi All,

Ping, and updated patch based on mid-end changes.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New.
	* config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/div-by-bitmask.c: New test.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0ba6386c1ab50f77e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb65154127dc7b72386c5a7 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+					       tree vectype, wide_int cst,
+					       rtx *output, rtx in0, rtx in1)
+{
+  if (code != TRUNC_DIV_EXPR
+      || !TYPE_UNSIGNED (vectype))
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      wide_int val = wi::add (cst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (int)(element_precision (vectype) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
 
+#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
+  aarch64_vectorize_can_special_div_by_constant
+
 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
   aarch64_vectorize_preferred_vector_alignment
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}

> -----Original Message-----
> From: Tamar Christina <tamar.christina@arm.com>
> Sent: Friday, September 23, 2022 10:34 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division.
> 
> Hi All,
> 
> This adds an implementation for the new optab for unsigned pow2 bitmask
> for AArch64.
> 
> The implementation rewrites:
> 
>    x = y / (2 ^ (sizeof (y)/2)-1
> 
> into e.g. (for bytes)
> 
>    (x + ((x + 257) >> 8)) >> 8
> 
> where it's required that the additions be done in double the precision of x
> such that we don't lose any bits during an overflow.
> 
> Essentially the sequence decomposes the division into doing two smaller
> divisions, one for the top and bottom parts of the number and adding the
> results back together.
> 
> To account for the fact that shift by 8 would be division by 256 we add 1 to
> both parts of x such that when 255 we still get 1 as the answer.
> 
> Because the amount we shift are half the original datatype we can use the
> halfing instructions the ISA provides to do the operation instead of using
> actual shifts.
> 
> For AArch64 this means we generate for:
> 
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] = (pixel[i] * level) / 0xff; }
> 
> the following:
> 
> 	movi    v3.16b, 0x1
> 	umull2  v1.8h, v0.16b, v2.16b
> 	umull   v0.8h, v0.8b, v2.8b
> 	addhn   v5.8b, v1.8h, v3.8h
> 	addhn   v4.8b, v0.8h, v3.8h
> 	uaddw   v1.8h, v1.8h, v5.8b
> 	uaddw   v0.8h, v0.8h, v4.8b
> 	uzp2    v0.16b, v0.16b, v1.16b
> 
> instead of:
> 
> 	umull   v2.8h, v1.8b, v5.8b
> 	umull2  v1.8h, v1.16b, v5.16b
> 	umull   v0.4s, v2.4h, v3.4h
> 	umull2  v2.4s, v2.8h, v3.8h
> 	umull   v4.4s, v1.4h, v3.4h
> 	umull2  v1.4s, v1.8h, v3.8h
> 	uzp2    v0.8h, v0.8h, v2.8h
> 	uzp2    v1.8h, v4.8h, v1.8h
> 	shrn    v0.8b, v0.8h, 7
> 	shrn2   v0.16b, v1.8h, 7
> 
> Which results in significantly faster code.
> 
> Thanks for Wilco for the concept.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md
> (@aarch64_bitmask_udiv<mode>3): New.
> 	* config/aarch64/aarch64.cc
> (aarch64_vectorize_can_special_div_by_constant): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/div-by-bitmask.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b
> a6386c1ab50f77e 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4831,6 +4831,65 @@ (define_expand
> "aarch64_<sur><addsub>hn2<mode>"
>    }
>  )
> 
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it
> +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;;
> +double the precision of x.
> +;;
> +;; If we imagine a short as being composed of two blocks of bytes then
> +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;;
> +adding 1 to each sub component:
> +;;
> +;;      short value of 16-bits
> +;; ┌──────────────┬────────────────┐
> +;; │              │                │
> +;; └──────────────┴────────────────┘
> +;;   8-bit part1 ▲  8-bit part2   ▲
> +;;               │                │
> +;;               │                │
> +;;              +1               +1
> +;;
> +;; after the first addition, we have to shift right by 8, and narrow
> +the ;; results back to a byte.  Remember that the addition must be done
> +in ;; double the precision of the input.  Since 8 is half the size of a
> +short ;; we can use a narrowing halfing instruction in AArch64, addhn
> +which also ;; does the addition in a wider precision and narrows back
> +to a byte.  The ;; shift itself is implicit in the operation as it
> +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize.
> +;;
> +;; Since we have narrowed the result of the first part back to a byte,
> +for ;; the second addition we can use a widening addition, uaddw.
> +;;
> +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr
> +by 8 ;; to shift and the vectorizer.
> +;;
> +;; The shift is later optimized by combine to a uzp2 with movi #0.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:VQN 0 "register_operand")
> +   (match_operand:VQN 1 "register_operand")
> +   (match_operand:VQN 2 "immediate_operand")]
> +  "TARGET_SIMD"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;
> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> +<VNARROWQ2>mode));
> +  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> +  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> +  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> +bitsize);
> +  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
> +  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> +shift_vector));
> +  DONE;
> +})
> +
>  ;; pmul.
> 
>  (define_insn "aarch64_pmul<mode>"
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc3
> 7484b6fc6901bfb4 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const
> (machine_mode vmode, machine_mode op_mode,
>    return ret;
>  }
> 
> +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> +
> +bool
> +aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> +					       tree vectype,
> +					       tree treeop0, tree treeop1,
> +					       rtx *output, rtx in0, rtx in1) {
> +
> +  if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX))
> +    return false;
> +
> +  tree cst = uniform_integer_cst_p (treeop1);  tree type;  if (code !=
> + TRUNC_DIV_EXPR
> +      || !cst
> +      || !TYPE_UNSIGNED ((type = TREE_TYPE (cst)))
> +      || tree_int_cst_sgn (cst) != 1)
> +    return false;
> +
> +  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE
> + (vectype));  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> +    return false;
> +
> +  if (in0 == NULL_RTX && in1 == NULL_RTX)
> +    {
> +      gcc_assert (treeop0 && treeop1);
> +      wide_int icst = wi::to_wide (cst);
> +      wide_int val = wi::add (icst, 1);
> +      int pow = wi::exact_log2 (val);
> +      return pow == (TYPE_PRECISION (type) / 2);
> +    }
> +
> +  if (!VECTOR_TYPE_P (vectype))
> +   return false;
> +
> +  gcc_assert (output);
> +
> +  if (!*output)
> +    *output = gen_reg_rtx (TYPE_MODE (vectype));
> +
> +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output,
> +in0, in1));
> +  return true;
> +}
> +
>  /* Generate a byte permute mask for a register of mode MODE,
>     which has NUNITS units.  */
> 
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d2
> 44a2a23e76cac097 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> to handle these two  implementation approaches itself.
>  @end deftypefn
> 
> +@deftypefn {Target Hook} bool
> TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree
> +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This
> +hook is used to test whether the target has a special method of
> +division of vectors of type @var{vectype} using the two operands
> @code{treeop0}, and @code{treeop1} and producing a vector of type
> @var{vectype}.  The division will then not be decomposed by the and kept as
> a div.
> +
> +When the hook is being used to test whether the target supports a
> +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> +When the hook is being used to emit a division, @var{in0} and @var{in1}
> +are the source vectors of type @var{vecttype} and @var{output} is the
> +destination vector of type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree
> TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> should return the decl of a function that implements the  vectorized variant
> of the function with the @code{combined_fn} code diff --git
> a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> 076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> can generate better code.
> 
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> 
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> 
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc index
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> 5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> + align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> 
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h index
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> rtx, poly_int64, rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods =
> OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb09
> 90db8b97d3af414 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0, HOST_WIDE_INT d)
> 
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
> 
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> 
> +  /* Check if the target has specific expansions for the division.  */
> +  if (treeop0
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> (treeop0),
> +						     treeop0, treeop1,
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
> 
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> 872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> sum,
> -				     gen_int_mode (INTVAL (op1),
> word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
> 
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> quot1,
> +				op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> quot1,
> +				 op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
> 
> diff --git a/gcc/target.def b/gcc/target.def index
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b070
> 81cdd70113db9b1 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
> 
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method
> +of\n\ division of vectors of type @var{vectype} using the two operands
> +@code{treeop0},\n\ and @code{treeop1} and producing a vector of type
> +@var{vectype}.  The division\n\ will then not be decomposed by the and
> +kept as a div.\n\ \n\ When the hook is being used to test whether the
> +target supports a special\n\ divide, @var{in0}, @var{in1}, and
> +@var{output} are all null.  When the hook\n\ is being used to emit a
> +division, @var{in0} and @var{in1} are the source\n\ vectors of type
> +@var{vecttype} and @var{output} is the destination vector of\n\ type
> +@var{vectype}.\n\ \n\ Return true if the operation is possible,
> +emitting instructions for it\n\ if rtxes are provided and updating
> +@var{output}.",  bool, (enum tree_code, tree vectype, tree treeop0,
> +tree treeop1, rtx *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> b/gcc/target.h index
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
> 
>  #if CHECKING_P
> 
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e
> 2640d63f936b336d 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> tree, tree);  extern unsigned int default_case_values_threshold (void);
> extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree,
> tree, tree,
> +					      rtx *, rtx, rtx);
> 
>  extern bool default_libc_has_function (enum function_class, tree);  extern
> bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> b/gcc/targhooks.cc index
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241
> 279936ced41ee95 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
> 
> +/* Default that no division by constant operations are special.  */
> +bool default_can_special_div_by_const (enum tree_code, tree, tree,
> +tree, rtx *, rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> 157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> +} } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> 832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a
> b211cd246d82d5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c
> 9a12046b6ec94f3 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator
> *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
> 
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> +							  rhs2, NULL,
> +							  NULL_RTX,
> NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> patterns.cc b/gcc/tree-vect-patterns.cc index
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af
> 0b1bfea10fe443 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> 
>        return pattern_stmt;
>      }
> +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> +						       oprnd0, oprnd1, NULL,
> +						       NULL_RTX, NULL_RTX))
> +    {
> +      return NULL;
> +    }
> 
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd6
> 8e0e1c1e93faafe 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      if (!target_support_p)
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							op0, op1, NULL,
> +							NULL_RTX,
> NULL_RTX);
>      }
> 
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 
> 
> 
> 
> --

[-- Attachment #2: rb15780.patch --]
[-- Type: application/octet-stream, Size: 7000 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0ba6386c1ab50f77e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>"
   }
 )
 
+;; div optimizations using narrowings
+;; we can do the division e.g. shorts by 255 faster by calculating it as
+;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+;; double the precision of x.
+;;
+;; If we imagine a short as being composed of two blocks of bytes then
+;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to
+;; adding 1 to each sub component:
+;;
+;;      short value of 16-bits
+;; ┌──────────────┬────────────────┐
+;; │              │                │
+;; └──────────────┴────────────────┘
+;;   8-bit part1 ▲  8-bit part2   ▲
+;;               │                │
+;;               │                │
+;;              +1               +1
+;;
+;; after the first addition, we have to shift right by 8, and narrow the
+;; results back to a byte.  Remember that the addition must be done in
+;; double the precision of the input.  Since 8 is half the size of a short
+;; we can use a narrowing halfing instruction in AArch64, addhn which also
+;; does the addition in a wider precision and narrows back to a byte.  The
+;; shift itself is implicit in the operation as it writes back only the top
+;; half of the result. i.e. bits 2*esize-1:esize.
+;;
+;; Since we have narrowed the result of the first part back to a byte, for
+;; the second addition we can use a widening addition, uaddw.
+;;
+;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8
+;; to shift and the vectorizer.
+;;
+;; The shift is later optimized by combine to a uzp2 with movi #0.
+(define_expand "@aarch64_bitmask_udiv<mode>3"
+  [(match_operand:VQN 0 "register_operand")
+   (match_operand:VQN 1 "register_operand")
+   (match_operand:VQN 2 "immediate_operand")]
+  "TARGET_SIMD"
+{
+  unsigned HOST_WIDE_INT size
+    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
+  if (!CONST_VECTOR_P (operands[2])
+      || const_vector_encoded_nelts (operands[2]) != 1
+      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
+    FAIL;
+
+  rtx addend = gen_reg_rtx (<MODE>mode);
+  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
+  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
+  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
+  rtx tmp2 = gen_reg_rtx (<MODE>mode);
+  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
+  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
+  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
+  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
+  DONE;
+})
+
 ;; pmul.
 
 (define_insn "aarch64_pmul<mode>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb65154127dc7b72386c5a7 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   return ret;
 }
 
+/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
+
+bool
+aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
+					       tree vectype, wide_int cst,
+					       rtx *output, rtx in0, rtx in1)
+{
+  if (code != TRUNC_DIV_EXPR
+      || !TYPE_UNSIGNED (vectype))
+    return false;
+
+  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype));
+  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
+    return false;
+
+  if (in0 == NULL_RTX && in1 == NULL_RTX)
+    {
+      wide_int val = wi::add (cst, 1);
+      int pow = wi::exact_log2 (val);
+      return pow == (int)(element_precision (vectype) / 2);
+    }
+
+  if (!VECTOR_TYPE_P (vectype))
+   return false;
+
+  gcc_assert (output);
+
+  if (!*output)
+    *output = gen_reg_rtx (TYPE_MODE (vectype));
+
+  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1));
+  return true;
+}
+
 /* Generate a byte permute mask for a register of mode MODE,
    which has NUNITS units.  */
 
@@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_VECTOR_ALIGNMENT
 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
 
+#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
+#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
+  aarch64_vectorize_can_special_div_by_constant
+
 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
   aarch64_vectorize_preferred_vector_alignment
diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
new file mode 100644
index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+#pragma GCC target "+nosve"
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
+** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
+** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division
  2022-09-23  9:33 ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
@ 2022-10-31 11:34   ` Tamar Christina
  2022-11-09  8:33     ` Tamar Christina
  2022-11-12 12:17   ` Richard Sandiford
  1 sibling, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-10-31 11:34 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

Ping

> -----Original Message-----
> From: Tamar Christina <tamar.christina@arm.com>
> Sent: Friday, September 23, 2022 10:34 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask
> division
> 
> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel
> values by a widening operation and then scale them back into range by
> dividing by 255.
> 
> This patch adds an named function to allow us to emit an optimized
> sequence when doing an unsigned division that is equivalent to:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1)
> 
> For SVE2 this means we generate for:
> 
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] = (pixel[i] * level) / 0xff; }
> 
> the following:
> 
>         mov     z3.b, #1
> .L3:
>         ld1b    z0.h, p0/z, [x0, x3]
>         mul     z0.h, p1/m, z0.h, z2.h
>         addhnb  z1.b, z0.h, z3.h
>         addhnb  z0.b, z0.h, z1.h
>         st1b    z0.h, p0, [x0, x3]
>         inch    x3
>         whilelo p0.h, w3, w2
>         b.any   .L3
> 
> instead of:
> 
> .L3:
>         ld1b    z0.h, p1/z, [x0, x3]
>         mul     z0.h, p0/m, z0.h, z1.h
>         umulh   z0.h, p0/m, z0.h, z2.h
>         lsr     z0.h, z0.h, #7
>         st1b    z0.h, p1, [x0, x3]
>         inch    x3
>         whilelo p1.h, w3, w2
>         b.any   .L3
> 
> Which results in significantly faster code.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve2.md
> (@aarch64_bitmask_udiv<mode>3): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-sve2.md
> b/gcc/config/aarch64/aarch64-sve2.md
> index
> f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234
> a1023a6eba0d1 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -71,6 +71,7 @@
>  ;; ---- [INT] Reciprocal approximation
>  ;; ---- [INT<-FP] Base-2 logarithm
>  ;; ---- [INT] Polynomial multiplication
> +;; ---- [INT] Misc optab implementations
>  ;;
>  ;; == Permutation
>  ;; ---- [INT,FP] General permutes
> @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
>    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
>  )
> 
> +;;
> +-----------------------------------------------------------------------
> +-- ;; ---- [INT] Misc optab implementations ;;
> +-----------------------------------------------------------------------
> +--
> +;; Includes:
> +;; - aarch64_bitmask_udiv
> +;;
> +-----------------------------------------------------------------------
> +--
> +
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it
> +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;;
> +double the precision of x.
> +;;
> +;; See aarch64-simd.md for bigger explanation.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> +  "TARGET_SVE2"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;
> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> +<VNARROW>mode));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1,
> operands[1],
> +			      addend));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2,
> operands[1],
> +			      lowpart_subreg (<MODE>mode, tmp1,
> +					      <VNARROW>mode)));
> +  emit_move_insn (operands[0],
> +		  lowpart_subreg (<MODE>mode, tmp2,
> <VNARROW>mode));
> +  DONE;
> +})
> +
>  ;;
> ==========================================================
> ===============
>  ;; == Permutation
>  ;;
> ==========================================================
> ===============
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c
> 0bb0d204cda6d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> 
> 
> 
> 
> --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT
  2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
@ 2022-10-31 11:34   ` Tamar Christina
  2022-11-09  8:33     ` Tamar Christina
  2022-11-12 12:25   ` Richard Sandiford
  1 sibling, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-10-31 11:34 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: Richard Earnshaw, nd, Richard Sandiford, Marcus Shawcroft

Ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, September 23, 2022 10:34 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to
> NARROWB + NARROWT
> 
> Hi All,
> 
> This adds an RTL pattern for when two NARROWB instructions are being
> combined with a PACK.  The second NARROWB is then transformed into a
> NARROWT.
> 
> For the example:
> 
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] += (pixel[i] * level) / 0xff; }
> 
> we generate:
> 
>         addhnb  z6.b, z0.h, z4.h
>         addhnb  z5.b, z1.h, z4.h
>         addhnb  z0.b, z0.h, z6.h
>         addhnt  z0.b, z1.h, z5.h
>         add     z0.b, z0.b, z2.b
> 
> instead of:
> 
>         addhnb  z6.b, z1.h, z4.h
>         addhnb  z5.b, z0.h, z4.h
>         addhnb  z1.b, z1.h, z6.h
>         addhnb  z0.b, z0.h, z5.h
>         uzp1    z0.b, z0.b, z1.b
>         add     z0.b, z0.b, z2.b
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve2.md
> (*aarch64_sve_pack_<sve_int_op><mode>):
> 	New.
> 	* config/aarch64/iterators.md (binary_top): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-sve2.md
> b/gcc/config/aarch64/aarch64-sve2.md
> index
> ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98a
> d3c41e5d05d8cf38 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1600,6 +1600,25 @@ (define_insn
> "@aarch64_sve_<sve_int_op><mode>"
>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>  )
> 
> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> +	(unspec:<VNARROW>
> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> +	     SVE2_INT_BINARY_NARROWB) 0)]
> +	  UNSPEC_PACK))]
> +  "TARGET_SVE2"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1],
> <MODE>mode);
> +  emit_insn (gen_aarch64_sve
> (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> +			      operands[0], tmp, operands[2], operands[3]));
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] Narrowing right shifts
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md
> b/gcc/config/aarch64/iterators.md index
> 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82af
> a67613a087389d4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a")
> (UNSPEC_BRKB "b")
> 
>  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT
> "pnext")])
> 
> +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> +
>  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
>  			     (UNSPEC_ADCLT "adclt")
>  			     (UNSPEC_ADDHNB "addhnb")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15
> c82dd9726897cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> +} } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdb
> ced36c8038db1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhnb	z6.b, z0.h, z4.h
> +** 	addhnb	z5.b, z1.h, z4.h
> +** 	addhnb	z0.b, z0.h, z6.h
> +** 	addhnt	z0.b, z1.h, z5.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhnb	z6.h, z0.s, z4.s
> +** 	addhnb	z5.h, z1.s, z4.s
> +** 	addhnb	z0.h, z0.s, z6.s
> +** 	addhnt	z0.h, z1.s, z5.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhnb	z6.s, z0.d, z4.d
> +** 	addhnb	z5.s, z1.d, z4.d
> +** 	addhnb	z0.s, z0.d, z6.d
> +** 	addhnt	z0.s, z1.d, z5.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> 
> 
> 
> 
> --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-10-31 11:34   ` Tamar Christina
@ 2022-10-31 17:12     ` Jeff Law
  2022-11-08 17:36     ` Tamar Christina
  1 sibling, 0 replies; 35+ messages in thread
From: Jeff Law @ 2022-10-31 17:12 UTC (permalink / raw)
  To: Tamar Christina, Richard Biener; +Cc: gcc-patches, nd


On 10/31/22 05:34, Tamar Christina wrote:
>> The type of the expression should be available via the mode and the
>> signedness, no?  So maybe to avoid having both RTX and TREE on the target
>> hook pass it a wide_int instead for the divisor?
>>
> Done.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* expmed.h (expand_divmod): Pass tree operands down in addition to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod, expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
>
> --- inline copy of patch ---
>
OK for the trunk.


Jeff


^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-10-31 11:34   ` Tamar Christina
  2022-10-31 17:12     ` Jeff Law
@ 2022-11-08 17:36     ` Tamar Christina
  2022-11-09  8:01       ` Richard Biener
  1 sibling, 1 reply; 35+ messages in thread
From: Tamar Christina @ 2022-11-08 17:36 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jeffreyalaw

Ping.

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Richard Biener <rguenther@suse.de>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jeffreyalaw@gmail.com
> Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific
> divisions during vectorization.
> 
> >
> > The type of the expression should be available via the mode and the
> > signedness, no?  So maybe to avoid having both RTX and TREE on the
> > target hook pass it a wide_int instead for the divisor?
> >
> 
> Done.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition
> to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod,
> expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094
> c7a712c0604fb77 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> to handle these two  implementation approaches itself.
>  @end deftypefn
> 
> +@deftypefn {Target Hook} bool
> TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +(enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx
> +*@var{output}, rtx @var{in0}, rtx @var{in1}) This hook is used to test
> +whether the target has a special method of division of vectors of type
> +@var{vectype} using the value @var{constant}, and producing a vector of
> type @var{vectype}.  The division will then not be decomposed by the and
> kept as a div.
> +
> +When the hook is being used to test whether the target supports a
> +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> +When the hook is being used to emit a division, @var{in0} and @var{in1}
> +are the source vectors of type @var{vecttype} and @var{output} is the
> +destination vector of type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree
> TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> should return the decl of a function that implements the  vectorized variant
> of the function with the @code{combined_fn} code diff --git
> a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> 076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> can generate better code.
> 
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> 
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> 
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc index
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> 5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> + align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> 
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h index
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> rtx, poly_int64, rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods =
> OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7
> 333f271b1965b78 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0, HOST_WIDE_INT d)
> 
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
> 
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> 
> +  /* Check if the target has specific expansions for the division.  */
> +  tree cst;
> +  if (treeop0
> +      && treeop1
> +      && (cst = uniform_integer_cst_p (treeop1))
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> (treeop0),
> +						     wi::to_wide (cst),
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
> 
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> 872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> sum,
> -				     gen_int_mode (INTVAL (op1),
> word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
> 
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> quot1,
> +				op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> quot1,
> +				 op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
> 
> diff --git a/gcc/target.def b/gcc/target.def index
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dac
> f18d0e0b133e4c 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
> 
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method
> +of\n\ division of vectors of type @var{vectype} using the value
> +@var{constant},\n\ and producing a vector of type @var{vectype}.  The
> +division\n\ will then not be decomposed by the and kept as a div.\n\
> +\n\ When the hook is being used to test whether the target supports a
> +special\n\ divide, @var{in0}, @var{in1}, and @var{output} are all null.
> +When the hook\n\ is being used to emit a division, @var{in0} and
> +@var{in1} are the source\n\ vectors of type @var{vecttype} and
> +@var{output} is the destination vector of\n\ type @var{vectype}.\n\ \n\
> +Return true if the operation is possible, emitting instructions for
> +it\n\ if rtxes are provided and updating @var{output}.",  bool, (enum
> +tree_code, tree vectype, wide_int constant, rtx *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> b/gcc/target.h index
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
> 
>  #if CHECKING_P
> 
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887
> dd980b1a7c5cf1c 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> tree, tree);  extern unsigned int default_case_values_threshold (void);
> extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree,
> wide_int,
> +					      rtx *, rtx, rtx);
> 
>  extern bool default_libc_has_function (enum function_class, tree);  extern
> bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> b/gcc/targhooks.cc index
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b6
> 9fe04593ae3419e 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
> 
> +/* Default that no division by constant operations are special.  */
> +bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
> +rtx *, rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> 157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> +} } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> 832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e7
> 9d66aa24e2f3b 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator
> *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
> 
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  tree cst = uniform_integer_cst_p (rhs2);
> +	  if (cst &&
> +	      targetm.vectorize.can_special_div_by_const (code, type,
> +							  wi::to_wide (cst),
> +							  NULL,
> +							  NULL_RTX,
> NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> patterns.cc b/gcc/tree-vect-patterns.cc index
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0
> affd56e7314370 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>    gimple *pattern_stmt, *def_stmt;
>    enum tree_code rhs_code;
>    optab optab;
> -  tree q;
> +  tree q, cst;
>    int dummy_int, prec;
> 
>    if (!is_gimple_assign (last_stmt))
> @@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> 
>        return pattern_stmt;
>      }
> +  else if ((cst = uniform_integer_cst_p (oprnd1))
> +	   && targetm.vectorize.can_special_div_by_const (rhs_code,
> vectype,
> +							  wi::to_wide (cst),
> +							  NULL, NULL_RTX,
> +							  NULL_RTX))
> +    {
> +      return NULL;
> +    }
> 
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e8
> 3b67e3ea53d25f4 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      tree cst;
> +      if (!target_support_p
> +	  && (cst = uniform_integer_cst_p (op1)))
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							wi::to_wide (cst),
> +							NULL, NULL_RTX,
> +							NULL_RTX);
>      }
> 
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-11-08 17:36     ` Tamar Christina
@ 2022-11-09  8:01       ` Richard Biener
  2022-11-09  8:26         ` Tamar Christina
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2022-11-09  8:01 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, jeffreyalaw

On Tue, 8 Nov 2022, Tamar Christina wrote:

> Ping.

Jeff approved this already.  I think it's OK if the rest of the series
is approved.

Richard.

> > -----Original Message-----
> > From: Tamar Christina
> > Sent: Monday, October 31, 2022 11:35 AM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific
> > divisions during vectorization.
> > 
> > >
> > > The type of the expression should be available via the mode and the
> > > signedness, no?  So maybe to avoid having both RTX and TREE on the
> > > target hook pass it a wide_int instead for the divisor?
> > >
> > 
> > Done.
> > 
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> > 
> > Ok for master?
> > 
> > Thanks,
> > Tamar
> > 
> > gcc/ChangeLog:
> > 
> > 	* expmed.h (expand_divmod): Pass tree operands down in addition
> > to RTX.
> > 	* expmed.cc (expand_divmod): Likewise.
> > 	* explow.cc (round_push, align_dynamic_address): Likewise.
> > 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> > 	* optabs.cc (expand_doubleword_mod,
> > expand_doubleword_divmod):
> > 	Likewise.
> > 	* target.h: Include tree-core.
> > 	* target.def (can_special_div_by_const): New.
> > 	* targhooks.cc (default_can_special_div_by_const): New.
> > 	* targhooks.h (default_can_special_div_by_const): New.
> > 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> > 	* doc/tm.texi.in: Document it.
> > 	* doc/tm.texi: Regenerate.
> > 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> > support.
> > 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> > 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> > 
> > --- inline copy of patch ---
> > 
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094
> > c7a712c0604fb77 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> > to handle these two  implementation approaches itself.
> >  @end deftypefn
> > 
> > +@deftypefn {Target Hook} bool
> > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +(enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx
> > +*@var{output}, rtx @var{in0}, rtx @var{in1}) This hook is used to test
> > +whether the target has a special method of division of vectors of type
> > +@var{vectype} using the value @var{constant}, and producing a vector of
> > type @var{vectype}.  The division will then not be decomposed by the and
> > kept as a div.
> > +
> > +When the hook is being used to test whether the target supports a
> > +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook is being used to emit a division, @var{in0} and @var{in1}
> > +are the source vectors of type @var{vecttype} and @var{output} is the
> > +destination vector of type @var{vectype}.
> > +
> > +Return true if the operation is possible, emitting instructions for it
> > +if rtxes are provided and updating @var{output}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} tree
> > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> > should return the decl of a function that implements the  vectorized variant
> > of the function with the @code{combined_fn} code diff --git
> > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> > 076d058c24ce093 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> > can generate better code.
> > 
> >  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> > 
> > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +
> >  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> > 
> >  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> > diff --git a/gcc/explow.cc b/gcc/explow.cc index
> > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> > 5e346bf34ba0036 100644
> > --- a/gcc/explow.cc
> > +++ b/gcc/explow.cc
> > @@ -1037,7 +1037,7 @@ round_push (rtx size)
> >       TRUNC_DIV_EXPR.  */
> >    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
> >  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> > +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> > + align_rtx,
> >  			NULL_RTX, 1);
> >    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> > 
> > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> > required_align)
> >  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
> >  				       Pmode),
> >  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> > +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> > target,
> >  			  gen_int_mode (required_align / BITS_PER_UNIT,
> >  					Pmode),
> >  			  NULL_RTX, 1);
> > diff --git a/gcc/expmed.h b/gcc/expmed.h index
> > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> > f33cb3595659b5 100644
> > --- a/gcc/expmed.h
> > +++ b/gcc/expmed.h
> > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> > machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> > rtx, poly_int64, rtx,
> >  			 int);
> >  #ifdef GCC_OPTABS_H
> > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> > -			  rtx, int, enum optab_methods =
> > OPTAB_LIB_WIDEN);
> > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> > tree,
> > +			  rtx, rtx, rtx, int,
> > +			  enum optab_methods = OPTAB_LIB_WIDEN);
> >  #endif
> >  #endif
> > 
> > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> > 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7
> > 333f271b1965b78 100644
> > --- a/gcc/expmed.cc
> > +++ b/gcc/expmed.cc
> > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> > op0, HOST_WIDE_INT d)
> > 
> >  rtx
> >  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> > mode,
> > -	       rtx op0, rtx op1, rtx target, int unsignedp,
> > -	       enum optab_methods methods)
> > +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> > +	       int unsignedp, enum optab_methods methods)
> >  {
> >    machine_mode compute_mode;
> >    rtx tquotient;
> > @@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> > 
> >    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> > 
> > +  /* Check if the target has specific expansions for the division.  */
> > +  tree cst;
> > +  if (treeop0
> > +      && treeop1
> > +      && (cst = uniform_integer_cst_p (treeop1))
> > +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> > (treeop0),
> > +						     wi::to_wide (cst),
> > +						     &target, op0, op1))
> > +    return target;
> > +
> > +
> >    /* Now convert to the best mode to use.  */
> >    if (compute_mode != mode)
> >      {
> > @@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  			    || (optab_handler (sdivmod_optab, int_mode)
> >  				!= CODE_FOR_nothing)))
> >  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> > -						int_mode, op0,
> > -						gen_int_mode (abs_d,
> > +						int_mode, treeop0, treeop1,
> > +						op0, gen_int_mode (abs_d,
> >  							      int_mode),
> >  						NULL_RTX, 0);
> >  		    else
> > @@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  				      size - 1, NULL_RTX, 0);
> >  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
> >  				    NULL_RTX);
> > -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> > op1,
> > -				    NULL_RTX, 0);
> > +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> > treeop0,
> > +				    treeop1, t3, op1, NULL_RTX, 0);
> >  		if (t4)
> >  		  {
> >  		    rtx t5;
> > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > index
> > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> > a8abc055fa34d9 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
> >  	    return expand_divmod (0,
> >  				  FLOAT_MODE_P (GET_MODE (value))
> >  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> > -				  GET_MODE (value), op1, op2, target, 0);
> > +				  GET_MODE (value), NULL, NULL, op1, op2,
> > +				  target, 0);
> >  	case MOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 0);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 0);
> >  	case UDIV:
> > -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case UMOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case ASHIFTRT:
> >  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
> >  				      target, 0, OPTAB_LIB_WIDEN);
> > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> > machine_mode mode, tree treeop0,
> >        bool speed_p = optimize_insn_for_speed_p ();
> >        do_pending_stack_adjust ();
> >        start_sequence ();
> > -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> > +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 1);
> >        rtx_insn *uns_insns = get_insns ();
> >        end_sequence ();
> >        start_sequence ();
> > -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> > +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 0);
> >        rtx_insn *sgn_insns = get_insns ();
> >        end_sequence ();
> >        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> > treeop0,
> >        emit_insn (sgn_insns);
> >        return sgn_ret;
> >      }
> > -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> > +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +			op0, op1, target, unsignedp);
> >  }
> > 
> >  rtx
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> > 872f340855dc96 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> > rtx op0, rtx op1, bool unsignedp)
> >  		return NULL_RTX;
> >  	    }
> >  	}
> > -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > sum,
> > -				     gen_int_mode (INTVAL (op1),
> > word_mode),
> > +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > NULL, NULL,
> > +				     sum, gen_int_mode (INTVAL (op1),
> > +							word_mode),
> >  				     NULL_RTX, 1, OPTAB_DIRECT);
> >        if (remainder == NULL_RTX)
> >  	return NULL_RTX;
> > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> > 
> >    if (op11 != const1_rtx)
> >      {
> > -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> > -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> > quot1,
> > +				op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> > -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> > quot1,
> > +				 op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (quot2 == NULL_RTX)
> >  	return NULL_RTX;
> > 
> > diff --git a/gcc/target.def b/gcc/target.def index
> > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dac
> > f18d0e0b133e4c 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -1902,6 +1902,25 @@ implementation approaches itself.",
> >  	const vec_perm_indices &sel),
> >   NULL)
> > 
> > +DEFHOOK
> > +(can_special_div_by_const,
> > + "This hook is used to test whether the target has a special method
> > +of\n\ division of vectors of type @var{vectype} using the value
> > +@var{constant},\n\ and producing a vector of type @var{vectype}.  The
> > +division\n\ will then not be decomposed by the and kept as a div.\n\
> > +\n\ When the hook is being used to test whether the target supports a
> > +special\n\ divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook\n\ is being used to emit a division, @var{in0} and
> > +@var{in1} are the source\n\ vectors of type @var{vecttype} and
> > +@var{output} is the destination vector of\n\ type @var{vectype}.\n\ \n\
> > +Return true if the operation is possible, emitting instructions for
> > +it\n\ if rtxes are provided and updating @var{output}.",  bool, (enum
> > +tree_code, tree vectype, wide_int constant, rtx *output,
> > +	rtx in0, rtx in1),
> > + default_can_special_div_by_const)
> > +
> >  /* Return true if the target supports misaligned store/load of a
> >     specific factor denoted in the third parameter.  The last parameter
> >     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> > b/gcc/target.h index
> > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> > f39c061f68b665 100644
> > --- a/gcc/target.h
> > +++ b/gcc/target.h
> > @@ -51,6 +51,7 @@
> >  #include "insn-codes.h"
> >  #include "tm.h"
> >  #include "hard-reg-set.h"
> > +#include "tree-core.h"
> > 
> >  #if CHECKING_P
> > 
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> > ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887
> > dd980b1a7c5cf1c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> > (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> > tree, tree);  extern unsigned int default_case_values_threshold (void);
> > extern bool default_have_conditional_execution (void);
> > +extern bool default_can_special_div_by_const (enum tree_code, tree,
> > wide_int,
> > +					      rtx *, rtx, rtx);
> > 
> >  extern bool default_libc_has_function (enum function_class, tree);  extern
> > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> > b/gcc/targhooks.cc index
> > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b6
> > 9fe04593ae3419e 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
> >    return HAVE_conditional_execution;
> >  }
> > 
> > +/* Default that no division by constant operations are special.  */
> > +bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
> > +rtx *, rtx,
> > +				  rtx)
> > +{
> > +  return false;
> > +}
> > +
> >  /* By default we assume that c99 functions are present at the runtime,
> >     but sincos is not.  */
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> > d7b4d5b64a19b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> > db75b3e4112e2cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> > 157701d9d1cf852
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> > +} } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > 832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e7
> > 9d66aa24e2f3b 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator
> > *gsi, tree type, tree compute_type
> >  	  tree rhs2 = gimple_assign_rhs2 (assign);
> >  	  tree ret;
> > 
> > +	  /* Check if the target was going to handle it through the special
> > +	     division callback hook.  */
> > +	  tree cst = uniform_integer_cst_p (rhs2);
> > +	  if (cst &&
> > +	      targetm.vectorize.can_special_div_by_const (code, type,
> > +							  wi::to_wide (cst),
> > +							  NULL,
> > +							  NULL_RTX,
> > NULL_RTX))
> > +	    return NULL_TREE;
> > +
> > +
> >  	  if (!optimize
> >  	      || !VECTOR_INTEGER_TYPE_P (type)
> >  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> > patterns.cc b/gcc/tree-vect-patterns.cc index
> > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0
> > affd56e7314370 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >    gimple *pattern_stmt, *def_stmt;
> >    enum tree_code rhs_code;
> >    optab optab;
> > -  tree q;
> > +  tree q, cst;
> >    int dummy_int, prec;
> > 
> >    if (!is_gimple_assign (last_stmt))
> > @@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> > 
> >        return pattern_stmt;
> >      }
> > +  else if ((cst = uniform_integer_cst_p (oprnd1))
> > +	   && targetm.vectorize.can_special_div_by_const (rhs_code,
> > vectype,
> > +							  wi::to_wide (cst),
> > +							  NULL, NULL_RTX,
> > +							  NULL_RTX))
> > +    {
> > +      return NULL;
> > +    }
> > 
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e8
> > 3b67e3ea53d25f4 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
> >  	}
> >        target_support_p = (optab_handler (optab, vec_mode)
> >  			  != CODE_FOR_nothing);
> > +      tree cst;
> > +      if (!target_support_p
> > +	  && (cst = uniform_integer_cst_p (op1)))
> > +	target_support_p
> > +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> > +							wi::to_wide (cst),
> > +							NULL, NULL_RTX,
> > +							NULL_RTX);
> >      }
> > 
> >    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-11-09  8:01       ` Richard Biener
@ 2022-11-09  8:26         ` Tamar Christina
  0 siblings, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-11-09  8:26 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, jeffreyalaw

[-- Attachment #1: Type: text/plain, Size: 26281 bytes --]

Ah sorry, i missed that one.

Thanks,
Tamar

________________________________
From: Richard Biener <rguenther@suse.de>
Sent: Wednesday, November 9, 2022 8:01 AM
To: Tamar Christina <Tamar.Christina@arm.com>
Cc: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; jeffreyalaw@gmail.com <jeffreyalaw@gmail.com>
Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.

On Tue, 8 Nov 2022, Tamar Christina wrote:

> Ping.

Jeff approved this already.  I think it's OK if the rest of the series
is approved.

Richard.

> > -----Original Message-----
> > From: Tamar Christina
> > Sent: Monday, October 31, 2022 11:35 AM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; jeffreyalaw@gmail.com
> > Subject: RE: [PATCH 1/4]middle-end Support not decomposing specific
> > divisions during vectorization.
> >
> > >
> > > The type of the expression should be available via the mode and the
> > > signedness, no?  So maybe to avoid having both RTX and TREE on the
> > > target hook pass it a wide_int instead for the divisor?
> > >
> >
> > Done.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >      * expmed.h (expand_divmod): Pass tree operands down in addition
> > to RTX.
> >      * expmed.cc (expand_divmod): Likewise.
> >      * explow.cc (round_push, align_dynamic_address): Likewise.
> >      * expr.cc (force_operand, expand_expr_divmod): Likewise.
> >      * optabs.cc (expand_doubleword_mod,
> > expand_doubleword_divmod):
> >      Likewise.
> >      * target.h: Include tree-core.
> >      * target.def (can_special_div_by_const): New.
> >      * targhooks.cc (default_can_special_div_by_const): New.
> >      * targhooks.h (default_can_special_div_by_const): New.
> >      * tree-vect-generic.cc (expand_vector_operation): Use it.
> >      * doc/tm.texi.in: Document it.
> >      * doc/tm.texi: Regenerate.
> >      * tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> > support.
> >      * tree-vect-stmts.cc (vectorizable_operation): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >      * gcc.dg/vect/vect-div-bitmask-1.c: New test.
> >      * gcc.dg/vect/vect-div-bitmask-2.c: New test.
> >      * gcc.dg/vect/vect-div-bitmask-3.c: New test.
> >      * gcc.dg/vect/vect-div-bitmask.h: New file.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..a29f5c39be3f0927f8ef6e094
> > c7a712c0604fb77 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the hook
> > to handle these two  implementation approaches itself.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} bool
> > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +(enum @var{tree_code}, tree @var{vectype}, wide_int @var{constant}, rtx
> > +*@var{output}, rtx @var{in0}, rtx @var{in1}) This hook is used to test
> > +whether the target has a special method of division of vectors of type
> > +@var{vectype} using the value @var{constant}, and producing a vector of
> > type @var{vectype}.  The division will then not be decomposed by the and
> > kept as a div.
> > +
> > +When the hook is being used to test whether the target supports a
> > +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook is being used to emit a division, @var{in0} and @var{in1}
> > +are the source vectors of type @var{vecttype} and @var{output} is the
> > +destination vector of type @var{vectype}.
> > +
> > +Return true if the operation is possible, emitting instructions for it
> > +if rtxes are provided and updating @var{output}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} tree
> > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> > should return the decl of a function that implements the  vectorized variant
> > of the function with the @code{combined_fn} code diff --git
> > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> > 076d058c24ce093 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> > can generate better code.
> >
> >  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> >
> > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +
> >  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> >
> >  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> > diff --git a/gcc/explow.cc b/gcc/explow.cc index
> > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> > 5e346bf34ba0036 100644
> > --- a/gcc/explow.cc
> > +++ b/gcc/explow.cc
> > @@ -1037,7 +1037,7 @@ round_push (rtx size)
> >       TRUNC_DIV_EXPR.  */
> >    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
> >                     NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> > +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> > + align_rtx,
> >                      NULL_RTX, 1);
> >    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> >
> > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> > required_align)
> >                       gen_int_mode (required_align / BITS_PER_UNIT - 1,
> >                                     Pmode),
> >                       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> > +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> > target,
> >                        gen_int_mode (required_align / BITS_PER_UNIT,
> >                                      Pmode),
> >                        NULL_RTX, 1);
> > diff --git a/gcc/expmed.h b/gcc/expmed.h index
> > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> > f33cb3595659b5 100644
> > --- a/gcc/expmed.h
> > +++ b/gcc/expmed.h
> > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> > machine_mode,  extern rtx expand_shift (enum tree_code, machine_mode,
> > rtx, poly_int64, rtx,
> >                       int);
> >  #ifdef GCC_OPTABS_H
> > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> > -                     rtx, int, enum optab_methods =
> > OPTAB_LIB_WIDEN);
> > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> > tree,
> > +                     rtx, rtx, rtx, int,
> > +                     enum optab_methods = OPTAB_LIB_WIDEN);
> >  #endif
> >  #endif
> >
> > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> > 8d7418be418406e72a895ecddf2dc7fdb950c76c..bab020c07222afa38305ef8d7
> > 333f271b1965b78 100644
> > --- a/gcc/expmed.cc
> > +++ b/gcc/expmed.cc
> > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> > op0, HOST_WIDE_INT d)
> >
> >  rtx
> >  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> > mode,
> > -          rtx op0, rtx op1, rtx target, int unsignedp,
> > -          enum optab_methods methods)
> > +          tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> > +          int unsignedp, enum optab_methods methods)
> >  {
> >    machine_mode compute_mode;
> >    rtx tquotient;
> > @@ -4375,6 +4375,17 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >
> >    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> >
> > +  /* Check if the target has specific expansions for the division.  */
> > +  tree cst;
> > +  if (treeop0
> > +      && treeop1
> > +      && (cst = uniform_integer_cst_p (treeop1))
> > +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> > (treeop0),
> > +                                                wi::to_wide (cst),
> > +                                                &target, op0, op1))
> > +    return target;
> > +
> > +
> >    /* Now convert to the best mode to use.  */
> >    if (compute_mode != mode)
> >      {
> > @@ -4618,8 +4629,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >                          || (optab_handler (sdivmod_optab, int_mode)
> >                              != CODE_FOR_nothing)))
> >                    quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> > -                                           int_mode, op0,
> > -                                           gen_int_mode (abs_d,
> > +                                           int_mode, treeop0, treeop1,
> > +                                           op0, gen_int_mode (abs_d,
> >                                                            int_mode),
> >                                              NULL_RTX, 0);
> >                  else
> > @@ -4808,8 +4819,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >                                    size - 1, NULL_RTX, 0);
> >              t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
> >                                  NULL_RTX);
> > -           t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> > op1,
> > -                               NULL_RTX, 0);
> > +           t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> > treeop0,
> > +                               treeop1, t3, op1, NULL_RTX, 0);
> >              if (t4)
> >                {
> >                  rtx t5;
> > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > index
> > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> > a8abc055fa34d9 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
> >          return expand_divmod (0,
> >                                FLOAT_MODE_P (GET_MODE (value))
> >                                ? RDIV_EXPR : TRUNC_DIV_EXPR,
> > -                             GET_MODE (value), op1, op2, target, 0);
> > +                             GET_MODE (value), NULL, NULL, op1, op2,
> > +                             target, 0);
> >      case MOD:
> > -     return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -                           target, 0);
> > +     return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +                           op1, op2, target, 0);
> >      case UDIV:
> > -     return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > op1, op2,
> > -                           target, 1);
> > +     return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +                           op1, op2, target, 1);
> >      case UMOD:
> > -     return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -                           target, 1);
> > +     return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +                           op1, op2, target, 1);
> >      case ASHIFTRT:
> >        return expand_simple_binop (GET_MODE (value), code, op1, op2,
> >                                    target, 0, OPTAB_LIB_WIDEN);
> > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> > machine_mode mode, tree treeop0,
> >        bool speed_p = optimize_insn_for_speed_p ();
> >        do_pending_stack_adjust ();
> >        start_sequence ();
> > -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> > +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +                              op0, op1, target, 1);
> >        rtx_insn *uns_insns = get_insns ();
> >        end_sequence ();
> >        start_sequence ();
> > -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> > +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +                              op0, op1, target, 0);
> >        rtx_insn *sgn_insns = get_insns ();
> >        end_sequence ();
> >        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8
> > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> > treeop0,
> >        emit_insn (sgn_insns);
> >        return sgn_ret;
> >      }
> > -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> > +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +                   op0, op1, target, unsignedp);
> >  }
> >
> >  rtx
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> > 872f340855dc96 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> > rtx op0, rtx op1, bool unsignedp)
> >              return NULL_RTX;
> >          }
> >      }
> > -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > sum,
> > -                                gen_int_mode (INTVAL (op1),
> > word_mode),
> > +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > NULL, NULL,
> > +                                sum, gen_int_mode (INTVAL (op1),
> > +                                                   word_mode),
> >                                   NULL_RTX, 1, OPTAB_DIRECT);
> >        if (remainder == NULL_RTX)
> >      return NULL_RTX;
> > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >
> >    if (op11 != const1_rtx)
> >      {
> > -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> > -                           NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> > quot1,
> > +                           op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (rem2 == NULL_RTX)
> >      return NULL_RTX;
> >
> > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >        if (rem2 == NULL_RTX)
> >      return NULL_RTX;
> >
> > -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> > -                            NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> > quot1,
> > +                            op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (quot2 == NULL_RTX)
> >      return NULL_RTX;
> >
> > diff --git a/gcc/target.def b/gcc/target.def index
> > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..f491e2233cf18760631f148dac
> > f18d0e0b133e4c 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -1902,6 +1902,25 @@ implementation approaches itself.",
> >      const vec_perm_indices &sel),
> >   NULL)
> >
> > +DEFHOOK
> > +(can_special_div_by_const,
> > + "This hook is used to test whether the target has a special method
> > +of\n\ division of vectors of type @var{vectype} using the value
> > +@var{constant},\n\ and producing a vector of type @var{vectype}.  The
> > +division\n\ will then not be decomposed by the and kept as a div.\n\
> > +\n\ When the hook is being used to test whether the target supports a
> > +special\n\ divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook\n\ is being used to emit a division, @var{in0} and
> > +@var{in1} are the source\n\ vectors of type @var{vecttype} and
> > +@var{output} is the destination vector of\n\ type @var{vectype}.\n\ \n\
> > +Return true if the operation is possible, emitting instructions for
> > +it\n\ if rtxes are provided and updating @var{output}.",  bool, (enum
> > +tree_code, tree vectype, wide_int constant, rtx *output,
> > +   rtx in0, rtx in1),
> > + default_can_special_div_by_const)
> > +
> >  /* Return true if the target supports misaligned store/load of a
> >     specific factor denoted in the third parameter.  The last parameter
> >     is true if the access is defined in a packed struct.  */ diff --git a/gcc/target.h
> > b/gcc/target.h index
> > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> > f39c061f68b665 100644
> > --- a/gcc/target.h
> > +++ b/gcc/target.h
> > @@ -51,6 +51,7 @@
> >  #include "insn-codes.h"
> >  #include "tm.h"
> >  #include "hard-reg-set.h"
> > +#include "tree-core.h"
> >
> >  #if CHECKING_P
> >
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> > ecce55ebe797cedc940620e8d89816973a045d49..c8df2af02b9d8c41d953b7887
> > dd980b1a7c5cf1c 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> > (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> > tree, tree);  extern unsigned int default_case_values_threshold (void);
> > extern bool default_have_conditional_execution (void);
> > +extern bool default_can_special_div_by_const (enum tree_code, tree,
> > wide_int,
> > +                                         rtx *, rtx, rtx);
> >
> >  extern bool default_libc_has_function (enum function_class, tree);  extern
> > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> > b/gcc/targhooks.cc index
> > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..f941b1c218d3c4de8b7f780b6
> > 9fe04593ae3419e 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
> >    return HAVE_conditional_execution;
> >  }
> >
> > +/* Default that no division by constant operations are special.  */
> > +bool default_can_special_div_by_const (enum tree_code, tree, wide_int,
> > +rtx *, rtx,
> > +                             rtx)
> > +{
> > +  return false;
> > +}
> > +
> >  /* By default we assume that c99 functions are present at the runtime,
> >     but sincos is not.  */
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> > d7b4d5b64a19b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> > db75b3e4112e2cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> > 157701d9d1cf852
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> > +} } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index
> > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > 832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> > 350129555a0c71c0896c4f1003163f3b3557c11b..6ad6372c55eef94a742a8fa35e7
> > 9d66aa24e2f3b 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -1237,6 +1237,17 @@ expand_vector_operation (gimple_stmt_iterator
> > *gsi, tree type, tree compute_type
> >        tree rhs2 = gimple_assign_rhs2 (assign);
> >        tree ret;
> >
> > +     /* Check if the target was going to handle it through the special
> > +        division callback hook.  */
> > +     tree cst = uniform_integer_cst_p (rhs2);
> > +     if (cst &&
> > +         targetm.vectorize.can_special_div_by_const (code, type,
> > +                                                     wi::to_wide (cst),
> > +                                                     NULL,
> > +                                                     NULL_RTX,
> > NULL_RTX))
> > +       return NULL_TREE;
> > +
> > +
> >        if (!optimize
> >            || !VECTOR_INTEGER_TYPE_P (type)
> >            || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> > patterns.cc b/gcc/tree-vect-patterns.cc index
> > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..e91bcef56fff931a7a7ba534a0
> > affd56e7314370 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3432,7 +3432,7 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >    gimple *pattern_stmt, *def_stmt;
> >    enum tree_code rhs_code;
> >    optab optab;
> > -  tree q;
> > +  tree q, cst;
> >    int dummy_int, prec;
> >
> >    if (!is_gimple_assign (last_stmt))
> > @@ -3596,6 +3596,14 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >
> >        return pattern_stmt;
> >      }
> > +  else if ((cst = uniform_integer_cst_p (oprnd1))
> > +      && targetm.vectorize.can_special_div_by_const (rhs_code,
> > vectype,
> > +                                                     wi::to_wide (cst),
> > +                                                     NULL, NULL_RTX,
> > +                                                     NULL_RTX))
> > +    {
> > +      return NULL;
> > +    }
> >
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> > c9dab217f059f17e91e9a7582523e627d7a45b66..1399c22ba0df75f582887d7e8
> > 3b67e3ea53d25f4 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6260,6 +6260,14 @@ vectorizable_operation (vec_info *vinfo,
> >      }
> >        target_support_p = (optab_handler (optab, vec_mode)
> >                        != CODE_FOR_nothing);
> > +      tree cst;
> > +      if (!target_support_p
> > +     && (cst = uniform_integer_cst_p (op1)))
> > +   target_support_p
> > +     = targetm.vectorize.can_special_div_by_const (code, vectype,
> > +                                                   wi::to_wide (cst),
> > +                                                   NULL, NULL_RTX,
> > +                                                   NULL_RTX);
> >      }
> >
> >    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
>

--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division.
  2022-10-31 11:34   ` Tamar Christina
@ 2022-11-09  8:33     ` Tamar Christina
  2022-11-09 16:02     ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-11-09  8:33 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

Ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: 'Tamar Christina' <tamar.christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask
> division.
> 
> Hi All,
> 
> Ping, and updated patch based on mid-end changes.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md
> (@aarch64_bitmask_udiv<mode>3): New.
> 	* config/aarch64/aarch64.cc
> (aarch64_vectorize_can_special_div_by_constant): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/div-by-bitmask.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b
> a6386c1ab50f77e 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4831,6 +4831,65 @@ (define_expand
> "aarch64_<sur><addsub>hn2<mode>"
>    }
>  )
> 
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it
> +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;;
> +double the precision of x.
> +;;
> +;; If we imagine a short as being composed of two blocks of bytes then
> +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;;
> +adding 1 to each sub component:
> +;;
> +;;      short value of 16-bits
> +;; ┌──────────────┬────────────────┐
> +;; │              │                │
> +;; └──────────────┴────────────────┘
> +;;   8-bit part1 ▲  8-bit part2   ▲
> +;;               │                │
> +;;               │                │
> +;;              +1               +1
> +;;
> +;; after the first addition, we have to shift right by 8, and narrow
> +the ;; results back to a byte.  Remember that the addition must be done
> +in ;; double the precision of the input.  Since 8 is half the size of a
> +short ;; we can use a narrowing halfing instruction in AArch64, addhn
> +which also ;; does the addition in a wider precision and narrows back
> +to a byte.  The ;; shift itself is implicit in the operation as it
> +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize.
> +;;
> +;; Since we have narrowed the result of the first part back to a byte,
> +for ;; the second addition we can use a widening addition, uaddw.
> +;;
> +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr
> +by 8 ;; to shift and the vectorizer.
> +;;
> +;; The shift is later optimized by combine to a uzp2 with movi #0.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:VQN 0 "register_operand")
> +   (match_operand:VQN 1 "register_operand")
> +   (match_operand:VQN 2 "immediate_operand")]
> +  "TARGET_SIMD"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;
> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> +<VNARROWQ2>mode));
> +  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> +  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> +  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> +bitsize);
> +  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
> +  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> +shift_vector));
> +  DONE;
> +})
> +
>  ;; pmul.
> 
>  (define_insn "aarch64_pmul<mode>"
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb65154
> 127dc7b72386c5a7 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const
> (machine_mode vmode, machine_mode op_mode,
>    return ret;
>  }
> 
> +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> +
> +bool
> +aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> +					       tree vectype, wide_int cst,
> +					       rtx *output, rtx in0, rtx in1) {
> +  if (code != TRUNC_DIV_EXPR
> +      || !TYPE_UNSIGNED (vectype))
> +    return false;
> +
> +  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE
> + (vectype));  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> +    return false;
> +
> +  if (in0 == NULL_RTX && in1 == NULL_RTX)
> +    {
> +      wide_int val = wi::add (cst, 1);
> +      int pow = wi::exact_log2 (val);
> +      return pow == (int)(element_precision (vectype) / 2);
> +    }
> +
> +  if (!VECTOR_TYPE_P (vectype))
> +   return false;
> +
> +  gcc_assert (output);
> +
> +  if (!*output)
> +    *output = gen_reg_rtx (TYPE_MODE (vectype));
> +
> +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output,
> +in0, in1));
> +  return true;
> +}
> +
>  /* Generate a byte permute mask for a register of mode MODE,
>     which has NUNITS units.  */
> 
> @@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p
>  #undef TARGET_VECTOR_ALIGNMENT
>  #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
> 
> +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
> +  aarch64_vectorize_can_special_div_by_constant
> +
>  #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
>  #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
>    aarch64_vectorize_preferred_vector_alignment
> diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a
> b211cd246d82d5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> 
> > -----Original Message-----
> > From: Tamar Christina <tamar.christina@arm.com>
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask
> division.
> >
> > Hi All,
> >
> > This adds an implementation for the new optab for unsigned pow2
> > bitmask for AArch64.
> >
> > The implementation rewrites:
> >
> >    x = y / (2 ^ (sizeof (y)/2)-1
> >
> > into e.g. (for bytes)
> >
> >    (x + ((x + 257) >> 8)) >> 8
> >
> > where it's required that the additions be done in double the precision
> > of x such that we don't lose any bits during an overflow.
> >
> > Essentially the sequence decomposes the division into doing two
> > smaller divisions, one for the top and bottom parts of the number and
> > adding the results back together.
> >
> > To account for the fact that shift by 8 would be division by 256 we
> > add 1 to both parts of x such that when 255 we still get 1 as the answer.
> >
> > Because the amount we shift are half the original datatype we can use
> > the halfing instructions the ISA provides to do the operation instead
> > of using actual shifts.
> >
> > For AArch64 this means we generate for:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] = (pixel[i] * level) / 0xff; }
> >
> > the following:
> >
> > 	movi    v3.16b, 0x1
> > 	umull2  v1.8h, v0.16b, v2.16b
> > 	umull   v0.8h, v0.8b, v2.8b
> > 	addhn   v5.8b, v1.8h, v3.8h
> > 	addhn   v4.8b, v0.8h, v3.8h
> > 	uaddw   v1.8h, v1.8h, v5.8b
> > 	uaddw   v0.8h, v0.8h, v4.8b
> > 	uzp2    v0.16b, v0.16b, v1.16b
> >
> > instead of:
> >
> > 	umull   v2.8h, v1.8b, v5.8b
> > 	umull2  v1.8h, v1.16b, v5.16b
> > 	umull   v0.4s, v2.4h, v3.4h
> > 	umull2  v2.4s, v2.8h, v3.8h
> > 	umull   v4.4s, v1.4h, v3.4h
> > 	umull2  v1.4s, v1.8h, v3.8h
> > 	uzp2    v0.8h, v0.8h, v2.8h
> > 	uzp2    v1.8h, v4.8h, v1.8h
> > 	shrn    v0.8b, v0.8h, 7
> > 	shrn2   v0.16b, v1.8h, 7
> >
> > Which results in significantly faster code.
> >
> > Thanks for Wilco for the concept.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md
> > (@aarch64_bitmask_udiv<mode>3): New.
> > 	* config/aarch64/aarch64.cc
> > (aarch64_vectorize_can_special_div_by_constant): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/div-by-bitmask.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b
> > a6386c1ab50f77e 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -4831,6 +4831,65 @@ (define_expand
> > "aarch64_<sur><addsub>hn2<mode>"
> >    }
> >  )
> >
> > +;; div optimizations using narrowings ;; we can do the division e.g.
> > +shorts by 255 faster by calculating it as ;; (x + ((x + 257) >> 8))
> > +>> 8 assuming the operation is done in ;; double the precision of x.
> > +;;
> > +;; If we imagine a short as being composed of two blocks of bytes
> > +then ;; adding 257 or 0b0000_0001_0000_0001 to the number is
> > +equivalen to ;; adding 1 to each sub component:
> > +;;
> > +;;      short value of 16-bits
> > +;; ┌──────────────┬────────────────┐
> > +;; │              │                │
> > +;; └──────────────┴────────────────┘
> > +;;   8-bit part1 ▲  8-bit part2   ▲
> > +;;               │                │
> > +;;               │                │
> > +;;              +1               +1
> > +;;
> > +;; after the first addition, we have to shift right by 8, and narrow
> > +the ;; results back to a byte.  Remember that the addition must be
> > +done in ;; double the precision of the input.  Since 8 is half the
> > +size of a short ;; we can use a narrowing halfing instruction in
> > +AArch64, addhn which also ;; does the addition in a wider precision
> > +and narrows back to a byte.  The ;; shift itself is implicit in the
> > +operation as it writes back only the top ;; half of the result. i.e. bits
> 2*esize-1:esize.
> > +;;
> > +;; Since we have narrowed the result of the first part back to a
> > +byte, for ;; the second addition we can use a widening addition, uaddw.
> > +;;
> > +;; For the finaly shift, since it's unsigned arithmatic we emit an
> > +ushr by 8 ;; to shift and the vectorizer.
> > +;;
> > +;; The shift is later optimized by combine to a uzp2 with movi #0.
> > +(define_expand "@aarch64_bitmask_udiv<mode>3"
> > +  [(match_operand:VQN 0 "register_operand")
> > +   (match_operand:VQN 1 "register_operand")
> > +   (match_operand:VQN 2 "immediate_operand")]
> > +  "TARGET_SIMD"
> > +{
> > +  unsigned HOST_WIDE_INT size
> > +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> > +  if (!CONST_VECTOR_P (operands[2])
> > +      || const_vector_encoded_nelts (operands[2]) != 1
> > +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> > +    FAIL;
> > +
> > +  rtx addend = gen_reg_rtx (<MODE>mode);
> > +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode,
> 1);
> > +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > +<VNARROWQ2>mode));
> > +  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> > +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> > +  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> > +  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> > +  rtx shift_vector = aarch64_simd_gen_const_vector_dup
> (<MODE>mode,
> > +bitsize);
> > +  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1],
> tmp1));
> > +  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> > +shift_vector));
> > +  DONE;
> > +})
> > +
> >  ;; pmul.
> >
> >  (define_insn "aarch64_pmul<mode>"
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index
> >
> 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc3
> > 7484b6fc6901bfb4 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const
> > (machine_mode vmode, machine_mode op_mode,
> >    return ret;
> >  }
> >
> > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> > +
> > +bool
> > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> > +					       tree vectype,
> > +					       tree treeop0, tree treeop1,
> > +					       rtx *output, rtx in0, rtx in1) {
> > +
> > +  if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX))
> > +    return false;
> > +
> > +  tree cst = uniform_integer_cst_p (treeop1);  tree type;  if (code
> > + != TRUNC_DIV_EXPR
> > +      || !cst
> > +      || !TYPE_UNSIGNED ((type = TREE_TYPE (cst)))
> > +      || tree_int_cst_sgn (cst) != 1)
> > +    return false;
> > +
> > +  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE
> > + (vectype));  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> > +    return false;
> > +
> > +  if (in0 == NULL_RTX && in1 == NULL_RTX)
> > +    {
> > +      gcc_assert (treeop0 && treeop1);
> > +      wide_int icst = wi::to_wide (cst);
> > +      wide_int val = wi::add (icst, 1);
> > +      int pow = wi::exact_log2 (val);
> > +      return pow == (TYPE_PRECISION (type) / 2);
> > +    }
> > +
> > +  if (!VECTOR_TYPE_P (vectype))
> > +   return false;
> > +
> > +  gcc_assert (output);
> > +
> > +  if (!*output)
> > +    *output = gen_reg_rtx (TYPE_MODE (vectype));
> > +
> > +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype),
> *output,
> > +in0, in1));
> > +  return true;
> > +}
> > +
> >  /* Generate a byte permute mask for a register of mode MODE,
> >     which has NUNITS units.  */
> >
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> >
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d2
> > 44a2a23e76cac097 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the
> > hook to handle these two  implementation approaches itself.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} bool
> > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree
> > +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This
> > +hook is used to test whether the target has a special method of
> > +division of vectors of type @var{vectype} using the two operands
> > @code{treeop0}, and @code{treeop1} and producing a vector of type
> > @var{vectype}.  The division will then not be decomposed by the and
> > kept as a div.
> > +
> > +When the hook is being used to test whether the target supports a
> > +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook is being used to emit a division, @var{in0} and
> > +@var{in1} are the source vectors of type @var{vecttype} and
> > +@var{output} is the destination vector of type @var{vectype}.
> > +
> > +Return true if the operation is possible, emitting instructions for
> > +it if rtxes are provided and updating @var{output}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} tree
> > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code},
> > tree @var{vec_type_out}, tree @var{vec_type_in})  This hook should
> > return the decl of a function that implements the  vectorized variant
> > of the function with the @code{combined_fn} code diff --git
> > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> >
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04
> > 076d058c24ce093 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent
> strategy
> > can generate better code.
> >
> >  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> >
> > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +
> >  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> >
> >  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> > diff --git a/gcc/explow.cc b/gcc/explow.cc index
> >
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f
> > 5e346bf34ba0036 100644
> > --- a/gcc/explow.cc
> > +++ b/gcc/explow.cc
> > @@ -1037,7 +1037,7 @@ round_push (rtx size)
> >       TRUNC_DIV_EXPR.  */
> >    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
> >  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> > +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> > + align_rtx,
> >  			NULL_RTX, 1);
> >    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> >
> > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> > required_align)
> >  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
> >  				       Pmode),
> >  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> > +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> > target,
> >  			  gen_int_mode (required_align / BITS_PER_UNIT,
> >  					Pmode),
> >  			  NULL_RTX, 1);
> > diff --git a/gcc/expmed.h b/gcc/expmed.h index
> >
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6
> > f33cb3595659b5 100644
> > --- a/gcc/expmed.h
> > +++ b/gcc/expmed.h
> > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> > machine_mode,  extern rtx expand_shift (enum tree_code,
> machine_mode,
> > rtx, poly_int64, rtx,
> >  			 int);
> >  #ifdef GCC_OPTABS_H
> > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> > -			  rtx, int, enum optab_methods =
> > OPTAB_LIB_WIDEN);
> > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> > tree,
> > +			  rtx, rtx, rtx, int,
> > +			  enum optab_methods = OPTAB_LIB_WIDEN);
> >  #endif
> >  #endif
> >
> > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> >
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb09
> > 90db8b97d3af414 100644
> > --- a/gcc/expmed.cc
> > +++ b/gcc/expmed.cc
> > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0,
> > HOST_WIDE_INT d)
> >
> >  rtx
> >  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> mode,
> > -	       rtx op0, rtx op1, rtx target, int unsignedp,
> > -	       enum optab_methods methods)
> > +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> > +	       int unsignedp, enum optab_methods methods)
> >  {
> >    machine_mode compute_mode;
> >    rtx tquotient;
> > @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >
> >    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> >
> > +  /* Check if the target has specific expansions for the division.
> > + */  if (treeop0
> > +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> > (treeop0),
> > +						     treeop0, treeop1,
> > +						     &target, op0, op1))
> > +    return target;
> > +
> > +
> >    /* Now convert to the best mode to use.  */
> >    if (compute_mode != mode)
> >      {
> > @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  			    || (optab_handler (sdivmod_optab, int_mode)
> >  				!= CODE_FOR_nothing)))
> >  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> > -						int_mode, op0,
> > -						gen_int_mode (abs_d,
> > +						int_mode, treeop0, treeop1,
> > +						op0, gen_int_mode (abs_d,
> >  							      int_mode),
> >  						NULL_RTX, 0);
> >  		    else
> > @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  				      size - 1, NULL_RTX, 0);
> >  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
> >  				    NULL_RTX);
> > -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> > op1,
> > -				    NULL_RTX, 0);
> > +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> > treeop0,
> > +				    treeop1, t3, op1, NULL_RTX, 0);
> >  		if (t4)
> >  		  {
> >  		    rtx t5;
> > diff --git a/gcc/expr.cc b/gcc/expr.cc index
> >
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96
> > a8abc055fa34d9 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
> >  	    return expand_divmod (0,
> >  				  FLOAT_MODE_P (GET_MODE (value))
> >  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> > -				  GET_MODE (value), op1, op2, target, 0);
> > +				  GET_MODE (value), NULL, NULL, op1, op2,
> > +				  target, 0);
> >  	case MOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 0);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 0);
> >  	case UDIV:
> > -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case UMOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case ASHIFTRT:
> >  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
> >  				      target, 0, OPTAB_LIB_WIDEN); @@ -
> 8990,11 +8991,13 @@
> > expand_expr_divmod (tree_code code, machine_mode mode, tree
> treeop0,
> >        bool speed_p = optimize_insn_for_speed_p ();
> >        do_pending_stack_adjust ();
> >        start_sequence ();
> > -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target,
> 1);
> > +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 1);
> >        rtx_insn *uns_insns = get_insns ();
> >        end_sequence ();
> >        start_sequence ();
> > -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target,
> 0);
> > +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 0);
> >        rtx_insn *sgn_insns = get_insns ();
> >        end_sequence ();
> >        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7
> > +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode
> mode, tree
> > treeop0,
> >        emit_insn (sgn_insns);
> >        return sgn_ret;
> >      }
> > -  return expand_divmod (mod_p, code, mode, op0, op1, target,
> > unsignedp);
> > +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +			op0, op1, target, unsignedp);
> >  }
> >
> >  rtx
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd
> > 872f340855dc96 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode
> mode, rtx
> > op0, rtx op1, bool unsignedp)
> >  		return NULL_RTX;
> >  	    }
> >  	}
> > -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > sum,
> > -				     gen_int_mode (INTVAL (op1),
> > word_mode),
> > +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > NULL, NULL,
> > +				     sum, gen_int_mode (INTVAL (op1),
> > +							word_mode),
> >  				     NULL_RTX, 1, OPTAB_DIRECT);
> >        if (remainder == NULL_RTX)
> >  	return NULL_RTX;
> > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx
> > op0, rtx op1, rtx *rem,
> >
> >    if (op11 != const1_rtx)
> >      {
> > -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1,
> op11,
> > -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL,
> NULL,
> > quot1,
> > +				op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx
> > op0, rtx op1, rtx *rem,
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> > -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> > quot1,
> > +				 op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (quot2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > diff --git a/gcc/target.def b/gcc/target.def index
> >
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b070
> > 81cdd70113db9b1 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -1902,6 +1902,25 @@ implementation approaches itself.",
> >  	const vec_perm_indices &sel),
> >   NULL)
> >
> > +DEFHOOK
> > +(can_special_div_by_const,
> > + "This hook is used to test whether the target has a special method
> > +of\n\ division of vectors of type @var{vectype} using the two
> > +operands @code{treeop0},\n\ and @code{treeop1} and producing a
> vector
> > +of type @var{vectype}.  The division\n\ will then not be decomposed
> > +by the and kept as a div.\n\ \n\ When the hook is being used to test
> > +whether the target supports a special\n\ divide, @var{in0},
> > +@var{in1}, and @var{output} are all null.  When the hook\n\ is being
> > +used to emit a division, @var{in0} and @var{in1} are the source\n\
> > +vectors of type @var{vecttype} and @var{output} is the destination
> > +vector of\n\ type @var{vectype}.\n\ \n\ Return true if the operation
> > +is possible, emitting instructions for it\n\ if rtxes are provided
> > +and updating @var{output}.",  bool, (enum tree_code, tree vectype,
> > +tree treeop0, tree treeop1, rtx *output,
> > +	rtx in0, rtx in1),
> > + default_can_special_div_by_const)
> > +
> >  /* Return true if the target supports misaligned store/load of a
> >     specific factor denoted in the third parameter.  The last parameter
> >     is true if the access is defined in a packed struct.  */ diff
> > --git a/gcc/target.h b/gcc/target.h index
> >
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56
> > f39c061f68b665 100644
> > --- a/gcc/target.h
> > +++ b/gcc/target.h
> > @@ -51,6 +51,7 @@
> >  #include "insn-codes.h"
> >  #include "tm.h"
> >  #include "hard-reg-set.h"
> > +#include "tree-core.h"
> >
> >  #if CHECKING_P
> >
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> >
> ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e
> > 2640d63f936b336d 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> > (addr_space_t, location_t);  extern rtx default_addr_space_convert
> > (rtx, tree, tree);  extern unsigned int default_case_values_threshold
> > (void); extern bool default_have_conditional_execution (void);
> > +extern bool default_can_special_div_by_const (enum tree_code, tree,
> > tree, tree,
> > +					      rtx *, rtx, rtx);
> >
> >  extern bool default_libc_has_function (enum function_class, tree);
> > extern bool default_libc_has_fast_function (int fcode); diff --git
> > a/gcc/targhooks.cc b/gcc/targhooks.cc index
> >
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241
> > 279936ced41ee95 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
> >    return HAVE_conditional_execution;
> >  }
> >
> > +/* Default that no division by constant operations are special.  */
> > +bool default_can_special_div_by_const (enum tree_code, tree, tree,
> > +tree, rtx *, rtx,
> > +				  rtx)
> > +{
> > +  return false;
> > +}
> > +
> >  /* By default we assume that c99 functions are present at the runtime,
> >     but sincos is not.  */
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3
> > d7b4d5b64a19b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3
> > db75b3e4112e2cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720
> > 157701d9d1cf852
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > +aarch64*-*-* } } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1
> > 832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a
> > b211cd246d82d5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > @@ -0,0 +1,61 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdint.h>
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> > +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> > +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> > +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> > +** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> > +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> > +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> > +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> > +** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> > +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> > +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> > +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> > +** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> >
> 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c
> > 9a12046b6ec94f3 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -1237,6 +1237,14 @@ expand_vector_operation
> (gimple_stmt_iterator
> > *gsi, tree type, tree compute_type
> >  	  tree rhs2 = gimple_assign_rhs2 (assign);
> >  	  tree ret;
> >
> > +	  /* Check if the target was going to handle it through the special
> > +	     division callback hook.  */
> > +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> > +							  rhs2, NULL,
> > +							  NULL_RTX,
> > NULL_RTX))
> > +	    return NULL_TREE;
> > +
> > +
> >  	  if (!optimize
> >  	      || !VECTOR_INTEGER_TYPE_P (type)
> >  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> > patterns.cc b/gcc/tree-vect-patterns.cc index
> >
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af
> > 0b1bfea10fe443 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >
> >        return pattern_stmt;
> >      }
> > +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> > +						       oprnd0, oprnd1, NULL,
> > +						       NULL_RTX, NULL_RTX))
> > +    {
> > +      return NULL;
> > +    }
> >
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> >
> c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd6
> > 8e0e1c1e93faafe 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
> >  	}
> >        target_support_p = (optab_handler (optab, vec_mode)
> >  			  != CODE_FOR_nothing);
> > +      if (!target_support_p)
> > +	target_support_p
> > +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> > +							op0, op1, NULL,
> > +							NULL_RTX,
> > NULL_RTX);
> >      }
> >
> >    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> >
> >
> >
> >
> > --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division
  2022-10-31 11:34   ` Tamar Christina
@ 2022-11-09  8:33     ` Tamar Christina
  0 siblings, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-11-09  8:33 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Tamar Christina <tamar.christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2
> bitmask division
> 
> Ping
> 
> > -----Original Message-----
> > From: Tamar Christina <tamar.christina@arm.com>
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask
> > division
> >
> > Hi All,
> >
> > In plenty of image and video processing code it's common to modify
> > pixel values by a widening operation and then scale them back into
> > range by dividing by 255.
> >
> > This patch adds an named function to allow us to emit an optimized
> > sequence when doing an unsigned division that is equivalent to:
> >
> >    x = y / (2 ^ (bitsize (y)/2)-1)
> >
> > For SVE2 this means we generate for:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] = (pixel[i] * level) / 0xff; }
> >
> > the following:
> >
> >         mov     z3.b, #1
> > .L3:
> >         ld1b    z0.h, p0/z, [x0, x3]
> >         mul     z0.h, p1/m, z0.h, z2.h
> >         addhnb  z1.b, z0.h, z3.h
> >         addhnb  z0.b, z0.h, z1.h
> >         st1b    z0.h, p0, [x0, x3]
> >         inch    x3
> >         whilelo p0.h, w3, w2
> >         b.any   .L3
> >
> > instead of:
> >
> > .L3:
> >         ld1b    z0.h, p1/z, [x0, x3]
> >         mul     z0.h, p0/m, z0.h, z1.h
> >         umulh   z0.h, p0/m, z0.h, z2.h
> >         lsr     z0.h, z0.h, #7
> >         st1b    z0.h, p1, [x0, x3]
> >         inch    x3
> >         whilelo p1.h, w3, w2
> >         b.any   .L3
> >
> > Which results in significantly faster code.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-sve2.md
> > (@aarch64_bitmask_udiv<mode>3): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index
> >
> f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234
> > a1023a6eba0d1 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -71,6 +71,7 @@
> >  ;; ---- [INT] Reciprocal approximation  ;; ---- [INT<-FP] Base-2
> > logarithm  ;; ---- [INT] Polynomial multiplication
> > +;; ---- [INT] Misc optab implementations
> >  ;;
> >  ;; == Permutation
> >  ;; ---- [INT,FP] General permutes
> > @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
> >    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
> >  )
> >
> > +;;
> > +---------------------------------------------------------------------
> > +--
> > +-- ;; ---- [INT] Misc optab implementations ;;
> > +---------------------------------------------------------------------
> > +--
> > +--
> > +;; Includes:
> > +;; - aarch64_bitmask_udiv
> > +;;
> > +---------------------------------------------------------------------
> > +--
> > +--
> > +
> > +;; div optimizations using narrowings ;; we can do the division e.g.
> > +shorts by 255 faster by calculating it as ;; (x + ((x + 257) >> 8))
> > +>> 8 assuming the operation is done in ;; double the precision of x.
> > +;;
> > +;; See aarch64-simd.md for bigger explanation.
> > +(define_expand "@aarch64_bitmask_udiv<mode>3"
> > +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> > +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> > +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> > +  "TARGET_SVE2"
> > +{
> > +  unsigned HOST_WIDE_INT size
> > +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> > +  if (!CONST_VECTOR_P (operands[2])
> > +      || const_vector_encoded_nelts (operands[2]) != 1
> > +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> > +    FAIL;
> > +
> > +  rtx addend = gen_reg_rtx (<MODE>mode);
> > +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> > +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> > +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> > +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > +<VNARROW>mode));
> > +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1,
> > operands[1],
> > +			      addend));
> > +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2,
> > operands[1],
> > +			      lowpart_subreg (<MODE>mode, tmp1,
> > +					      <VNARROW>mode)));
> > +  emit_move_insn (operands[0],
> > +		  lowpart_subreg (<MODE>mode, tmp2,
> > <VNARROW>mode));
> > +  DONE;
> > +})
> > +
> >  ;;
> >
> ==========================================================
> > ===============
> >  ;; == Permutation
> >  ;;
> >
> ==========================================================
> > ===============
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c
> > 0bb0d204cda6d9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> > +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> > +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> > +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> > +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> >
> >
> >
> >
> > --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT
  2022-10-31 11:34   ` Tamar Christina
@ 2022-11-09  8:33     ` Tamar Christina
  0 siblings, 0 replies; 35+ messages in thread
From: Tamar Christina @ 2022-11-09  8:33 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Earnshaw, nd, Richard Sandiford, Marcus Shawcroft

ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: RE: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB +
> NARROWB to NARROWB + NARROWT
> 
> Ping
> 
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-
> > bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> > Christina via Gcc-patches
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> > Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>
> > Subject: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB
> to
> > NARROWB + NARROWT
> >
> > Hi All,
> >
> > This adds an RTL pattern for when two NARROWB instructions are being
> > combined with a PACK.  The second NARROWB is then transformed into a
> > NARROWT.
> >
> > For the example:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] += (pixel[i] * level) / 0xff; }
> >
> > we generate:
> >
> >         addhnb  z6.b, z0.h, z4.h
> >         addhnb  z5.b, z1.h, z4.h
> >         addhnb  z0.b, z0.h, z6.h
> >         addhnt  z0.b, z1.h, z5.h
> >         add     z0.b, z0.b, z2.b
> >
> > instead of:
> >
> >         addhnb  z6.b, z1.h, z4.h
> >         addhnb  z5.b, z0.h, z4.h
> >         addhnb  z1.b, z1.h, z6.h
> >         addhnb  z0.b, z0.h, z5.h
> >         uzp1    z0.b, z0.b, z1.b
> >         add     z0.b, z0.b, z2.b
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-sve2.md
> > (*aarch64_sve_pack_<sve_int_op><mode>):
> > 	New.
> > 	* config/aarch64/iterators.md (binary_top): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> > 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index
> >
> ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98a
> > d3c41e5d05d8cf38 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -1600,6 +1600,25 @@ (define_insn
> > "@aarch64_sve_<sve_int_op><mode>"
> >    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
> >  )
> >
> > +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> > +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> > +	(unspec:<VNARROW>
> > +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
> > +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> > +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> > +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> > +	     SVE2_INT_BINARY_NARROWB) 0)]
> > +	  UNSPEC_PACK))]
> > +  "TARGET_SVE2"
> > +  "#"
> > +  "&& true"
> > +  [(const_int 0)]
> > +{
> > +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1],
> > <MODE>mode);
> > +  emit_insn (gen_aarch64_sve
> > (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> > +			      operands[0], tmp, operands[2], operands[3]));
> > +})
> > +
> >  ;;
> > ----------------------------------------------------------------------
> > ---
> >  ;; ---- [INT] Narrowing right shifts
> >  ;;
> > ----------------------------------------------------------------------
> > --- diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82af
> > a67613a087389d4 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a")
> > (UNSPEC_BRKB "b")
> >
> >  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT
> > "pnext")])
> >
> > +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> > +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> > +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> > +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> > +
> >  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
> >  			     (UNSPEC_ADCLT "adclt")
> >  			     (UNSPEC_ADDHNB "addhnb")
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15
> > c82dd9726897cfd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > +aarch64*-*-* } } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdb
> > ced36c8038db1b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +** 	addhnb	z6.b, z0.h, z4.h
> > +** 	addhnb	z5.b, z1.h, z4.h
> > +** 	addhnb	z0.b, z0.h, z6.h
> > +** 	addhnt	z0.b, z1.h, z5.h
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +** 	addhnb	z6.h, z0.s, z4.s
> > +** 	addhnb	z5.h, z1.s, z4.s
> > +** 	addhnb	z0.h, z0.s, z6.s
> > +** 	addhnt	z0.h, z1.s, z5.s
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +** 	addhnb	z6.s, z0.d, z4.d
> > +** 	addhnb	z5.s, z1.d, z4.d
> > +** 	addhnb	z0.s, z0.d, z6.d
> > +** 	addhnt	z0.s, z1.d, z5.d
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> >
> >
> >
> >
> > --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization.
  2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
                   ` (6 preceding siblings ...)
  2022-09-26 10:39 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Richard Biener
@ 2022-11-09 10:37 ` Kyrylo Tkachov
  7 siblings, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2022-11-09 10:37 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches; +Cc: nd, rguenther

Hi Tamar,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+kyrylo.tkachov=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, September 23, 2022 10:33 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; rguenther@suse.de
> Subject: [PATCH 1/4]middle-end Support not decomposing specific divisions
> during vectorization.
> 
> Hi All,
> 
> In plenty of image and video processing code it's common to modify pixel
> values
> by a widening operation and then scale them back into range by dividing by
> 255.
> 
> e.g.:
> 
>    x = y / (2 ^ (bitsize (y)/2)-1
> 
> This patch adds a new target hook can_special_div_by_const, similar to
> can_vec_perm which can be called to check if a target will handle a particular
> division in a special way in the back-end.
> 
> The vectorizer will then vectorize the division using the standard tree code
> and at expansion time the hook is called again to generate the code for the
> division.
> 
> Alot of the changes in the patch are to pass down the tree operands in all
> paths
> that can lead to the divmod expansion so that the target hook always has the
> type of the expression you're expanding since the types can change the
> expansion.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* expmed.h (expand_divmod): Pass tree operands down in addition
> to RTX.
> 	* expmed.cc (expand_divmod): Likewise.
> 	* explow.cc (round_push, align_dynamic_address): Likewise.
> 	* expr.cc (force_operand, expand_expr_divmod): Likewise.
> 	* optabs.cc (expand_doubleword_mod,
> expand_doubleword_divmod):
> 	Likewise.
> 	* target.h: Include tree-core.
> 	* target.def (can_special_div_by_const): New.
> 	* targhooks.cc (default_can_special_div_by_const): New.
> 	* targhooks.h (default_can_special_div_by_const): New.
> 	* tree-vect-generic.cc (expand_vector_operation): Use it.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Check for
> support.
> 	* tree-vect-stmts.cc (vectorizable_operation): Likewise.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-1.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-2.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-3.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask.h: New file.
> 
> --- inline copy of patch --
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d
> 244a2a23e76cac097 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the
> hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
> 
> +@deftypefn {Target Hook} bool
> TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code},
> tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx
> *@var{output}, rtx @var{in0}, rtx @var{in1})
> +This hook is used to test whether the target has a special method of
> +division of vectors of type @var{vectype} using the two operands
> @code{treeop0},
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The
> division
> +will then not be decomposed by the and kept as a div.

I think the grammar here is wonky, can you reword this sentence please?
(I was just reading this patch to understand the optab semantics futher in the series)
Thanks,
Kyrill

> +
> +When the hook is being used to test whether the target supports a special
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the hook
> +is being used to emit a division, @var{in0} and @var{in1} are the source
> +vectors of type @var{vecttype} and @var{output} is the destination vector
> of
> +type @var{vectype}.
> +
> +Return true if the operation is possible, emitting instructions for it
> +if rtxes are provided and updating @var{output}.
> +@end deftypefn
> +
>  @deftypefn {Target Hook} tree
> TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
>  This hook should return the decl of a function that implements the
>  vectorized variant of the function with the @code{combined_fn} code
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b
> 04076d058c24ce093 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent strategy
> can generate better code.
> 
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> 
> +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> 
>  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> diff --git a/gcc/explow.cc b/gcc/explow.cc
> index
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae67
> 8f5e346bf34ba0036 100644
> --- a/gcc/explow.cc
> +++ b/gcc/explow.cc
> @@ -1037,7 +1037,7 @@ round_push (rtx size)
>       TRUNC_DIV_EXPR.  */
>    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
>  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> align_rtx,
>  			NULL_RTX, 1);
>    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> 
> @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> required_align)
>  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
>  				       Pmode),
>  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target,
>  			  gen_int_mode (required_align / BITS_PER_UNIT,
>  					Pmode),
>  			  NULL_RTX, 1);
> diff --git a/gcc/expmed.h b/gcc/expmed.h
> index
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501
> c6f33cb3595659b5 100644
> --- a/gcc/expmed.h
> +++ b/gcc/expmed.h
> @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> machine_mode,
>  extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64,
> rtx,
>  			 int);
>  #ifdef GCC_OPTABS_H
> -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> -			  rtx, int, enum optab_methods =
> OPTAB_LIB_WIDEN);
> +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree,
> +			  rtx, rtx, rtx, int,
> +			  enum optab_methods = OPTAB_LIB_WIDEN);
>  #endif
>  #endif
> 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb
> 0990db8b97d3af414 100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> op0, HOST_WIDE_INT d)
> 
>  rtx
>  expand_divmod (int rem_flag, enum tree_code code, machine_mode mode,
> -	       rtx op0, rtx op1, rtx target, int unsignedp,
> -	       enum optab_methods methods)
> +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> +	       int unsignedp, enum optab_methods methods)
>  {
>    machine_mode compute_mode;
>    rtx tquotient;
> @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
> 
>    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> 
> +  /* Check if the target has specific expansions for the division.  */
> +  if (treeop0
> +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> (treeop0),
> +						     treeop0, treeop1,
> +						     &target, op0, op1))
> +    return target;
> +
> +
>    /* Now convert to the best mode to use.  */
>    if (compute_mode != mode)
>      {
> @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  			    || (optab_handler (sdivmod_optab, int_mode)
>  				!= CODE_FOR_nothing)))
>  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> -						int_mode, op0,
> -						gen_int_mode (abs_d,
> +						int_mode, treeop0, treeop1,
> +						op0, gen_int_mode (abs_d,
>  							      int_mode),
>  						NULL_RTX, 0);
>  		    else
> @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code
> code, machine_mode mode,
>  				      size - 1, NULL_RTX, 0);
>  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
>  				    NULL_RTX);
> -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> op1,
> -				    NULL_RTX, 0);
> +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> treeop0,
> +				    treeop1, t3, op1, NULL_RTX, 0);
>  		if (t4)
>  		  {
>  		    rtx t5;
> diff --git a/gcc/expr.cc b/gcc/expr.cc
> index
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd
> 96a8abc055fa34d9 100644
> --- a/gcc/expr.cc
> +++ b/gcc/expr.cc
> @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
>  	    return expand_divmod (0,
>  				  FLOAT_MODE_P (GET_MODE (value))
>  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> -				  GET_MODE (value), op1, op2, target, 0);
> +				  GET_MODE (value), NULL, NULL, op1, op2,
> +				  target, 0);
>  	case MOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 0);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 0);
>  	case UDIV:
> -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case UMOD:
> -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> op1, op2,
> -				target, 1);
> +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> NULL, NULL,
> +				op1, op2, target, 1);
>  	case ASHIFTRT:
>  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
>  				      target, 0, OPTAB_LIB_WIDEN);
> @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        bool speed_p = optimize_insn_for_speed_p ();
>        do_pending_stack_adjust ();
>        start_sequence ();
> -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1);
> +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 1);
>        rtx_insn *uns_insns = get_insns ();
>        end_sequence ();
>        start_sequence ();
> -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0);
> +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +				   op0, op1, target, 0);
>        rtx_insn *sgn_insns = get_insns ();
>        end_sequence ();
>        unsigned uns_cost = seq_cost (uns_insns, speed_p);
> @@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code,
> machine_mode mode, tree treeop0,
>        emit_insn (sgn_insns);
>        return sgn_ret;
>      }
> -  return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp);
> +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> +			op0, op1, target, unsignedp);
>  }
> 
>  rtx
> diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> index
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abf
> d872f340855dc96 100644
> --- a/gcc/optabs.cc
> +++ b/gcc/optabs.cc
> @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode,
> rtx op0, rtx op1, bool unsignedp)
>  		return NULL_RTX;
>  	    }
>  	}
> -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> sum,
> -				     gen_int_mode (INTVAL (op1),
> word_mode),
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> NULL, NULL,
> +				     sum, gen_int_mode (INTVAL (op1),
> +							word_mode),
>  				     NULL_RTX, 1, OPTAB_DIRECT);
>        if (remainder == NULL_RTX)
>  	return NULL_RTX;
> @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
> 
>    if (op11 != const1_rtx)
>      {
> -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> quot1,
> +				op11, NULL_RTX, unsignedp, OPTAB_DIRECT);
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> mode, rtx op0, rtx op1, rtx *rem,
>        if (rem2 == NULL_RTX)
>  	return NULL_RTX;
> 
> -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> quot1,
> +				 op11, NULL_RTX, unsignedp,
> OPTAB_DIRECT);
>        if (quot2 == NULL_RTX)
>  	return NULL_RTX;
> 
> diff --git a/gcc/target.def b/gcc/target.def
> index
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b0
> 7081cdd70113db9b1 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1902,6 +1902,25 @@ implementation approaches itself.",
>  	const vec_perm_indices &sel),
>   NULL)
> 
> +DEFHOOK
> +(can_special_div_by_const,
> + "This hook is used to test whether the target has a special method of\n\
> +division of vectors of type @var{vectype} using the two operands
> @code{treeop0},\n\
> +and @code{treeop1} and producing a vector of type @var{vectype}.  The
> division\n\
> +will then not be decomposed by the and kept as a div.\n\
> +\n\
> +When the hook is being used to test whether the target supports a
> special\n\
> +divide, @var{in0}, @var{in1}, and @var{output} are all null.  When the
> hook\n\
> +is being used to emit a division, @var{in0} and @var{in1} are the source\n\
> +vectors of type @var{vecttype} and @var{output} is the destination vector
> of\n\
> +type @var{vectype}.\n\
> +\n\
> +Return true if the operation is possible, emitting instructions for it\n\
> +if rtxes are provided and updating @var{output}.",
> + bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx
> *output,
> +	rtx in0, rtx in1),
> + default_can_special_div_by_const)
> +
>  /* Return true if the target supports misaligned store/load of a
>     specific factor denoted in the third parameter.  The last parameter
>     is true if the access is defined in a packed struct.  */
> diff --git a/gcc/target.h b/gcc/target.h
> index
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da
> 56f39c061f68b665 100644
> --- a/gcc/target.h
> +++ b/gcc/target.h
> @@ -51,6 +51,7 @@
>  #include "insn-codes.h"
>  #include "tm.h"
>  #include "hard-reg-set.h"
> +#include "tree-core.h"
> 
>  #if CHECKING_P
> 
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index
> ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e
> 2640d63f936b336d 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> (addr_space_t, location_t);
>  extern rtx default_addr_space_convert (rtx, tree, tree);
>  extern unsigned int default_case_values_threshold (void);
>  extern bool default_have_conditional_execution (void);
> +extern bool default_can_special_div_by_const (enum tree_code, tree, tree,
> tree,
> +					      rtx *, rtx, rtx);
> 
>  extern bool default_libc_has_function (enum function_class, tree);
>  extern bool default_libc_has_fast_function (int fcode);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba2
> 41279936ced41ee95 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
>    return HAVE_conditional_execution;
>  }
> 
> +/* Default that no division by constant operations are special.  */
> +bool
> +default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *,
> rtx,
> +				  rtx)
> +{
> +  return false;
> +}
> +
>  /* By default we assume that c99 functions are present at the runtime,
>     but sincos is not.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b491
> 6f3d7b4d5b64a19b9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2c
> d3db75b3e4112e2cc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint16_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e7
> 20157701d9d1cf852
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected"
> "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bf
> d1832f28ebd07993e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> @@ -0,0 +1,43 @@
> +#include <stdio.h>
> +
> +#ifndef N
> +#define N 65
> +#endif
> +
> +#ifndef TYPE
> +#define TYPE uint32_t
> +#endif
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
> index
> 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817
> c9a12046b6ec94f3 100644
> --- a/gcc/tree-vect-generic.cc
> +++ b/gcc/tree-vect-generic.cc
> @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator
> *gsi, tree type, tree compute_type
>  	  tree rhs2 = gimple_assign_rhs2 (assign);
>  	  tree ret;
> 
> +	  /* Check if the target was going to handle it through the special
> +	     division callback hook.  */
> +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> +							  rhs2, NULL,
> +							  NULL_RTX,
> NULL_RTX))
> +	    return NULL_TREE;
> +
> +
>  	  if (!optimize
>  	      || !VECTOR_INTEGER_TYPE_P (type)
>  	      || TREE_CODE (rhs2) != VECTOR_CST
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85
> af0b1bfea10fe443 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> 
>        return pattern_stmt;
>      }
> +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> +						       oprnd0, oprnd1, NULL,
> +						       NULL_RTX, NULL_RTX))
> +    {
> +      return NULL;
> +    }
> 
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index
> c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288b
> d68e0e1c1e93faafe 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        target_support_p = (optab_handler (optab, vec_mode)
>  			  != CODE_FOR_nothing);
> +      if (!target_support_p)
> +	target_support_p
> +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> +							op0, op1, NULL,
> +							NULL_RTX,
> NULL_RTX);
>      }
> 
>    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> 
> 
> 
> 
> --

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division.
  2022-10-31 11:34   ` Tamar Christina
  2022-11-09  8:33     ` Tamar Christina
@ 2022-11-09 16:02     ` Kyrylo Tkachov
  1 sibling, 0 replies; 35+ messages in thread
From: Kyrylo Tkachov @ 2022-11-09 16:02 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Richard Sandiford

Hi Tamar,

> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, October 31, 2022 11:35 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask
> division.
> 
> Hi All,
> 
> Ping, and updated patch based on mid-end changes.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-simd.md
> (@aarch64_bitmask_udiv<mode>3): New.
> 	* config/aarch64/aarch64.cc
> (aarch64_vectorize_can_special_div_by_constant): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/div-by-bitmask.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f
> 0ba6386c1ab50f77e 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -4831,6 +4831,65 @@ (define_expand
> "aarch64_<sur><addsub>hn2<mode>"
>    }
>  )

Some editorial comments.

> 
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it as
> +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
> +;; double the precision of x.
> +;;
> +;; If we imagine a short as being composed of two blocks of bytes then
> +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to

Typo "equivalent"

> +;; adding 1 to each sub component:
> +;;
> +;;      short value of 16-bits
> +;; ┌──────────────┬────────────────┐
> +;; │              │                │
> +;; └──────────────┴────────────────┘
> +;;   8-bit part1 ▲  8-bit part2   ▲
> +;;               │                │
> +;;               │                │
> +;;              +1               +1
> +;;
> +;; after the first addition, we have to shift right by 8, and narrow the
> +;; results back to a byte.  Remember that the addition must be done in
> +;; double the precision of the input.  Since 8 is half the size of a short
> +;; we can use a narrowing halfing instruction in AArch64, addhn which also
> +;; does the addition in a wider precision and narrows back to a byte.  The
> +;; shift itself is implicit in the operation as it writes back only the top
> +;; half of the result. i.e. bits 2*esize-1:esize.
> +;;
> +;; Since we have narrowed the result of the first part back to a byte, for
> +;; the second addition we can use a widening addition, uaddw.
> +;;
> +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8

"final shift", "unsigned arithmetic"

> +;; to shift and the vectorizer.

Incomplete sentence?

> +;;
> +;; The shift is later optimized by combine to a uzp2 with movi #0.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:VQN 0 "register_operand")
> +   (match_operand:VQN 1 "register_operand")
> +   (match_operand:VQN 2 "immediate_operand")]
> +  "TARGET_SIMD"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;
> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> <VNARROWQ2>mode));
> +  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> +  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> +  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> +  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> bitsize);
> +  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
> +  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> shift_vector));
> +  DONE;
> +})

Does all this work for big-endian too? I think it does, but wonder whether you've tested.

Ok if so, with the comments addressed.
Thanks,
Kyrill

> +
>  ;; pmul.
> 
>  (define_insn "aarch64_pmul<mode>"
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index
> 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb651
> 54127dc7b72386c5a7 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const
> (machine_mode vmode, machine_mode op_mode,
>    return ret;
>  }
> 
> +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> +
> +bool
> +aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> +					       tree vectype, wide_int cst,
> +					       rtx *output, rtx in0, rtx in1)
> +{
> +  if (code != TRUNC_DIV_EXPR
> +      || !TYPE_UNSIGNED (vectype))
> +    return false;
> +
> +  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE
> (vectype));
> +  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> +    return false;
> +
> +  if (in0 == NULL_RTX && in1 == NULL_RTX)
> +    {
> +      wide_int val = wi::add (cst, 1);
> +      int pow = wi::exact_log2 (val);
> +      return pow == (int)(element_precision (vectype) / 2);
> +    }
> +
> +  if (!VECTOR_TYPE_P (vectype))
> +   return false;
> +
> +  gcc_assert (output);
> +
> +  if (!*output)
> +    *output = gen_reg_rtx (TYPE_MODE (vectype));
> +
> +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output,
> in0, in1));
> +  return true;
> +}
> +
>  /* Generate a byte permute mask for a register of mode MODE,
>     which has NUNITS units.  */
> 
> @@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p
>  #undef TARGET_VECTOR_ALIGNMENT
>  #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
> 
> +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
> +  aarch64_vectorize_can_special_div_by_constant
> +
>  #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
>  #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
>    aarch64_vectorize_preferred_vector_alignment
> diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf
> 44ab211cd246d82d5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> +** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> +** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> +** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> 
> > -----Original Message-----
> > From: Tamar Christina <tamar.christina@arm.com>
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask
> division.
> >
> > Hi All,
> >
> > This adds an implementation for the new optab for unsigned pow2 bitmask
> > for AArch64.
> >
> > The implementation rewrites:
> >
> >    x = y / (2 ^ (sizeof (y)/2)-1
> >
> > into e.g. (for bytes)
> >
> >    (x + ((x + 257) >> 8)) >> 8
> >
> > where it's required that the additions be done in double the precision of x
> > such that we don't lose any bits during an overflow.
> >
> > Essentially the sequence decomposes the division into doing two smaller
> > divisions, one for the top and bottom parts of the number and adding the
> > results back together.
> >
> > To account for the fact that shift by 8 would be division by 256 we add 1 to
> > both parts of x such that when 255 we still get 1 as the answer.
> >
> > Because the amount we shift are half the original datatype we can use the
> > halfing instructions the ISA provides to do the operation instead of using
> > actual shifts.
> >
> > For AArch64 this means we generate for:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] = (pixel[i] * level) / 0xff; }
> >
> > the following:
> >
> > 	movi    v3.16b, 0x1
> > 	umull2  v1.8h, v0.16b, v2.16b
> > 	umull   v0.8h, v0.8b, v2.8b
> > 	addhn   v5.8b, v1.8h, v3.8h
> > 	addhn   v4.8b, v0.8h, v3.8h
> > 	uaddw   v1.8h, v1.8h, v5.8b
> > 	uaddw   v0.8h, v0.8h, v4.8b
> > 	uzp2    v0.16b, v0.16b, v1.16b
> >
> > instead of:
> >
> > 	umull   v2.8h, v1.8b, v5.8b
> > 	umull2  v1.8h, v1.16b, v5.16b
> > 	umull   v0.4s, v2.4h, v3.4h
> > 	umull2  v2.4s, v2.8h, v3.8h
> > 	umull   v4.4s, v1.4h, v3.4h
> > 	umull2  v1.4s, v1.8h, v3.8h
> > 	uzp2    v0.8h, v0.8h, v2.8h
> > 	uzp2    v1.8h, v4.8h, v1.8h
> > 	shrn    v0.8b, v0.8h, 7
> > 	shrn2   v0.16b, v1.8h, 7
> >
> > Which results in significantly faster code.
> >
> > Thanks for Wilco for the concept.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-simd.md
> > (@aarch64_bitmask_udiv<mode>3): New.
> > 	* config/aarch64/aarch64.cc
> > (aarch64_vectorize_can_special_div_by_constant): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/div-by-bitmask.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f
> 0b
> > a6386c1ab50f77e 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -4831,6 +4831,65 @@ (define_expand
> > "aarch64_<sur><addsub>hn2<mode>"
> >    }
> >  )
> >
> > +;; div optimizations using narrowings
> > +;; we can do the division e.g. shorts by 255 faster by calculating it
> > +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;;
> > +double the precision of x.
> > +;;
> > +;; If we imagine a short as being composed of two blocks of bytes then
> > +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;;
> > +adding 1 to each sub component:
> > +;;
> > +;;      short value of 16-bits
> > +;; ┌──────────────┬────────────────┐
> > +;; │              │                │
> > +;; └──────────────┴────────────────┘
> > +;;   8-bit part1 ▲  8-bit part2   ▲
> > +;;               │                │
> > +;;               │                │
> > +;;              +1               +1
> > +;;
> > +;; after the first addition, we have to shift right by 8, and narrow
> > +the ;; results back to a byte.  Remember that the addition must be done
> > +in ;; double the precision of the input.  Since 8 is half the size of a
> > +short ;; we can use a narrowing halfing instruction in AArch64, addhn
> > +which also ;; does the addition in a wider precision and narrows back
> > +to a byte.  The ;; shift itself is implicit in the operation as it
> > +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize.
> > +;;
> > +;; Since we have narrowed the result of the first part back to a byte,
> > +for ;; the second addition we can use a widening addition, uaddw.
> > +;;
> > +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr
> > +by 8 ;; to shift and the vectorizer.
> > +;;
> > +;; The shift is later optimized by combine to a uzp2 with movi #0.
> > +(define_expand "@aarch64_bitmask_udiv<mode>3"
> > +  [(match_operand:VQN 0 "register_operand")
> > +   (match_operand:VQN 1 "register_operand")
> > +   (match_operand:VQN 2 "immediate_operand")]
> > +  "TARGET_SIMD"
> > +{
> > +  unsigned HOST_WIDE_INT size
> > +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
> > +  if (!CONST_VECTOR_P (operands[2])
> > +      || const_vector_encoded_nelts (operands[2]) != 1
> > +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> > +    FAIL;
> > +
> > +  rtx addend = gen_reg_rtx (<MODE>mode);
> > +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode,
> 1);
> > +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val,
> > +<VNARROWQ2>mode));
> > +  rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
> > +  rtx tmp2 = gen_reg_rtx (<MODE>mode);
> > +  emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
> > +  unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
> > +  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> > +bitsize);
> > +  emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1],
> tmp1));
> > +  emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2,
> > +shift_vector));
> > +  DONE;
> > +})
> > +
> >  ;; pmul.
> >
> >  (define_insn "aarch64_pmul<mode>"
> > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> > index
> >
> 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc
> 3
> > 7484b6fc6901bfb4 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const
> > (machine_mode vmode, machine_mode op_mode,
> >    return ret;
> >  }
> >
> > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST.  */
> > +
> > +bool
> > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
> > +					       tree vectype,
> > +					       tree treeop0, tree treeop1,
> > +					       rtx *output, rtx in0, rtx in1) {
> > +
> > +  if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX))
> > +    return false;
> > +
> > +  tree cst = uniform_integer_cst_p (treeop1);  tree type;  if (code !=
> > + TRUNC_DIV_EXPR
> > +      || !cst
> > +      || !TYPE_UNSIGNED ((type = TREE_TYPE (cst)))
> > +      || tree_int_cst_sgn (cst) != 1)
> > +    return false;
> > +
> > +  unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE
> > + (vectype));  if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
> > +    return false;
> > +
> > +  if (in0 == NULL_RTX && in1 == NULL_RTX)
> > +    {
> > +      gcc_assert (treeop0 && treeop1);
> > +      wide_int icst = wi::to_wide (cst);
> > +      wide_int val = wi::add (icst, 1);
> > +      int pow = wi::exact_log2 (val);
> > +      return pow == (TYPE_PRECISION (type) / 2);
> > +    }
> > +
> > +  if (!VECTOR_TYPE_P (vectype))
> > +   return false;
> > +
> > +  gcc_assert (output);
> > +
> > +  if (!*output)
> > +    *output = gen_reg_rtx (TYPE_MODE (vectype));
> > +
> > +  emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output,
> > +in0, in1));
> > +  return true;
> > +}
> > +
> >  /* Generate a byte permute mask for a register of mode MODE,
> >     which has NUNITS units.  */
> >
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index
> >
> 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d
> 2
> > 44a2a23e76cac097 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -6112,6 +6112,22 @@ instruction pattern.  There is no need for the
> hook
> > to handle these two  implementation approaches itself.
> >  @end deftypefn
> >
> > +@deftypefn {Target Hook} bool
> > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree
> > +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This
> > +hook is used to test whether the target has a special method of
> > +division of vectors of type @var{vectype} using the two operands
> > @code{treeop0}, and @code{treeop1} and producing a vector of type
> > @var{vectype}.  The division will then not be decomposed by the and kept
> as
> > a div.
> > +
> > +When the hook is being used to test whether the target supports a
> > +special divide, @var{in0}, @var{in1}, and @var{output} are all null.
> > +When the hook is being used to emit a division, @var{in0} and @var{in1}
> > +are the source vectors of type @var{vecttype} and @var{output} is the
> > +destination vector of type @var{vectype}.
> > +
> > +Return true if the operation is possible, emitting instructions for it
> > +if rtxes are provided and updating @var{output}.
> > +@end deftypefn
> > +
> >  @deftypefn {Target Hook} tree
> > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned
> > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})  This hook
> > should return the decl of a function that implements the  vectorized variant
> > of the function with the @code{combined_fn} code diff --git
> > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index
> >
> 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b
> 04
> > 076d058c24ce093 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -4164,6 +4164,8 @@ address;  but often a machine-dependent
> strategy
> > can generate better code.
> >
> >  @hook TARGET_VECTORIZE_VEC_PERM_CONST
> >
> > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
> > +
> >  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
> >
> >  @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION
> > diff --git a/gcc/explow.cc b/gcc/explow.cc index
> >
> ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae67
> 8f
> > 5e346bf34ba0036 100644
> > --- a/gcc/explow.cc
> > +++ b/gcc/explow.cc
> > @@ -1037,7 +1037,7 @@ round_push (rtx size)
> >       TRUNC_DIV_EXPR.  */
> >    size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
> >  		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
> > +  size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size,
> > + align_rtx,
> >  			NULL_RTX, 1);
> >    size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
> >
> > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned
> > required_align)
> >  			 gen_int_mode (required_align / BITS_PER_UNIT - 1,
> >  				       Pmode),
> >  			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
> > -  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target,
> > +  target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL,
> > target,
> >  			  gen_int_mode (required_align / BITS_PER_UNIT,
> >  					Pmode),
> >  			  NULL_RTX, 1);
> > diff --git a/gcc/expmed.h b/gcc/expmed.h index
> >
> 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501
> c6
> > f33cb3595659b5 100644
> > --- a/gcc/expmed.h
> > +++ b/gcc/expmed.h
> > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code,
> > machine_mode,  extern rtx expand_shift (enum tree_code,
> machine_mode,
> > rtx, poly_int64, rtx,
> >  			 int);
> >  #ifdef GCC_OPTABS_H
> > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx,
> > -			  rtx, int, enum optab_methods =
> > OPTAB_LIB_WIDEN);
> > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree,
> > tree,
> > +			  rtx, rtx, rtx, int,
> > +			  enum optab_methods = OPTAB_LIB_WIDEN);
> >  #endif
> >  #endif
> >
> > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index
> >
> 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb
> 09
> > 90db8b97d3af414 100644
> > --- a/gcc/expmed.cc
> > +++ b/gcc/expmed.cc
> > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx
> > op0, HOST_WIDE_INT d)
> >
> >  rtx
> >  expand_divmod (int rem_flag, enum tree_code code, machine_mode
> > mode,
> > -	       rtx op0, rtx op1, rtx target, int unsignedp,
> > -	       enum optab_methods methods)
> > +	       tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target,
> > +	       int unsignedp, enum optab_methods methods)
> >  {
> >    machine_mode compute_mode;
> >    rtx tquotient;
> > @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >
> >    last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0;
> >
> > +  /* Check if the target has specific expansions for the division.  */
> > +  if (treeop0
> > +      && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE
> > (treeop0),
> > +						     treeop0, treeop1,
> > +						     &target, op0, op1))
> > +    return target;
> > +
> > +
> >    /* Now convert to the best mode to use.  */
> >    if (compute_mode != mode)
> >      {
> > @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  			    || (optab_handler (sdivmod_optab, int_mode)
> >  				!= CODE_FOR_nothing)))
> >  		      quotient = expand_divmod (0, TRUNC_DIV_EXPR,
> > -						int_mode, op0,
> > -						gen_int_mode (abs_d,
> > +						int_mode, treeop0, treeop1,
> > +						op0, gen_int_mode (abs_d,
> >  							      int_mode),
> >  						NULL_RTX, 0);
> >  		    else
> > @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code
> > code, machine_mode mode,
> >  				      size - 1, NULL_RTX, 0);
> >  		t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign),
> >  				    NULL_RTX);
> > -		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3,
> > op1,
> > -				    NULL_RTX, 0);
> > +		t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode,
> > treeop0,
> > +				    treeop1, t3, op1, NULL_RTX, 0);
> >  		if (t4)
> >  		  {
> >  		    rtx t5;
> > diff --git a/gcc/expr.cc b/gcc/expr.cc
> > index
> >
> 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd
> 96
> > a8abc055fa34d9 100644
> > --- a/gcc/expr.cc
> > +++ b/gcc/expr.cc
> > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target)
> >  	    return expand_divmod (0,
> >  				  FLOAT_MODE_P (GET_MODE (value))
> >  				  ? RDIV_EXPR : TRUNC_DIV_EXPR,
> > -				  GET_MODE (value), op1, op2, target, 0);
> > +				  GET_MODE (value), NULL, NULL, op1, op2,
> > +				  target, 0);
> >  	case MOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 0);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 0);
> >  	case UDIV:
> > -	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case UMOD:
> > -	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > op1, op2,
> > -				target, 1);
> > +	  return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value),
> > NULL, NULL,
> > +				op1, op2, target, 1);
> >  	case ASHIFTRT:
> >  	  return expand_simple_binop (GET_MODE (value), code, op1, op2,
> >  				      target, 0, OPTAB_LIB_WIDEN);
> > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code,
> > machine_mode mode, tree treeop0,
> >        bool speed_p = optimize_insn_for_speed_p ();
> >        do_pending_stack_adjust ();
> >        start_sequence ();
> > -      rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target,
> 1);
> > +      rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 1);
> >        rtx_insn *uns_insns = get_insns ();
> >        end_sequence ();
> >        start_sequence ();
> > -      rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target,
> 0);
> > +      rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +				   op0, op1, target, 0);
> >        rtx_insn *sgn_insns = get_insns ();
> >        end_sequence ();
> >        unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7
> +9019,8
> > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree
> > treeop0,
> >        emit_insn (sgn_insns);
> >        return sgn_ret;
> >      }
> > -  return expand_divmod (mod_p, code, mode, op0, op1, target,
> unsignedp);
> > +  return expand_divmod (mod_p, code, mode, treeop0, treeop1,
> > +			op0, op1, target, unsignedp);
> >  }
> >
> >  rtx
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index
> >
> 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abf
> d
> > 872f340855dc96 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode
> mode,
> > rtx op0, rtx op1, bool unsignedp)
> >  		return NULL_RTX;
> >  	    }
> >  	}
> > -      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > sum,
> > -				     gen_int_mode (INTVAL (op1),
> > word_mode),
> > +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode,
> > NULL, NULL,
> > +				     sum, gen_int_mode (INTVAL (op1),
> > +							word_mode),
> >  				     NULL_RTX, 1, OPTAB_DIRECT);
> >        if (remainder == NULL_RTX)
> >  	return NULL_RTX;
> > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >
> >    if (op11 != const1_rtx)
> >      {
> > -      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11,
> > -				NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL,
> > quot1,
> > +				op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode
> > mode, rtx op0, rtx op1, rtx *rem,
> >        if (rem2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > -      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11,
> > -				 NULL_RTX, unsignedp, OPTAB_DIRECT);
> > +      rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL,
> > quot1,
> > +				 op11, NULL_RTX, unsignedp,
> > OPTAB_DIRECT);
> >        if (quot2 == NULL_RTX)
> >  	return NULL_RTX;
> >
> > diff --git a/gcc/target.def b/gcc/target.def index
> >
> 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b0
> 70
> > 81cdd70113db9b1 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -1902,6 +1902,25 @@ implementation approaches itself.",
> >  	const vec_perm_indices &sel),
> >   NULL)
> >
> > +DEFHOOK
> > +(can_special_div_by_const,
> > + "This hook is used to test whether the target has a special method
> > +of\n\ division of vectors of type @var{vectype} using the two operands
> > +@code{treeop0},\n\ and @code{treeop1} and producing a vector of type
> > +@var{vectype}.  The division\n\ will then not be decomposed by the and
> > +kept as a div.\n\ \n\ When the hook is being used to test whether the
> > +target supports a special\n\ divide, @var{in0}, @var{in1}, and
> > +@var{output} are all null.  When the hook\n\ is being used to emit a
> > +division, @var{in0} and @var{in1} are the source\n\ vectors of type
> > +@var{vecttype} and @var{output} is the destination vector of\n\ type
> > +@var{vectype}.\n\ \n\ Return true if the operation is possible,
> > +emitting instructions for it\n\ if rtxes are provided and updating
> > +@var{output}.",  bool, (enum tree_code, tree vectype, tree treeop0,
> > +tree treeop1, rtx *output,
> > +	rtx in0, rtx in1),
> > + default_can_special_div_by_const)
> > +
> >  /* Return true if the target supports misaligned store/load of a
> >     specific factor denoted in the third parameter.  The last parameter
> >     is true if the access is defined in a packed struct.  */ diff --git
> a/gcc/target.h
> > b/gcc/target.h index
> >
> d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da
> 56
> > f39c061f68b665 100644
> > --- a/gcc/target.h
> > +++ b/gcc/target.h
> > @@ -51,6 +51,7 @@
> >  #include "insn-codes.h"
> >  #include "tm.h"
> >  #include "hard-reg-set.h"
> > +#include "tree-core.h"
> >
> >  #if CHECKING_P
> >
> > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index
> >
> ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e
> > 2640d63f936b336d 100644
> > --- a/gcc/targhooks.h
> > +++ b/gcc/targhooks.h
> > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage
> > (addr_space_t, location_t);  extern rtx default_addr_space_convert (rtx,
> > tree, tree);  extern unsigned int default_case_values_threshold (void);
> > extern bool default_have_conditional_execution (void);
> > +extern bool default_can_special_div_by_const (enum tree_code, tree,
> > tree, tree,
> > +					      rtx *, rtx, rtx);
> >
> >  extern bool default_libc_has_function (enum function_class, tree);  extern
> > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc
> > b/gcc/targhooks.cc index
> >
> b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba2
> 41
> > 279936ced41ee95 100644
> > --- a/gcc/targhooks.cc
> > +++ b/gcc/targhooks.cc
> > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void)
> >    return HAVE_conditional_execution;
> >  }
> >
> > +/* Default that no division by constant operations are special.  */
> > +bool default_can_special_div_by_const (enum tree_code, tree, tree,
> > +tree, rtx *, rtx,
> > +				  rtx)
> > +{
> > +  return false;
> > +}
> > +
> >  /* By default we assume that c99 functions are present at the runtime,
> >     but sincos is not.  */
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b491
> 6f3
> > d7b4d5b64a19b9
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint8_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..e904a71885b2e8487593a2c
> d3
> > db75b3e4112e2cc
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c
> > @@ -0,0 +1,25 @@
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint16_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e7
> 20
> > 157701d9d1cf852
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> > +} } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..29a16739aa4b706616367bf
> d1
> > 832f28ebd07993e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h
> > @@ -0,0 +1,43 @@
> > +#include <stdio.h>
> > +
> > +#ifndef N
> > +#define N 65
> > +#endif
> > +
> > +#ifndef TYPE
> > +#define TYPE uint32_t
> > +#endif
> > +
> > +#ifndef DEBUG
> > +#define DEBUG 0
> > +#endif
> > +
> > +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> > +
> > +int main ()
> > +{
> > +  TYPE a[N];
> > +  TYPE b[N];
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      a[i] = BASE + i * 13;
> > +      b[i] = BASE + i * 13;
> > +      if (DEBUG)
> > +        printf ("%d: 0x%x\n", i, a[i]);
> > +    }
> > +
> > +  fun1 (a, N / 2, N);
> > +  fun2 (b, N / 2, N);
> > +
> > +  for (int i = 0; i < N; ++i)
> > +    {
> > +      if (DEBUG)
> > +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> > +
> > +      if (a[i] != b[i])
> > +        __builtin_abort ();
> > +    }
> > +  return 0;
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf
> 44a
> > b211cd246d82d5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c
> > @@ -0,0 +1,61 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> > +*/
> > +
> > +#include <stdint.h>
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> > +** 	addhn	v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h
> > +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> > +** 	uaddw	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b
> > +** 	uzp2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> > +** 	addhn	v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s
> > +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> > +** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
> > +** 	uzp2	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> > +** 	addhn	v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d
> > +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> > +** 	uaddw	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s
> > +** 	uzp2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index
> >
> 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817
> c
> > 9a12046b6ec94f3 100644
> > --- a/gcc/tree-vect-generic.cc
> > +++ b/gcc/tree-vect-generic.cc
> > @@ -1237,6 +1237,14 @@ expand_vector_operation
> (gimple_stmt_iterator
> > *gsi, tree type, tree compute_type
> >  	  tree rhs2 = gimple_assign_rhs2 (assign);
> >  	  tree ret;
> >
> > +	  /* Check if the target was going to handle it through the special
> > +	     division callback hook.  */
> > +	  if (targetm.vectorize.can_special_div_by_const (code, type, rhs1,
> > +							  rhs2, NULL,
> > +							  NULL_RTX,
> > NULL_RTX))
> > +	    return NULL_TREE;
> > +
> > +
> >  	  if (!optimize
> >  	      || !VECTOR_INTEGER_TYPE_P (type)
> >  	      || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-
> > patterns.cc b/gcc/tree-vect-patterns.cc index
> >
> 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85
> af
> > 0b1bfea10fe443 100644
> > --- a/gcc/tree-vect-patterns.cc
> > +++ b/gcc/tree-vect-patterns.cc
> > @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo,
> >
> >        return pattern_stmt;
> >      }
> > +  else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype,
> > +						       oprnd0, oprnd1, NULL,
> > +						       NULL_RTX, NULL_RTX))
> > +    {
> > +      return NULL;
> > +    }
> >
> >    if (prec > HOST_BITS_PER_WIDE_INT
> >        || integer_zerop (oprnd1))
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index
> >
> c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288b
> d6
> > 8e0e1c1e93faafe 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo,
> >  	}
> >        target_support_p = (optab_handler (optab, vec_mode)
> >  			  != CODE_FOR_nothing);
> > +      if (!target_support_p)
> > +	target_support_p
> > +	  = targetm.vectorize.can_special_div_by_const (code, vectype,
> > +							op0, op1, NULL,
> > +							NULL_RTX,
> > NULL_RTX);
> >      }
> >
> >    bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);
> >
> >
> >
> >
> > --


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/4]AArch64 Add SVE2 implementation for pow2 bitmask division
  2022-09-23  9:33 ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
  2022-10-31 11:34   ` Tamar Christina
@ 2022-11-12 12:17   ` Richard Sandiford
  1 sibling, 0 replies; 35+ messages in thread
From: Richard Sandiford @ 2022-11-12 12:17 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Sorry for the slow review, been snowed under with stage1 stuff.

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> In plenty of image and video processing code it's common to modify pixel values
> by a widening operation and then scale them back into range by dividing by 255.
>
> This patch adds an named function to allow us to emit an optimized sequence
> when doing an unsigned division that is equivalent to:
>
>    x = y / (2 ^ (bitsize (y)/2)-1)
>
> For SVE2 this means we generate for:
>
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] = (pixel[i] * level) / 0xff;
> }
>
> the following:
>
>         mov     z3.b, #1
> .L3:
>         ld1b    z0.h, p0/z, [x0, x3]
>         mul     z0.h, p1/m, z0.h, z2.h
>         addhnb  z1.b, z0.h, z3.h
>         addhnb  z0.b, z0.h, z1.h
>         st1b    z0.h, p0, [x0, x3]
>         inch    x3
>         whilelo p0.h, w3, w2
>         b.any   .L3
>
> instead of:
>
> .L3:
>         ld1b    z0.h, p1/z, [x0, x3]
>         mul     z0.h, p0/m, z0.h, z1.h
>         umulh   z0.h, p0/m, z0.h, z2.h
>         lsr     z0.h, z0.h, #7
>         st1b    z0.h, p1, [x0, x3]
>         inch    x3
>         whilelo p1.h, w3, w2
>         b.any   .L3
>
> Which results in significantly faster code.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-sve2.md (@aarch64_bitmask_udiv<mode>3): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve2/div-by-bitmask_1.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index f138f4be4bcf74c1a4a6d5847ed831435246737f..4d097f7c405cc68a1d6cda5c234a1023a6eba0d1 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -71,6 +71,7 @@
>  ;; ---- [INT] Reciprocal approximation
>  ;; ---- [INT<-FP] Base-2 logarithm
>  ;; ---- [INT] Polynomial multiplication
> +;; ---- [INT] Misc optab implementations
>  ;;
>  ;; == Permutation
>  ;; ---- [INT,FP] General permutes
> @@ -2312,6 +2313,47 @@ (define_insn "@aarch64_sve_<optab><mode>"
>    "<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
>  )
>  
> +;; -------------------------------------------------------------------------
> +;; ---- [INT] Misc optab implementations
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - aarch64_bitmask_udiv
> +;; -------------------------------------------------------------------------
> +
> +;; div optimizations using narrowings
> +;; we can do the division e.g. shorts by 255 faster by calculating it as
> +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
> +;; double the precision of x.
> +;;
> +;; See aarch64-simd.md for bigger explanation.
> +(define_expand "@aarch64_bitmask_udiv<mode>3"
> +  [(match_operand:SVE_FULL_HSDI 0 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 1 "register_operand")
> +   (match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
> +  "TARGET_SVE2"
> +{
> +  unsigned HOST_WIDE_INT size
> +    = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
> +  if (!CONST_VECTOR_P (operands[2])
> +      || const_vector_encoded_nelts (operands[2]) != 1
> +      || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0)))
> +    FAIL;

A slightly simpler way to write this, without the direct use of the
encoding, is:

  rtx elt = unwrap_const_vec_duplicate (operands[2]);
  if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
    FAIL;

OK with that change, thanks.

Richard

> +
> +  rtx addend = gen_reg_rtx (<MODE>mode);
> +  rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
> +  rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
> +  rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
> +  emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
> +			      addend));
> +  emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
> +			      lowpart_subreg (<MODE>mode, tmp1,
> +					      <VNARROW>mode)));
> +  emit_move_insn (operands[0],
> +		  lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
> +  DONE;
> +})
> +
>  ;; =========================================================================
>  ;; == Permutation
>  ;; =========================================================================
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e6f5098c30f4e2eb8ed1af153c0bb0d204cda6d9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_1.c
> @@ -0,0 +1,53 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +**	mul	z[0-9]+.h, p[0-9]+/m, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +**	addhnb	z[0-9]+.b, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +**	mul	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +**	addhnb	z[0-9]+.h, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +**	mul	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +**	addhnb	z[0-9]+.s, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT
  2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
  2022-10-31 11:34   ` Tamar Christina
@ 2022-11-12 12:25   ` Richard Sandiford
  2022-11-12 12:33     ` Richard Sandiford
  1 sibling, 1 reply; 35+ messages in thread
From: Richard Sandiford @ 2022-11-12 12:25 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This adds an RTL pattern for when two NARROWB instructions are being combined
> with a PACK.  The second NARROWB is then transformed into a NARROWT.
>
> For the example:
>
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] += (pixel[i] * level) / 0xff;
> }
>
> we generate:
>
>         addhnb  z6.b, z0.h, z4.h
>         addhnb  z5.b, z1.h, z4.h
>         addhnb  z0.b, z0.h, z6.h
>         addhnt  z0.b, z1.h, z5.h
>         add     z0.b, z0.b, z2.b
>
> instead of:
>
>         addhnb  z6.b, z1.h, z4.h
>         addhnb  z5.b, z0.h, z4.h
>         addhnb  z1.b, z1.h, z6.h
>         addhnb  z0.b, z0.h, z5.h
>         uzp1    z0.b, z0.b, z1.b
>         add     z0.b, z0.b, z2.b
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
> 	New.
> 	* config/aarch64/iterators.md (binary_top): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>  )
>  
> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> +	(unspec:<VNARROW>
> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")

"0" would be safer, in case the instruction is only split after RA.

> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> +	     SVE2_INT_BINARY_NARROWB) 0)]
> +	  UNSPEC_PACK))]

I think ideally this would be the canonical pattern, so that we can
drop the separate top unspecs.  That's more work though, and would
probably make sense to do once we have a generic way of representing
the pack.

So OK with the "0" change above.

Thanks,
Richard

> +  "TARGET_SVE2"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
> +  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> +			      operands[0], tmp, operands[2], operands[3]));
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] Narrowing right shifts
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
>  
>  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
>  
> +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> +
>  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
>  			     (UNSPEC_ADCLT "adclt")
>  			     (UNSPEC_ADDHNB "addhnb")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhnb	z6.b, z0.h, z4.h
> +** 	addhnb	z5.b, z1.h, z4.h
> +** 	addhnb	z0.b, z0.h, z6.h
> +** 	addhnt	z0.b, z1.h, z5.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhnb	z6.h, z0.s, z4.s
> +** 	addhnb	z5.h, z1.s, z4.s
> +** 	addhnb	z0.h, z0.s, z6.s
> +** 	addhnt	z0.h, z1.s, z5.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhnb	z6.s, z0.d, z4.d
> +** 	addhnb	z5.s, z1.d, z4.d
> +** 	addhnb	z0.s, z0.d, z6.d
> +** 	addhnt	z0.s, z1.d, z5.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT
  2022-11-12 12:25   ` Richard Sandiford
@ 2022-11-12 12:33     ` Richard Sandiford
  0 siblings, 0 replies; 35+ messages in thread
From: Richard Sandiford @ 2022-11-12 12:33 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Richard Sandiford <richard.sandiford@arm.com> writes:
> Tamar Christina <tamar.christina@arm.com> writes:
>> Hi All,
>>
>> This adds an RTL pattern for when two NARROWB instructions are being combined
>> with a PACK.  The second NARROWB is then transformed into a NARROWT.
>>
>> For the example:
>>
>> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
>> {
>>   for (int i = 0; i < (n & -16); i+=1)
>>     pixel[i] += (pixel[i] * level) / 0xff;
>> }
>>
>> we generate:
>>
>>         addhnb  z6.b, z0.h, z4.h
>>         addhnb  z5.b, z1.h, z4.h
>>         addhnb  z0.b, z0.h, z6.h
>>         addhnt  z0.b, z1.h, z5.h
>>         add     z0.b, z0.b, z2.b
>>
>> instead of:
>>
>>         addhnb  z6.b, z1.h, z4.h
>>         addhnb  z5.b, z0.h, z4.h
>>         addhnb  z1.b, z1.h, z6.h
>>         addhnb  z0.b, z0.h, z5.h
>>         uzp1    z0.b, z0.b, z1.b
>>         add     z0.b, z0.b, z2.b
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
>> 	New.
>> 	* config/aarch64/iterators.md (binary_top): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
>> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
>>
>> --- inline copy of patch -- 
>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>> index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
>> --- a/gcc/config/aarch64/aarch64-sve2.md
>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>> @@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>>  )
>>  
>> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
>> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
>> +	(unspec:<VNARROW>
>> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
>
> "0" would be safer, in case the instruction is only split after RA.
>
>> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
>> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
>> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
>> +	     SVE2_INT_BINARY_NARROWB) 0)]
>> +	  UNSPEC_PACK))]
>
> I think ideally this would be the canonical pattern, so that we can
> drop the separate top unspecs.  That's more work though, and would
> probably make sense to do once we have a generic way of representing
> the pack.
>
> So OK with the "0" change above.

Hmm, actually, I take that back.  Is this transform really correct?
I think the blend corresponds to a TRN1 rather than a UZP1.
The bottom operations populate the lower half of each wider element
and the top operations populate the upper half.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2022-11-12 12:33 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-09  4:39 [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Tamar Christina
2022-06-09  4:40 ` [PATCH 2/2]AArch64 aarch64: Add implementation for pow2 bitmask division Tamar Christina
2022-06-13  9:24 ` [PATCH 1/2]middle-end Support optimized division by pow2 bitmask Richard Biener
2022-06-13  9:39   ` Richard Biener
2022-06-13 10:09     ` Tamar Christina
2022-06-13 11:47       ` Richard Biener
2022-06-13 14:37         ` Tamar Christina
2022-06-14 13:18           ` Richard Biener
2022-06-14 13:38             ` Tamar Christina
2022-06-14 13:42             ` Richard Sandiford
2022-06-14 15:57               ` Tamar Christina
2022-06-14 16:09                 ` Richard Biener
2022-06-22  0:34                 ` Tamar Christina
2022-06-26 19:55                   ` Jeff Law
2022-09-23  9:33 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Tamar Christina
2022-09-23  9:33 ` [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division Tamar Christina
2022-10-31 11:34   ` Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-09 16:02     ` Kyrylo Tkachov
2022-09-23  9:33 ` [PATCH 3/4]AArch64 Add SVE2 " Tamar Christina
2022-10-31 11:34   ` Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-12 12:17   ` Richard Sandiford
2022-09-23  9:34 ` [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT Tamar Christina
2022-10-31 11:34   ` Tamar Christina
2022-11-09  8:33     ` Tamar Christina
2022-11-12 12:25   ` Richard Sandiford
2022-11-12 12:33     ` Richard Sandiford
2022-09-26 10:39 ` [PATCH 1/4]middle-end Support not decomposing specific divisions during vectorization Richard Biener
2022-10-31 11:34   ` Tamar Christina
2022-10-31 17:12     ` Jeff Law
2022-11-08 17:36     ` Tamar Christina
2022-11-09  8:01       ` Richard Biener
2022-11-09  8:26         ` Tamar Christina
2022-11-09 10:37 ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).