public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements
@ 2022-11-11 13:29 Andrew Carlotti
  2022-11-11 13:39 ` [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit Andrew Carlotti
                   ` (8 more replies)
  0 siblings, 9 replies; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 13:29 UTC (permalink / raw)
  To: gcc-patches

This is a series of patches to improve recognition of popcount and
clz/ctz idioms, along with some related fixes.

- Patches 1 and 8 are independent fixes or improvements.
- Patch 4 is a dependency of patch 5, as it improves the robustness of a
  test that would otherwise begin failing.
- Patches 2, 3, 5 and 7 form the main dependent sequence.
- Patch 6 is a documentation update, covering attributes in patch 5 and
  existing code.
- Patch 7 may require other work before it can be merged, as it seems to
  expose a latent issue in the vectoriser.

Each patch has been bootstrapped and regression tested on
aarch64-none-linux-gnu.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
@ 2022-11-11 13:39 ` Andrew Carlotti
  2022-11-14 14:23   ` Jeff Law
  2022-11-11 13:46 ` [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount Andrew Carlotti
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 13:39 UTC (permalink / raw)
  To: gcc-patches

This prevents a null dereference error when outputing debug information
following an early exit from number_of_iterations_exit_assumptions.

gcc/ChangeLog:

	* tree-ssa-loop-niter.cc (number_of_iterations_exit_assumptions):
	Move at_stmt assignment.


--


diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 4ffcef4f4ff2fe182fbe711553c8e4575560ab07..cdbb924216243ebcabe6c695698a4aee71882c49 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2537,6 +2537,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
   if (!stmt)
     return false;
 
+  if (at_stmt)
+    *at_stmt = stmt;
+
   /* We want the condition for staying inside loop.  */
   code = gimple_cond_code (stmt);
   if (exit->flags & EDGE_TRUE_VALUE)
@@ -2642,9 +2645,6 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
   if (TREE_CODE (niter->niter) == INTEGER_CST)
     niter->max = wi::to_widest (niter->niter);
 
-  if (at_stmt)
-    *at_stmt = stmt;
-
   return (!integer_zerop (niter->assumptions));
 }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
  2022-11-11 13:39 ` [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit Andrew Carlotti
@ 2022-11-11 13:46 ` Andrew Carlotti
  2022-11-14 14:24   ` Jeff Law
  2022-11-11 13:52 ` [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount Andrew Carlotti
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 13:46 UTC (permalink / raw)
  To: gcc-patches

gcc/ChangeLog:

	* tree-ssa-loop-niter.c (ssa_defined_by_minus_one_stmt_p): Move
	(number_of_iterations_popcount): Move, and remove separate prototype.


--


diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index cdbb924216243ebcabe6c695698a4aee71882c49..c23643fd9dd8b27ff11549e1f28f585534e84cd3 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -63,11 +63,6 @@ struct bounds
   mpz_t below, up;
 };
 
-static bool number_of_iterations_popcount (loop_p loop, edge exit,
-					   enum tree_code code,
-					   class tree_niter_desc *niter);
-
-
 /* Splits expression EXPR to a variable part VAR and constant OFFSET.  */
 
 static void
@@ -2031,6 +2026,200 @@ number_of_iterations_cond (class loop *loop,
   return ret;
 }
 
+/* Utility function to check if OP is defined by a stmt
+   that is a val - 1.  */
+
+static bool
+ssa_defined_by_minus_one_stmt_p (tree op, tree val)
+{
+  gimple *stmt;
+  return (TREE_CODE (op) == SSA_NAME
+	  && (stmt = SSA_NAME_DEF_STMT (op))
+	  && is_gimple_assign (stmt)
+	  && (gimple_assign_rhs_code (stmt) == PLUS_EXPR)
+	  && val == gimple_assign_rhs1 (stmt)
+	  && integer_minus_onep (gimple_assign_rhs2 (stmt)));
+}
+
+/* See if LOOP is a popcout implementation, determine NITER for the loop
+
+   We match:
+   <bb 2>
+   goto <bb 4>
+
+   <bb 3>
+   _1 = b_11 + -1
+   b_6 = _1 & b_11
+
+   <bb 4>
+   b_11 = PHI <b_5(D)(2), b_6(3)>
+
+   exit block
+   if (b_11 != 0)
+	goto <bb 3>
+   else
+	goto <bb 5>
+
+   OR we match copy-header version:
+   if (b_5 != 0)
+	goto <bb 3>
+   else
+	goto <bb 4>
+
+   <bb 3>
+   b_11 = PHI <b_5(2), b_6(3)>
+   _1 = b_11 + -1
+   b_6 = _1 & b_11
+
+   exit block
+   if (b_6 != 0)
+	goto <bb 3>
+   else
+	goto <bb 4>
+
+   If popcount pattern, update NITER accordingly.
+   i.e., set NITER to  __builtin_popcount (b)
+   return true if we did, false otherwise.
+
+ */
+
+static bool
+number_of_iterations_popcount (loop_p loop, edge exit,
+			       enum tree_code code,
+			       class tree_niter_desc *niter)
+{
+  bool adjust = true;
+  tree iter;
+  HOST_WIDE_INT max;
+  adjust = true;
+  tree fn = NULL_TREE;
+
+  /* Check loop terminating branch is like
+     if (b != 0).  */
+  gimple *stmt = last_stmt (exit->src);
+  if (!stmt
+      || gimple_code (stmt) != GIMPLE_COND
+      || code != NE_EXPR
+      || !integer_zerop (gimple_cond_rhs (stmt))
+      || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME)
+    return false;
+
+  gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt));
+
+  /* Depending on copy-header is performed, feeding PHI stmts might be in
+     the loop header or loop latch, handle this.  */
+  if (gimple_code (and_stmt) == GIMPLE_PHI
+      && gimple_bb (and_stmt) == loop->header
+      && gimple_phi_num_args (and_stmt) == 2
+      && (TREE_CODE (gimple_phi_arg_def (and_stmt,
+					 loop_latch_edge (loop)->dest_idx))
+	  == SSA_NAME))
+    {
+      /* SSA used in exit condition is defined by PHI stmt
+	b_11 = PHI <b_5(D)(2), b_6(3)>
+	from the PHI stmt, get the and_stmt
+	b_6 = _1 & b_11.  */
+      tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx);
+      and_stmt = SSA_NAME_DEF_STMT (t);
+      adjust = false;
+    }
+
+  /* Make sure it is indeed an and stmt (b_6 = _1 & b_11).  */
+  if (!is_gimple_assign (and_stmt)
+      || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
+    return false;
+
+  tree b_11 = gimple_assign_rhs1 (and_stmt);
+  tree _1 = gimple_assign_rhs2 (and_stmt);
+
+  /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1).
+     Also make sure that b_11 is the same in and_stmt and _1 defining stmt.
+     Also canonicalize if _1 and _b11 are revrsed.  */
+  if (ssa_defined_by_minus_one_stmt_p (b_11, _1))
+    std::swap (b_11, _1);
+  else if (ssa_defined_by_minus_one_stmt_p (_1, b_11))
+    ;
+  else
+    return false;
+  /* Check the recurrence:
+   ... = PHI <b_5(2), b_6(3)>.  */
+  gimple *phi = SSA_NAME_DEF_STMT (b_11);
+  if (gimple_code (phi) != GIMPLE_PHI
+      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+      || (gimple_assign_lhs (and_stmt)
+	  != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+    return false;
+
+  /* We found a match. Get the corresponding popcount builtin.  */
+  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+  if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node))
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
+  else if (TYPE_PRECISION (TREE_TYPE (src))
+	   == TYPE_PRECISION (long_integer_type_node))
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
+  else if (TYPE_PRECISION (TREE_TYPE (src))
+	   == TYPE_PRECISION (long_long_integer_type_node)
+	   || (TYPE_PRECISION (TREE_TYPE (src))
+	       == 2 * TYPE_PRECISION (long_long_integer_type_node)))
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
+
+  if (!fn)
+    return false;
+
+  /* Update NITER params accordingly  */
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+  if (TYPE_PRECISION (TREE_TYPE (src)) < TYPE_PRECISION (integer_type_node))
+    src = fold_convert (unsigned_type_node, src);
+  tree call;
+  if (TYPE_PRECISION (TREE_TYPE (src))
+      == 2 * TYPE_PRECISION (long_long_integer_type_node))
+    {
+      int prec = TYPE_PRECISION (long_long_integer_type_node);
+      tree src1 = fold_convert (long_long_unsigned_type_node,
+				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+					     unshare_expr (src),
+					     build_int_cst (integer_type_node,
+							    prec)));
+      tree src2 = fold_convert (long_long_unsigned_type_node, src);
+      call = build_call_expr (fn, 1, src1);
+      call = fold_build2 (PLUS_EXPR, TREE_TYPE (call), call,
+			  build_call_expr (fn, 1, src2));
+      call = fold_convert (utype, call);
+    }
+  else
+    call = fold_convert (utype, build_call_expr (fn, 1, src));
+  if (adjust)
+    iter = fold_build2 (MINUS_EXPR, utype, call, build_int_cst (utype, 1));
+  else
+    iter = call;
+
+  if (TREE_CODE (call) == INTEGER_CST)
+    max = tree_to_uhwi (call);
+  else
+    max = TYPE_PRECISION (TREE_TYPE (src));
+  if (adjust)
+    max = max - 1;
+
+  niter->niter = iter;
+  niter->assumptions = boolean_true_node;
+
+  if (adjust)
+    {
+      tree may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+      niter->may_be_zero
+	= simplify_using_initial_conditions (loop, may_be_zero);
+    }
+  else
+    niter->may_be_zero = boolean_false_node;
+
+  niter->max = max;
+  niter->bound = NULL_TREE;
+  niter->cmp = ERROR_MARK;
+  return true;
+}
+
 /* Substitute NEW_TREE for OLD in EXPR and fold the result.
    If VALUEIZE is non-NULL then OLD and NEW_TREE are ignored and instead
    all SSA names are replaced with the result of calling the VALUEIZE
@@ -2648,203 +2837,6 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
   return (!integer_zerop (niter->assumptions));
 }
 
-
-/* Utility function to check if OP is defined by a stmt
-   that is a val - 1.  */
-
-static bool
-ssa_defined_by_minus_one_stmt_p (tree op, tree val)
-{
-  gimple *stmt;
-  return (TREE_CODE (op) == SSA_NAME
-	  && (stmt = SSA_NAME_DEF_STMT (op))
-	  && is_gimple_assign (stmt)
-	  && (gimple_assign_rhs_code (stmt) == PLUS_EXPR)
-	  && val == gimple_assign_rhs1 (stmt)
-	  && integer_minus_onep (gimple_assign_rhs2 (stmt)));
-}
-
-
-/* See if LOOP is a popcout implementation, determine NITER for the loop
-
-   We match:
-   <bb 2>
-   goto <bb 4>
-
-   <bb 3>
-   _1 = b_11 + -1
-   b_6 = _1 & b_11
-
-   <bb 4>
-   b_11 = PHI <b_5(D)(2), b_6(3)>
-
-   exit block
-   if (b_11 != 0)
-	goto <bb 3>
-   else
-	goto <bb 5>
-
-   OR we match copy-header version:
-   if (b_5 != 0)
-	goto <bb 3>
-   else
-	goto <bb 4>
-
-   <bb 3>
-   b_11 = PHI <b_5(2), b_6(3)>
-   _1 = b_11 + -1
-   b_6 = _1 & b_11
-
-   exit block
-   if (b_6 != 0)
-	goto <bb 3>
-   else
-	goto <bb 4>
-
-   If popcount pattern, update NITER accordingly.
-   i.e., set NITER to  __builtin_popcount (b)
-   return true if we did, false otherwise.
-
- */
-
-static bool
-number_of_iterations_popcount (loop_p loop, edge exit,
-			       enum tree_code code,
-			       class tree_niter_desc *niter)
-{
-  bool adjust = true;
-  tree iter;
-  HOST_WIDE_INT max;
-  adjust = true;
-  tree fn = NULL_TREE;
-
-  /* Check loop terminating branch is like
-     if (b != 0).  */
-  gimple *stmt = last_stmt (exit->src);
-  if (!stmt
-      || gimple_code (stmt) != GIMPLE_COND
-      || code != NE_EXPR
-      || !integer_zerop (gimple_cond_rhs (stmt))
-      || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME)
-    return false;
-
-  gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt));
-
-  /* Depending on copy-header is performed, feeding PHI stmts might be in
-     the loop header or loop latch, handle this.  */
-  if (gimple_code (and_stmt) == GIMPLE_PHI
-      && gimple_bb (and_stmt) == loop->header
-      && gimple_phi_num_args (and_stmt) == 2
-      && (TREE_CODE (gimple_phi_arg_def (and_stmt,
-					 loop_latch_edge (loop)->dest_idx))
-	  == SSA_NAME))
-    {
-      /* SSA used in exit condition is defined by PHI stmt
-	b_11 = PHI <b_5(D)(2), b_6(3)>
-	from the PHI stmt, get the and_stmt
-	b_6 = _1 & b_11.  */
-      tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx);
-      and_stmt = SSA_NAME_DEF_STMT (t);
-      adjust = false;
-    }
-
-  /* Make sure it is indeed an and stmt (b_6 = _1 & b_11).  */
-  if (!is_gimple_assign (and_stmt)
-      || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
-    return false;
-
-  tree b_11 = gimple_assign_rhs1 (and_stmt);
-  tree _1 = gimple_assign_rhs2 (and_stmt);
-
-  /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1).
-     Also make sure that b_11 is the same in and_stmt and _1 defining stmt.
-     Also canonicalize if _1 and _b11 are revrsed.  */
-  if (ssa_defined_by_minus_one_stmt_p (b_11, _1))
-    std::swap (b_11, _1);
-  else if (ssa_defined_by_minus_one_stmt_p (_1, b_11))
-    ;
-  else
-    return false;
-  /* Check the recurrence:
-   ... = PHI <b_5(2), b_6(3)>.  */
-  gimple *phi = SSA_NAME_DEF_STMT (b_11);
-  if (gimple_code (phi) != GIMPLE_PHI
-      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
-      || (gimple_assign_lhs (and_stmt)
-	  != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
-    return false;
-
-  /* We found a match. Get the corresponding popcount builtin.  */
-  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
-  if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
-  else if (TYPE_PRECISION (TREE_TYPE (src))
-	   == TYPE_PRECISION (long_integer_type_node))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
-  else if (TYPE_PRECISION (TREE_TYPE (src))
-	   == TYPE_PRECISION (long_long_integer_type_node)
-	   || (TYPE_PRECISION (TREE_TYPE (src))
-	       == 2 * TYPE_PRECISION (long_long_integer_type_node)))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
-
-  if (!fn)
-    return false;
-
-  /* Update NITER params accordingly  */
-  tree utype = unsigned_type_for (TREE_TYPE (src));
-  src = fold_convert (utype, src);
-  if (TYPE_PRECISION (TREE_TYPE (src)) < TYPE_PRECISION (integer_type_node))
-    src = fold_convert (unsigned_type_node, src);
-  tree call;
-  if (TYPE_PRECISION (TREE_TYPE (src))
-      == 2 * TYPE_PRECISION (long_long_integer_type_node))
-    {
-      int prec = TYPE_PRECISION (long_long_integer_type_node);
-      tree src1 = fold_convert (long_long_unsigned_type_node,
-				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
-					     unshare_expr (src),
-					     build_int_cst (integer_type_node,
-							    prec)));
-      tree src2 = fold_convert (long_long_unsigned_type_node, src);
-      call = build_call_expr (fn, 1, src1);
-      call = fold_build2 (PLUS_EXPR, TREE_TYPE (call), call,
-			  build_call_expr (fn, 1, src2));
-      call = fold_convert (utype, call);
-    }
-  else
-    call = fold_convert (utype, build_call_expr (fn, 1, src));
-  if (adjust)
-    iter = fold_build2 (MINUS_EXPR, utype, call, build_int_cst (utype, 1));
-  else
-    iter = call;
-
-  if (TREE_CODE (call) == INTEGER_CST)
-    max = tree_to_uhwi (call);
-  else
-    max = TYPE_PRECISION (TREE_TYPE (src));
-  if (adjust)
-    max = max - 1;
-
-  niter->niter = iter;
-  niter->assumptions = boolean_true_node;
-
-  if (adjust)
-    {
-      tree may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
-				      build_zero_cst (TREE_TYPE (src)));
-      niter->may_be_zero
-	= simplify_using_initial_conditions (loop, may_be_zero);
-    }
-  else
-    niter->may_be_zero = boolean_false_node;
-
-  niter->max = max;
-  niter->bound = NULL_TREE;
-  niter->cmp = ERROR_MARK;
-  return true;
-}
-
-
 /* Like number_of_iterations_exit_assumptions, but return TRUE only if
    the niter information holds unconditionally.  */
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
  2022-11-11 13:39 ` [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit Andrew Carlotti
  2022-11-11 13:46 ` [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount Andrew Carlotti
@ 2022-11-11 13:52 ` Andrew Carlotti
  2022-11-14 14:52   ` Richard Biener
  2022-11-11 18:43 ` [PATCH 4/8] Modify test, to prevent the next patch breaking it Andrew Carlotti
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 13:52 UTC (permalink / raw)
  To: gcc-patches

This includes various changes to improve clarity, and to enable the code
to be more similar to the clz and ctz idiom recognition added in
subsequent patches.

We create new number_of_iterations_bitcount function, which will be used
to call the other bit-counting recognition functions added in subsequent
patches, as well as a generic comment describing the loop structures
that are common to each idiom. Some of the variables in
number_of_iterations_popcount are given more descriptive names, and the
popcount expression builder is extracted into a separate function.

As part of the refactoring, we also fix a bug where the max loop count
for modes shorter than an integer would be incorrectly computed as if
the input mode were actually an integer.

We also ensure that niter->max takes into account the final value for
niter->niter (after any folding and simplifying), since if the latter is a
constant, then record_estimate mandates that the two values are equivalent.

gcc/ChangeLog:

	* tree-ssa-loop-niter.cc
	(number_of_iterations_exit_assumptions): Modify to call...
	(number_of_iterations_bitcount): ...this new function.
	(number_of_iterations_popcount): Now called by the above.
	Refactor, and extract popcount expression builder to...
	(build_popcount_expr): this new function.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/popcount-max.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
new file mode 100644
index 0000000000000000000000000000000000000000..ca7204cbc3cea636183408e24d7dd36d702ffdb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int count1 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b &= b - 1;
+	c++;
+    }
+    if (c <= PREC)
+      return 0;
+    else
+      return 34567;
+}
+
+int count2 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b &= b - 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 76543;
+}
+
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 0af34e46580bb9a6f9b40e09c9f29b8454a4aaf6..fece876099c1687569d6351e7d2416ea6acae5b5 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2026,6 +2026,48 @@ number_of_iterations_cond (class loop *loop,
   return ret;
 }
 
+/* Return an expression that computes the popcount of src.  */
+
+static tree
+build_popcount_expr (tree src)
+{
+  tree fn;
+  int prec = TYPE_PRECISION (TREE_TYPE (src));
+  int i_prec = TYPE_PRECISION (integer_type_node);
+  int li_prec = TYPE_PRECISION (long_integer_type_node);
+  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
+  if (prec <= i_prec)
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
+  else if (prec == li_prec)
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
+  else if (prec == lli_prec || prec == 2 * lli_prec)
+    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
+  else
+    return NULL_TREE;
+
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+  if (prec < i_prec)
+    src = fold_convert (unsigned_type_node, src);
+  tree call;
+  if (prec == 2 * lli_prec)
+    {
+      tree src1 = fold_convert (long_long_unsigned_type_node,
+				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+					     unshare_expr (src),
+					     build_int_cst (integer_type_node,
+							    lli_prec)));
+      tree src2 = fold_convert (long_long_unsigned_type_node, src);
+      tree call1 = build_call_expr (fn, 1, src1);
+      tree call2 = build_call_expr (fn, 1, src2);
+      call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
+    }
+  else
+    call = build_call_expr (fn, 1, src);
+
+  return call;
+}
+
 /* Utility function to check if OP is defined by a stmt
    that is a val - 1.  */
 
@@ -2041,45 +2083,18 @@ ssa_defined_by_minus_one_stmt_p (tree op, tree val)
 	  && integer_minus_onep (gimple_assign_rhs2 (stmt)));
 }
 
-/* See if LOOP is a popcout implementation, determine NITER for the loop
+/* See comment below for number_of_iterations_bitcount.
+   For popcount, we have:
 
-   We match:
-   <bb 2>
-   goto <bb 4>
+   modify:
+   _1 = iv_1 + -1
+   iv_2 = iv_1 & _1
 
-   <bb 3>
-   _1 = b_11 + -1
-   b_6 = _1 & b_11
-
-   <bb 4>
-   b_11 = PHI <b_5(D)(2), b_6(3)>
+   test:
+   if (iv != 0)
 
-   exit block
-   if (b_11 != 0)
-	goto <bb 3>
-   else
-	goto <bb 5>
-
-   OR we match copy-header version:
-   if (b_5 != 0)
-	goto <bb 3>
-   else
-	goto <bb 4>
-
-   <bb 3>
-   b_11 = PHI <b_5(2), b_6(3)>
-   _1 = b_11 + -1
-   b_6 = _1 & b_11
-
-   exit block
-   if (b_6 != 0)
-	goto <bb 3>
-   else
-	goto <bb 4>
-
-   If popcount pattern, update NITER accordingly.
-   i.e., set NITER to  __builtin_popcount (b)
-   return true if we did, false otherwise.
+   modification count:
+   popcount (src)
 
  */
 
@@ -2088,138 +2103,150 @@ number_of_iterations_popcount (loop_p loop, edge exit,
 			       enum tree_code code,
 			       class tree_niter_desc *niter)
 {
-  bool adjust = true;
-  tree iter;
+  bool modify_before_test = true;
   HOST_WIDE_INT max;
-  adjust = true;
-  tree fn = NULL_TREE;
-
-  /* Check loop terminating branch is like
-     if (b != 0).  */
-  gimple *stmt = last_stmt (exit->src);
-  if (!stmt
-      || gimple_code (stmt) != GIMPLE_COND
+
+  /* Check that condition for staying inside the loop is like
+     if (iv != 0).  */
+  gimple *cond_stmt = last_stmt (exit->src);
+  if (!cond_stmt
+      || gimple_code (cond_stmt) != GIMPLE_COND
       || code != NE_EXPR
-      || !integer_zerop (gimple_cond_rhs (stmt))
-      || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME)
+      || !integer_zerop (gimple_cond_rhs (cond_stmt))
+      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
     return false;
 
-  gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt));
+  tree iv_2 = gimple_cond_lhs (cond_stmt);
+  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
 
-  /* Depending on copy-header is performed, feeding PHI stmts might be in
-     the loop header or loop latch, handle this.  */
-  if (gimple_code (and_stmt) == GIMPLE_PHI
-      && gimple_bb (and_stmt) == loop->header
-      && gimple_phi_num_args (and_stmt) == 2
-      && (TREE_CODE (gimple_phi_arg_def (and_stmt,
+  /* If the test comes before the iv modification, then these will actually be
+     iv_1 and a phi node.  */
+  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
+      && gimple_bb (iv_2_stmt) == loop->header
+      && gimple_phi_num_args (iv_2_stmt) == 2
+      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
 					 loop_latch_edge (loop)->dest_idx))
 	  == SSA_NAME))
     {
-      /* SSA used in exit condition is defined by PHI stmt
-	b_11 = PHI <b_5(D)(2), b_6(3)>
-	from the PHI stmt, get the and_stmt
-	b_6 = _1 & b_11.  */
-      tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx);
-      and_stmt = SSA_NAME_DEF_STMT (t);
-      adjust = false;
+      /* iv_2 is actually one of the inputs to the phi.  */
+      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
+      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+      modify_before_test = false;
     }
 
-  /* Make sure it is indeed an and stmt (b_6 = _1 & b_11).  */
-  if (!is_gimple_assign (and_stmt)
-      || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
+  /* Make sure iv_2_stmt is an and stmt (iv_2 = _1 & iv_1).  */
+  if (!is_gimple_assign (iv_2_stmt)
+      || gimple_assign_rhs_code (iv_2_stmt) != BIT_AND_EXPR)
     return false;
 
-  tree b_11 = gimple_assign_rhs1 (and_stmt);
-  tree _1 = gimple_assign_rhs2 (and_stmt);
+  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
+  tree _1 = gimple_assign_rhs2 (iv_2_stmt);
 
-  /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1).
-     Also make sure that b_11 is the same in and_stmt and _1 defining stmt.
+  /* Check that _1 is defined by (_1 = iv_1 + -1).
+     Also make sure that _1 is the same in and_stmt and _1 defining stmt.
      Also canonicalize if _1 and _b11 are revrsed.  */
-  if (ssa_defined_by_minus_one_stmt_p (b_11, _1))
-    std::swap (b_11, _1);
-  else if (ssa_defined_by_minus_one_stmt_p (_1, b_11))
+  if (ssa_defined_by_minus_one_stmt_p (iv_1, _1))
+    std::swap (iv_1, _1);
+  else if (ssa_defined_by_minus_one_stmt_p (_1, iv_1))
     ;
   else
     return false;
-  /* Check the recurrence:
-   ... = PHI <b_5(2), b_6(3)>.  */
-  gimple *phi = SSA_NAME_DEF_STMT (b_11);
+
+  /* Check the recurrence.  */
+  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
   if (gimple_code (phi) != GIMPLE_PHI
       || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
-      || (gimple_assign_lhs (and_stmt)
-	  != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
     return false;
 
-  /* We found a match. Get the corresponding popcount builtin.  */
+  /* We found a match.  */
   tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
-  if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
-  else if (TYPE_PRECISION (TREE_TYPE (src))
-	   == TYPE_PRECISION (long_integer_type_node))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
-  else if (TYPE_PRECISION (TREE_TYPE (src))
-	   == TYPE_PRECISION (long_long_integer_type_node)
-	   || (TYPE_PRECISION (TREE_TYPE (src))
-	       == 2 * TYPE_PRECISION (long_long_integer_type_node)))
-    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
+  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
 
-  if (!fn)
+  /* Get the corresponding popcount builtin.  */
+  tree expr = build_popcount_expr (src);
+
+  if (!expr)
     return false;
 
-  /* Update NITER params accordingly  */
-  tree utype = unsigned_type_for (TREE_TYPE (src));
-  src = fold_convert (utype, src);
-  if (TYPE_PRECISION (TREE_TYPE (src)) < TYPE_PRECISION (integer_type_node))
-    src = fold_convert (unsigned_type_node, src);
-  tree call;
-  if (TYPE_PRECISION (TREE_TYPE (src))
-      == 2 * TYPE_PRECISION (long_long_integer_type_node))
+  max = src_precision;
+
+  tree may_be_zero = boolean_false_node;
+
+  if (modify_before_test)
     {
-      int prec = TYPE_PRECISION (long_long_integer_type_node);
-      tree src1 = fold_convert (long_long_unsigned_type_node,
-				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
-					     unshare_expr (src),
-					     build_int_cst (integer_type_node,
-							    prec)));
-      tree src2 = fold_convert (long_long_unsigned_type_node, src);
-      call = build_call_expr (fn, 1, src1);
-      call = fold_build2 (PLUS_EXPR, TREE_TYPE (call), call,
-			  build_call_expr (fn, 1, src2));
-      call = fold_convert (utype, call);
+      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
+			  integer_one_node);
+      max = max - 1;
+      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
     }
-  else
-    call = fold_convert (utype, build_call_expr (fn, 1, src));
-  if (adjust)
-    iter = fold_build2 (MINUS_EXPR, utype, call, build_int_cst (utype, 1));
-  else
-    iter = call;
 
-  if (TREE_CODE (call) == INTEGER_CST)
-    max = tree_to_uhwi (call);
-  else
-    max = TYPE_PRECISION (TREE_TYPE (src));
-  if (adjust)
-    max = max - 1;
+  expr = fold_convert (unsigned_type_node, expr);
 
-  niter->niter = iter;
   niter->assumptions = boolean_true_node;
+  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
+  niter->niter = simplify_using_initial_conditions(loop, expr);
 
-  if (adjust)
-    {
-      tree may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
-				      build_zero_cst (TREE_TYPE (src)));
-      niter->may_be_zero
-	= simplify_using_initial_conditions (loop, may_be_zero);
-    }
+  if (TREE_CODE (niter->niter) == INTEGER_CST)
+    niter->max = tree_to_uhwi (niter->niter);
   else
-    niter->may_be_zero = boolean_false_node;
+    niter->max = max;
 
-  niter->max = max;
   niter->bound = NULL_TREE;
   niter->cmp = ERROR_MARK;
   return true;
 }
 
+/* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
+   1. A modification to the induction variabler;.
+   2. A test to determine whether or not to exit the loop.
+
+   These can come in either order - i.e.:
+
+   <bb 3>
+   iv_1 = PHI <src(2), iv_2(4)>
+   if (test (iv_1))
+     goto <bb 4>
+   else
+     goto <bb 5>
+
+   <bb 4>
+   iv_2 = modify (iv_1)
+   goto <bb 3>
+
+   OR
+
+   <bb 3>
+   iv_1 = PHI <src(2), iv_2(4)>
+   iv_2 = modify (iv_1)
+
+   <bb 4>
+   if (test (iv_2))
+     goto <bb 3>
+   else
+     goto <bb 5>
+
+   The second form can be generated by copying the loop header out of the loop.
+
+   In the first case, the number of latch executions will be equal to the
+   number of induction variable modifications required before the test fails.
+
+   In the second case (modify_before_test), if we assume that the number of
+   modifications required before the test fails is nonzero, then the number of
+   latch executions will be one less than this number.
+
+   If we recognise the pattern, then we update niter accordingly, and return
+   true.  */
+
+static bool
+number_of_iterations_bitcount (loop_p loop, edge exit,
+			       enum tree_code code,
+			       class tree_niter_desc *niter)
+{
+  return number_of_iterations_popcount (loop, exit, code, niter);
+}
+
 /* Substitute NEW_TREE for OLD in EXPR and fold the result.
    If VALUEIZE is non-NULL then OLD and NEW_TREE are ignored and instead
    all SSA names are replaced with the result of calling the VALUEIZE
@@ -2758,7 +2785,7 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
   tree iv0_niters = NULL_TREE;
   if (!simple_iv_with_niters (loop, loop_containing_stmt (stmt),
 			      op0, &iv0, safe ? &iv0_niters : NULL, false))
-    return number_of_iterations_popcount (loop, exit, code, niter);
+    return number_of_iterations_bitcount (loop, exit, code, niter);
   tree iv1_niters = NULL_TREE;
   if (!simple_iv_with_niters (loop, loop_containing_stmt (stmt),
 			      op1, &iv1, safe ? &iv1_niters : NULL, false))

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 4/8] Modify test, to prevent the next patch breaking it
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (2 preceding siblings ...)
  2022-11-11 13:52 ` [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount Andrew Carlotti
@ 2022-11-11 18:43 ` Andrew Carlotti
  2022-11-14 10:18   ` Richard Biener
  2022-11-11 18:50 ` [PATCH 5/8] middle-end: Add cltz_complement idiom recognition Andrew Carlotti
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 18:43 UTC (permalink / raw)
  To: gcc-patches

The upcoming c[lt]z idiom recognition patch eliminates the need for a
brute force computation of the iteration count of these loops. The test
is intended to verify that ivcanon can determine the loop count when the
condition is given by a chain of constant computations.

We replace the constant operations with a more complicated chain that should
resist future idiom recognition.

gcc/testsuite/ChangeLog:

	* gcc.dg/pr77975.c: Make tests more robust.


--


diff --git a/gcc/testsuite/gcc.dg/pr77975.c b/gcc/testsuite/gcc.dg/pr77975.c
index 148cebdded964da7fce148abdf2a430c55650513..a187ce2b50c2821841e71b5b6cb243a37a66fb57 100644
--- a/gcc/testsuite/gcc.dg/pr77975.c
+++ b/gcc/testsuite/gcc.dg/pr77975.c
@@ -7,10 +7,11 @@
 unsigned int
 foo (unsigned int *b)
 {
-  unsigned int a = 3;
+  unsigned int a = 8;
   while (a)
     {
-      a >>= 1;
+      a += 5;
+      a &= 44;
       *b += a;
     }
   return a; 
@@ -21,10 +22,11 @@ foo (unsigned int *b)
 unsigned int
 bar (unsigned int *b)
 {
-  unsigned int a = 7;
+  unsigned int a = 3;
   while (a)
     {
-      a >>= 1;
+      a += 5;
+      a &= 44;
       *b += a;
     }
   return a; 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 5/8] middle-end: Add cltz_complement idiom recognition
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (3 preceding siblings ...)
  2022-11-11 18:43 ` [PATCH 4/8] Modify test, to prevent the next patch breaking it Andrew Carlotti
@ 2022-11-11 18:50 ` Andrew Carlotti
  2022-11-14 15:10   ` Richard Biener
  2022-11-11 18:54 ` [PATCH 6/8] docs: Add popcount, clz and ctz target attributes Andrew Carlotti
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 18:50 UTC (permalink / raw)
  To: gcc-patches

This recognises patterns of the form:
while (n) { n >>= 1 }

This patch results in improved (but still suboptimal) codegen:

foo (unsigned int b) {
    int c = 0;

    while (b) {
        b >>= 1;
        c++;
    }

    return c;
}

foo:
.LFB11:
        .cfi_startproc
        cbz     w0, .L3
        clz     w1, w0
        tst     x0, 1
        mov     w0, 32
        sub     w0, w0, w1
        csel    w0, w0, wzr, ne
        ret

The conditional is unnecessary. phiopt could recognise a redundant csel
(using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
clz call, but it cannot recognise the redunancy when the input is (e.g.)
(32 - clz).

I could perhaps extend this function to recognise this pattern in a later
patch, if this is a good place to recognise more patterns.

gcc/ChangeLog:

	* tree-scalar-evolution.cc (expression_expensive_p): Add checks
	for c[lt]z optabs.
	* tree-ssa-loop-niter.cc (build_cltz_expr): New.
	(number_of_iterations_cltz_complement): New.
	(number_of_iterations_bitcount): Add call to the above.

gcc/testsuite/ChangeLog:

	* lib/target-supports.exp (check_effective_target_clz)
	(check_effective_target_clzl, check_effective_target_clzll)
	(check_effective_target_ctz, check_effective_target_clzl)
	(check_effective_target_ctzll): New.
	* gcc.dg/tree-ssa/cltz-complement-max.c: New test.
	* gcc.dg/tree-ssa/clz-complement-char.c: New test.
	* gcc.dg/tree-ssa/clz-complement-int.c: New test.
	* gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
	* gcc.dg/tree-ssa/clz-complement-long.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-char.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-int.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-long.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
new file mode 100644
index 0000000000000000000000000000000000000000..1a29ca52e42e50822e4e3213b2cb008b766d0318
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_complement_count1 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC)
+      return 0;
+    else
+      return 34567;
+}
+
+int clz_complement_count2 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 76543;
+}
+
+int ctz_complement_count1 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC)
+      return 0;
+    else
+      return 23456;
+}
+
+int ctz_complement_count2 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(255) != 8)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1 << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f7793f0efac1f0d793e6e99b84988e5cc5221c9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1LL << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..97161bb7a74260bea20e325ebab64acb33a2b696
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1L << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..b9afe8852d8ffbc7ee9a0760cf04b8f98af293a2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..d2702a65daf34db66550d2255395db68a29a4797
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..1ea0d5d7d9f8be1824c4177c33edd91e66b4ddab
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..80fb02dcfa68bc022ae69b26fb189323e01fc6fc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index c7f583d6d1498401a7c106ed3f539dcd04f95451..325f12d62324793d6b2cf55b074ef6cc9cf4dd4d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -8687,6 +8687,72 @@ proc check_effective_target_popcount { } {
     } "" ]
 }
 
+# Return 1 if the target supports clz on int.
+
+proc check_effective_target_clz { } {
+    return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
+        int foo (int b)
+          {
+            return __builtin_clz (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports clz on long.
+
+proc check_effective_target_clzl { } {
+    return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
+	int foo (long b)
+	  {
+	    return __builtin_clzl (b);
+	  }
+    } "" ]
+}
+
+# Return 1 if the target supports clz on long long.
+
+proc check_effective_target_clzll { } {
+    return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
+        int foo (long long b)
+          {
+            return __builtin_clzll (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on int.
+
+proc check_effective_target_ctz { } {
+    return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
+        int foo (int b)
+          {
+            return __builtin_ctz (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on long.
+
+proc check_effective_target_ctzl { } {
+    return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
+	int foo (long b)
+	  {
+	    return __builtin_ctzl (b);
+	  }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on long long.
+
+proc check_effective_target_ctzll { } {
+    return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
+        int foo (long long b)
+          {
+            return __builtin_ctzll (b);
+          }
+    } "" ]
+}
+
 # Return 1 if the target supports atomic operations on "long long"
 # and can execute them.
 #
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
 	 library call for popcount when backend does not have an instruction
 	 to do so.  We consider this to be expensive and generate
 	 __builtin_popcount only when backend defines it.  */
+      optab optab;
       combined_fn cfn = get_call_combined_fn (expr);
       switch (cfn)
 	{
 	CASE_CFN_POPCOUNT:
+	  optab = popcount_optab;
+	  goto bitcount_call;
+	CASE_CFN_CLZ:
+	  optab = clz_optab;
+	  goto bitcount_call;
+	CASE_CFN_CTZ:
+	  optab = ctz_optab;
+bitcount_call:
 	  /* Check if opcode for popcount is available in the mode required.  */
-	  if (optab_handler (popcount_optab,
+	  if (optab_handler (optab,
 			     TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
 	      == CODE_FOR_nothing)
 	    {
@@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
 		 instructions.  */
 	      if (is_a <scalar_int_mode> (mode, &int_mode)
 		  && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
-		  && (optab_handler (popcount_optab, word_mode)
+		  && (optab_handler (optab, word_mode)
 		      != CODE_FOR_nothing))
 		  break;
 	      return true;
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
   return true;
 }
 
+/* Return an expression that counts the leading/trailing zeroes of src.  */
+
+static tree
+build_cltz_expr (tree src, bool leading, bool defined_at_zero)
+{
+  tree fn;
+  int prec = TYPE_PRECISION (TREE_TYPE (src));
+  int i_prec = TYPE_PRECISION (integer_type_node);
+  int li_prec = TYPE_PRECISION (long_integer_type_node);
+  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
+  if (prec <= i_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
+		 : builtin_decl_implicit (BUILT_IN_CTZ);
+  else if (prec == li_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
+		 : builtin_decl_implicit (BUILT_IN_CTZL);
+  else if (prec == lli_prec || prec == 2 * lli_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
+		 : builtin_decl_implicit (BUILT_IN_CTZLL);
+  else
+    return NULL_TREE;
+
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+  if (prec < i_prec)
+    src = fold_convert (unsigned_type_node, src);
+
+  tree call;
+  if (prec == 2 * lli_prec)
+    {
+      tree src1 = fold_convert (long_long_unsigned_type_node,
+				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+					     unshare_expr (src),
+					     build_int_cst (integer_type_node,
+							    lli_prec)));
+      tree src2 = fold_convert (long_long_unsigned_type_node, src);
+      /* We count the zeroes in src1, and add the number in src2 when src1
+	 is 0.  */
+      if (!leading)
+	std::swap(src1, src2);
+      tree call1 = build_call_expr (fn, 1, src1);
+      tree call2 = build_call_expr (fn, 1, src2);
+      if (defined_at_zero)
+	{
+	  tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
+				       build_zero_cst (TREE_TYPE (src2)));
+	  call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
+			      build_int_cst (integer_type_node, lli_prec));
+	}
+      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
+				   build_zero_cst (TREE_TYPE (src1)));
+      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
+			 fold_build2 (PLUS_EXPR, integer_type_node, call2,
+				      build_int_cst (integer_type_node,
+						     lli_prec)));
+    }
+  else
+    {
+      call = build_call_expr (fn, 1, src);
+      if (defined_at_zero)
+	{
+	  tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+	  call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
+			     build_int_cst (integer_type_node, prec));
+	}
+    }
+
+  if (leading && prec < i_prec)
+    call = fold_build2(MINUS_EXPR, integer_type_node, call,
+		       build_int_cst (integer_type_node,
+				      i_prec - prec));
+
+  return call;
+}
+
+/* See comment below for number_of_iterations_bitcount.
+   For c[lt]z complement, we have:
+
+   modify:
+   iv_2 = iv_1 >> 1 OR iv_1 << 1
+
+   test:
+   if (iv != 0)
+
+   modification count:
+   src precision - c[lt]z (src)
+
+ */
+
+static bool
+number_of_iterations_cltz_complement (loop_p loop, edge exit,
+			       enum tree_code code,
+			       class tree_niter_desc *niter)
+{
+  bool modify_before_test = true;
+  HOST_WIDE_INT max;
+
+  /* Check that condition for staying inside the loop is like
+     if (iv != 0).  */
+  gimple *cond_stmt = last_stmt (exit->src);
+  if (!cond_stmt
+      || gimple_code (cond_stmt) != GIMPLE_COND
+      || code != NE_EXPR
+      || !integer_zerop (gimple_cond_rhs (cond_stmt))
+      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
+    return false;
+
+  tree iv_2 = gimple_cond_lhs (cond_stmt);
+  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+
+  /* If the test comes before the iv modification, then these will actually be
+     iv_1 and a phi node.  */
+  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
+      && gimple_bb (iv_2_stmt) == loop->header
+      && gimple_phi_num_args (iv_2_stmt) == 2
+      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
+					 loop_latch_edge (loop)->dest_idx))
+	  == SSA_NAME))
+    {
+      /* iv_2 is actually one of the inputs to the phi.  */
+      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
+      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+      modify_before_test = false;
+    }
+
+  /* Make sure iv_2_stmt is a logical shift by one stmt:
+     iv_2 = iv_1 {>>|<<} 1  */
+  if (!is_gimple_assign (iv_2_stmt)
+      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
+	  && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
+	      || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
+      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
+    return false;
+
+  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
+
+  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
+
+  /* Check the recurrence.  */
+  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
+  if (gimple_code (phi) != GIMPLE_PHI
+      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+    return false;
+
+  /* We found a match.  */
+  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
+
+  /* Get the corresponding c[lt]z builtin.  */
+  tree expr = build_cltz_expr (src, !left_shift, true);
+
+  if (!expr)
+    return false;
+
+  expr = fold_build2 (MINUS_EXPR, integer_type_node,
+		      build_int_cst (integer_type_node, src_precision),
+		      expr);
+
+  max = src_precision;
+
+  tree may_be_zero = boolean_false_node;
+
+  if (modify_before_test)
+    {
+      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
+			  integer_one_node);
+      max = max - 1;
+      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+    }
+
+  expr = fold_convert (unsigned_type_node, expr);
+
+  niter->assumptions = boolean_true_node;
+  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
+  niter->niter = simplify_using_initial_conditions (loop, expr);
+
+  if (TREE_CODE (niter->niter) == INTEGER_CST)
+    niter->max = tree_to_uhwi (niter->niter);
+  else
+    niter->max = max;
+
+  niter->bound = NULL_TREE;
+  niter->cmp = ERROR_MARK;
+  return true;
+}
+
 /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
    1. A modification to the induction variabler;.
    2. A test to determine whether or not to exit the loop.
@@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
 			       enum tree_code code,
 			       class tree_niter_desc *niter)
 {
-  return number_of_iterations_popcount (loop, exit, code, niter);
+  return (number_of_iterations_popcount (loop, exit, code, niter)
+	  || number_of_iterations_cltz_complement (loop, exit, code, niter));
 }
 
 /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 6/8] docs: Add popcount, clz and ctz target attributes
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (4 preceding siblings ...)
  2022-11-11 18:50 ` [PATCH 5/8] middle-end: Add cltz_complement idiom recognition Andrew Carlotti
@ 2022-11-11 18:54 ` Andrew Carlotti
  2022-11-14 14:52   ` Jeff Law
  2022-11-11 19:01 ` [PATCH 7/8] middle-end: Add c[lt]z idiom recognition Andrew Carlotti
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 18:54 UTC (permalink / raw)
  To: gcc-patches

gcc/ChangeLog:

	* doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst:
	Add missing target attributes.


--


diff --git a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
index 709e4ea2b903cfad4faed40899020b29bc9b5811..8410c40d38fceb83ea8c6ba3bbf0fba5db7929e5 100644
--- a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
+++ b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
@@ -1075,6 +1075,24 @@ Other hardware attributes
 ``cell_hw``
   Test system can execute AltiVec and Cell PPU instructions.
 
+``clz``
+  Target supports a clz optab on int.
+
+``clzl``
+  Target supports a clz optab on long.
+
+``clzll``
+  Target supports a clz optab on long long.
+
+``ctz``
+  Target supports a ctz optab on int.
+
+``ctzl``
+  Target supports a ctz optab on long.
+
+``ctzll``
+  Target supports a ctz optab on long long.
+
 ``cmpccxadd``
   Target supports the execution of ``cmpccxadd`` instructions.
 
@@ -1096,6 +1114,15 @@ Other hardware attributes
 ``pie_copyreloc``
   The x86-64 target linker supports PIE with copy reloc.
 
+``popcount``
+  Target supports a popcount optab on int.
+
+``popcountl``
+  Target supports a popcount optab on long.
+
+``popcountll``
+  Target supports a popcount optab on long long.
+
 ``prefetchi``
   Target supports the execution of ``prefetchi`` instructions.
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 7/8] middle-end: Add c[lt]z idiom recognition
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (5 preceding siblings ...)
  2022-11-11 18:54 ` [PATCH 6/8] docs: Add popcount, clz and ctz target attributes Andrew Carlotti
@ 2022-11-11 19:01 ` Andrew Carlotti
  2022-11-14 15:22   ` Richard Biener
  2022-11-11 19:07 ` [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max Andrew Carlotti
  2022-12-22 17:43 ` [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN Andrew Carlotti
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 19:01 UTC (permalink / raw)
  To: gcc-patches

This recognises the patterns of the form:
  while (n & 1) { n >>= 1 }

Unfortunately there are currently two issues relating to this patch.

Firstly, simplify_using_initial_conditions does not recognise that
	(n != 0) and ((n & 1) == 0) implies that ((n >> 1) != 0).

This preconditions arise following the loop copy-header pass, and the
assumptions returned by number_of_iterations_exit_assumptions then
prevent final value replacement from using the niter result.

I'm not sure what is the best way to fix this - one approach could be to
modify simplify_using_initial_conditions to handle this sort of case,
but it seems that it basically wants the information that ranger could
give anway, so would something like that be a better option?

The second issue arises in the vectoriser, which is able to determine
that the niter->assumptions are always true.
When building with -march=armv8.4-a+sve -S -O3, we get this codegen:

foo (unsigned int b) {
    int c = 0;

    if (b == 0)
      return PREC;

    while (!(b & (1 << (PREC - 1)))) {
        b <<= 1;
        c++;
    }

    return c;
}

foo:
.LFB0:
        .cfi_startproc
        cmp     w0, 0
        cbz     w0, .L6
        blt     .L7
        lsl     w1, w0, 1
        clz     w2, w1
        cmp     w2, 14
        bls     .L8
        mov     x0, 0
        cntw    x3
        add     w1, w2, 1
        index   z1.s, #0, #1
        whilelo p0.s, wzr, w1
.L4:
        add     x0, x0, x3
        mov     p1.b, p0.b
        mov     z0.d, z1.d
        whilelo p0.s, w0, w1
        incw    z1.s
        b.any   .L4
        add     z0.s, z0.s, #1
        lastb   w0, p1, z0.s
        ret
        .p2align 2,,3
.L8:
        mov     w0, 0
        b       .L3
        .p2align 2,,3
.L13:
        lsl     w1, w1, 1
.L3:
        add     w0, w0, 1
        tbz     w1, #31, .L13
        ret
        .p2align 2,,3
.L6:
        mov     w0, 32
        ret
        .p2align 2,,3
.L7:
        mov     w0, 0
        ret
        .cfi_endproc

In essence, the vectoriser uses the niter information to determine
exactly how many iterations of the loop it needs to run. It then uses
SVE whilelo instructions to run this number of iterations. The original
loop counter is also vectorised, despite only being used in the final
iteration, and then the final value of this counter is used as the
return value (which is the same as the number of iterations it computed
in the first place).

This vectorisation is obviously bad, and I think it exposes a latent
bug in the vectoriser, rather than being an issue caused by this
specific patch.

gcc/ChangeLog:

	* tree-ssa-loop-niter.cc (number_of_iterations_cltz): New.
	(number_of_iterations_bitcount): Add call to the above.
	(number_of_iterations_exit_assumptions): Add EQ_EXPR case for
	c[lt]z idiom recognition.

gcc/testsuite/ChangeLog:

	* gcc.dg/tree-ssa/cltz-max.c: New test.
	* gcc.dg/tree-ssa/clz-char.c: New test.
	* gcc.dg/tree-ssa/clz-int.c: New test.
	* gcc.dg/tree-ssa/clz-long-long.c: New test.
	* gcc.dg/tree-ssa/clz-long.c: New test.
	* gcc.dg/tree-ssa/ctz-char.c: New test.
	* gcc.dg/tree-ssa/ctz-int.c: New test.
	* gcc.dg/tree-ssa/ctz-long-long.c: New test.
	* gcc.dg/tree-ssa/ctz-long.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
new file mode 100644
index 0000000000000000000000000000000000000000..a6bea3d338940efee2e7e1c95a5941525945af9e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_count1 (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return 0;
+
+    while (!(b & (1 << (PREC - 1)))) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 34567;
+}
+
+int clz_count2 (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return 0;
+
+    while (!(b & (1 << PREC - 1))) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC - 2)
+      return 0;
+    else
+      return 76543;
+}
+
+int ctz_count1 (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return 0;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 23456;
+}
+
+int ctz_count2 (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return 0;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC - 2)
+      return 0;
+    else
+      return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..4a122db95bbb576b4ade706bd3b1ca809d2f1e3b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & (1 << (PREC - 1)))) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1 << (PREC - 1)) != 0)
+    __builtin_abort ();
+  if (foo(35) != PREC - 6)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..96646f8e19cd5b2342acb88949b3ef6e3e2abd5a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & (1 << (PREC - 1)))) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1 << (PREC - 1)) != 0)
+    __builtin_abort ();
+  if (foo(35) != PREC - 6)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..80d3edc1dab2e74fc3271ba9d97640839b3a3786
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzll } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & (1LL << (PREC - 1)))) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1LL << (PREC - 1)) != 0)
+    __builtin_abort ();
+  if (foo(35) != PREC - 6)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..1c8037f93b9c9d42f580a172267c65723a46ef8b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & (1L << (PREC - 1)))) {
+	b <<= 1;
+	c++;
+}
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1L << (PREC - 1)) != 0)
+    __builtin_abort ();
+  if (foo(35) != PREC - 6)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..3cd166acbd4670e175d79a2403de2d5a4fd38665
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(128) != 7)
+    __builtin_abort ();
+  if (foo(96) != 5)
+    __builtin_abort ();
+  if (foo(35) != 0)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f63493eb7389a18516f8f126c3c55dc80f0bde6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1 << (PREC - 1)) != PREC - 1)
+    __builtin_abort ();
+  if (foo(96) != 5)
+    __builtin_abort ();
+  if (foo(35) != 0)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..924f61b76f01c77a40b9fff64af3b629ab1418c0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzll } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1LL << (PREC - 1)) != PREC - 1)
+    __builtin_abort ();
+  if (foo(96) != 5)
+    __builtin_abort ();
+  if (foo(35) != 0)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..178945daa8a2697989f1a1a0804ce33d768dcc55
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzl } */
+/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    if (b == 0)
+      return PREC;
+
+    while (!(b & 1)) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != PREC)
+    __builtin_abort ();
+  if (foo(1L << (PREC - 1)) != PREC - 1)
+    __builtin_abort ();
+  if (foo(96) != 5)
+    __builtin_abort ();
+  if (foo(35) != 0)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 16e8e25919d808cea27adbd72f0be01ae2f0e547..87e6fe81d68fc3352e450688ef79e6fc68854d8a 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2274,6 +2274,167 @@ build_cltz_expr (tree src, bool leading, bool defined_at_zero)
   return call;
 }
 
+/* See comment below for number_of_iterations_bitcount.
+   For c[lt]z, we have:
+
+   modify:
+   iv_2 = iv_1 << 1 OR iv_1 >> 1
+
+   test:
+   if (iv & 1 << (prec-1)) OR (iv & 1)
+
+   modification count:
+   src precision - c[lt]z (src)
+
+ */
+
+static bool
+number_of_iterations_cltz (loop_p loop, edge exit,
+			       enum tree_code code,
+			       class tree_niter_desc *niter)
+{
+  bool modify_before_test = true;
+  HOST_WIDE_INT max;
+  int checked_bit;
+  tree iv_2;
+
+  /* Check that condition for staying inside the loop is like
+     if (iv == 0).  */
+  gimple *cond_stmt = last_stmt (exit->src);
+  if (!cond_stmt
+      || gimple_code (cond_stmt) != GIMPLE_COND
+      || (code != EQ_EXPR && code != GE_EXPR)
+      || !integer_zerop (gimple_cond_rhs (cond_stmt))
+      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
+    return false;
+
+  if (code == EQ_EXPR)
+    {
+      /* Make sure we check a bitwise and with a suitable constant */
+      gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (cond_stmt));
+      if (!is_gimple_assign (and_stmt)
+	  || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR
+	  || !integer_pow2p (gimple_assign_rhs2 (and_stmt)))
+	return false;
+
+      checked_bit = tree_log2 (gimple_assign_rhs2 (and_stmt));
+
+      iv_2 = gimple_assign_rhs1 (and_stmt);
+    }
+  else
+    {
+      /* We have a GE_EXPR - a signed comparison with zero is equivalent to
+	 testing the leading bit, so check for this pattern too.  */
+
+      iv_2 = gimple_cond_lhs (cond_stmt);
+      tree test_value_type = TREE_TYPE (iv_2);
+
+      if (TYPE_UNSIGNED (test_value_type))
+	return false;
+
+      gimple *test_value_stmt = SSA_NAME_DEF_STMT (iv_2);
+
+      if (is_gimple_assign (test_value_stmt)
+	  && gimple_assign_rhs_code (test_value_stmt) == NOP_EXPR)
+	{
+	  /* If the test value comes from a NOP_EXPR, then we need to unwrap
+	     this.  We conservatively require that both types have the same
+	     precision.  */
+	  iv_2 = gimple_assign_rhs1 (test_value_stmt);
+	  tree rhs_type = TREE_TYPE (iv_2);
+	  if (TREE_CODE (rhs_type) != INTEGER_TYPE
+	      || (TYPE_PRECISION (rhs_type)
+		  != TYPE_PRECISION (test_value_type)))
+	    return false;
+	}
+
+      checked_bit = TYPE_PRECISION (test_value_type) - 1;
+    }
+
+  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+
+  /* If the test comes before the iv modification, then these will actually be
+     iv_1 and a phi node.  */
+  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
+      && gimple_bb (iv_2_stmt) == loop->header
+      && gimple_phi_num_args (iv_2_stmt) == 2
+      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
+					 loop_latch_edge (loop)->dest_idx))
+	  == SSA_NAME))
+    {
+      /* iv_2 is actually one of the inputs to the phi.  */
+      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
+      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+      modify_before_test = false;
+    }
+
+  /* Make sure iv_2_stmt is a logical shift by one stmt:
+     iv_2 = iv_1 {<<|>>} 1  */
+  if (!is_gimple_assign (iv_2_stmt)
+      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
+	  && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
+	      || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
+      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
+    return false;
+
+  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
+
+  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
+
+  /* Check the recurrence.  */
+  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
+  if (gimple_code (phi) != GIMPLE_PHI
+      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+    return false;
+
+  /* We found a match.  */
+  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
+
+  /* Apply any needed preprocessing to src.  */
+  int num_ignored_bits;
+  if (left_shift)
+    num_ignored_bits = src_precision - checked_bit - 1;
+  else
+    num_ignored_bits = checked_bit;
+
+  if (modify_before_test)
+    num_ignored_bits++;
+
+  if (num_ignored_bits != 0)
+    src = fold_build2 (left_shift ? LSHIFT_EXPR : RSHIFT_EXPR,
+		       TREE_TYPE (src), src,
+		       build_int_cst (integer_type_node, num_ignored_bits));
+
+  /* Get the corresponding c[lt]z builtin.  */
+  tree expr = build_cltz_expr (src, left_shift, false);
+
+  if (!expr)
+    return false;
+
+  max = src_precision - num_ignored_bits - 1;
+
+  expr = fold_convert (unsigned_type_node, expr);
+
+  tree assumptions = fold_build2 (NE_EXPR, boolean_type_node, src,
+				  build_zero_cst (TREE_TYPE (src)));
+
+  niter->assumptions = simplify_using_initial_conditions (loop, assumptions);
+  niter->may_be_zero = boolean_false_node;
+  niter->niter = simplify_using_initial_conditions (loop, expr);
+
+  if (TREE_CODE (niter->niter) == INTEGER_CST)
+    niter->max = tree_to_uhwi (niter->niter);
+  else
+    niter->max = max;
+
+  niter->bound = NULL_TREE;
+  niter->cmp = ERROR_MARK;
+
+  return true;
+}
+
 /* See comment below for number_of_iterations_bitcount.
    For c[lt]z complement, we have:
 
@@ -2434,6 +2595,7 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
 			       class tree_niter_desc *niter)
 {
   return (number_of_iterations_popcount (loop, exit, code, niter)
+	  || number_of_iterations_cltz (loop, exit, code, niter)
 	  || number_of_iterations_cltz_complement (loop, exit, code, niter));
 }
 
@@ -2960,6 +3122,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
     case NE_EXPR:
       break;
 
+    case EQ_EXPR:
+      return number_of_iterations_cltz (loop, exit, code, niter);
+
     default:
       return false;
     }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (6 preceding siblings ...)
  2022-11-11 19:01 ` [PATCH 7/8] middle-end: Add c[lt]z idiom recognition Andrew Carlotti
@ 2022-11-11 19:07 ` Andrew Carlotti
  2022-11-14 14:51   ` Jeff Law
  2022-12-22 17:43 ` [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN Andrew Carlotti
  8 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-11 19:07 UTC (permalink / raw)
  To: gcc-patches

This requirement is enforced by a gcc_checking_assert in
record_estimate.

gcc/ChangeLog:

	* tree-ssa-loop.h (tree_niter_desc): Update comment.


--


diff --git a/gcc/tree-ssa-loop.h b/gcc/tree-ssa-loop.h
index 6c70f795d171f22b3ed75873fec4920fea75255b..c24215be8822c31a05eaedcf4d3a26db0feab6cf 100644
--- a/gcc/tree-ssa-loop.h
+++ b/gcc/tree-ssa-loop.h
@@ -52,7 +52,8 @@ public:
 			   may_be_zero == false), more precisely the number
 			   of executions of the latch of the loop.  */
   widest_int max;	/* The upper bound on the number of iterations of
-			   the loop.  */
+			   the loop.  If niter is constant, then these values
+			   must agree.  */
 
   /* The simplified shape of the exit condition.  This information is used by
      loop unrolling.  If CMP is ERROR_MARK, then the loop cannot be unrolled.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] Modify test, to prevent the next patch breaking it
  2022-11-11 18:43 ` [PATCH 4/8] Modify test, to prevent the next patch breaking it Andrew Carlotti
@ 2022-11-14 10:18   ` Richard Biener
  0 siblings, 0 replies; 28+ messages in thread
From: Richard Biener @ 2022-11-14 10:18 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Fri, Nov 11, 2022 at 7:48 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> The upcoming c[lt]z idiom recognition patch eliminates the need for a
> brute force computation of the iteration count of these loops. The test
> is intended to verify that ivcanon can determine the loop count when the
> condition is given by a chain of constant computations.
>
> We replace the constant operations with a more complicated chain that should
> resist future idiom recognition.

OK.

> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/pr77975.c: Make tests more robust.
>
>
> --
>
>
> diff --git a/gcc/testsuite/gcc.dg/pr77975.c b/gcc/testsuite/gcc.dg/pr77975.c
> index 148cebdded964da7fce148abdf2a430c55650513..a187ce2b50c2821841e71b5b6cb243a37a66fb57 100644
> --- a/gcc/testsuite/gcc.dg/pr77975.c
> +++ b/gcc/testsuite/gcc.dg/pr77975.c
> @@ -7,10 +7,11 @@
>  unsigned int
>  foo (unsigned int *b)
>  {
> -  unsigned int a = 3;
> +  unsigned int a = 8;
>    while (a)
>      {
> -      a >>= 1;
> +      a += 5;
> +      a &= 44;
>        *b += a;
>      }
>    return a;
> @@ -21,10 +22,11 @@ foo (unsigned int *b)
>  unsigned int
>  bar (unsigned int *b)
>  {
> -  unsigned int a = 7;
> +  unsigned int a = 3;
>    while (a)
>      {
> -      a >>= 1;
> +      a += 5;
> +      a &= 44;
>        *b += a;
>      }
>    return a;

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit
  2022-11-11 13:39 ` [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit Andrew Carlotti
@ 2022-11-14 14:23   ` Jeff Law
  0 siblings, 0 replies; 28+ messages in thread
From: Jeff Law @ 2022-11-14 14:23 UTC (permalink / raw)
  To: Andrew Carlotti, gcc-patches


On 11/11/22 06:39, Andrew Carlotti via Gcc-patches wrote:
> This prevents a null dereference error when outputing debug information
> following an early exit from number_of_iterations_exit_assumptions.
>
> gcc/ChangeLog:
>
> 	* tree-ssa-loop-niter.cc (number_of_iterations_exit_assumptions):
> 	Move at_stmt assignment.

OK

jeff



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount
  2022-11-11 13:46 ` [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount Andrew Carlotti
@ 2022-11-14 14:24   ` Jeff Law
  0 siblings, 0 replies; 28+ messages in thread
From: Jeff Law @ 2022-11-14 14:24 UTC (permalink / raw)
  To: Andrew Carlotti, gcc-patches


On 11/11/22 06:46, Andrew Carlotti via Gcc-patches wrote:
> gcc/ChangeLog:
>
> 	* tree-ssa-loop-niter.c (ssa_defined_by_minus_one_stmt_p): Move
> 	(number_of_iterations_popcount): Move, and remove separate prototype.

OK.

jeff



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max
  2022-11-11 19:07 ` [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max Andrew Carlotti
@ 2022-11-14 14:51   ` Jeff Law
  0 siblings, 0 replies; 28+ messages in thread
From: Jeff Law @ 2022-11-14 14:51 UTC (permalink / raw)
  To: Andrew Carlotti, gcc-patches


On 11/11/22 12:07, Andrew Carlotti via Gcc-patches wrote:
> This requirement is enforced by a gcc_checking_assert in
> record_estimate.
>
> gcc/ChangeLog:
>
> 	* tree-ssa-loop.h (tree_niter_desc): Update comment.

OK

jeff



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount
  2022-11-11 13:52 ` [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount Andrew Carlotti
@ 2022-11-14 14:52   ` Richard Biener
  0 siblings, 0 replies; 28+ messages in thread
From: Richard Biener @ 2022-11-14 14:52 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Fri, Nov 11, 2022 at 2:58 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This includes various changes to improve clarity, and to enable the code
> to be more similar to the clz and ctz idiom recognition added in
> subsequent patches.
>
> We create new number_of_iterations_bitcount function, which will be used
> to call the other bit-counting recognition functions added in subsequent
> patches, as well as a generic comment describing the loop structures
> that are common to each idiom. Some of the variables in
> number_of_iterations_popcount are given more descriptive names, and the
> popcount expression builder is extracted into a separate function.
>
> As part of the refactoring, we also fix a bug where the max loop count
> for modes shorter than an integer would be incorrectly computed as if
> the input mode were actually an integer.
>
> We also ensure that niter->max takes into account the final value for
> niter->niter (after any folding and simplifying), since if the latter is a
> constant, then record_estimate mandates that the two values are equivalent.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         * tree-ssa-loop-niter.cc
>         (number_of_iterations_exit_assumptions): Modify to call...
>         (number_of_iterations_bitcount): ...this new function.
>         (number_of_iterations_popcount): Now called by the above.
>         Refactor, and extract popcount expression builder to...
>         (build_popcount_expr): this new function.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/popcount-max.c: New test.
>
>
> --
>
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ca7204cbc3cea636183408e24d7dd36d702ffdb2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int count1 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b &= b - 1;
> +       c++;
> +    }
> +    if (c <= PREC)
> +      return 0;
> +    else
> +      return 34567;
> +}
> +
> +int count2 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b &= b - 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 76543;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index 0af34e46580bb9a6f9b40e09c9f29b8454a4aaf6..fece876099c1687569d6351e7d2416ea6acae5b5 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2026,6 +2026,48 @@ number_of_iterations_cond (class loop *loop,
>    return ret;
>  }
>
> +/* Return an expression that computes the popcount of src.  */
> +
> +static tree
> +build_popcount_expr (tree src)
> +{
> +  tree fn;
> +  int prec = TYPE_PRECISION (TREE_TYPE (src));
> +  int i_prec = TYPE_PRECISION (integer_type_node);
> +  int li_prec = TYPE_PRECISION (long_integer_type_node);
> +  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> +  if (prec <= i_prec)
> +    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> +  else if (prec == li_prec)
> +    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
> +  else if (prec == lli_prec || prec == 2 * lli_prec)
> +    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
> +  else
> +    return NULL_TREE;
> +
> +  tree utype = unsigned_type_for (TREE_TYPE (src));
> +  src = fold_convert (utype, src);
> +  if (prec < i_prec)
> +    src = fold_convert (unsigned_type_node, src);
> +  tree call;
> +  if (prec == 2 * lli_prec)
> +    {
> +      tree src1 = fold_convert (long_long_unsigned_type_node,
> +                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> +                                            unshare_expr (src),
> +                                            build_int_cst (integer_type_node,
> +                                                           lli_prec)));
> +      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> +      tree call1 = build_call_expr (fn, 1, src1);
> +      tree call2 = build_call_expr (fn, 1, src2);
> +      call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
> +    }
> +  else
> +    call = build_call_expr (fn, 1, src);
> +
> +  return call;
> +}
> +
>  /* Utility function to check if OP is defined by a stmt
>     that is a val - 1.  */
>
> @@ -2041,45 +2083,18 @@ ssa_defined_by_minus_one_stmt_p (tree op, tree val)
>           && integer_minus_onep (gimple_assign_rhs2 (stmt)));
>  }
>
> -/* See if LOOP is a popcout implementation, determine NITER for the loop
> +/* See comment below for number_of_iterations_bitcount.
> +   For popcount, we have:
>
> -   We match:
> -   <bb 2>
> -   goto <bb 4>
> +   modify:
> +   _1 = iv_1 + -1
> +   iv_2 = iv_1 & _1
>
> -   <bb 3>
> -   _1 = b_11 + -1
> -   b_6 = _1 & b_11
> -
> -   <bb 4>
> -   b_11 = PHI <b_5(D)(2), b_6(3)>
> +   test:
> +   if (iv != 0)
>
> -   exit block
> -   if (b_11 != 0)
> -       goto <bb 3>
> -   else
> -       goto <bb 5>
> -
> -   OR we match copy-header version:
> -   if (b_5 != 0)
> -       goto <bb 3>
> -   else
> -       goto <bb 4>
> -
> -   <bb 3>
> -   b_11 = PHI <b_5(2), b_6(3)>
> -   _1 = b_11 + -1
> -   b_6 = _1 & b_11
> -
> -   exit block
> -   if (b_6 != 0)
> -       goto <bb 3>
> -   else
> -       goto <bb 4>
> -
> -   If popcount pattern, update NITER accordingly.
> -   i.e., set NITER to  __builtin_popcount (b)
> -   return true if we did, false otherwise.
> +   modification count:
> +   popcount (src)
>
>   */
>
> @@ -2088,138 +2103,150 @@ number_of_iterations_popcount (loop_p loop, edge exit,
>                                enum tree_code code,
>                                class tree_niter_desc *niter)
>  {
> -  bool adjust = true;
> -  tree iter;
> +  bool modify_before_test = true;
>    HOST_WIDE_INT max;
> -  adjust = true;
> -  tree fn = NULL_TREE;
> -
> -  /* Check loop terminating branch is like
> -     if (b != 0).  */
> -  gimple *stmt = last_stmt (exit->src);
> -  if (!stmt
> -      || gimple_code (stmt) != GIMPLE_COND
> +
> +  /* Check that condition for staying inside the loop is like
> +     if (iv != 0).  */
> +  gimple *cond_stmt = last_stmt (exit->src);
> +  if (!cond_stmt
> +      || gimple_code (cond_stmt) != GIMPLE_COND
>        || code != NE_EXPR
> -      || !integer_zerop (gimple_cond_rhs (stmt))
> -      || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME)
> +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
>      return false;
>
> -  gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt));
> +  tree iv_2 = gimple_cond_lhs (cond_stmt);
> +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
>
> -  /* Depending on copy-header is performed, feeding PHI stmts might be in
> -     the loop header or loop latch, handle this.  */
> -  if (gimple_code (and_stmt) == GIMPLE_PHI
> -      && gimple_bb (and_stmt) == loop->header
> -      && gimple_phi_num_args (and_stmt) == 2
> -      && (TREE_CODE (gimple_phi_arg_def (and_stmt,
> +  /* If the test comes before the iv modification, then these will actually be
> +     iv_1 and a phi node.  */
> +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> +      && gimple_bb (iv_2_stmt) == loop->header
> +      && gimple_phi_num_args (iv_2_stmt) == 2
> +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
>                                          loop_latch_edge (loop)->dest_idx))
>           == SSA_NAME))
>      {
> -      /* SSA used in exit condition is defined by PHI stmt
> -       b_11 = PHI <b_5(D)(2), b_6(3)>
> -       from the PHI stmt, get the and_stmt
> -       b_6 = _1 & b_11.  */
> -      tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx);
> -      and_stmt = SSA_NAME_DEF_STMT (t);
> -      adjust = false;
> +      /* iv_2 is actually one of the inputs to the phi.  */
> +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +      modify_before_test = false;
>      }
>
> -  /* Make sure it is indeed an and stmt (b_6 = _1 & b_11).  */
> -  if (!is_gimple_assign (and_stmt)
> -      || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
> +  /* Make sure iv_2_stmt is an and stmt (iv_2 = _1 & iv_1).  */
> +  if (!is_gimple_assign (iv_2_stmt)
> +      || gimple_assign_rhs_code (iv_2_stmt) != BIT_AND_EXPR)
>      return false;
>
> -  tree b_11 = gimple_assign_rhs1 (and_stmt);
> -  tree _1 = gimple_assign_rhs2 (and_stmt);
> +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> +  tree _1 = gimple_assign_rhs2 (iv_2_stmt);
>
> -  /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1).
> -     Also make sure that b_11 is the same in and_stmt and _1 defining stmt.
> +  /* Check that _1 is defined by (_1 = iv_1 + -1).
> +     Also make sure that _1 is the same in and_stmt and _1 defining stmt.
>       Also canonicalize if _1 and _b11 are revrsed.  */
> -  if (ssa_defined_by_minus_one_stmt_p (b_11, _1))
> -    std::swap (b_11, _1);
> -  else if (ssa_defined_by_minus_one_stmt_p (_1, b_11))
> +  if (ssa_defined_by_minus_one_stmt_p (iv_1, _1))
> +    std::swap (iv_1, _1);
> +  else if (ssa_defined_by_minus_one_stmt_p (_1, iv_1))
>      ;
>    else
>      return false;
> -  /* Check the recurrence:
> -   ... = PHI <b_5(2), b_6(3)>.  */
> -  gimple *phi = SSA_NAME_DEF_STMT (b_11);
> +
> +  /* Check the recurrence.  */
> +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
>    if (gimple_code (phi) != GIMPLE_PHI
>        || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> -      || (gimple_assign_lhs (and_stmt)
> -         != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
>      return false;
>
> -  /* We found a match. Get the corresponding popcount builtin.  */
> +  /* We found a match.  */
>    tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> -  if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node))
> -    fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> -  else if (TYPE_PRECISION (TREE_TYPE (src))
> -          == TYPE_PRECISION (long_integer_type_node))
> -    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
> -  else if (TYPE_PRECISION (TREE_TYPE (src))
> -          == TYPE_PRECISION (long_long_integer_type_node)
> -          || (TYPE_PRECISION (TREE_TYPE (src))
> -              == 2 * TYPE_PRECISION (long_long_integer_type_node)))
> -    fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
> +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
>
> -  if (!fn)
> +  /* Get the corresponding popcount builtin.  */
> +  tree expr = build_popcount_expr (src);
> +
> +  if (!expr)
>      return false;
>
> -  /* Update NITER params accordingly  */
> -  tree utype = unsigned_type_for (TREE_TYPE (src));
> -  src = fold_convert (utype, src);
> -  if (TYPE_PRECISION (TREE_TYPE (src)) < TYPE_PRECISION (integer_type_node))
> -    src = fold_convert (unsigned_type_node, src);
> -  tree call;
> -  if (TYPE_PRECISION (TREE_TYPE (src))
> -      == 2 * TYPE_PRECISION (long_long_integer_type_node))
> +  max = src_precision;
> +
> +  tree may_be_zero = boolean_false_node;
> +
> +  if (modify_before_test)
>      {
> -      int prec = TYPE_PRECISION (long_long_integer_type_node);
> -      tree src1 = fold_convert (long_long_unsigned_type_node,
> -                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> -                                            unshare_expr (src),
> -                                            build_int_cst (integer_type_node,
> -                                                           prec)));
> -      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> -      call = build_call_expr (fn, 1, src1);
> -      call = fold_build2 (PLUS_EXPR, TREE_TYPE (call), call,
> -                         build_call_expr (fn, 1, src2));
> -      call = fold_convert (utype, call);
> +      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> +                         integer_one_node);
> +      max = max - 1;
> +      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
>      }
> -  else
> -    call = fold_convert (utype, build_call_expr (fn, 1, src));
> -  if (adjust)
> -    iter = fold_build2 (MINUS_EXPR, utype, call, build_int_cst (utype, 1));
> -  else
> -    iter = call;
>
> -  if (TREE_CODE (call) == INTEGER_CST)
> -    max = tree_to_uhwi (call);
> -  else
> -    max = TYPE_PRECISION (TREE_TYPE (src));
> -  if (adjust)
> -    max = max - 1;
> +  expr = fold_convert (unsigned_type_node, expr);
>
> -  niter->niter = iter;
>    niter->assumptions = boolean_true_node;
> +  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> +  niter->niter = simplify_using_initial_conditions(loop, expr);
>
> -  if (adjust)
> -    {
> -      tree may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> -                                     build_zero_cst (TREE_TYPE (src)));
> -      niter->may_be_zero
> -       = simplify_using_initial_conditions (loop, may_be_zero);
> -    }
> +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> +    niter->max = tree_to_uhwi (niter->niter);
>    else
> -    niter->may_be_zero = boolean_false_node;
> +    niter->max = max;
>
> -  niter->max = max;
>    niter->bound = NULL_TREE;
>    niter->cmp = ERROR_MARK;
>    return true;
>  }
>
> +/* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> +   1. A modification to the induction variabler;.
> +   2. A test to determine whether or not to exit the loop.
> +
> +   These can come in either order - i.e.:
> +
> +   <bb 3>
> +   iv_1 = PHI <src(2), iv_2(4)>
> +   if (test (iv_1))
> +     goto <bb 4>
> +   else
> +     goto <bb 5>
> +
> +   <bb 4>
> +   iv_2 = modify (iv_1)
> +   goto <bb 3>
> +
> +   OR
> +
> +   <bb 3>
> +   iv_1 = PHI <src(2), iv_2(4)>
> +   iv_2 = modify (iv_1)
> +
> +   <bb 4>
> +   if (test (iv_2))
> +     goto <bb 3>
> +   else
> +     goto <bb 5>
> +
> +   The second form can be generated by copying the loop header out of the loop.
> +
> +   In the first case, the number of latch executions will be equal to the
> +   number of induction variable modifications required before the test fails.
> +
> +   In the second case (modify_before_test), if we assume that the number of
> +   modifications required before the test fails is nonzero, then the number of
> +   latch executions will be one less than this number.
> +
> +   If we recognise the pattern, then we update niter accordingly, and return
> +   true.  */
> +
> +static bool
> +number_of_iterations_bitcount (loop_p loop, edge exit,
> +                              enum tree_code code,
> +                              class tree_niter_desc *niter)
> +{
> +  return number_of_iterations_popcount (loop, exit, code, niter);
> +}
> +
>  /* Substitute NEW_TREE for OLD in EXPR and fold the result.
>     If VALUEIZE is non-NULL then OLD and NEW_TREE are ignored and instead
>     all SSA names are replaced with the result of calling the VALUEIZE
> @@ -2758,7 +2785,7 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
>    tree iv0_niters = NULL_TREE;
>    if (!simple_iv_with_niters (loop, loop_containing_stmt (stmt),
>                               op0, &iv0, safe ? &iv0_niters : NULL, false))
> -    return number_of_iterations_popcount (loop, exit, code, niter);
> +    return number_of_iterations_bitcount (loop, exit, code, niter);
>    tree iv1_niters = NULL_TREE;
>    if (!simple_iv_with_niters (loop, loop_containing_stmt (stmt),
>                               op1, &iv1, safe ? &iv1_niters : NULL, false))

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/8] docs: Add popcount, clz and ctz target attributes
  2022-11-11 18:54 ` [PATCH 6/8] docs: Add popcount, clz and ctz target attributes Andrew Carlotti
@ 2022-11-14 14:52   ` Jeff Law
  2022-12-22 17:42     ` [PATCH 6/8 v2] " Andrew Carlotti
  0 siblings, 1 reply; 28+ messages in thread
From: Jeff Law @ 2022-11-14 14:52 UTC (permalink / raw)
  To: Andrew Carlotti, gcc-patches


On 11/11/22 11:54, Andrew Carlotti via Gcc-patches wrote:
> gcc/ChangeLog:
>
> 	* doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst:
> 	Add missing target attributes.

OK

jeff



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8] middle-end: Add cltz_complement idiom recognition
  2022-11-11 18:50 ` [PATCH 5/8] middle-end: Add cltz_complement idiom recognition Andrew Carlotti
@ 2022-11-14 15:10   ` Richard Biener
  2022-11-21 15:53     ` Andrew Carlotti
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Biener @ 2022-11-14 15:10 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This recognises patterns of the form:
> while (n) { n >>= 1 }
>
> This patch results in improved (but still suboptimal) codegen:
>
> foo (unsigned int b) {
>     int c = 0;
>
>     while (b) {
>         b >>= 1;
>         c++;
>     }
>
>     return c;
> }
>
> foo:
> .LFB11:
>         .cfi_startproc
>         cbz     w0, .L3
>         clz     w1, w0
>         tst     x0, 1
>         mov     w0, 32
>         sub     w0, w0, w1
>         csel    w0, w0, wzr, ne
>         ret
>
> The conditional is unnecessary. phiopt could recognise a redundant csel
> (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> clz call, but it cannot recognise the redunancy when the input is (e.g.)
> (32 - clz).
>
> I could perhaps extend this function to recognise this pattern in a later
> patch, if this is a good place to recognise more patterns.
>
> gcc/ChangeLog:
>
>         * tree-scalar-evolution.cc (expression_expensive_p): Add checks
>         for c[lt]z optabs.
>         * tree-ssa-loop-niter.cc (build_cltz_expr): New.
>         (number_of_iterations_cltz_complement): New.
>         (number_of_iterations_bitcount): Add call to the above.
>
> gcc/testsuite/ChangeLog:
>
>         * lib/target-supports.exp (check_effective_target_clz)
>         (check_effective_target_clzl, check_effective_target_clzll)
>         (check_effective_target_ctz, check_effective_target_clzl)
>         (check_effective_target_ctzll): New.
>         * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-char.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-int.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
>
>
> --
>
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1a29ca52e42e50822e4e3213b2cb008b766d0318
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int clz_complement_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC)
> +      return 0;
> +    else
> +      return 34567;
> +}
> +
> +int clz_complement_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 76543;
> +}
> +
> +int ctz_complement_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC)
> +      return 0;
> +    else
> +      return 23456;
> +}
> +
> +int ctz_complement_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 65432;
> +}
> +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(255) != 8)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1 << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..7f7793f0efac1f0d793e6e99b84988e5cc5221c9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1LL << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..97161bb7a74260bea20e325ebab64acb33a2b696
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1L << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b9afe8852d8ffbc7ee9a0760cf04b8f98af293a2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d2702a65daf34db66550d2255395db68a29a4797
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1ea0d5d7d9f8be1824c4177c33edd91e66b4ddab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..80fb02dcfa68bc022ae69b26fb189323e01fc6fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index c7f583d6d1498401a7c106ed3f539dcd04f95451..325f12d62324793d6b2cf55b074ef6cc9cf4dd4d 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -8687,6 +8687,72 @@ proc check_effective_target_popcount { } {
>      } "" ]
>  }
>
> +# Return 1 if the target supports clz on int.
> +
> +proc check_effective_target_clz { } {
> +    return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
> +        int foo (int b)
> +          {
> +            return __builtin_clz (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long.
> +
> +proc check_effective_target_clzl { } {
> +    return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
> +       int foo (long b)
> +         {
> +           return __builtin_clzl (b);
> +         }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long long.
> +
> +proc check_effective_target_clzll { } {
> +    return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
> +        int foo (long long b)
> +          {
> +            return __builtin_clzll (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on int.
> +
> +proc check_effective_target_ctz { } {
> +    return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
> +        int foo (int b)
> +          {
> +            return __builtin_ctz (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long.
> +
> +proc check_effective_target_ctzl { } {
> +    return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
> +       int foo (long b)
> +         {
> +           return __builtin_ctzl (b);
> +         }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long long.
> +
> +proc check_effective_target_ctzll { } {
> +    return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
> +        int foo (long long b)
> +          {
> +            return __builtin_ctzll (b);
> +          }
> +    } "" ]
> +}
> +
>  # Return 1 if the target supports atomic operations on "long long"
>  # and can execute them.
>  #
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
>          library call for popcount when backend does not have an instruction
>          to do so.  We consider this to be expensive and generate
>          __builtin_popcount only when backend defines it.  */
> +      optab optab;
>        combined_fn cfn = get_call_combined_fn (expr);
>        switch (cfn)
>         {
>         CASE_CFN_POPCOUNT:
> +         optab = popcount_optab;
> +         goto bitcount_call;
> +       CASE_CFN_CLZ:
> +         optab = clz_optab;
> +         goto bitcount_call;
> +       CASE_CFN_CTZ:
> +         optab = ctz_optab;
> +bitcount_call:
>           /* Check if opcode for popcount is available in the mode required.  */
> -         if (optab_handler (popcount_optab,
> +         if (optab_handler (optab,
>                              TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
>               == CODE_FOR_nothing)
>             {
> @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
>                  instructions.  */
>               if (is_a <scalar_int_mode> (mode, &int_mode)
>                   && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> -                 && (optab_handler (popcount_optab, word_mode)
> +                 && (optab_handler (optab, word_mode)
>                       != CODE_FOR_nothing))
>                   break;
>               return true;
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
>    return true;
>  }
>
> +/* Return an expression that counts the leading/trailing zeroes of src.  */

Can you expand the comment on how you handle defined_at_zero and how
that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
The loop examples you gave above all have a defined value for zero, I'm
not sure how you'd write a C loop which has that undefined.

> +static tree
> +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> +{
> +  tree fn;
> +  int prec = TYPE_PRECISION (TREE_TYPE (src));
> +  int i_prec = TYPE_PRECISION (integer_type_node);
> +  int li_prec = TYPE_PRECISION (long_integer_type_node);
> +  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> +  if (prec <= i_prec)

I don't think we can use <= for both CLZ and CTZ, no?  You probably
need a GIMPLE testcase or a language that doesn't promote char/short
to int for a testcase that fails though.

> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> +                : builtin_decl_implicit (BUILT_IN_CTZ);
> +  else if (prec == li_prec)
> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> +                : builtin_decl_implicit (BUILT_IN_CTZL);
> +  else if (prec == lli_prec || prec == 2 * lli_prec)
> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> +                : builtin_decl_implicit (BUILT_IN_CTZLL);
> +  else
> +    return NULL_TREE;
> +
> +  tree utype = unsigned_type_for (TREE_TYPE (src));
> +  src = fold_convert (utype, src);
> +  if (prec < i_prec)
> +    src = fold_convert (unsigned_type_node, src);
> +
> +  tree call;
> +  if (prec == 2 * lli_prec)
> +    {
> +      tree src1 = fold_convert (long_long_unsigned_type_node,
> +                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> +                                            unshare_expr (src),
> +                                            build_int_cst (integer_type_node,
> +                                                           lli_prec)));
> +      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> +      /* We count the zeroes in src1, and add the number in src2 when src1
> +        is 0.  */
> +      if (!leading)
> +       std::swap(src1, src2);
> +      tree call1 = build_call_expr (fn, 1, src1);
> +      tree call2 = build_call_expr (fn, 1, src2);
> +      if (defined_at_zero)
> +       {
> +         tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> +                                      build_zero_cst (TREE_TYPE (src2)));
> +         call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> +                             build_int_cst (integer_type_node, lli_prec));
> +       }
> +      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> +                                  build_zero_cst (TREE_TYPE (src1)));
> +      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> +                        fold_build2 (PLUS_EXPR, integer_type_node, call2,
> +                                     build_int_cst (integer_type_node,
> +                                                    lli_prec)));
> +    }
> +  else
> +    {
> +      call = build_call_expr (fn, 1, src);
> +      if (defined_at_zero)
> +       {
> +         tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
> +         call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> +                            build_int_cst (integer_type_node, prec));
> +       }
> +    }
> +
> +  if (leading && prec < i_prec)
> +    call = fold_build2(MINUS_EXPR, integer_type_node, call,
> +                      build_int_cst (integer_type_node,
> +                                     i_prec - prec));
> +
> +  return call;
> +}
> +
> +/* See comment below for number_of_iterations_bitcount.
> +   For c[lt]z complement, we have:
> +
> +   modify:
> +   iv_2 = iv_1 >> 1 OR iv_1 << 1
> +
> +   test:
> +   if (iv != 0)
> +
> +   modification count:
> +   src precision - c[lt]z (src)
> +
> + */
> +
> +static bool
> +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> +                              enum tree_code code,
> +                              class tree_niter_desc *niter)
> +{
> +  bool modify_before_test = true;
> +  HOST_WIDE_INT max;
> +
> +  /* Check that condition for staying inside the loop is like
> +     if (iv != 0).  */
> +  gimple *cond_stmt = last_stmt (exit->src);
> +  if (!cond_stmt
> +      || gimple_code (cond_stmt) != GIMPLE_COND
> +      || code != NE_EXPR
> +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> +    return false;
> +
> +  tree iv_2 = gimple_cond_lhs (cond_stmt);
> +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +
> +  /* If the test comes before the iv modification, then these will actually be
> +     iv_1 and a phi node.  */
> +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> +      && gimple_bb (iv_2_stmt) == loop->header
> +      && gimple_phi_num_args (iv_2_stmt) == 2
> +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> +                                        loop_latch_edge (loop)->dest_idx))
> +         == SSA_NAME))
> +    {
> +      /* iv_2 is actually one of the inputs to the phi.  */
> +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +      modify_before_test = false;
> +    }
> +
> +  /* Make sure iv_2_stmt is a logical shift by one stmt:
> +     iv_2 = iv_1 {>>|<<} 1  */
> +  if (!is_gimple_assign (iv_2_stmt)
> +      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> +         && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> +             || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> +      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> +    return false;
> +
> +  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> +
> +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> +
> +  /* Check the recurrence.  */
> +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> +  if (gimple_code (phi) != GIMPLE_PHI
> +      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> +    return false;
> +
> +  /* We found a match.  */
> +  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> +
> +  /* Get the corresponding c[lt]z builtin.  */
> +  tree expr = build_cltz_expr (src, !left_shift, true);

So we always have defined_at_zero == true?

> +
> +  if (!expr)
> +    return false;
> +
> +  expr = fold_build2 (MINUS_EXPR, integer_type_node,
> +                     build_int_cst (integer_type_node, src_precision),
> +                     expr);
> +
> +  max = src_precision;
> +
> +  tree may_be_zero = boolean_false_node;
> +
> +  if (modify_before_test)
> +    {
> +      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> +                         integer_one_node);
> +      max = max - 1;
> +      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
> +    }
> +
> +  expr = fold_convert (unsigned_type_node, expr);
> +
> +  niter->assumptions = boolean_true_node;
> +  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> +  niter->niter = simplify_using_initial_conditions (loop, expr);
> +
> +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> +    niter->max = tree_to_uhwi (niter->niter);
> +  else
> +    niter->max = max;
> +
> +  niter->bound = NULL_TREE;
> +  niter->cmp = ERROR_MARK;
> +  return true;
> +}
> +
>  /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
>     1. A modification to the induction variabler;.
>     2. A test to determine whether or not to exit the loop.
> @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
>                                enum tree_code code,
>                                class tree_niter_desc *niter)
>  {
> -  return number_of_iterations_popcount (loop, exit, code, niter);
> +  return (number_of_iterations_popcount (loop, exit, code, niter)
> +         || number_of_iterations_cltz_complement (loop, exit, code, niter));

I'm kind-of missing the non-complement variant ;)

Otherwise looks OK to me.

Thanks,
Richard.

>  }
>
>  /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/8] middle-end: Add c[lt]z idiom recognition
  2022-11-11 19:01 ` [PATCH 7/8] middle-end: Add c[lt]z idiom recognition Andrew Carlotti
@ 2022-11-14 15:22   ` Richard Biener
  0 siblings, 0 replies; 28+ messages in thread
From: Richard Biener @ 2022-11-14 15:22 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Fri, Nov 11, 2022 at 8:06 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This recognises the patterns of the form:
>   while (n & 1) { n >>= 1 }
>
> Unfortunately there are currently two issues relating to this patch.
>
> Firstly, simplify_using_initial_conditions does not recognise that
>         (n != 0) and ((n & 1) == 0) implies that ((n >> 1) != 0).
>
> This preconditions arise following the loop copy-header pass, and the
> assumptions returned by number_of_iterations_exit_assumptions then
> prevent final value replacement from using the niter result.
>
> I'm not sure what is the best way to fix this - one approach could be to
> modify simplify_using_initial_conditions to handle this sort of case,
> but it seems that it basically wants the information that ranger could
> give anway, so would something like that be a better option?

I've noted elsewhere that simplify_using_initial_conditions should be
rewritten to use (path) ranger somehow.  But I've also worked around
that for the case in PR100756, though not in this function.

> The second issue arises in the vectoriser, which is able to determine
> that the niter->assumptions are always true.
> When building with -march=armv8.4-a+sve -S -O3, we get this codegen:
>
> foo (unsigned int b) {
>     int c = 0;
>
>     if (b == 0)
>       return PREC;
>
>     while (!(b & (1 << (PREC - 1)))) {
>         b <<= 1;
>         c++;
>     }
>
>     return c;
> }
>
> foo:
> .LFB0:
>         .cfi_startproc
>         cmp     w0, 0
>         cbz     w0, .L6
>         blt     .L7
>         lsl     w1, w0, 1
>         clz     w2, w1
>         cmp     w2, 14
>         bls     .L8
>         mov     x0, 0
>         cntw    x3
>         add     w1, w2, 1
>         index   z1.s, #0, #1
>         whilelo p0.s, wzr, w1
> .L4:
>         add     x0, x0, x3
>         mov     p1.b, p0.b
>         mov     z0.d, z1.d
>         whilelo p0.s, w0, w1
>         incw    z1.s
>         b.any   .L4
>         add     z0.s, z0.s, #1
>         lastb   w0, p1, z0.s
>         ret
>         .p2align 2,,3
> .L8:
>         mov     w0, 0
>         b       .L3
>         .p2align 2,,3
> .L13:
>         lsl     w1, w1, 1
> .L3:
>         add     w0, w0, 1
>         tbz     w1, #31, .L13
>         ret
>         .p2align 2,,3
> .L6:
>         mov     w0, 32
>         ret
>         .p2align 2,,3
> .L7:
>         mov     w0, 0
>         ret
>         .cfi_endproc
>
> In essence, the vectoriser uses the niter information to determine
> exactly how many iterations of the loop it needs to run. It then uses
> SVE whilelo instructions to run this number of iterations. The original
> loop counter is also vectorised, despite only being used in the final
> iteration, and then the final value of this counter is used as the
> return value (which is the same as the number of iterations it computed
> in the first place).
>
> This vectorisation is obviously bad, and I think it exposes a latent
> bug in the vectoriser, rather than being an issue caused by this
> specific patch.

The main issue is that we use niter analysis to detect popcount and
friends but final value replacement doesn't always apply.  When other
optimizations pick up this niter result the final values are not
replaced as aggressively.  Ideally we'd replace the loops IV with
a counting one, but sometimes the intermediate values of the
popcounted variable are still used.

This patch looks OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         * tree-ssa-loop-niter.cc (number_of_iterations_cltz): New.
>         (number_of_iterations_bitcount): Add call to the above.
>         (number_of_iterations_exit_assumptions): Add EQ_EXPR case for
>         c[lt]z idiom recognition.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/tree-ssa/cltz-max.c: New test.
>         * gcc.dg/tree-ssa/clz-char.c: New test.
>         * gcc.dg/tree-ssa/clz-int.c: New test.
>         * gcc.dg/tree-ssa/clz-long-long.c: New test.
>         * gcc.dg/tree-ssa/clz-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-char.c: New test.
>         * gcc.dg/tree-ssa/ctz-int.c: New test.
>         * gcc.dg/tree-ssa/ctz-long-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-long.c: New test.
>
>
> --
>
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a6bea3d338940efee2e7e1c95a5941525945af9e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
> @@ -0,0 +1,72 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int clz_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return 0;
> +
> +    while (!(b & (1 << (PREC - 1)))) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 34567;
> +}
> +
> +int clz_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return 0;
> +
> +    while (!(b & (1 << PREC - 1))) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 2)
> +      return 0;
> +    else
> +      return 76543;
> +}
> +
> +int ctz_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return 0;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 23456;
> +}
> +
> +int ctz_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return 0;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 2)
> +      return 0;
> +    else
> +      return 65432;
> +}
> +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..4a122db95bbb576b4ade706bd3b1ca809d2f1e3b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & (1 << (PREC - 1)))) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1 << (PREC - 1)) != 0)
> +    __builtin_abort ();
> +  if (foo(35) != PREC - 6)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..96646f8e19cd5b2342acb88949b3ef6e3e2abd5a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-int.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & (1 << (PREC - 1)))) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1 << (PREC - 1)) != 0)
> +    __builtin_abort ();
> +  if (foo(35) != PREC - 6)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..80d3edc1dab2e74fc3271ba9d97640839b3a3786
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-long-long.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzll } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & (1LL << (PREC - 1)))) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1LL << (PREC - 1)) != 0)
> +    __builtin_abort ();
> +  if (foo(35) != PREC - 6)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1c8037f93b9c9d42f580a172267c65723a46ef8b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-long.c
> @@ -0,0 +1,34 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & (1L << (PREC - 1)))) {
> +       b <<= 1;
> +       c++;
> +}
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1L << (PREC - 1)) != 0)
> +    __builtin_abort ();
> +  if (foo(35) != PREC - 6)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..3cd166acbd4670e175d79a2403de2d5a4fd38665
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-char.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(128) != 7)
> +    __builtin_abort ();
> +  if (foo(96) != 5)
> +    __builtin_abort ();
> +  if (foo(35) != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..7f63493eb7389a18516f8f126c3c55dc80f0bde6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-int.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1 << (PREC - 1)) != PREC - 1)
> +    __builtin_abort ();
> +  if (foo(96) != 5)
> +    __builtin_abort ();
> +  if (foo(35) != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..924f61b76f01c77a40b9fff64af3b629ab1418c0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long-long.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzll } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1LL << (PREC - 1)) != PREC - 1)
> +    __builtin_abort ();
> +  if (foo(96) != 5)
> +    __builtin_abort ();
> +  if (foo(35) != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..178945daa8a2697989f1a1a0804ce33d768dcc55
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-long.c
> @@ -0,0 +1,36 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzl } */
> +/* { dg-options "-O2 -fno-tree-ch -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    if (b == 0)
> +      return PREC;
> +
> +    while (!(b & 1)) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != PREC)
> +    __builtin_abort ();
> +  if (foo(1L << (PREC - 1)) != PREC - 1)
> +    __builtin_abort ();
> +  if (foo(96) != 5)
> +    __builtin_abort ();
> +  if (foo(35) != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index 16e8e25919d808cea27adbd72f0be01ae2f0e547..87e6fe81d68fc3352e450688ef79e6fc68854d8a 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2274,6 +2274,167 @@ build_cltz_expr (tree src, bool leading, bool defined_at_zero)
>    return call;
>  }
>
> +/* See comment below for number_of_iterations_bitcount.
> +   For c[lt]z, we have:
> +
> +   modify:
> +   iv_2 = iv_1 << 1 OR iv_1 >> 1
> +
> +   test:
> +   if (iv & 1 << (prec-1)) OR (iv & 1)
> +
> +   modification count:
> +   src precision - c[lt]z (src)
> +
> + */
> +
> +static bool
> +number_of_iterations_cltz (loop_p loop, edge exit,
> +                              enum tree_code code,
> +                              class tree_niter_desc *niter)
> +{
> +  bool modify_before_test = true;
> +  HOST_WIDE_INT max;
> +  int checked_bit;
> +  tree iv_2;
> +
> +  /* Check that condition for staying inside the loop is like
> +     if (iv == 0).  */
> +  gimple *cond_stmt = last_stmt (exit->src);
> +  if (!cond_stmt
> +      || gimple_code (cond_stmt) != GIMPLE_COND
> +      || (code != EQ_EXPR && code != GE_EXPR)
> +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> +    return false;
> +
> +  if (code == EQ_EXPR)
> +    {
> +      /* Make sure we check a bitwise and with a suitable constant */
> +      gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (cond_stmt));
> +      if (!is_gimple_assign (and_stmt)
> +         || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR
> +         || !integer_pow2p (gimple_assign_rhs2 (and_stmt)))
> +       return false;
> +
> +      checked_bit = tree_log2 (gimple_assign_rhs2 (and_stmt));
> +
> +      iv_2 = gimple_assign_rhs1 (and_stmt);
> +    }
> +  else
> +    {
> +      /* We have a GE_EXPR - a signed comparison with zero is equivalent to
> +        testing the leading bit, so check for this pattern too.  */
> +
> +      iv_2 = gimple_cond_lhs (cond_stmt);
> +      tree test_value_type = TREE_TYPE (iv_2);
> +
> +      if (TYPE_UNSIGNED (test_value_type))
> +       return false;
> +
> +      gimple *test_value_stmt = SSA_NAME_DEF_STMT (iv_2);
> +
> +      if (is_gimple_assign (test_value_stmt)
> +         && gimple_assign_rhs_code (test_value_stmt) == NOP_EXPR)
> +       {
> +         /* If the test value comes from a NOP_EXPR, then we need to unwrap
> +            this.  We conservatively require that both types have the same
> +            precision.  */
> +         iv_2 = gimple_assign_rhs1 (test_value_stmt);
> +         tree rhs_type = TREE_TYPE (iv_2);
> +         if (TREE_CODE (rhs_type) != INTEGER_TYPE
> +             || (TYPE_PRECISION (rhs_type)
> +                 != TYPE_PRECISION (test_value_type)))
> +           return false;
> +       }
> +
> +      checked_bit = TYPE_PRECISION (test_value_type) - 1;
> +    }
> +
> +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +
> +  /* If the test comes before the iv modification, then these will actually be
> +     iv_1 and a phi node.  */
> +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> +      && gimple_bb (iv_2_stmt) == loop->header
> +      && gimple_phi_num_args (iv_2_stmt) == 2
> +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> +                                        loop_latch_edge (loop)->dest_idx))
> +         == SSA_NAME))
> +    {
> +      /* iv_2 is actually one of the inputs to the phi.  */
> +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +      modify_before_test = false;
> +    }
> +
> +  /* Make sure iv_2_stmt is a logical shift by one stmt:
> +     iv_2 = iv_1 {<<|>>} 1  */
> +  if (!is_gimple_assign (iv_2_stmt)
> +      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> +         && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> +             || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> +      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> +    return false;
> +
> +  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> +
> +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> +
> +  /* Check the recurrence.  */
> +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> +  if (gimple_code (phi) != GIMPLE_PHI
> +      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> +    return false;
> +
> +  /* We found a match.  */
> +  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> +
> +  /* Apply any needed preprocessing to src.  */
> +  int num_ignored_bits;
> +  if (left_shift)
> +    num_ignored_bits = src_precision - checked_bit - 1;
> +  else
> +    num_ignored_bits = checked_bit;
> +
> +  if (modify_before_test)
> +    num_ignored_bits++;
> +
> +  if (num_ignored_bits != 0)
> +    src = fold_build2 (left_shift ? LSHIFT_EXPR : RSHIFT_EXPR,
> +                      TREE_TYPE (src), src,
> +                      build_int_cst (integer_type_node, num_ignored_bits));
> +
> +  /* Get the corresponding c[lt]z builtin.  */
> +  tree expr = build_cltz_expr (src, left_shift, false);
> +
> +  if (!expr)
> +    return false;
> +
> +  max = src_precision - num_ignored_bits - 1;
> +
> +  expr = fold_convert (unsigned_type_node, expr);
> +
> +  tree assumptions = fold_build2 (NE_EXPR, boolean_type_node, src,
> +                                 build_zero_cst (TREE_TYPE (src)));
> +
> +  niter->assumptions = simplify_using_initial_conditions (loop, assumptions);
> +  niter->may_be_zero = boolean_false_node;
> +  niter->niter = simplify_using_initial_conditions (loop, expr);
> +
> +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> +    niter->max = tree_to_uhwi (niter->niter);
> +  else
> +    niter->max = max;
> +
> +  niter->bound = NULL_TREE;
> +  niter->cmp = ERROR_MARK;
> +
> +  return true;
> +}
> +
>  /* See comment below for number_of_iterations_bitcount.
>     For c[lt]z complement, we have:
>
> @@ -2434,6 +2595,7 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
>                                class tree_niter_desc *niter)
>  {
>    return (number_of_iterations_popcount (loop, exit, code, niter)
> +         || number_of_iterations_cltz (loop, exit, code, niter)
>           || number_of_iterations_cltz_complement (loop, exit, code, niter));
>  }
>
> @@ -2960,6 +3122,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit,
>      case NE_EXPR:
>        break;
>
> +    case EQ_EXPR:
> +      return number_of_iterations_cltz (loop, exit, code, niter);
> +
>      default:
>        return false;
>      }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8] middle-end: Add cltz_complement idiom recognition
  2022-11-14 15:10   ` Richard Biener
@ 2022-11-21 15:53     ` Andrew Carlotti
  2022-11-24 10:41       ` Richard Biener
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2022-11-21 15:53 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote:
> On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > This recognises patterns of the form:
> > while (n) { n >>= 1 }
> >
> > This patch results in improved (but still suboptimal) codegen:
> >
> > foo (unsigned int b) {
> >     int c = 0;
> >
> >     while (b) {
> >         b >>= 1;
> >         c++;
> >     }
> >
> >     return c;
> > }
> >
> > foo:
> > .LFB11:
> >         .cfi_startproc
> >         cbz     w0, .L3
> >         clz     w1, w0
> >         tst     x0, 1
> >         mov     w0, 32
> >         sub     w0, w0, w1
> >         csel    w0, w0, wzr, ne
> >         ret
> >
> > The conditional is unnecessary. phiopt could recognise a redundant csel
> > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> > clz call, but it cannot recognise the redunancy when the input is (e.g.)
> > (32 - clz).
> >
> > I could perhaps extend this function to recognise this pattern in a later
> > patch, if this is a good place to recognise more patterns.
> >
> > gcc/ChangeLog:
> >
+           PR tree-optimization/94793
> >         * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> >         for c[lt]z optabs.
> >         * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> >         (number_of_iterations_cltz_complement): New.
> >         (number_of_iterations_bitcount): Add call to the above.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * lib/target-supports.exp (check_effective_target_clz)
> >         (check_effective_target_clzl, check_effective_target_clzll)
> >         (check_effective_target_ctz, check_effective_target_clzl)
> >         (check_effective_target_ctzll): New.
> >         * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> >         * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> >         * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> >         * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> >         * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> >         * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> >         * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> >         * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> >         * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
> >
> >
> > --
> >
> >

[snip test diffs]

> > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> > --- a/gcc/tree-scalar-evolution.cc
> > +++ b/gcc/tree-scalar-evolution.cc
> > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> >          library call for popcount when backend does not have an instruction
> >          to do so.  We consider this to be expensive and generate
> >          __builtin_popcount only when backend defines it.  */
> > +      optab optab;
> >        combined_fn cfn = get_call_combined_fn (expr);
> >        switch (cfn)
> >         {
> >         CASE_CFN_POPCOUNT:
> > +         optab = popcount_optab;
> > +         goto bitcount_call;
> > +       CASE_CFN_CLZ:
> > +         optab = clz_optab;
> > +         goto bitcount_call;
> > +       CASE_CFN_CTZ:
> > +         optab = ctz_optab;
> > +bitcount_call:
> >           /* Check if opcode for popcount is available in the mode required.  */
> > -         if (optab_handler (popcount_optab,
> > +         if (optab_handler (optab,
> >                              TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
> >               == CODE_FOR_nothing)
> >             {
> > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> >                  instructions.  */
> >               if (is_a <scalar_int_mode> (mode, &int_mode)
> >                   && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> > -                 && (optab_handler (popcount_optab, word_mode)
> > +                 && (optab_handler (optab, word_mode)
> >                       != CODE_FOR_nothing))
> >                   break;
> >               return true;
> > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> > --- a/gcc/tree-ssa-loop-niter.cc
> > +++ b/gcc/tree-ssa-loop-niter.cc
> > @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> >    return true;
> >  }
> >
> > +/* Return an expression that counts the leading/trailing zeroes of src.  */
> 
> Can you expand the comment on how you handle defined_at_zero and how
> that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
> The loop examples you gave above all have a defined value for zero, I'm
> not sure how you'd write a C loop which has that undefined.
> 

How about:

/* Return an expression that counts the leading/trailing zeroes of src.

   If defined_at_zero is true, then the built expression uses a conditonal
   expression to return the precision of src when src == 0.
   Otherwise, we can elide the conditional expression and let src = 0 invoke
   undefined behaviour.  */


> > +static tree
> > +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> > +{
> > +  tree fn;
> > +  int prec = TYPE_PRECISION (TREE_TYPE (src));
> > +  int i_prec = TYPE_PRECISION (integer_type_node);
> > +  int li_prec = TYPE_PRECISION (long_integer_type_node);
> > +  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> > +  if (prec <= i_prec)
> 
> I don't think we can use <= for both CLZ and CTZ, no?  You probably
> need a GIMPLE testcase or a language that doesn't promote char/short
> to int for a testcase that fails though.

I don't see an issue here.  I've ensured that src is the mode used in
the iterator, and if a longer mode is used for the __builtin_clz
call then I account for the difference below...

> > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> > +                : builtin_decl_implicit (BUILT_IN_CTZ);
> > +  else if (prec == li_prec)
> > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> > +                : builtin_decl_implicit (BUILT_IN_CTZL);
> > +  else if (prec == lli_prec || prec == 2 * lli_prec)
> > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> > +                : builtin_decl_implicit (BUILT_IN_CTZLL);
> > +  else
> > +    return NULL_TREE;
> > +
> > +  tree utype = unsigned_type_for (TREE_TYPE (src));
> > +  src = fold_convert (utype, src);
> > +  if (prec < i_prec)
> > +    src = fold_convert (unsigned_type_node, src);
> > +
> > +  tree call;
> > +  if (prec == 2 * lli_prec)
> > +    {
> > +      tree src1 = fold_convert (long_long_unsigned_type_node,
> > +                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> > +                                            unshare_expr (src),
> > +                                            build_int_cst (integer_type_node,
> > +                                                           lli_prec)));
> > +      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> > +      /* We count the zeroes in src1, and add the number in src2 when src1
> > +        is 0.  */
> > +      if (!leading)
> > +       std::swap(src1, src2);
> > +      tree call1 = build_call_expr (fn, 1, src1);
> > +      tree call2 = build_call_expr (fn, 1, src2);
> > +      if (defined_at_zero)
> > +       {
> > +         tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> > +                                      build_zero_cst (TREE_TYPE (src2)));
> > +         call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> > +                             build_int_cst (integer_type_node, lli_prec));
> > +       }
> > +      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> > +                                  build_zero_cst (TREE_TYPE (src1)));
> > +      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> > +                        fold_build2 (PLUS_EXPR, integer_type_node, call2,
> > +                                     build_int_cst (integer_type_node,
> > +                                                    lli_prec)));
> > +    }
> > +  else
> > +    {
> > +      call = build_call_expr (fn, 1, src);
> > +      if (defined_at_zero)
> > +       {
> > +         tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> > +                                     build_zero_cst (TREE_TYPE (src)));
> > +         call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> > +                            build_int_cst (integer_type_node, prec));
> > +       }
> > +    }
> > +

...with this code:

> > +  if (leading && prec < i_prec)
> > +    call = fold_build2(MINUS_EXPR, integer_type_node, call,
> > +                      build_int_cst (integer_type_node,
> > +                                     i_prec - prec));
> > +
> > +  return call;
> > +}
> > +
> > +/* See comment below for number_of_iterations_bitcount.
> > +   For c[lt]z complement, we have:
> > +
> > +   modify:
> > +   iv_2 = iv_1 >> 1 OR iv_1 << 1
> > +
> > +   test:
> > +   if (iv != 0)
> > +
> > +   modification count:
> > +   src precision - c[lt]z (src)
> > +
> > + */
> > +
> > +static bool
> > +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> > +                              enum tree_code code,
> > +                              class tree_niter_desc *niter)
> > +{
> > +  bool modify_before_test = true;
> > +  HOST_WIDE_INT max;
> > +
> > +  /* Check that condition for staying inside the loop is like
> > +     if (iv != 0).  */
> > +  gimple *cond_stmt = last_stmt (exit->src);
> > +  if (!cond_stmt
> > +      || gimple_code (cond_stmt) != GIMPLE_COND
> > +      || code != NE_EXPR
> > +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> > +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> > +    return false;
> > +
> > +  tree iv_2 = gimple_cond_lhs (cond_stmt);
> > +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > +
> > +  /* If the test comes before the iv modification, then these will actually be
> > +     iv_1 and a phi node.  */
> > +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> > +      && gimple_bb (iv_2_stmt) == loop->header
> > +      && gimple_phi_num_args (iv_2_stmt) == 2
> > +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> > +                                        loop_latch_edge (loop)->dest_idx))
> > +         == SSA_NAME))
> > +    {
> > +      /* iv_2 is actually one of the inputs to the phi.  */
> > +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> > +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > +      modify_before_test = false;
> > +    }
> > +
> > +  /* Make sure iv_2_stmt is a logical shift by one stmt:
> > +     iv_2 = iv_1 {>>|<<} 1  */
> > +  if (!is_gimple_assign (iv_2_stmt)
> > +      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> > +         && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> > +             || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> > +      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> > +    return false;
> > +
> > +  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> > +
> > +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> > +
> > +  /* Check the recurrence.  */
> > +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> > +  if (gimple_code (phi) != GIMPLE_PHI
> > +      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> > +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> > +    return false;
> > +
> > +  /* We found a match.  */
> > +  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> > +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> > +
> > +  /* Get the corresponding c[lt]z builtin.  */
> > +  tree expr = build_cltz_expr (src, !left_shift, true);
> 
> So we always have defined_at_zero == true?
 
In the patch: yes.  The next patch uses defined_at_zero == false, and I
don't think there's any point in submitting a simpler build_cltz_expr
for just this patch.

> > +
> > +  if (!expr)
> > +    return false;
> > +
> > +  expr = fold_build2 (MINUS_EXPR, integer_type_node,
> > +                     build_int_cst (integer_type_node, src_precision),
> > +                     expr);
> > +
> > +  max = src_precision;
> > +
> > +  tree may_be_zero = boolean_false_node;
> > +
> > +  if (modify_before_test)
> > +    {
> > +      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> > +                         integer_one_node);
> > +      max = max - 1;
> > +      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> > +                                     build_zero_cst (TREE_TYPE (src)));
> > +    }
> > +
> > +  expr = fold_convert (unsigned_type_node, expr);
> > +
> > +  niter->assumptions = boolean_true_node;
> > +  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> > +  niter->niter = simplify_using_initial_conditions (loop, expr);
> > +
> > +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> > +    niter->max = tree_to_uhwi (niter->niter);
> > +  else
> > +    niter->max = max;
> > +
> > +  niter->bound = NULL_TREE;
> > +  niter->cmp = ERROR_MARK;
> > +  return true;
> > +}
> > +
> >  /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> >     1. A modification to the induction variabler;.
> >     2. A test to determine whether or not to exit the loop.
> > @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
> >                                enum tree_code code,
> >                                class tree_niter_desc *niter)
> >  {
> > -  return number_of_iterations_popcount (loop, exit, code, niter);
> > +  return (number_of_iterations_popcount (loop, exit, code, niter)
> > +         || number_of_iterations_cltz_complement (loop, exit, code, niter));
> 
> I'm kind-of missing the non-complement variant ;)

See next patch :)

> Otherwise looks OK to me.
> 
> Thanks,
> Richard.
> 
> >  }
> >
> >  /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8] middle-end: Add cltz_complement idiom recognition
  2022-11-21 15:53     ` Andrew Carlotti
@ 2022-11-24 10:41       ` Richard Biener
  2022-12-22 17:42         ` [PATCH 5/8 v2] " Andrew Carlotti
  0 siblings, 1 reply; 28+ messages in thread
From: Richard Biener @ 2022-11-24 10:41 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Mon, Nov 21, 2022 at 4:53 PM Andrew Carlotti <andrew.carlotti@arm.com> wrote:
>
> On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote:
> > On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > This recognises patterns of the form:
> > > while (n) { n >>= 1 }
> > >
> > > This patch results in improved (but still suboptimal) codegen:
> > >
> > > foo (unsigned int b) {
> > >     int c = 0;
> > >
> > >     while (b) {
> > >         b >>= 1;
> > >         c++;
> > >     }
> > >
> > >     return c;
> > > }
> > >
> > > foo:
> > > .LFB11:
> > >         .cfi_startproc
> > >         cbz     w0, .L3
> > >         clz     w1, w0
> > >         tst     x0, 1
> > >         mov     w0, 32
> > >         sub     w0, w0, w1
> > >         csel    w0, w0, wzr, ne
> > >         ret
> > >
> > > The conditional is unnecessary. phiopt could recognise a redundant csel
> > > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> > > clz call, but it cannot recognise the redunancy when the input is (e.g.)
> > > (32 - clz).
> > >
> > > I could perhaps extend this function to recognise this pattern in a later
> > > patch, if this is a good place to recognise more patterns.
> > >
> > > gcc/ChangeLog:
> > >
> +           PR tree-optimization/94793
> > >         * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> > >         for c[lt]z optabs.
> > >         * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> > >         (number_of_iterations_cltz_complement): New.
> > >         (number_of_iterations_bitcount): Add call to the above.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * lib/target-supports.exp (check_effective_target_clz)
> > >         (check_effective_target_clzl, check_effective_target_clzll)
> > >         (check_effective_target_ctz, check_effective_target_clzl)
> > >         (check_effective_target_ctzll): New.
> > >         * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> > >         * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> > >         * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> > >         * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> > >         * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> > >         * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> > >         * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> > >         * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> > >         * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
> > >
> > >
> > > --
> > >
> > >
>
> [snip test diffs]
>
> > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > > index 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 100644
> > > --- a/gcc/tree-scalar-evolution.cc
> > > +++ b/gcc/tree-scalar-evolution.cc
> > > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > >          library call for popcount when backend does not have an instruction
> > >          to do so.  We consider this to be expensive and generate
> > >          __builtin_popcount only when backend defines it.  */
> > > +      optab optab;
> > >        combined_fn cfn = get_call_combined_fn (expr);
> > >        switch (cfn)
> > >         {
> > >         CASE_CFN_POPCOUNT:
> > > +         optab = popcount_optab;
> > > +         goto bitcount_call;
> > > +       CASE_CFN_CLZ:
> > > +         optab = clz_optab;
> > > +         goto bitcount_call;
> > > +       CASE_CFN_CTZ:
> > > +         optab = ctz_optab;
> > > +bitcount_call:
> > >           /* Check if opcode for popcount is available in the mode required.  */
> > > -         if (optab_handler (popcount_optab,
> > > +         if (optab_handler (optab,
> > >                              TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
> > >               == CODE_FOR_nothing)
> > >             {
> > > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
> > >                  instructions.  */
> > >               if (is_a <scalar_int_mode> (mode, &int_mode)
> > >                   && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> > > -                 && (optab_handler (popcount_optab, word_mode)
> > > +                 && (optab_handler (optab, word_mode)
> > >                       != CODE_FOR_nothing))
> > >                   break;
> > >               return true;
> > > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > > index fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 100644
> > > --- a/gcc/tree-ssa-loop-niter.cc
> > > +++ b/gcc/tree-ssa-loop-niter.cc
> > > @@ -2198,6 +2198,195 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> > >    return true;
> > >  }
> > >
> > > +/* Return an expression that counts the leading/trailing zeroes of src.  */
> >
> > Can you expand the comment on how you handle defined_at_zero and how
> > that relates to the C[LT]Z_DEFINED_VALUE_AT_ZERO target macros?
> > The loop examples you gave above all have a defined value for zero, I'm
> > not sure how you'd write a C loop which has that undefined.
> >
>
> How about:
>
> /* Return an expression that counts the leading/trailing zeroes of src.
>
>    If defined_at_zero is true, then the built expression uses a conditonal
>    expression to return the precision of src when src == 0.
>    Otherwise, we can elide the conditional expression and let src = 0 invoke
>    undefined behaviour.  */

Ah, yes - that makes things clearer.

>
> > > +static tree
> > > +build_cltz_expr (tree src, bool leading, bool defined_at_zero)
> > > +{
> > > +  tree fn;
> > > +  int prec = TYPE_PRECISION (TREE_TYPE (src));
> > > +  int i_prec = TYPE_PRECISION (integer_type_node);
> > > +  int li_prec = TYPE_PRECISION (long_integer_type_node);
> > > +  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> > > +  if (prec <= i_prec)
> >
> > I don't think we can use <= for both CLZ and CTZ, no?  You probably
> > need a GIMPLE testcase or a language that doesn't promote char/short
> > to int for a testcase that fails though.
>
> I don't see an issue here.  I've ensured that src is the mode used in
> the iterator, and if a longer mode is used for the __builtin_clz
> call then I account for the difference below...
>
> > > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> > > +                : builtin_decl_implicit (BUILT_IN_CTZ);
> > > +  else if (prec == li_prec)
> > > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> > > +                : builtin_decl_implicit (BUILT_IN_CTZL);
> > > +  else if (prec == lli_prec || prec == 2 * lli_prec)
> > > +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> > > +                : builtin_decl_implicit (BUILT_IN_CTZLL);
> > > +  else
> > > +    return NULL_TREE;
> > > +
> > > +  tree utype = unsigned_type_for (TREE_TYPE (src));
> > > +  src = fold_convert (utype, src);
> > > +  if (prec < i_prec)
> > > +    src = fold_convert (unsigned_type_node, src);
> > > +

but if you have an unsigned short variable you promote
that to unsigned int here, no?

> > > +  tree call;
> > > +  if (prec == 2 * lli_prec)
> > > +    {
> > > +      tree src1 = fold_convert (long_long_unsigned_type_node,
> > > +                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> > > +                                            unshare_expr (src),
> > > +                                            build_int_cst (integer_type_node,
> > > +                                                           lli_prec)));
> > > +      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> > > +      /* We count the zeroes in src1, and add the number in src2 when src1
> > > +        is 0.  */
> > > +      if (!leading)
> > > +       std::swap(src1, src2);
> > > +      tree call1 = build_call_expr (fn, 1, src1);
> > > +      tree call2 = build_call_expr (fn, 1, src2);
> > > +      if (defined_at_zero)
> > > +       {
> > > +         tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> > > +                                      build_zero_cst (TREE_TYPE (src2)));
> > > +         call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> > > +                             build_int_cst (integer_type_node, lli_prec));
> > > +       }
> > > +      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> > > +                                  build_zero_cst (TREE_TYPE (src1)));
> > > +      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> > > +                        fold_build2 (PLUS_EXPR, integer_type_node, call2,
> > > +                                     build_int_cst (integer_type_node,
> > > +                                                    lli_prec)));
> > > +    }
> > > +  else
> > > +    {
> > > +      call = build_call_expr (fn, 1, src);
> > > +      if (defined_at_zero)
> > > +       {
> > > +         tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> > > +                                     build_zero_cst (TREE_TYPE (src)));
> > > +         call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> > > +                            build_int_cst (integer_type_node, prec));
> > > +       }
> > > +    }
> > > +
>
> ...with this code:
>
> > > +  if (leading && prec < i_prec)
> > > +    call = fold_build2(MINUS_EXPR, integer_type_node, call,
> > > +                      build_int_cst (integer_type_node,
> > > +                                     i_prec - prec));

... ah, OK.  Indeed, that should work.  Note we do have CTZ and CLZ
optabs and internal functions - in case there's a HImode CLZ this
could be elided.  More general you can avoid using the __builtin_
functions with their fixed types in favor of using IFN_C[TL]Z which
are type agnostic (but require optab support - you should be able
to check this via direct_internal_fn_supported_p).

> > > +  return call;
> > > +}
> > > +
> > > +/* See comment below for number_of_iterations_bitcount.
> > > +   For c[lt]z complement, we have:
> > > +
> > > +   modify:
> > > +   iv_2 = iv_1 >> 1 OR iv_1 << 1
> > > +
> > > +   test:
> > > +   if (iv != 0)
> > > +
> > > +   modification count:
> > > +   src precision - c[lt]z (src)
> > > +
> > > + */
> > > +
> > > +static bool
> > > +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> > > +                              enum tree_code code,
> > > +                              class tree_niter_desc *niter)
> > > +{
> > > +  bool modify_before_test = true;
> > > +  HOST_WIDE_INT max;
> > > +
> > > +  /* Check that condition for staying inside the loop is like
> > > +     if (iv != 0).  */
> > > +  gimple *cond_stmt = last_stmt (exit->src);
> > > +  if (!cond_stmt
> > > +      || gimple_code (cond_stmt) != GIMPLE_COND
> > > +      || code != NE_EXPR
> > > +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> > > +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> > > +    return false;
> > > +
> > > +  tree iv_2 = gimple_cond_lhs (cond_stmt);
> > > +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > > +
> > > +  /* If the test comes before the iv modification, then these will actually be
> > > +     iv_1 and a phi node.  */
> > > +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> > > +      && gimple_bb (iv_2_stmt) == loop->header
> > > +      && gimple_phi_num_args (iv_2_stmt) == 2
> > > +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> > > +                                        loop_latch_edge (loop)->dest_idx))
> > > +         == SSA_NAME))
> > > +    {
> > > +      /* iv_2 is actually one of the inputs to the phi.  */
> > > +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> > > +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> > > +      modify_before_test = false;
> > > +    }
> > > +
> > > +  /* Make sure iv_2_stmt is a logical shift by one stmt:
> > > +     iv_2 = iv_1 {>>|<<} 1  */
> > > +  if (!is_gimple_assign (iv_2_stmt)
> > > +      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> > > +         && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> > > +             || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> > > +      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> > > +    return false;
> > > +
> > > +  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> > > +
> > > +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> > > +
> > > +  /* Check the recurrence.  */
> > > +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> > > +  if (gimple_code (phi) != GIMPLE_PHI
> > > +      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> > > +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> > > +    return false;
> > > +
> > > +  /* We found a match.  */
> > > +  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> > > +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> > > +
> > > +  /* Get the corresponding c[lt]z builtin.  */
> > > +  tree expr = build_cltz_expr (src, !left_shift, true);
> >
> > So we always have defined_at_zero == true?
>
> In the patch: yes.  The next patch uses defined_at_zero == false, and I
> don't think there's any point in submitting a simpler build_cltz_expr
> for just this patch.
>
> > > +
> > > +  if (!expr)
> > > +    return false;
> > > +
> > > +  expr = fold_build2 (MINUS_EXPR, integer_type_node,
> > > +                     build_int_cst (integer_type_node, src_precision),
> > > +                     expr);
> > > +
> > > +  max = src_precision;
> > > +
> > > +  tree may_be_zero = boolean_false_node;
> > > +
> > > +  if (modify_before_test)
> > > +    {
> > > +      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> > > +                         integer_one_node);
> > > +      max = max - 1;
> > > +      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> > > +                                     build_zero_cst (TREE_TYPE (src)));
> > > +    }
> > > +
> > > +  expr = fold_convert (unsigned_type_node, expr);
> > > +
> > > +  niter->assumptions = boolean_true_node;
> > > +  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> > > +  niter->niter = simplify_using_initial_conditions (loop, expr);
> > > +
> > > +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> > > +    niter->max = tree_to_uhwi (niter->niter);
> > > +  else
> > > +    niter->max = max;
> > > +
> > > +  niter->bound = NULL_TREE;
> > > +  niter->cmp = ERROR_MARK;
> > > +  return true;
> > > +}
> > > +
> > >  /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
> > >     1. A modification to the induction variabler;.
> > >     2. A test to determine whether or not to exit the loop.
> > > @@ -2244,7 +2433,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
> > >                                enum tree_code code,
> > >                                class tree_niter_desc *niter)
> > >  {
> > > -  return number_of_iterations_popcount (loop, exit, code, niter);
> > > +  return (number_of_iterations_popcount (loop, exit, code, niter)
> > > +         || number_of_iterations_cltz_complement (loop, exit, code, niter));
> >
> > I'm kind-of missing the non-complement variant ;)
>
> See next patch :)
>
> > Otherwise looks OK to me.
> >
> > Thanks,
> > Richard.
> >
> > >  }
> > >
> > >  /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition
  2022-11-24 10:41       ` Richard Biener
@ 2022-12-22 17:42         ` Andrew Carlotti
  2023-01-12 13:19           ` Richard Biener
  2023-01-19  9:19           ` Jan-Benedict Glaw
  0 siblings, 2 replies; 28+ messages in thread
From: Andrew Carlotti @ 2022-12-22 17:42 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

On Thu, Nov 24, 2022 at 11:41:31AM +0100, Richard Biener wrote:
> Note we do have CTZ and CLZ
> optabs and internal functions - in case there's a HImode CLZ this
> could be elided.  More general you can avoid using the __builtin_
> functions with their fixed types in favor of using IFN_C[TL]Z which
> are type agnostic (but require optab support - you should be able
> to check this via direct_internal_fn_supported_p).

IFN support added. I've also renamed the defined_at_zero parameter to
define_at_zero, since this is a request for the expression to define it,
rather than a guarantee that it is already defined.

New patch below, bootstrapped and regression tested on
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge?

---

This recognises patterns of the form:
while (n) { n >>= 1 }

This patch results in improved (but still suboptimal) codegen:

foo (unsigned int b) {
    int c = 0;

    while (b) {
        b >>= 1;
        c++;
    }

    return c;
}

foo:
.LFB11:
        .cfi_startproc
        cbz     w0, .L3
        clz     w1, w0
        tst     x0, 1
        mov     w0, 32
        sub     w0, w0, w1
        csel    w0, w0, wzr, ne
        ret

The conditional is unnecessary. phiopt could recognise a redundant csel
(using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
clz call, but it cannot recognise the redunancy when the input is (e.g.)
(32 - clz).

I could perhaps extend this function to recognise this pattern in a later
patch, if this is a good place to recognise more patterns.

gcc/ChangeLog:

	PR tree-optimization/94793
	* tree-scalar-evolution.cc (expression_expensive_p): Add checks
	for c[lt]z optabs.
	* tree-ssa-loop-niter.cc (build_cltz_expr): New.
	(number_of_iterations_cltz_complement): New.
	(number_of_iterations_bitcount): Add call to the above.

gcc/testsuite/ChangeLog:

	* lib/target-supports.exp (check_effective_target_clz)
	(check_effective_target_clzl, check_effective_target_clzll)
	(check_effective_target_ctz, check_effective_target_clzl)
	(check_effective_target_ctzll): New.
	* gcc.dg/tree-ssa/cltz-complement-max.c: New test.
	* gcc.dg/tree-ssa/clz-complement-char.c: New test.
	* gcc.dg/tree-ssa/clz-complement-int.c: New test.
	* gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
	* gcc.dg/tree-ssa/clz-complement-long.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-char.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-int.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
	* gcc.dg/tree-ssa/ctz-complement-long.c: New test.

---

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
new file mode 100644
index 0000000000000000000000000000000000000000..1a29ca52e42e50822e4e3213b2cb008b766d0318
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_complement_count1 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC)
+      return 0;
+    else
+      return 34567;
+}
+
+int clz_complement_count2 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 76543;
+}
+
+int ctz_complement_count1 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC)
+      return 0;
+    else
+      return 23456;
+}
+
+int ctz_complement_count2 (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+    if (c <= PREC - 1)
+      return 0;
+    else
+      return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(255) != 8)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1 << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f7793f0efac1f0d793e6e99b84988e5cc5221c9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1LL << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..97161bb7a74260bea20e325ebab64acb33a2b696
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    while (b) {
+	b >>= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(5) != 3)
+    __builtin_abort ();
+  if (foo(1L << (PREC - 1)) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
new file mode 100644
index 0000000000000000000000000000000000000000..b9afe8852d8ffbc7ee9a0760cf04b8f98af293a2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
new file mode 100644
index 0000000000000000000000000000000000000000..d2702a65daf34db66550d2255395db68a29a4797
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..1ea0d5d7d9f8be1824c4177c33edd91e66b4ddab
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzll } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long long b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
new file mode 100644
index 0000000000000000000000000000000000000000..80fb02dcfa68bc022ae69b26fb189323e01fc6fc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target ctzl } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned long b) {
+    int c = 0;
+
+    while (b) {
+	b <<= 1;
+	c++;
+    }
+
+    return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+    __builtin_abort ();
+  if (foo(96) != PREC - 5)
+    __builtin_abort ();
+  if (foo(35) != PREC)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 2a058c67c53466fe41b748d37ab660afd4e3403f..c745202624da672d1bdc21b8e74c1daac6ad9933 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -8701,6 +8701,72 @@ proc check_effective_target_popcount { } {
     } "" ]
 }
 
+# Return 1 if the target supports clz on int.
+
+proc check_effective_target_clz { } {
+    return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
+        int foo (int b)
+          {
+            return __builtin_clz (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports clz on long.
+
+proc check_effective_target_clzl { } {
+    return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
+	int foo (long b)
+	  {
+	    return __builtin_clzl (b);
+	  }
+    } "" ]
+}
+
+# Return 1 if the target supports clz on long long.
+
+proc check_effective_target_clzll { } {
+    return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
+        int foo (long long b)
+          {
+            return __builtin_clzll (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on int.
+
+proc check_effective_target_ctz { } {
+    return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
+        int foo (int b)
+          {
+            return __builtin_ctz (b);
+          }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on long.
+
+proc check_effective_target_ctzl { } {
+    return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
+	int foo (long b)
+	  {
+	    return __builtin_ctzl (b);
+	  }
+    } "" ]
+}
+
+# Return 1 if the target supports ctz on long long.
+
+proc check_effective_target_ctzll { } {
+    return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
+        int foo (long long b)
+          {
+            return __builtin_ctzll (b);
+          }
+    } "" ]
+}
+
 # Return 1 if the target supports atomic operations on "long long"
 # and can execute them.
 #
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index f75398afb7c9fdf42e69e940e2232942143049f6..0e4e87aea34622c8ee21f5c8e29dae2d0cdd2643 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3397,12 +3397,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
 	 library call for popcount when backend does not have an instruction
 	 to do so.  We consider this to be expensive and generate
 	 __builtin_popcount only when backend defines it.  */
+      optab optab;
       combined_fn cfn = get_call_combined_fn (expr);
       switch (cfn)
 	{
 	CASE_CFN_POPCOUNT:
+	  optab = popcount_optab;
+	  goto bitcount_call;
+	CASE_CFN_CLZ:
+	  optab = clz_optab;
+	  goto bitcount_call;
+	CASE_CFN_CTZ:
+	  optab = ctz_optab;
+bitcount_call:
 	  /* Check if opcode for popcount is available in the mode required.  */
-	  if (optab_handler (popcount_optab,
+	  if (optab_handler (optab,
 			     TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
 	      == CODE_FOR_nothing)
 	    {
@@ -3415,7 +3424,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
 		 instructions.  */
 	      if (is_a <scalar_int_mode> (mode, &int_mode)
 		  && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
-		  && (optab_handler (popcount_optab, word_mode)
+		  && (optab_handler (optab, word_mode)
 		      != CODE_FOR_nothing))
 		  break;
 	      return true;
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index fece876099c1687569d6351e7d2416ea6acae5b5..ce2441f2a6dbdf2d8fe42755d5d1abd8a631bb5c 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-chrec.h"
 #include "tree-scalar-evolution.h"
 #include "tree-dfa.h"
+#include "internal-fn.h"
 #include "gimple-range.h"
 
 
@@ -2198,6 +2199,224 @@ number_of_iterations_popcount (loop_p loop, edge exit,
   return true;
 }
 
+/* Return an expression that counts the leading/trailing zeroes of src.
+
+   If define_at_zero is true, then the built expression will be defined to
+   return the precision of src when src == 0 (using either a conditional
+   expression or a suitable internal function).
+   Otherwise, we can elide the conditional expression and let src = 0 invoke
+   undefined behaviour.  */
+
+static tree
+build_cltz_expr (tree src, bool leading, bool define_at_zero)
+{
+  tree fn;
+  internal_fn ifn = leading ? IFN_CLZ : IFN_CTZ;
+  bool use_ifn = false;
+  int prec = TYPE_PRECISION (TREE_TYPE (src));
+  int i_prec = TYPE_PRECISION (integer_type_node);
+  int li_prec = TYPE_PRECISION (long_integer_type_node);
+  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
+
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+
+  if (direct_internal_fn_supported_p (ifn, utype, OPTIMIZE_FOR_BOTH))
+    use_ifn = true;
+  else if (prec <= i_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
+		 : builtin_decl_implicit (BUILT_IN_CTZ);
+  else if (prec == li_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
+		 : builtin_decl_implicit (BUILT_IN_CTZL);
+  else if (prec == lli_prec || prec == 2 * lli_prec)
+    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
+		 : builtin_decl_implicit (BUILT_IN_CTZLL);
+  else
+    return NULL_TREE;
+
+  tree call;
+  if (use_ifn)
+    {
+      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, ifn,
+					   integer_type_node, 1, src);
+      int val;
+      scalar_int_mode mode = SCALAR_INT_TYPE_MODE (utype);
+      int optab_defined_at_zero
+	= leading ? CLZ_DEFINED_VALUE_AT_ZERO (mode, val)
+		  : CTZ_DEFINED_VALUE_AT_ZERO (mode, val);
+      if (define_at_zero && !(optab_defined_at_zero == 2 && val == prec))
+	{
+	  tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+	  call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
+			     build_int_cst (integer_type_node, prec));
+	}
+    }
+  else if (prec == 2 * lli_prec)
+    {
+      tree src1 = fold_convert (long_long_unsigned_type_node,
+				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+					     unshare_expr (src),
+					     build_int_cst (integer_type_node,
+							    lli_prec)));
+      tree src2 = fold_convert (long_long_unsigned_type_node, src);
+      /* We count the zeroes in src1, and add the number in src2 when src1
+	 is 0.  */
+      if (!leading)
+	std::swap(src1, src2);
+      tree call1 = build_call_expr (fn, 1, src1);
+      tree call2 = build_call_expr (fn, 1, src2);
+      if (define_at_zero)
+	{
+	  tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
+				       build_zero_cst (TREE_TYPE (src2)));
+	  call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
+			      build_int_cst (integer_type_node, lli_prec));
+	}
+      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
+				   build_zero_cst (TREE_TYPE (src1)));
+      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
+			 fold_build2 (PLUS_EXPR, integer_type_node, call2,
+				      build_int_cst (integer_type_node,
+						     lli_prec)));
+    }
+  else
+    {
+      if (prec < i_prec)
+	src = fold_convert (unsigned_type_node, src);
+
+      call = build_call_expr (fn, 1, src);
+      if (define_at_zero)
+	{
+	  tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+	  call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
+			     build_int_cst (integer_type_node, prec));
+	}
+
+      if (leading && prec < i_prec)
+	call = fold_build2(MINUS_EXPR, integer_type_node, call,
+			   build_int_cst (integer_type_node,
+					  i_prec - prec));
+    }
+
+  return call;
+}
+
+/* See comment below for number_of_iterations_bitcount.
+   For c[lt]z complement, we have:
+
+   modify:
+   iv_2 = iv_1 >> 1 OR iv_1 << 1
+
+   test:
+   if (iv != 0)
+
+   modification count:
+   src precision - c[lt]z (src)
+
+ */
+
+static bool
+number_of_iterations_cltz_complement (loop_p loop, edge exit,
+			       enum tree_code code,
+			       class tree_niter_desc *niter)
+{
+  bool modify_before_test = true;
+  HOST_WIDE_INT max;
+
+  /* Check that condition for staying inside the loop is like
+     if (iv != 0).  */
+  gimple *cond_stmt = last_stmt (exit->src);
+  if (!cond_stmt
+      || gimple_code (cond_stmt) != GIMPLE_COND
+      || code != NE_EXPR
+      || !integer_zerop (gimple_cond_rhs (cond_stmt))
+      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
+    return false;
+
+  tree iv_2 = gimple_cond_lhs (cond_stmt);
+  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+
+  /* If the test comes before the iv modification, then these will actually be
+     iv_1 and a phi node.  */
+  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
+      && gimple_bb (iv_2_stmt) == loop->header
+      && gimple_phi_num_args (iv_2_stmt) == 2
+      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
+					 loop_latch_edge (loop)->dest_idx))
+	  == SSA_NAME))
+    {
+      /* iv_2 is actually one of the inputs to the phi.  */
+      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
+      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
+      modify_before_test = false;
+    }
+
+  /* Make sure iv_2_stmt is a logical shift by one stmt:
+     iv_2 = iv_1 {>>|<<} 1  */
+  if (!is_gimple_assign (iv_2_stmt)
+      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
+	  && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
+	      || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
+      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
+    return false;
+
+  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
+
+  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
+
+  /* Check the recurrence.  */
+  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
+  if (gimple_code (phi) != GIMPLE_PHI
+      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+    return false;
+
+  /* We found a match.  */
+  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
+
+  /* Get the corresponding c[lt]z builtin.  */
+  tree expr = build_cltz_expr (src, !left_shift, true);
+
+  if (!expr)
+    return false;
+
+  expr = fold_build2 (MINUS_EXPR, integer_type_node,
+		      build_int_cst (integer_type_node, src_precision),
+		      expr);
+
+  max = src_precision;
+
+  tree may_be_zero = boolean_false_node;
+
+  if (modify_before_test)
+    {
+      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
+			  integer_one_node);
+      max = max - 1;
+      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
+				      build_zero_cst (TREE_TYPE (src)));
+    }
+
+  expr = fold_convert (unsigned_type_node, expr);
+
+  niter->assumptions = boolean_true_node;
+  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
+  niter->niter = simplify_using_initial_conditions (loop, expr);
+
+  if (TREE_CODE (niter->niter) == INTEGER_CST)
+    niter->max = tree_to_uhwi (niter->niter);
+  else
+    niter->max = max;
+
+  niter->bound = NULL_TREE;
+  niter->cmp = ERROR_MARK;
+  return true;
+}
+
 /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
    1. A modification to the induction variabler;.
    2. A test to determine whether or not to exit the loop.
@@ -2244,7 +2463,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
 			       enum tree_code code,
 			       class tree_niter_desc *niter)
 {
-  return number_of_iterations_popcount (loop, exit, code, niter);
+  return (number_of_iterations_popcount (loop, exit, code, niter)
+	  || number_of_iterations_cltz_complement (loop, exit, code, niter));
 }
 
 /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 6/8 v2] docs: Add popcount, clz and ctz target attributes
  2022-11-14 14:52   ` Jeff Law
@ 2022-12-22 17:42     ` Andrew Carlotti
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Carlotti @ 2022-12-22 17:42 UTC (permalink / raw)
  To: gcc-patches

Updated to reflect Sphinx revert; I'll commit this once the
cltz_complement patch is merged.

gcc/ChangeLog:

	* doc/sourcebuild.texi: Add missing target attributes.

---

diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index ffe69d6fcb9c46cf97ba570e85b56e586a0c9b99..1036b185ee289bbf7883bd14956a41da9a6d677b 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2511,6 +2511,24 @@ Target supports the execution of @code{amx-fp16} instructions.
 @item cell_hw
 Test system can execute AltiVec and Cell PPU instructions.
 
+@item clz
+Target supports a clz optab on int.
+
+@item clzl
+Target supports a clz optab on long.
+
+@item clzll
+Target supports a clz optab on long long.
+
+@item ctz
+Target supports a ctz optab on int.
+
+@item ctzl
+Target supports a ctz optab on long.
+
+@item ctzll
+Target supports a ctz optab on long long.
+
 @item cmpccxadd
 Target supports the execution of @code{cmpccxadd} instructions.
 
@@ -2532,6 +2550,15 @@ Target does not require strict alignment.
 @item pie_copyreloc
 The x86-64 target linker supports PIE with copy reloc.
 
+@item popcount
+Target supports a popcount optab on int.
+
+@item popcountl
+Target supports a popcount optab on long.
+
+@item popcountll
+Target supports a popcount optab on long long.
+
 @item prefetchi
 Target supports the execution of @code{prefetchi} instructions.
 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
  2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
                   ` (7 preceding siblings ...)
  2022-11-11 19:07 ` [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max Andrew Carlotti
@ 2022-12-22 17:43 ` Andrew Carlotti
  2023-01-12 13:20   ` Richard Biener
  2023-01-16 14:03   ` Andrew Carlotti
  8 siblings, 2 replies; 28+ messages in thread
From: Andrew Carlotti @ 2022-12-22 17:43 UTC (permalink / raw)
  To: gcc-patches

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu - ok to merge?

gcc/ChangeLog:

	* tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.

gcc/testsuite/ChangeLog:

	* g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
	* gcc.dg/tree-ssa/popcount.c: Likewise.
	* gcc.dg/tree-ssa/popcount2.c: Likewise.
	* gcc.dg/tree-ssa/popcount3.c: Likewise.
	* gcc.target/aarch64/popcount4.c: Likewise.
	* gcc.target/i386/pr95771.c: Likewise, and...
	* gcc.target/i386/pr95771-2.c: ...split int128 test from above,
	since this would emit just a single IFN if a TI optab is added.

---

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
@@ -12,5 +12,5 @@ int PopCount (long b) {
     return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
 /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
index b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
@@ -39,4 +39,4 @@ void PopCount3 (long b1) {
       }
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
index ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
@@ -26,4 +26,4 @@ int main()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
@@ -12,5 +12,5 @@ int PopCount (long b) {
     return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
 /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c b/gcc/testsuite/gcc.target/aarch64/popcount4.c
index ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
@@ -11,4 +11,4 @@ int PopCount (long b) {
     return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c b/gcc/testsuite/gcc.target/i386/pr95771-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..1db9dc94d0b66477667624012221d6844c141a26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c
@@ -0,0 +1,17 @@
+/* PR tree-optimization/95771 */
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" "optimized" } } */
+
+int
+corge (unsigned __int128 x)
+{
+  int i = 0;
+  while (x)
+    {
+      x &= x - 1;
+      ++i;
+    }
+  return i;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c b/gcc/testsuite/gcc.target/i386/pr95771.c
index d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f 100644
--- a/gcc/testsuite/gcc.target/i386/pr95771.c
+++ b/gcc/testsuite/gcc.target/i386/pr95771.c
@@ -1,8 +1,7 @@
 /* PR tree-optimization/95771 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
-/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { target int128 } } } */
-/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { target { ! int128 } } } } */
+/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 "optimized" } } */
 
 int
 foo (unsigned char x)
@@ -51,17 +50,3 @@ qux (unsigned long long x)
     }
   return i;
 }
-
-#ifdef __SIZEOF_INT128__
-int
-corge (unsigned __int128 x)
-{
-  int i = 0;
-  while (x)
-    {
-      x &= x - 1;
-      ++i;
-    }
-  return i;
-}
-#endif
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 9c2f9f3d5f6205bb5e7f490257800c660fdd0b8d..cc53b27329f8518bc2cacef1830768a140331b31 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2033,11 +2033,18 @@ static tree
 build_popcount_expr (tree src)
 {
   tree fn;
+  bool use_ifn = false;
   int prec = TYPE_PRECISION (TREE_TYPE (src));
   int i_prec = TYPE_PRECISION (integer_type_node);
   int li_prec = TYPE_PRECISION (long_integer_type_node);
   int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
-  if (prec <= i_prec)
+
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+
+  if (direct_internal_fn_supported_p (IFN_POPCOUNT, utype, OPTIMIZE_FOR_BOTH))
+    use_ifn = true;
+  else if (prec <= i_prec)
     fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
   else if (prec == li_prec)
     fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
@@ -2046,12 +2053,11 @@ build_popcount_expr (tree src)
   else
     return NULL_TREE;
 
-  tree utype = unsigned_type_for (TREE_TYPE (src));
-  src = fold_convert (utype, src);
-  if (prec < i_prec)
-    src = fold_convert (unsigned_type_node, src);
   tree call;
-  if (prec == 2 * lli_prec)
+  if (use_ifn)
+      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_POPCOUNT,
+					   integer_type_node, 1, src);
+  else if (prec == 2 * lli_prec)
     {
       tree src1 = fold_convert (long_long_unsigned_type_node,
 				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
@@ -2064,7 +2070,12 @@ build_popcount_expr (tree src)
       call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
     }
   else
-    call = build_call_expr (fn, 1, src);
+    {
+      if (prec < i_prec)
+	src = fold_convert (unsigned_type_node, src);
+
+      call = build_call_expr (fn, 1, src);
+    }
 
   return call;
 }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition
  2022-12-22 17:42         ` [PATCH 5/8 v2] " Andrew Carlotti
@ 2023-01-12 13:19           ` Richard Biener
  2023-01-19  9:19           ` Jan-Benedict Glaw
  1 sibling, 0 replies; 28+ messages in thread
From: Richard Biener @ 2023-01-12 13:19 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Thu, Dec 22, 2022 at 6:42 PM Andrew Carlotti <andrew.carlotti@arm.com> wrote:
>
> On Thu, Nov 24, 2022 at 11:41:31AM +0100, Richard Biener wrote:
> > Note we do have CTZ and CLZ
> > optabs and internal functions - in case there's a HImode CLZ this
> > could be elided.  More general you can avoid using the __builtin_
> > functions with their fixed types in favor of using IFN_C[TL]Z which
> > are type agnostic (but require optab support - you should be able
> > to check this via direct_internal_fn_supported_p).
>
> IFN support added. I've also renamed the defined_at_zero parameter to
> define_at_zero, since this is a request for the expression to define it,
> rather than a guarantee that it is already defined.
>
> New patch below, bootstrapped and regression tested on
> aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge?

OK, and sorry for the delay.

Richard.

> ---
>
> This recognises patterns of the form:
> while (n) { n >>= 1 }
>
> This patch results in improved (but still suboptimal) codegen:
>
> foo (unsigned int b) {
>     int c = 0;
>
>     while (b) {
>         b >>= 1;
>         c++;
>     }
>
>     return c;
> }
>
> foo:
> .LFB11:
>         .cfi_startproc
>         cbz     w0, .L3
>         clz     w1, w0
>         tst     x0, 1
>         mov     w0, 32
>         sub     w0, w0, w1
>         csel    w0, w0, wzr, ne
>         ret
>
> The conditional is unnecessary. phiopt could recognise a redundant csel
> (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> clz call, but it cannot recognise the redunancy when the input is (e.g.)
> (32 - clz).
>
> I could perhaps extend this function to recognise this pattern in a later
> patch, if this is a good place to recognise more patterns.
>
> gcc/ChangeLog:
>
>         PR tree-optimization/94793
>         * tree-scalar-evolution.cc (expression_expensive_p): Add checks
>         for c[lt]z optabs.
>         * tree-ssa-loop-niter.cc (build_cltz_expr): New.
>         (number_of_iterations_cltz_complement): New.
>         (number_of_iterations_bitcount): Add call to the above.
>
> gcc/testsuite/ChangeLog:
>
>         * lib/target-supports.exp (check_effective_target_clz)
>         (check_effective_target_clzl, check_effective_target_clzll)
>         (check_effective_target_ctz, check_effective_target_clzl)
>         (check_effective_target_ctzll): New.
>         * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-char.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-int.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
>         * gcc.dg/tree-ssa/clz-complement-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
>         * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
>
> ---
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1a29ca52e42e50822e4e3213b2cb008b766d0318
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int clz_complement_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC)
> +      return 0;
> +    else
> +      return 34567;
> +}
> +
> +int clz_complement_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 76543;
> +}
> +
> +int ctz_complement_count1 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC)
> +      return 0;
> +    else
> +      return 23456;
> +}
> +
> +int ctz_complement_count2 (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +    if (c <= PREC - 1)
> +      return 0;
> +    else
> +      return 65432;
> +}
> +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(255) != 8)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1 << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..7f7793f0efac1f0d793e6e99b84988e5cc5221c9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1LL << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..97161bb7a74260bea20e325ebab64acb33a2b696
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target clzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b >>= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(5) != 3)
> +    __builtin_abort ();
> +  if (foo(1L << (PREC - 1)) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b9afe8852d8ffbc7ee9a0760cf04b8f98af293a2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-char.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned char b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d2702a65daf34db66550d2255395db68a29a4797
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-int.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctz } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned int b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1ea0d5d7d9f8be1824c4177c33edd91e66b4ddab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzll } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..80fb02dcfa68bc022ae69b26fb189323e01fc6fc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ctz-complement-long.c
> @@ -0,0 +1,31 @@
> +/* { dg-do run } */
> +/* { dg-require-effective-target ctzl } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +#define PREC (__CHAR_BIT__ * __SIZEOF_LONG__)
> +
> +int
> +__attribute__ ((noinline, noclone))
> +foo (unsigned long b) {
> +    int c = 0;
> +
> +    while (b) {
> +       b <<= 1;
> +       c++;
> +    }
> +
> +    return c;
> +}
> +
> +int main()
> +{
> +  if (foo(0) != 0)
> +    __builtin_abort ();
> +  if (foo(96) != PREC - 5)
> +    __builtin_abort ();
> +  if (foo(35) != PREC)
> +    __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "__builtin_ctz|\\.CTZ" 1 "optimized" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index 2a058c67c53466fe41b748d37ab660afd4e3403f..c745202624da672d1bdc21b8e74c1daac6ad9933 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -8701,6 +8701,72 @@ proc check_effective_target_popcount { } {
>      } "" ]
>  }
>
> +# Return 1 if the target supports clz on int.
> +
> +proc check_effective_target_clz { } {
> +    return [check_no_messages_and_pattern clz "!\\(call" rtl-expand {
> +        int foo (int b)
> +          {
> +            return __builtin_clz (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long.
> +
> +proc check_effective_target_clzl { } {
> +    return [check_no_messages_and_pattern clzl "!\\(call" rtl-expand {
> +       int foo (long b)
> +         {
> +           return __builtin_clzl (b);
> +         }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports clz on long long.
> +
> +proc check_effective_target_clzll { } {
> +    return [check_no_messages_and_pattern clzll "!\\(call" rtl-expand {
> +        int foo (long long b)
> +          {
> +            return __builtin_clzll (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on int.
> +
> +proc check_effective_target_ctz { } {
> +    return [check_no_messages_and_pattern ctz "!\\(call" rtl-expand {
> +        int foo (int b)
> +          {
> +            return __builtin_ctz (b);
> +          }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long.
> +
> +proc check_effective_target_ctzl { } {
> +    return [check_no_messages_and_pattern ctzl "!\\(call" rtl-expand {
> +       int foo (long b)
> +         {
> +           return __builtin_ctzl (b);
> +         }
> +    } "" ]
> +}
> +
> +# Return 1 if the target supports ctz on long long.
> +
> +proc check_effective_target_ctzll { } {
> +    return [check_no_messages_and_pattern ctzll "!\\(call" rtl-expand {
> +        int foo (long long b)
> +          {
> +            return __builtin_ctzll (b);
> +          }
> +    } "" ]
> +}
> +
>  # Return 1 if the target supports atomic operations on "long long"
>  # and can execute them.
>  #
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index f75398afb7c9fdf42e69e940e2232942143049f6..0e4e87aea34622c8ee21f5c8e29dae2d0cdd2643 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -3397,12 +3397,21 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
>          library call for popcount when backend does not have an instruction
>          to do so.  We consider this to be expensive and generate
>          __builtin_popcount only when backend defines it.  */
> +      optab optab;
>        combined_fn cfn = get_call_combined_fn (expr);
>        switch (cfn)
>         {
>         CASE_CFN_POPCOUNT:
> +         optab = popcount_optab;
> +         goto bitcount_call;
> +       CASE_CFN_CLZ:
> +         optab = clz_optab;
> +         goto bitcount_call;
> +       CASE_CFN_CTZ:
> +         optab = ctz_optab;
> +bitcount_call:
>           /* Check if opcode for popcount is available in the mode required.  */
> -         if (optab_handler (popcount_optab,
> +         if (optab_handler (optab,
>                              TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 0))))
>               == CODE_FOR_nothing)
>             {
> @@ -3415,7 +3424,7 @@ expression_expensive_p (tree expr, hash_map<tree, uint64_t> &cache,
>                  instructions.  */
>               if (is_a <scalar_int_mode> (mode, &int_mode)
>                   && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> -                 && (optab_handler (popcount_optab, word_mode)
> +                 && (optab_handler (optab, word_mode)
>                       != CODE_FOR_nothing))
>                   break;
>               return true;
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index fece876099c1687569d6351e7d2416ea6acae5b5..ce2441f2a6dbdf2d8fe42755d5d1abd8a631bb5c 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-chrec.h"
>  #include "tree-scalar-evolution.h"
>  #include "tree-dfa.h"
> +#include "internal-fn.h"
>  #include "gimple-range.h"
>
>
> @@ -2198,6 +2199,224 @@ number_of_iterations_popcount (loop_p loop, edge exit,
>    return true;
>  }
>
> +/* Return an expression that counts the leading/trailing zeroes of src.
> +
> +   If define_at_zero is true, then the built expression will be defined to
> +   return the precision of src when src == 0 (using either a conditional
> +   expression or a suitable internal function).
> +   Otherwise, we can elide the conditional expression and let src = 0 invoke
> +   undefined behaviour.  */
> +
> +static tree
> +build_cltz_expr (tree src, bool leading, bool define_at_zero)
> +{
> +  tree fn;
> +  internal_fn ifn = leading ? IFN_CLZ : IFN_CTZ;
> +  bool use_ifn = false;
> +  int prec = TYPE_PRECISION (TREE_TYPE (src));
> +  int i_prec = TYPE_PRECISION (integer_type_node);
> +  int li_prec = TYPE_PRECISION (long_integer_type_node);
> +  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> +
> +  tree utype = unsigned_type_for (TREE_TYPE (src));
> +  src = fold_convert (utype, src);
> +
> +  if (direct_internal_fn_supported_p (ifn, utype, OPTIMIZE_FOR_BOTH))
> +    use_ifn = true;
> +  else if (prec <= i_prec)
> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZ)
> +                : builtin_decl_implicit (BUILT_IN_CTZ);
> +  else if (prec == li_prec)
> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZL)
> +                : builtin_decl_implicit (BUILT_IN_CTZL);
> +  else if (prec == lli_prec || prec == 2 * lli_prec)
> +    fn = leading ? builtin_decl_implicit (BUILT_IN_CLZLL)
> +                : builtin_decl_implicit (BUILT_IN_CTZLL);
> +  else
> +    return NULL_TREE;
> +
> +  tree call;
> +  if (use_ifn)
> +    {
> +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, ifn,
> +                                          integer_type_node, 1, src);
> +      int val;
> +      scalar_int_mode mode = SCALAR_INT_TYPE_MODE (utype);
> +      int optab_defined_at_zero
> +       = leading ? CLZ_DEFINED_VALUE_AT_ZERO (mode, val)
> +                 : CTZ_DEFINED_VALUE_AT_ZERO (mode, val);
> +      if (define_at_zero && !(optab_defined_at_zero == 2 && val == prec))
> +       {
> +         tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
> +         call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> +                            build_int_cst (integer_type_node, prec));
> +       }
> +    }
> +  else if (prec == 2 * lli_prec)
> +    {
> +      tree src1 = fold_convert (long_long_unsigned_type_node,
> +                               fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> +                                            unshare_expr (src),
> +                                            build_int_cst (integer_type_node,
> +                                                           lli_prec)));
> +      tree src2 = fold_convert (long_long_unsigned_type_node, src);
> +      /* We count the zeroes in src1, and add the number in src2 when src1
> +        is 0.  */
> +      if (!leading)
> +       std::swap(src1, src2);
> +      tree call1 = build_call_expr (fn, 1, src1);
> +      tree call2 = build_call_expr (fn, 1, src2);
> +      if (define_at_zero)
> +       {
> +         tree is_zero2 = fold_build2 (NE_EXPR, boolean_type_node, src2,
> +                                      build_zero_cst (TREE_TYPE (src2)));
> +         call2 = fold_build3(COND_EXPR, integer_type_node, is_zero2, call2,
> +                             build_int_cst (integer_type_node, lli_prec));
> +       }
> +      tree is_zero1 = fold_build2 (NE_EXPR, boolean_type_node, src1,
> +                                  build_zero_cst (TREE_TYPE (src1)));
> +      call = fold_build3(COND_EXPR, integer_type_node, is_zero1, call1,
> +                        fold_build2 (PLUS_EXPR, integer_type_node, call2,
> +                                     build_int_cst (integer_type_node,
> +                                                    lli_prec)));
> +    }
> +  else
> +    {
> +      if (prec < i_prec)
> +       src = fold_convert (unsigned_type_node, src);
> +
> +      call = build_call_expr (fn, 1, src);
> +      if (define_at_zero)
> +       {
> +         tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
> +         call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> +                            build_int_cst (integer_type_node, prec));
> +       }
> +
> +      if (leading && prec < i_prec)
> +       call = fold_build2(MINUS_EXPR, integer_type_node, call,
> +                          build_int_cst (integer_type_node,
> +                                         i_prec - prec));
> +    }
> +
> +  return call;
> +}
> +
> +/* See comment below for number_of_iterations_bitcount.
> +   For c[lt]z complement, we have:
> +
> +   modify:
> +   iv_2 = iv_1 >> 1 OR iv_1 << 1
> +
> +   test:
> +   if (iv != 0)
> +
> +   modification count:
> +   src precision - c[lt]z (src)
> +
> + */
> +
> +static bool
> +number_of_iterations_cltz_complement (loop_p loop, edge exit,
> +                              enum tree_code code,
> +                              class tree_niter_desc *niter)
> +{
> +  bool modify_before_test = true;
> +  HOST_WIDE_INT max;
> +
> +  /* Check that condition for staying inside the loop is like
> +     if (iv != 0).  */
> +  gimple *cond_stmt = last_stmt (exit->src);
> +  if (!cond_stmt
> +      || gimple_code (cond_stmt) != GIMPLE_COND
> +      || code != NE_EXPR
> +      || !integer_zerop (gimple_cond_rhs (cond_stmt))
> +      || TREE_CODE (gimple_cond_lhs (cond_stmt)) != SSA_NAME)
> +    return false;
> +
> +  tree iv_2 = gimple_cond_lhs (cond_stmt);
> +  gimple *iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +
> +  /* If the test comes before the iv modification, then these will actually be
> +     iv_1 and a phi node.  */
> +  if (gimple_code (iv_2_stmt) == GIMPLE_PHI
> +      && gimple_bb (iv_2_stmt) == loop->header
> +      && gimple_phi_num_args (iv_2_stmt) == 2
> +      && (TREE_CODE (gimple_phi_arg_def (iv_2_stmt,
> +                                        loop_latch_edge (loop)->dest_idx))
> +         == SSA_NAME))
> +    {
> +      /* iv_2 is actually one of the inputs to the phi.  */
> +      iv_2 = gimple_phi_arg_def (iv_2_stmt, loop_latch_edge (loop)->dest_idx);
> +      iv_2_stmt = SSA_NAME_DEF_STMT (iv_2);
> +      modify_before_test = false;
> +    }
> +
> +  /* Make sure iv_2_stmt is a logical shift by one stmt:
> +     iv_2 = iv_1 {>>|<<} 1  */
> +  if (!is_gimple_assign (iv_2_stmt)
> +      || (gimple_assign_rhs_code (iv_2_stmt) != LSHIFT_EXPR
> +         && (gimple_assign_rhs_code (iv_2_stmt) != RSHIFT_EXPR
> +             || !TYPE_UNSIGNED (TREE_TYPE (gimple_assign_lhs (iv_2_stmt)))))
> +      || !integer_onep (gimple_assign_rhs2 (iv_2_stmt)))
> +    return false;
> +
> +  bool left_shift = (gimple_assign_rhs_code (iv_2_stmt) == LSHIFT_EXPR);
> +
> +  tree iv_1 = gimple_assign_rhs1 (iv_2_stmt);
> +
> +  /* Check the recurrence.  */
> +  gimple *phi = SSA_NAME_DEF_STMT (iv_1);
> +  if (gimple_code (phi) != GIMPLE_PHI
> +      || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
> +      || (iv_2 != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
> +    return false;
> +
> +  /* We found a match.  */
> +  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
> +  int src_precision = TYPE_PRECISION (TREE_TYPE (src));
> +
> +  /* Get the corresponding c[lt]z builtin.  */
> +  tree expr = build_cltz_expr (src, !left_shift, true);
> +
> +  if (!expr)
> +    return false;
> +
> +  expr = fold_build2 (MINUS_EXPR, integer_type_node,
> +                     build_int_cst (integer_type_node, src_precision),
> +                     expr);
> +
> +  max = src_precision;
> +
> +  tree may_be_zero = boolean_false_node;
> +
> +  if (modify_before_test)
> +    {
> +      expr = fold_build2 (MINUS_EXPR, integer_type_node, expr,
> +                         integer_one_node);
> +      max = max - 1;
> +      may_be_zero = fold_build2 (EQ_EXPR, boolean_type_node, src,
> +                                     build_zero_cst (TREE_TYPE (src)));
> +    }
> +
> +  expr = fold_convert (unsigned_type_node, expr);
> +
> +  niter->assumptions = boolean_true_node;
> +  niter->may_be_zero = simplify_using_initial_conditions (loop, may_be_zero);
> +  niter->niter = simplify_using_initial_conditions (loop, expr);
> +
> +  if (TREE_CODE (niter->niter) == INTEGER_CST)
> +    niter->max = tree_to_uhwi (niter->niter);
> +  else
> +    niter->max = max;
> +
> +  niter->bound = NULL_TREE;
> +  niter->cmp = ERROR_MARK;
> +  return true;
> +}
> +
>  /* See if LOOP contains a bit counting idiom. The idiom consists of two parts:
>     1. A modification to the induction variabler;.
>     2. A test to determine whether or not to exit the loop.
> @@ -2244,7 +2463,8 @@ number_of_iterations_bitcount (loop_p loop, edge exit,
>                                enum tree_code code,
>                                class tree_niter_desc *niter)
>  {
> -  return number_of_iterations_popcount (loop, exit, code, niter);
> +  return (number_of_iterations_popcount (loop, exit, code, niter)
> +         || number_of_iterations_cltz_complement (loop, exit, code, niter));
>  }
>
>  /* Substitute NEW_TREE for OLD in EXPR and fold the result.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
  2022-12-22 17:43 ` [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN Andrew Carlotti
@ 2023-01-12 13:20   ` Richard Biener
  2023-01-16 14:03   ` Andrew Carlotti
  1 sibling, 0 replies; 28+ messages in thread
From: Richard Biener @ 2023-01-12 13:20 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: gcc-patches

On Thu, Dec 22, 2022 at 6:44 PM Andrew Carlotti via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
> x86_64-pc-linux-gnu - ok to merge?

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.
>
> gcc/testsuite/ChangeLog:
>
>         * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
>         * gcc.dg/tree-ssa/popcount.c: Likewise.
>         * gcc.dg/tree-ssa/popcount2.c: Likewise.
>         * gcc.dg/tree-ssa/popcount3.c: Likewise.
>         * gcc.target/aarch64/popcount4.c: Likewise.
>         * gcc.target/i386/pr95771.c: Likewise, and...
>         * gcc.target/i386/pr95771-2.c: ...split int128 test from above,
>         since this would emit just a single IFN if a TI optab is added.
>
> ---
>
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>      return c;
>  }
>
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> index b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> @@ -39,4 +39,4 @@ void PopCount3 (long b1) {
>        }
>  }
>
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> index ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> @@ -26,4 +26,4 @@ int main()
>    return 0;
>  }
>
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>      return c;
>  }
>
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> index ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> @@ -11,4 +11,4 @@ int PopCount (long b) {
>      return c;
>  }
>
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1db9dc94d0b66477667624012221d6844c141a26
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> @@ -0,0 +1,17 @@
> +/* PR tree-optimization/95771 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" "optimized" } } */
> +
> +int
> +corge (unsigned __int128 x)
> +{
> +  int i = 0;
> +  while (x)
> +    {
> +      x &= x - 1;
> +      ++i;
> +    }
> +  return i;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c b/gcc/testsuite/gcc.target/i386/pr95771.c
> index d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f 100644
> --- a/gcc/testsuite/gcc.target/i386/pr95771.c
> +++ b/gcc/testsuite/gcc.target/i386/pr95771.c
> @@ -1,8 +1,7 @@
>  /* PR tree-optimization/95771 */
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { target int128 } } } */
> -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { target { ! int128 } } } } */
> +/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 "optimized" } } */
>
>  int
>  foo (unsigned char x)
> @@ -51,17 +50,3 @@ qux (unsigned long long x)
>      }
>    return i;
>  }
> -
> -#ifdef __SIZEOF_INT128__
> -int
> -corge (unsigned __int128 x)
> -{
> -  int i = 0;
> -  while (x)
> -    {
> -      x &= x - 1;
> -      ++i;
> -    }
> -  return i;
> -}
> -#endif
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index 9c2f9f3d5f6205bb5e7f490257800c660fdd0b8d..cc53b27329f8518bc2cacef1830768a140331b31 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2033,11 +2033,18 @@ static tree
>  build_popcount_expr (tree src)
>  {
>    tree fn;
> +  bool use_ifn = false;
>    int prec = TYPE_PRECISION (TREE_TYPE (src));
>    int i_prec = TYPE_PRECISION (integer_type_node);
>    int li_prec = TYPE_PRECISION (long_integer_type_node);
>    int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> -  if (prec <= i_prec)
> +
> +  tree utype = unsigned_type_for (TREE_TYPE (src));
> +  src = fold_convert (utype, src);
> +
> +  if (direct_internal_fn_supported_p (IFN_POPCOUNT, utype, OPTIMIZE_FOR_BOTH))
> +    use_ifn = true;
> +  else if (prec <= i_prec)
>      fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>    else if (prec == li_prec)
>      fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
> @@ -2046,12 +2053,11 @@ build_popcount_expr (tree src)
>    else
>      return NULL_TREE;
>
> -  tree utype = unsigned_type_for (TREE_TYPE (src));
> -  src = fold_convert (utype, src);
> -  if (prec < i_prec)
> -    src = fold_convert (unsigned_type_node, src);
>    tree call;
> -  if (prec == 2 * lli_prec)
> +  if (use_ifn)
> +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_POPCOUNT,
> +                                          integer_type_node, 1, src);
> +  else if (prec == 2 * lli_prec)
>      {
>        tree src1 = fold_convert (long_long_unsigned_type_node,
>                                 fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> @@ -2064,7 +2070,12 @@ build_popcount_expr (tree src)
>        call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
>      }
>    else
> -    call = build_call_expr (fn, 1, src);
> +    {
> +      if (prec < i_prec)
> +       src = fold_convert (unsigned_type_node, src);
> +
> +      call = build_call_expr (fn, 1, src);
> +    }
>
>    return call;
>  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
  2022-12-22 17:43 ` [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN Andrew Carlotti
  2023-01-12 13:20   ` Richard Biener
@ 2023-01-16 14:03   ` Andrew Carlotti
  2023-01-16 14:07     ` Andrew Carlotti
  1 sibling, 1 reply; 28+ messages in thread
From: Andrew Carlotti @ 2023-01-16 14:03 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Biener

Hi Richard

I accidentally pushed this patch earlier in the mistaken belief that
you'd already approved it. It looks uncontroversial to me - it just adds
IFN support to build_popcount_expr, analogous to the changes you
suggested and approved for build_cltz_expr (and adjusts testcases
accordingly). I might have incorporated it into an earlier patch in this
series, if I hadn't already pushed that earlier patch.

Is this OK to leave in master now?

Thanks,
Andrew

On Thu, Dec 22, 2022 at 05:43:21PM +0000, Andrew Carlotti via Gcc-patches wrote:
> Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
> x86_64-pc-linux-gnu - ok to merge?
> 
> gcc/ChangeLog:
> 
> 	* tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
> 	* gcc.dg/tree-ssa/popcount.c: Likewise.
> 	* gcc.dg/tree-ssa/popcount2.c: Likewise.
> 	* gcc.dg/tree-ssa/popcount3.c: Likewise.
> 	* gcc.target/aarch64/popcount4.c: Likewise.
> 	* gcc.target/i386/pr95771.c: Likewise, and...
> 	* gcc.target/i386/pr95771-2.c: ...split int128 test from above,
> 	since this would emit just a single IFN if a TI optab is added.
> 
> ---
> 
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>      return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> index b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> @@ -39,4 +39,4 @@ void PopCount3 (long b1) {
>        }
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> index ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> @@ -26,4 +26,4 @@ int main()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>      return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> index ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> @@ -11,4 +11,4 @@ int PopCount (long b) {
>      return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1db9dc94d0b66477667624012221d6844c141a26
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> @@ -0,0 +1,17 @@
> +/* PR tree-optimization/95771 */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" "optimized" } } */
> +
> +int
> +corge (unsigned __int128 x)
> +{
> +  int i = 0;
> +  while (x)
> +    {
> +      x &= x - 1;
> +      ++i;
> +    }
> +  return i;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c b/gcc/testsuite/gcc.target/i386/pr95771.c
> index d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f 100644
> --- a/gcc/testsuite/gcc.target/i386/pr95771.c
> +++ b/gcc/testsuite/gcc.target/i386/pr95771.c
> @@ -1,8 +1,7 @@
>  /* PR tree-optimization/95771 */
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { target int128 } } } */
> -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { target { ! int128 } } } } */
> +/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 "optimized" } } */
>  
>  int
>  foo (unsigned char x)
> @@ -51,17 +50,3 @@ qux (unsigned long long x)
>      }
>    return i;
>  }
> -
> -#ifdef __SIZEOF_INT128__
> -int
> -corge (unsigned __int128 x)
> -{
> -  int i = 0;
> -  while (x)
> -    {
> -      x &= x - 1;
> -      ++i;
> -    }
> -  return i;
> -}
> -#endif
> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index 9c2f9f3d5f6205bb5e7f490257800c660fdd0b8d..cc53b27329f8518bc2cacef1830768a140331b31 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -2033,11 +2033,18 @@ static tree
>  build_popcount_expr (tree src)
>  {
>    tree fn;
> +  bool use_ifn = false;
>    int prec = TYPE_PRECISION (TREE_TYPE (src));
>    int i_prec = TYPE_PRECISION (integer_type_node);
>    int li_prec = TYPE_PRECISION (long_integer_type_node);
>    int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> -  if (prec <= i_prec)
> +
> +  tree utype = unsigned_type_for (TREE_TYPE (src));
> +  src = fold_convert (utype, src);
> +
> +  if (direct_internal_fn_supported_p (IFN_POPCOUNT, utype, OPTIMIZE_FOR_BOTH))
> +    use_ifn = true;
> +  else if (prec <= i_prec)
>      fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
>    else if (prec == li_prec)
>      fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
> @@ -2046,12 +2053,11 @@ build_popcount_expr (tree src)
>    else
>      return NULL_TREE;
>  
> -  tree utype = unsigned_type_for (TREE_TYPE (src));
> -  src = fold_convert (utype, src);
> -  if (prec < i_prec)
> -    src = fold_convert (unsigned_type_node, src);
>    tree call;
> -  if (prec == 2 * lli_prec)
> +  if (use_ifn)
> +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_POPCOUNT,
> +					   integer_type_node, 1, src);
> +  else if (prec == 2 * lli_prec)
>      {
>        tree src1 = fold_convert (long_long_unsigned_type_node,
>  				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> @@ -2064,7 +2070,12 @@ build_popcount_expr (tree src)
>        call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
>      }
>    else
> -    call = build_call_expr (fn, 1, src);
> +    {
> +      if (prec < i_prec)
> +	src = fold_convert (unsigned_type_node, src);
> +
> +      call = build_call_expr (fn, 1, src);
> +    }
>  
>    return call;
>  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
  2023-01-16 14:03   ` Andrew Carlotti
@ 2023-01-16 14:07     ` Andrew Carlotti
  0 siblings, 0 replies; 28+ messages in thread
From: Andrew Carlotti @ 2023-01-16 14:07 UTC (permalink / raw)
  To: gcc-patches, Richard Biener

Erm, ignore this - I just rediscovered the approval in a different mail
folder. I forgot that Outlook's automatic email dedpulication meant that
messages CC'd to me end up in one of two different folders at random
when I want them in both.


On Mon, Jan 16, 2023 at 02:03:29PM +0000, Andrew Carlotti via Gcc-patches wrote:
> Hi Richard
> 
> I accidentally pushed this patch earlier in the mistaken belief that
> you'd already approved it. It looks uncontroversial to me - it just adds
> IFN support to build_popcount_expr, analogous to the changes you
> suggested and approved for build_cltz_expr (and adjusts testcases
> accordingly). I might have incorporated it into an earlier patch in this
> series, if I hadn't already pushed that earlier patch.
> 
> Is this OK to leave in master now?
> 
> Thanks,
> Andrew
> 
> On Thu, Dec 22, 2022 at 05:43:21PM +0000, Andrew Carlotti via Gcc-patches wrote:
> > Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
> > x86_64-pc-linux-gnu - ok to merge?
> > 
> > gcc/ChangeLog:
> > 
> > 	* tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
> > 	* gcc.dg/tree-ssa/popcount.c: Likewise.
> > 	* gcc.dg/tree-ssa/popcount2.c: Likewise.
> > 	* gcc.dg/tree-ssa/popcount3.c: Likewise.
> > 	* gcc.target/aarch64/popcount4.c: Likewise.
> > 	* gcc.target/i386/pr95771.c: Likewise, and...
> > 	* gcc.target/i386/pr95771-2.c: ...split int128 test from above,
> > 	since this would emit just a single IFN if a TI optab is added.
> > 
> > ---
> > 
> > diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> > --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > @@ -12,5 +12,5 @@ int PopCount (long b) {
> >      return c;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
> >  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > index b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > @@ -39,4 +39,4 @@ void PopCount3 (long b1) {
> >        }
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > index ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > @@ -26,4 +26,4 @@ int main()
> >    return 0;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > @@ -12,5 +12,5 @@ int PopCount (long b) {
> >      return c;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */
> >  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> > index ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> > @@ -11,4 +11,4 @@ int PopCount (long b) {
> >      return c;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> > new file mode 100644
> > index 0000000000000000000000000000000000000000..1db9dc94d0b66477667624012221d6844c141a26
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> > @@ -0,0 +1,17 @@
> > +/* PR tree-optimization/95771 */
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target int128 } */
> > +/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" "optimized" } } */
> > +
> > +int
> > +corge (unsigned __int128 x)
> > +{
> > +  int i = 0;
> > +  while (x)
> > +    {
> > +      x &= x - 1;
> > +      ++i;
> > +    }
> > +  return i;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c b/gcc/testsuite/gcc.target/i386/pr95771.c
> > index d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr95771.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr95771.c
> > @@ -1,8 +1,7 @@
> >  /* PR tree-optimization/95771 */
> >  /* { dg-do compile } */
> >  /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
> > -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { target int128 } } } */
> > -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { target { ! int128 } } } } */
> > +/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 "optimized" } } */
> >  
> >  int
> >  foo (unsigned char x)
> > @@ -51,17 +50,3 @@ qux (unsigned long long x)
> >      }
> >    return i;
> >  }
> > -
> > -#ifdef __SIZEOF_INT128__
> > -int
> > -corge (unsigned __int128 x)
> > -{
> > -  int i = 0;
> > -  while (x)
> > -    {
> > -      x &= x - 1;
> > -      ++i;
> > -    }
> > -  return i;
> > -}
> > -#endif
> > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > index 9c2f9f3d5f6205bb5e7f490257800c660fdd0b8d..cc53b27329f8518bc2cacef1830768a140331b31 100644
> > --- a/gcc/tree-ssa-loop-niter.cc
> > +++ b/gcc/tree-ssa-loop-niter.cc
> > @@ -2033,11 +2033,18 @@ static tree
> >  build_popcount_expr (tree src)
> >  {
> >    tree fn;
> > +  bool use_ifn = false;
> >    int prec = TYPE_PRECISION (TREE_TYPE (src));
> >    int i_prec = TYPE_PRECISION (integer_type_node);
> >    int li_prec = TYPE_PRECISION (long_integer_type_node);
> >    int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
> > -  if (prec <= i_prec)
> > +
> > +  tree utype = unsigned_type_for (TREE_TYPE (src));
> > +  src = fold_convert (utype, src);
> > +
> > +  if (direct_internal_fn_supported_p (IFN_POPCOUNT, utype, OPTIMIZE_FOR_BOTH))
> > +    use_ifn = true;
> > +  else if (prec <= i_prec)
> >      fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
> >    else if (prec == li_prec)
> >      fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
> > @@ -2046,12 +2053,11 @@ build_popcount_expr (tree src)
> >    else
> >      return NULL_TREE;
> >  
> > -  tree utype = unsigned_type_for (TREE_TYPE (src));
> > -  src = fold_convert (utype, src);
> > -  if (prec < i_prec)
> > -    src = fold_convert (unsigned_type_node, src);
> >    tree call;
> > -  if (prec == 2 * lli_prec)
> > +  if (use_ifn)
> > +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, IFN_POPCOUNT,
> > +					   integer_type_node, 1, src);
> > +  else if (prec == 2 * lli_prec)
> >      {
> >        tree src1 = fold_convert (long_long_unsigned_type_node,
> >  				fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
> > @@ -2064,7 +2070,12 @@ build_popcount_expr (tree src)
> >        call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
> >      }
> >    else
> > -    call = build_call_expr (fn, 1, src);
> > +    {
> > +      if (prec < i_prec)
> > +	src = fold_convert (unsigned_type_node, src);
> > +
> > +      call = build_call_expr (fn, 1, src);
> > +    }
> >  
> >    return call;
> >  }

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition
  2022-12-22 17:42         ` [PATCH 5/8 v2] " Andrew Carlotti
  2023-01-12 13:19           ` Richard Biener
@ 2023-01-19  9:19           ` Jan-Benedict Glaw
  2023-01-19  9:43             ` Richard Biener
  1 sibling, 1 reply; 28+ messages in thread
From: Jan-Benedict Glaw @ 2023-01-19  9:19 UTC (permalink / raw)
  To: Andrew Carlotti; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2151 bytes --]

On Thu, 2022-12-22 17:42:16 +0000, Andrew Carlotti via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> New patch below, bootstrapped and regression tested on
> aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge?

> diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> index fece876099c1687569d6351e7d2416ea6acae5b5..ce2441f2a6dbdf2d8fe42755d5d1abd8a631bb5c 100644
> --- a/gcc/tree-ssa-loop-niter.cc
> +++ b/gcc/tree-ssa-loop-niter.cc
> @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-chrec.h"
>  #include "tree-scalar-evolution.h"
>  #include "tree-dfa.h"
> +#include "internal-fn.h"
>  #include "gimple-range.h"
>  
>  
> @@ -2198,6 +2199,224 @@ number_of_iterations_popcount (loop_p loop, edge exit,
>    return true;
>  }
>  
> +/* Return an expression that counts the leading/trailing zeroes of src.
> +
> +   If define_at_zero is true, then the built expression will be defined to
> +   return the precision of src when src == 0 (using either a conditional
> +   expression or a suitable internal function).
> +   Otherwise, we can elide the conditional expression and let src = 0 invoke
> +   undefined behaviour.  */
> +
> +static tree
> +build_cltz_expr (tree src, bool leading, bool define_at_zero)
> +{
[...]
> +
> +  tree call;
> +  if (use_ifn)
> +    {
> +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, ifn,
> +					   integer_type_node, 1, src);
> +      int val;
> +      scalar_int_mode mode = SCALAR_INT_TYPE_MODE (utype);
         ^^^^^^^^^^^^^^^^^^^^

This will give us a new unused variable warning.

> +      int optab_defined_at_zero
> +	= leading ? CLZ_DEFINED_VALUE_AT_ZERO (mode, val)
> +		  : CTZ_DEFINED_VALUE_AT_ZERO (mode, val);
> +      if (define_at_zero && !(optab_defined_at_zero == 2 && val == prec))
> +	{
> +	  tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> +				      build_zero_cst (TREE_TYPE (src)));
> +	  call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> +			     build_int_cst (integer_type_node, prec));
> +	}
> +    }

MfG, JBG

-- 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition
  2023-01-19  9:19           ` Jan-Benedict Glaw
@ 2023-01-19  9:43             ` Richard Biener
  0 siblings, 0 replies; 28+ messages in thread
From: Richard Biener @ 2023-01-19  9:43 UTC (permalink / raw)
  To: Jan-Benedict Glaw; +Cc: Andrew Carlotti, gcc-patches

On Thu, Jan 19, 2023 at 10:19 AM Jan-Benedict Glaw <jbglaw@lug-owl.de> wrote:
>
> On Thu, 2022-12-22 17:42:16 +0000, Andrew Carlotti via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> > New patch below, bootstrapped and regression tested on
> > aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge?
>
> > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > index fece876099c1687569d6351e7d2416ea6acae5b5..ce2441f2a6dbdf2d8fe42755d5d1abd8a631bb5c 100644
> > --- a/gcc/tree-ssa-loop-niter.cc
> > +++ b/gcc/tree-ssa-loop-niter.cc
> > @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "tree-chrec.h"
> >  #include "tree-scalar-evolution.h"
> >  #include "tree-dfa.h"
> > +#include "internal-fn.h"
> >  #include "gimple-range.h"
> >
> >
> > @@ -2198,6 +2199,224 @@ number_of_iterations_popcount (loop_p loop, edge exit,
> >    return true;
> >  }
> >
> > +/* Return an expression that counts the leading/trailing zeroes of src.
> > +
> > +   If define_at_zero is true, then the built expression will be defined to
> > +   return the precision of src when src == 0 (using either a conditional
> > +   expression or a suitable internal function).
> > +   Otherwise, we can elide the conditional expression and let src = 0 invoke
> > +   undefined behaviour.  */
> > +
> > +static tree
> > +build_cltz_expr (tree src, bool leading, bool define_at_zero)
> > +{
> [...]
> > +
> > +  tree call;
> > +  if (use_ifn)
> > +    {
> > +      call = build_call_expr_internal_loc (UNKNOWN_LOCATION, ifn,
> > +                                        integer_type_node, 1, src);
> > +      int val;
> > +      scalar_int_mode mode = SCALAR_INT_TYPE_MODE (utype);
>          ^^^^^^^^^^^^^^^^^^^^
>
> This will give us a new unused variable warning.

I wonder if hardening the defaults.h macros like

#define CLZ_DEFINED_VALUE_AT_ZERO(MODE, VALUE)  (((MODE), (VALUE)), 0)

fixes that and makes sense (also to avoid losing side-effects for the arguments)

Richard.


> > +      int optab_defined_at_zero
> > +     = leading ? CLZ_DEFINED_VALUE_AT_ZERO (mode, val)
> > +               : CTZ_DEFINED_VALUE_AT_ZERO (mode, val);
> > +      if (define_at_zero && !(optab_defined_at_zero == 2 && val == prec))
> > +     {
> > +       tree is_zero = fold_build2 (NE_EXPR, boolean_type_node, src,
> > +                                   build_zero_cst (TREE_TYPE (src)));
> > +       call = fold_build3(COND_EXPR, integer_type_node, is_zero, call,
> > +                          build_int_cst (integer_type_node, prec));
> > +     }
> > +    }
>
> MfG, JBG
>
> --

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-01-19  9:43 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-11 13:29 [PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements Andrew Carlotti
2022-11-11 13:39 ` [PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit Andrew Carlotti
2022-11-14 14:23   ` Jeff Law
2022-11-11 13:46 ` [PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount Andrew Carlotti
2022-11-14 14:24   ` Jeff Law
2022-11-11 13:52 ` [PATCH 3/8] middle-end: Refactor number_of_iterations_popcount Andrew Carlotti
2022-11-14 14:52   ` Richard Biener
2022-11-11 18:43 ` [PATCH 4/8] Modify test, to prevent the next patch breaking it Andrew Carlotti
2022-11-14 10:18   ` Richard Biener
2022-11-11 18:50 ` [PATCH 5/8] middle-end: Add cltz_complement idiom recognition Andrew Carlotti
2022-11-14 15:10   ` Richard Biener
2022-11-21 15:53     ` Andrew Carlotti
2022-11-24 10:41       ` Richard Biener
2022-12-22 17:42         ` [PATCH 5/8 v2] " Andrew Carlotti
2023-01-12 13:19           ` Richard Biener
2023-01-19  9:19           ` Jan-Benedict Glaw
2023-01-19  9:43             ` Richard Biener
2022-11-11 18:54 ` [PATCH 6/8] docs: Add popcount, clz and ctz target attributes Andrew Carlotti
2022-11-14 14:52   ` Jeff Law
2022-12-22 17:42     ` [PATCH 6/8 v2] " Andrew Carlotti
2022-11-11 19:01 ` [PATCH 7/8] middle-end: Add c[lt]z idiom recognition Andrew Carlotti
2022-11-14 15:22   ` Richard Biener
2022-11-11 19:07 ` [PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max Andrew Carlotti
2022-11-14 14:51   ` Jeff Law
2022-12-22 17:43 ` [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN Andrew Carlotti
2023-01-12 13:20   ` Richard Biener
2023-01-16 14:03   ` Andrew Carlotti
2023-01-16 14:07     ` Andrew Carlotti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).