public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-5084] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.
@ 2021-11-10  8:28 hongtao Liu
  0 siblings, 0 replies; only message in thread
From: hongtao Liu @ 2021-11-10  8:28 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:249b4eeef1fe30237acb4d8e1832243b39d61e7e

commit r12-5084-g249b4eeef1fe30237acb4d8e1832243b39d61e7e
Author: liuhongt <hongtao.liu@intel.com>
Date:   Mon Nov 8 15:49:17 2021 +0800

    Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.
    
    This will enable transformation like
    
    -  # sum1_50 = PHI <prephitmp_64(13), 0(4)>
    -  # sum2_52 = PHI <sum2_21(13), 0(4)>
    +  # sum1_50 = PHI <_87(13), 0(4)>
    +  # sum2_52 = PHI <_89(13), 0(4)>
       # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
       i.2_7 = (long unsigned int) i_49;
       _8 = i.2_7 * 8;
    ...
       vec1_i_38 = vec1_29 >> _10;
       vec2_i_39 = vec2_31 >> _10;
       _11 = vec1_i_38 & 1;
    -  _63 = tmp_37 ^ sum1_50;
    -  prephitmp_64 = _11 == 0 ? sum1_50 : _63;
    +  _ifc__86 = _11 != 0 ? tmp_37 : 0;
    +  _87 = sum1_50 ^ _ifc__86;
       _12 = vec2_i_39 & 1;
    :
    
    so that vectorizer won't failed due to
    
      /* If this isn't a nested cycle or if the nested cycle reduction value
         is used ouside of the inner loop we cannot handle uses of the reduction
         value.  */
      if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "reduction used in loop.\n");
          return NULL;
        }
    
    gcc/ChangeLog:
    
            PR tree-optimization/103126
            * tree-vect-loop.c (neutral_op_for_reduction): Remove static.
            * tree-vectorizer.h (neutral_op_for_reduction): Declare.
            * tree-if-conv.c : Include tree-vectorizer.h.
            (is_cond_scalar_reduction): Handle
            BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
            (convert_scalar_cond_reduction): Ditto.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.

Diff:
---
 .../gcc.target/i386/ifcvt-reduction-logic-op.c     | 80 ++++++++++++++++++++++
 gcc/tree-if-conv.c                                 | 20 ++++--
 gcc/tree-vect-loop.c                               |  2 +-
 gcc/tree-vectorizer.h                              |  1 +
 4 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
new file mode 100644
index 00000000000..eeb822d5d43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
@@ -0,0 +1,80 @@
+/* PR tree-optimization/103126.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
+#include<stdint.h>
+
+void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = 0;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 ^= (vec1_i & 1) ? tmp : 0;
+      if (vec2_i&1) sum2 ^= tmp;
+    }
+    *ans++ ^= sum1;  n--;
+    *ans++ ^= sum2;  n--;
+  }
+}
+
+void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = 0;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 |= (vec1_i & 1) ? tmp : 0;
+      if (vec2_i&1) sum2 |= tmp;
+    }
+    *ans++ |= sum1;  n--;
+    *ans++ |= sum2;  n--;
+  }
+}
+
+void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = -1;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 &= (vec1_i & 1) ? tmp : -1;
+      if (vec2_i&1) sum2 &= tmp;
+    }
+    *ans++ &= sum1;  n--;
+    *ans++ &= sum2;  n--;
+  }
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index b165dc0c17f..e88ddc9f788 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -120,6 +120,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-sccvn.h"
 #include "tree-cfgcleanup.h"
 #include "tree-ssa-dse.h"
+#include "tree-vectorizer.h"
 
 /* Only handle PHIs with no more arguments unless we are asked to by
    simd pragma.  */
@@ -1732,7 +1733,11 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
       reduction_op = gimple_assign_rhs_code (stmt);
     }
 
-  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
+  if (reduction_op != PLUS_EXPR
+      && reduction_op != MINUS_EXPR
+      && reduction_op != BIT_IOR_EXPR
+      && reduction_op != BIT_XOR_EXPR
+      && reduction_op != BIT_AND_EXPR)
     return false;
   r_op1 = gimple_assign_rhs1 (stmt);
   r_op2 = gimple_assign_rhs2 (stmt);
@@ -1742,7 +1747,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
 
   /* Make R_OP1 to hold reduction variable.  */
   if (r_nop2 == PHI_RESULT (header_phi)
-      && reduction_op == PLUS_EXPR)
+      && commutative_tree_code (reduction_op))
     {
       std::swap (r_op1, r_op2);
       std::swap (r_nop1, r_nop2);
@@ -1811,7 +1816,8 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   tree rhs1 = gimple_assign_rhs1 (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
-  tree zero = build_zero_cst (TREE_TYPE (rhs1));
+  enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
+  tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1824,14 +1830,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
      of reduction rhs.  */
   c = fold_build_cond_expr (TREE_TYPE (rhs1),
 			    unshare_expr (cond),
-			    swap ? zero : op1,
-			    swap ? op1 : zero);
+			    swap ? op_nochange : op1,
+			    swap ? op1 : op_nochange);
 
   /* Create assignment stmt and insert it at GSI.  */
   new_assign = gimple_build_assign (tmp, c);
   gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement.  */
-  rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
+  /* Build rhs for unconditional increment/decrement/logic_operation.  */
+  rhs = gimple_build (&stmts, reduction_op,
 		      TREE_TYPE (rhs1), op0, tmp);
 
   if (has_nop)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a28bb6321d7..fa4cf88ce51 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3330,7 +3330,7 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
    of the scalar elements.  If the reduction has just a single initial value
    then INITIAL_VALUE is that value, otherwise it is null.  */
 
-static tree
+tree
 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
 {
   switch (code)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b552e9dccce..51ab21896aa 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2120,6 +2120,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
 						  tree);
 
 /* In tree-vect-loop.c.  */
+extern tree neutral_op_for_reduction (tree, tree_code, tree);
 extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
 bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
 /* Used in tree-vect-loop-manip.c */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-10  8:28 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-10  8:28 [gcc r12-5084] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).