* [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.
@ 2021-11-09 2:09 liuhongt
2021-11-09 10:19 ` Richard Biener
0 siblings, 1 reply; 3+ messages in thread
From: liuhongt @ 2021-11-09 2:09 UTC (permalink / raw)
To: gcc-patches
This will enable transformation like
- # sum1_50 = PHI <prephitmp_64(13), 0(4)>
- # sum2_52 = PHI <sum2_21(13), 0(4)>
+ # sum1_50 = PHI <_87(13), 0(4)>
+ # sum2_52 = PHI <_89(13), 0(4)>
# ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
i.2_7 = (long unsigned int) i_49;
_8 = i.2_7 * 8;
...
vec1_i_38 = vec1_29 >> _10;
vec2_i_39 = vec2_31 >> _10;
_11 = vec1_i_38 & 1;
- _63 = tmp_37 ^ sum1_50;
- prephitmp_64 = _11 == 0 ? sum1_50 : _63;
+ _ifc__86 = _11 != 0 ? tmp_37 : 0;
+ _87 = sum1_50 ^ _ifc__86;
_12 = vec2_i_39 & 1;
:
so that vectorizer won't failed due to
/* If this isn't a nested cycle or if the nested cycle reduction value
is used ouside of the inner loop we cannot handle uses of the reduction
value. */
if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"reduction used in loop.\n");
return NULL;
}
Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?
gcc/ChangeLog:
PR tree-optimization/103126
* tree-if-conv.c (is_cond_scalar_reduction): Handle
BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
(convert_scalar_cond_reduction): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
---
.../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++
gcc/tree-if-conv.c | 19 +++--
2 files changed, 92 insertions(+), 7 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
new file mode 100644
index 00000000000..eeb822d5d43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
@@ -0,0 +1,80 @@
+/* PR tree-optimization/103126. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
+#include<stdint.h>
+
+void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = 0;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 ^= (vec1_i & 1) ? tmp : 0;
+ if (vec2_i&1) sum2 ^= tmp;
+ }
+ *ans++ ^= sum1; n--;
+ *ans++ ^= sum2; n--;
+ }
+}
+
+void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = 0;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 |= (vec1_i & 1) ? tmp : 0;
+ if (vec2_i&1) sum2 |= tmp;
+ }
+ *ans++ |= sum1; n--;
+ *ans++ |= sum2; n--;
+ }
+}
+
+void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = -1;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 &= (vec1_i & 1) ? tmp : -1;
+ if (vec2_i&1) sum2 &= tmp;
+ }
+ *ans++ &= sum1; n--;
+ *ans++ &= sum2; n--;
+ }
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index b165dc0c17f..7df1103ff89 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
reduction_op = gimple_assign_rhs_code (stmt);
}
- if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
+ if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
+ && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
+ && reduction_op != BIT_AND_EXPR)
return false;
r_op1 = gimple_assign_rhs1 (stmt);
r_op2 = gimple_assign_rhs2 (stmt);
@@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
/* Make R_OP1 to hold reduction variable. */
if (r_nop2 == PHI_RESULT (header_phi)
- && reduction_op == PLUS_EXPR)
+ && commutative_tree_code (reduction_op))
{
std::swap (r_op1, r_op2);
std::swap (r_nop1, r_nop2);
@@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
tree rhs1 = gimple_assign_rhs1 (reduc);
tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
tree c;
- tree zero = build_zero_cst (TREE_TYPE (rhs1));
+ enum tree_code reduction_op = gimple_assign_rhs_code (reduc);
+ tree op_nochange = reduction_op != BIT_AND_EXPR
+ ? build_zero_cst (TREE_TYPE (rhs1))
+ : build_minus_one_cst (TREE_TYPE (rhs1));
gimple_seq stmts = NULL;
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
of reduction rhs. */
c = fold_build_cond_expr (TREE_TYPE (rhs1),
unshare_expr (cond),
- swap ? zero : op1,
- swap ? op1 : zero);
+ swap ? op_nochange : op1,
+ swap ? op1 : op_nochange);
/* Create assignment stmt and insert it at GSI. */
new_assign = gimple_build_assign (tmp, c);
gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
- /* Build rhs for unconditional increment/decrement. */
- rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
+ /* Build rhs for unconditional increment/decrement/logic_operation. */
+ rhs = gimple_build (&stmts, reduction_op,
TREE_TYPE (rhs1), op0, tmp);
if (has_nop)
--
2.18.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.
2021-11-09 2:09 [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior liuhongt
@ 2021-11-09 10:19 ` Richard Biener
2021-11-10 7:26 ` Hongtao Liu
0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2021-11-09 10:19 UTC (permalink / raw)
To: liuhongt; +Cc: GCC Patches
On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This will enable transformation like
>
> - # sum1_50 = PHI <prephitmp_64(13), 0(4)>
> - # sum2_52 = PHI <sum2_21(13), 0(4)>
> + # sum1_50 = PHI <_87(13), 0(4)>
> + # sum2_52 = PHI <_89(13), 0(4)>
> # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
> i.2_7 = (long unsigned int) i_49;
> _8 = i.2_7 * 8;
> ...
> vec1_i_38 = vec1_29 >> _10;
> vec2_i_39 = vec2_31 >> _10;
> _11 = vec1_i_38 & 1;
> - _63 = tmp_37 ^ sum1_50;
> - prephitmp_64 = _11 == 0 ? sum1_50 : _63;
> + _ifc__86 = _11 != 0 ? tmp_37 : 0;
> + _87 = sum1_50 ^ _ifc__86;
> _12 = vec2_i_39 & 1;
> :
>
> so that vectorizer won't failed due to
>
> /* If this isn't a nested cycle or if the nested cycle reduction value
> is used ouside of the inner loop we cannot handle uses of the reduction
> value. */
> if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
> {
> if (dump_enabled_p ())
> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> "reduction used in loop.\n");
> return NULL;
> }
>
> Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR tree-optimization/103126
> * tree-if-conv.c (is_cond_scalar_reduction): Handle
> BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
> (convert_scalar_cond_reduction): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
> ---
> .../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++
> gcc/tree-if-conv.c | 19 +++--
> 2 files changed, 92 insertions(+), 7 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> new file mode 100644
> index 00000000000..eeb822d5d43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> @@ -0,0 +1,80 @@
> +/* PR tree-optimization/103126. */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
> +#include<stdint.h>
> +
> +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> + int64_t n)
> +{
> + int64_t i;
> + uint64_t vec1, sum1;
> + uint64_t vec2, sum2;
> +
> + while (n > 0) {
> + sum1 = 0;
> + vec1 = a[n];
> + sum2 = 0;
> + vec2 = b[n];
> +
> + for (i = 0; i < 64; i++) {
> + uint64_t tmp = mat[i];
> + uint64_t vec1_i = (vec1 >> i);
> + uint64_t vec2_i = (vec2 >> i);
> + sum1 ^= (vec1_i & 1) ? tmp : 0;
> + if (vec2_i&1) sum2 ^= tmp;
> + }
> + *ans++ ^= sum1; n--;
> + *ans++ ^= sum2; n--;
> + }
> +}
> +
> +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> + int64_t n)
> +{
> + int64_t i;
> + uint64_t vec1, sum1;
> + uint64_t vec2, sum2;
> +
> + while (n > 0) {
> + sum1 = 0;
> + vec1 = a[n];
> + sum2 = 0;
> + vec2 = b[n];
> +
> + for (i = 0; i < 64; i++) {
> + uint64_t tmp = mat[i];
> + uint64_t vec1_i = (vec1 >> i);
> + uint64_t vec2_i = (vec2 >> i);
> + sum1 |= (vec1_i & 1) ? tmp : 0;
> + if (vec2_i&1) sum2 |= tmp;
> + }
> + *ans++ |= sum1; n--;
> + *ans++ |= sum2; n--;
> + }
> +}
> +
> +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> + int64_t n)
> +{
> + int64_t i;
> + uint64_t vec1, sum1;
> + uint64_t vec2, sum2;
> +
> + while (n > 0) {
> + sum1 = -1;
> + vec1 = a[n];
> + sum2 = 0;
> + vec2 = b[n];
> +
> + for (i = 0; i < 64; i++) {
> + uint64_t tmp = mat[i];
> + uint64_t vec1_i = (vec1 >> i);
> + uint64_t vec2_i = (vec2 >> i);
> + sum1 &= (vec1_i & 1) ? tmp : -1;
> + if (vec2_i&1) sum2 &= tmp;
> + }
> + *ans++ &= sum1; n--;
> + *ans++ &= sum2; n--;
> + }
> +}
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index b165dc0c17f..7df1103ff89 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
> reduction_op = gimple_assign_rhs_code (stmt);
> }
>
> - if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
> + if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
> + && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
> + && reduction_op != BIT_AND_EXPR)
Please put each && on a separate line
> return false;
> r_op1 = gimple_assign_rhs1 (stmt);
> r_op2 = gimple_assign_rhs2 (stmt);
> @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
>
> /* Make R_OP1 to hold reduction variable. */
> if (r_nop2 == PHI_RESULT (header_phi)
> - && reduction_op == PLUS_EXPR)
> + && commutative_tree_code (reduction_op))
> {
> std::swap (r_op1, r_op2);
> std::swap (r_nop1, r_nop2);
> @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> tree rhs1 = gimple_assign_rhs1 (reduc);
> tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
> tree c;
> - tree zero = build_zero_cst (TREE_TYPE (rhs1));
> + enum tree_code reduction_op = gimple_assign_rhs_code (reduc);
> + tree op_nochange = reduction_op != BIT_AND_EXPR
> + ? build_zero_cst (TREE_TYPE (rhs1))
> + : build_minus_one_cst (TREE_TYPE (rhs1));
maybe export neutral_op_for_reduction and use it here (supply NULL
initial_value)?
Otherwise looks OK.
Thanks,
Richard.
> gimple_seq stmts = NULL;
>
> if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> of reduction rhs. */
> c = fold_build_cond_expr (TREE_TYPE (rhs1),
> unshare_expr (cond),
> - swap ? zero : op1,
> - swap ? op1 : zero);
> + swap ? op_nochange : op1,
> + swap ? op1 : op_nochange);
>
> /* Create assignment stmt and insert it at GSI. */
> new_assign = gimple_build_assign (tmp, c);
> gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> - /* Build rhs for unconditional increment/decrement. */
> - rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
> + /* Build rhs for unconditional increment/decrement/logic_operation. */
> + rhs = gimple_build (&stmts, reduction_op,
> TREE_TYPE (rhs1), op0, tmp);
>
> if (has_nop)
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.
2021-11-09 10:19 ` Richard Biener
@ 2021-11-10 7:26 ` Hongtao Liu
0 siblings, 0 replies; 3+ messages in thread
From: Hongtao Liu @ 2021-11-10 7:26 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 7473 bytes --]
On Tue, Nov 9, 2021 at 6:22 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > This will enable transformation like
> >
> > - # sum1_50 = PHI <prephitmp_64(13), 0(4)>
> > - # sum2_52 = PHI <sum2_21(13), 0(4)>
> > + # sum1_50 = PHI <_87(13), 0(4)>
> > + # sum2_52 = PHI <_89(13), 0(4)>
> > # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
> > i.2_7 = (long unsigned int) i_49;
> > _8 = i.2_7 * 8;
> > ...
> > vec1_i_38 = vec1_29 >> _10;
> > vec2_i_39 = vec2_31 >> _10;
> > _11 = vec1_i_38 & 1;
> > - _63 = tmp_37 ^ sum1_50;
> > - prephitmp_64 = _11 == 0 ? sum1_50 : _63;
> > + _ifc__86 = _11 != 0 ? tmp_37 : 0;
> > + _87 = sum1_50 ^ _ifc__86;
> > _12 = vec2_i_39 & 1;
> > :
> >
> > so that vectorizer won't failed due to
> >
> > /* If this isn't a nested cycle or if the nested cycle reduction value
> > is used ouside of the inner loop we cannot handle uses of the reduction
> > value. */
> > if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
> > {
> > if (dump_enabled_p ())
> > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > "reduction used in loop.\n");
> > return NULL;
> > }
> >
> > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/103126
> > * tree-if-conv.c (is_cond_scalar_reduction): Handle
> > BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
> > (convert_scalar_cond_reduction): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
> > ---
> > .../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++
> > gcc/tree-if-conv.c | 19 +++--
> > 2 files changed, 92 insertions(+), 7 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> > new file mode 100644
> > index 00000000000..eeb822d5d43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> > @@ -0,0 +1,80 @@
> > +/* PR tree-optimization/103126. */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
> > +#include<stdint.h>
> > +
> > +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > + int64_t n)
> > +{
> > + int64_t i;
> > + uint64_t vec1, sum1;
> > + uint64_t vec2, sum2;
> > +
> > + while (n > 0) {
> > + sum1 = 0;
> > + vec1 = a[n];
> > + sum2 = 0;
> > + vec2 = b[n];
> > +
> > + for (i = 0; i < 64; i++) {
> > + uint64_t tmp = mat[i];
> > + uint64_t vec1_i = (vec1 >> i);
> > + uint64_t vec2_i = (vec2 >> i);
> > + sum1 ^= (vec1_i & 1) ? tmp : 0;
> > + if (vec2_i&1) sum2 ^= tmp;
> > + }
> > + *ans++ ^= sum1; n--;
> > + *ans++ ^= sum2; n--;
> > + }
> > +}
> > +
> > +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > + int64_t n)
> > +{
> > + int64_t i;
> > + uint64_t vec1, sum1;
> > + uint64_t vec2, sum2;
> > +
> > + while (n > 0) {
> > + sum1 = 0;
> > + vec1 = a[n];
> > + sum2 = 0;
> > + vec2 = b[n];
> > +
> > + for (i = 0; i < 64; i++) {
> > + uint64_t tmp = mat[i];
> > + uint64_t vec1_i = (vec1 >> i);
> > + uint64_t vec2_i = (vec2 >> i);
> > + sum1 |= (vec1_i & 1) ? tmp : 0;
> > + if (vec2_i&1) sum2 |= tmp;
> > + }
> > + *ans++ |= sum1; n--;
> > + *ans++ |= sum2; n--;
> > + }
> > +}
> > +
> > +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > + int64_t n)
> > +{
> > + int64_t i;
> > + uint64_t vec1, sum1;
> > + uint64_t vec2, sum2;
> > +
> > + while (n > 0) {
> > + sum1 = -1;
> > + vec1 = a[n];
> > + sum2 = 0;
> > + vec2 = b[n];
> > +
> > + for (i = 0; i < 64; i++) {
> > + uint64_t tmp = mat[i];
> > + uint64_t vec1_i = (vec1 >> i);
> > + uint64_t vec2_i = (vec2 >> i);
> > + sum1 &= (vec1_i & 1) ? tmp : -1;
> > + if (vec2_i&1) sum2 &= tmp;
> > + }
> > + *ans++ &= sum1; n--;
> > + *ans++ &= sum2; n--;
> > + }
> > +}
> > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> > index b165dc0c17f..7df1103ff89 100644
> > --- a/gcc/tree-if-conv.c
> > +++ b/gcc/tree-if-conv.c
> > @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
> > reduction_op = gimple_assign_rhs_code (stmt);
> > }
> >
> > - if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
> > + if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
> > + && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
> > + && reduction_op != BIT_AND_EXPR)
>
> Please put each && on a separate line
Changed.
>
> > return false;
> > r_op1 = gimple_assign_rhs1 (stmt);
> > r_op2 = gimple_assign_rhs2 (stmt);
> > @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
> >
> > /* Make R_OP1 to hold reduction variable. */
> > if (r_nop2 == PHI_RESULT (header_phi)
> > - && reduction_op == PLUS_EXPR)
> > + && commutative_tree_code (reduction_op))
> > {
> > std::swap (r_op1, r_op2);
> > std::swap (r_nop1, r_nop2);
> > @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> > tree rhs1 = gimple_assign_rhs1 (reduc);
> > tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
> > tree c;
> > - tree zero = build_zero_cst (TREE_TYPE (rhs1));
> > + enum tree_code reduction_op = gimple_assign_rhs_code (reduc);
> > + tree op_nochange = reduction_op != BIT_AND_EXPR
> > + ? build_zero_cst (TREE_TYPE (rhs1))
> > + : build_minus_one_cst (TREE_TYPE (rhs1));
>
> maybe export neutral_op_for_reduction and use it here (supply NULL
> initial_value)?
Changed(didn't know there's such function.)
>
> Otherwise looks OK.
>
> Thanks,
> Richard.
>
> > gimple_seq stmts = NULL;
> >
> > if (dump_file && (dump_flags & TDF_DETAILS))
> > @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> > of reduction rhs. */
> > c = fold_build_cond_expr (TREE_TYPE (rhs1),
> > unshare_expr (cond),
> > - swap ? zero : op1,
> > - swap ? op1 : zero);
> > + swap ? op_nochange : op1,
> > + swap ? op1 : op_nochange);
> >
> > /* Create assignment stmt and insert it at GSI. */
> > new_assign = gimple_build_assign (tmp, c);
> > gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> > - /* Build rhs for unconditional increment/decrement. */
> > - rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
> > + /* Build rhs for unconditional increment/decrement/logic_operation. */
> > + rhs = gimple_build (&stmts, reduction_op,
> > TREE_TYPE (rhs1), op0, tmp);
> >
> > if (has_nop)
> > --
> > 2.18.1
> >
Here's the patch i'm going to check in.
--
BR,
Hongtao
[-- Attachment #2: 0001-pass_if_conversion-Extend-is_cond_scalar_reduction-t.patch --]
[-- Type: text/x-patch, Size: 7803 bytes --]
From 41b806b99bb4e7bf760bb0a4902ae426e2596fd5 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Mon, 8 Nov 2021 15:49:17 +0800
Subject: [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to
handle bit_and/bit_xor/bit_ior.
This will enable transformation like
- # sum1_50 = PHI <prephitmp_64(13), 0(4)>
- # sum2_52 = PHI <sum2_21(13), 0(4)>
+ # sum1_50 = PHI <_87(13), 0(4)>
+ # sum2_52 = PHI <_89(13), 0(4)>
# ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
i.2_7 = (long unsigned int) i_49;
_8 = i.2_7 * 8;
...
vec1_i_38 = vec1_29 >> _10;
vec2_i_39 = vec2_31 >> _10;
_11 = vec1_i_38 & 1;
- _63 = tmp_37 ^ sum1_50;
- prephitmp_64 = _11 == 0 ? sum1_50 : _63;
+ _ifc__86 = _11 != 0 ? tmp_37 : 0;
+ _87 = sum1_50 ^ _ifc__86;
_12 = vec2_i_39 & 1;
:
so that vectorizer won't failed due to
/* If this isn't a nested cycle or if the nested cycle reduction value
is used ouside of the inner loop we cannot handle uses of the reduction
value. */
if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"reduction used in loop.\n");
return NULL;
}
gcc/ChangeLog:
PR tree-optimization/103126
* tree-vect-loop.c (neutral_op_for_reduction): Remove static.
* tree-vectorizer.h (neutral_op_for_reduction): Declare.
* tree-if-conv.c : Include tree-vectorizer.h.
(is_cond_scalar_reduction): Handle
BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
(convert_scalar_cond_reduction): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
---
.../i386/ifcvt-reduction-logic-op.c | 80 +++++++++++++++++++
gcc/tree-if-conv.c | 20 +++--
gcc/tree-vect-loop.c | 2 +-
gcc/tree-vectorizer.h | 1 +
4 files changed, 95 insertions(+), 8 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
new file mode 100644
index 00000000000..eeb822d5d43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
@@ -0,0 +1,80 @@
+/* PR tree-optimization/103126. */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
+#include<stdint.h>
+
+void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = 0;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 ^= (vec1_i & 1) ? tmp : 0;
+ if (vec2_i&1) sum2 ^= tmp;
+ }
+ *ans++ ^= sum1; n--;
+ *ans++ ^= sum2; n--;
+ }
+}
+
+void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = 0;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 |= (vec1_i & 1) ? tmp : 0;
+ if (vec2_i&1) sum2 |= tmp;
+ }
+ *ans++ |= sum1; n--;
+ *ans++ |= sum2; n--;
+ }
+}
+
+void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+ int64_t n)
+{
+ int64_t i;
+ uint64_t vec1, sum1;
+ uint64_t vec2, sum2;
+
+ while (n > 0) {
+ sum1 = -1;
+ vec1 = a[n];
+ sum2 = 0;
+ vec2 = b[n];
+
+ for (i = 0; i < 64; i++) {
+ uint64_t tmp = mat[i];
+ uint64_t vec1_i = (vec1 >> i);
+ uint64_t vec2_i = (vec2 >> i);
+ sum1 &= (vec1_i & 1) ? tmp : -1;
+ if (vec2_i&1) sum2 &= tmp;
+ }
+ *ans++ &= sum1; n--;
+ *ans++ &= sum2; n--;
+ }
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index b165dc0c17f..e88ddc9f788 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -120,6 +120,7 @@ along with GCC; see the file COPYING3. If not see
#include "tree-ssa-sccvn.h"
#include "tree-cfgcleanup.h"
#include "tree-ssa-dse.h"
+#include "tree-vectorizer.h"
/* Only handle PHIs with no more arguments unless we are asked to by
simd pragma. */
@@ -1732,7 +1733,11 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
reduction_op = gimple_assign_rhs_code (stmt);
}
- if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
+ if (reduction_op != PLUS_EXPR
+ && reduction_op != MINUS_EXPR
+ && reduction_op != BIT_IOR_EXPR
+ && reduction_op != BIT_XOR_EXPR
+ && reduction_op != BIT_AND_EXPR)
return false;
r_op1 = gimple_assign_rhs1 (stmt);
r_op2 = gimple_assign_rhs2 (stmt);
@@ -1742,7 +1747,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
/* Make R_OP1 to hold reduction variable. */
if (r_nop2 == PHI_RESULT (header_phi)
- && reduction_op == PLUS_EXPR)
+ && commutative_tree_code (reduction_op))
{
std::swap (r_op1, r_op2);
std::swap (r_nop1, r_nop2);
@@ -1811,7 +1816,8 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
tree rhs1 = gimple_assign_rhs1 (reduc);
tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
tree c;
- tree zero = build_zero_cst (TREE_TYPE (rhs1));
+ enum tree_code reduction_op = gimple_assign_rhs_code (reduc);
+ tree op_nochange = neutral_op_for_reduction (TREE_TYPE (rhs1), reduction_op, NULL);
gimple_seq stmts = NULL;
if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1824,14 +1830,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
of reduction rhs. */
c = fold_build_cond_expr (TREE_TYPE (rhs1),
unshare_expr (cond),
- swap ? zero : op1,
- swap ? op1 : zero);
+ swap ? op_nochange : op1,
+ swap ? op1 : op_nochange);
/* Create assignment stmt and insert it at GSI. */
new_assign = gimple_build_assign (tmp, c);
gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
- /* Build rhs for unconditional increment/decrement. */
- rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
+ /* Build rhs for unconditional increment/decrement/logic_operation. */
+ rhs = gimple_build (&stmts, reduction_op,
TREE_TYPE (rhs1), op0, tmp);
if (has_nop)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a28bb6321d7..fa4cf88ce51 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3330,7 +3330,7 @@ reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
of the scalar elements. If the reduction has just a single initial value
then INITIAL_VALUE is that value, otherwise it is null. */
-static tree
+tree
neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
{
switch (code)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b552e9dccce..51ab21896aa 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2120,6 +2120,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *,
tree);
/* In tree-vect-loop.c. */
+extern tree neutral_op_for_reduction (tree, tree_code, tree);
extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo);
bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *);
/* Used in tree-vect-loop-manip.c */
--
2.18.1
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2021-11-10 7:20 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-09 2:09 [PATCH] [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior liuhongt
2021-11-09 10:19 ` Richard Biener
2021-11-10 7:26 ` Hongtao Liu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).