* [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
@ 2022-05-09 5:19 liuhongt
2022-05-11 8:45 ` Richard Biener
0 siblings, 1 reply; 6+ messages in thread
From: liuhongt @ 2022-05-09 5:19 UTC (permalink / raw)
To: gcc-patches
This patch will enable below optimization:
{
- int bit;
- long long unsigned int _1;
- long long unsigned int _2;
-
<bb 2> [local count: 46707768]:
-
- <bb 3> [local count: 1027034057]:
- # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
- # bit_13 = PHI <bit_9(3), 63(2)>
- _1 = 1 << bit_13;
- _2 = ~_1;
- tmp_8 = _2 & tmp_11;
- bit_9 = bit_13 + -3;
- if (bit_9 != -3(OVF))
- goto <bb 3>; [95.65%]
- else
- goto <bb 4>; [4.35%]
-
- <bb 4> [local count: 46707768]:
- return tmp_8;
+ tmp_12 = tmp_6(D) & 7905747460161236406;
+ return tmp_12;
}
Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?
gcc/ChangeLog:
PR middle-end/103462
* match.pd (bitwise_induction_p): New match.
* tree-scalar-evolution.c (gimple_bitwise_induction_p):
Declare.
(analyze_and_compute_bitwise_induction_effect): New function.
(enum bit_op_kind): New enum.
(final_value_replacement_loop): Enhanced to handle bitwise
induction.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103462-1.c: New test.
* gcc.target/i386/pr103462-2.c: New test.
* gcc.target/i386/pr103462-3.c: New test.
* gcc.target/i386/pr103462-4.c: New test.
* gcc.target/i386/pr103462-5.c: New test.
* gcc.target/i386/pr103462-6.c: New test.
---
gcc/match.pd | 7 +
gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
8 files changed, 654 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b3..24ff5f9e6a8 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7746,3 +7746,10 @@ and,
== TYPE_UNSIGNED (TREE_TYPE (@3))))
&& single_use (@4)
&& single_use (@5))))
+
+(for bit_op (bit_and bit_ior bit_xor)
+ (match (bitwise_induction_p @0 @2 @3)
+ (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
+
+(match (bitwise_induction_p @0 @2 @3)
+ (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
new file mode 100644
index 00000000000..1dc4c2acad6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned long long
+__attribute__((noipa))
+foo (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo1 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo2 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo3 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo4 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo5 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo6 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo7 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo8 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo9 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo10 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo11 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
new file mode 100644
index 00000000000..bc375cb78d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-1.c"
+
+int main()
+{
+ unsigned long long tmp = 0x1111111111111111ULL;
+ if (foo (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
new file mode 100644
index 00000000000..4ba248a4872
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned int
+__attribute__((noipa))
+foo (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo1 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo2 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo3 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo4 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo5 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo6 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo7 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo8 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo9 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo10 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo11 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
new file mode 100644
index 00000000000..e2f4056f525
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-3.c"
+
+int main()
+{
+ unsigned int tmp = 0x11111111U;
+
+ if (foo (tmp) != 0x10110110U)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x1101101U)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x59359359U)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x93593593U)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0xa7ca7ca7U)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x7ca7ca7cU)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x58358358U)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x83583583U)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
new file mode 100644
index 00000000000..1f4ffa34b48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned short
+__attribute__((noipa))
+foo (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo1 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo2 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo3 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo4 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo5 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo6 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo7 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo8 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo9 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo10 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo11 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
new file mode 100644
index 00000000000..65426d71efe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-5.c"
+
+int main()
+{
+ unsigned short tmp = 0x1111U;
+
+ if (foo (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358)
+ __builtin_abort ();
+}
+
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 72ceb4001e3..9b0aec4fd09 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
|| expanded_size > cache.elements ());
}
+/* Match.pd function to match bitwise inductive expression.
+ .i.e.
+ _2 = 1 << _1;
+ _3 = ~_2;
+ tmp_9 = _3 & tmp_12; */
+extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
+
+/* Return the inductive expression of bitwise operation if possible,
+ otherwise returns DEF. */
+static tree
+analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
+ class loop* loop,
+ tree phidef,
+ tree def,
+ tree niter)
+{
+ tree match_op[3],inv, bitwise_scev, bitwise_res;
+ bool folded_casts = false;
+ tree type = TREE_TYPE (phidef);
+ gimple* header_phi = NULL;
+
+ /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
+
+ op2 = PHI <phidef, inv>
+ _1 = (int) bit_17;
+ _3 = 1 << _1;
+ op1 = ~_3;
+ phidef = op1 & op2; */
+ if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
+ || TREE_CODE (match_op[2]) != SSA_NAME
+ || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
+ || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
+ return def;
+
+ header_phi = SSA_NAME_DEF_STMT (match_op[2]);
+ if (gimple_bb (header_phi) != loop->header)
+ return def;
+
+ if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
+ return def;
+
+ inv = gimple_phi_arg_def (header_phi, 0);
+ if (inv == phidef)
+ inv = gimple_phi_arg_def (header_phi, 1);
+
+ bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
+ match_op[1],
+ &folded_casts);
+
+ /* Make sure bits is in range of type precision. */
+ if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
+ || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
+ || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
+ || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
+ return def;
+
+ bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
+ if (!tree_fits_uhwi_p (bitwise_res)
+ || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
+ return def;
+
+enum bit_op_kind
+ {
+ INDUCTION_BIT_CLEAR,
+ INDUCTION_BIT_IOR,
+ INDUCTION_BIT_XOR,
+ INDUCTION_BIT_RESET,
+ INDUCTION_ZERO,
+ INDUCTION_ALL
+ };
+
+ enum bit_op_kind induction_kind;
+ enum tree_code code1
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
+ enum tree_code code2
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
+
+ /* BIT_CLEAR: A &= ~(1 << bit)
+ BIT_RESET: A ^= (1 << bit).
+ BIT_IOR: A |= (1 << bit)
+ BIT_ZERO: A &= (1 << bit)
+ BIT_ALL: A |= ~(1 << bit)
+ BIT_XOR: A ^= ~(1 << bit).
+ bit is induction variable. */
+ switch (code1)
+ {
+ case BIT_AND_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_CLEAR
+ : INDUCTION_ZERO;
+ break;
+ case BIT_IOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_ALL
+ : INDUCTION_BIT_IOR;
+ break;
+ case BIT_XOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_XOR
+ : INDUCTION_BIT_RESET;
+ break;
+ /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
+ case BIT_NOT_EXPR:
+ gcc_assert (code2 == BIT_XOR_EXPR);
+ induction_kind = INDUCTION_BIT_XOR;
+ break;
+ default:
+ gcc_unreachable();
+ }
+
+ if (induction_kind == INDUCTION_ZERO)
+ return build_zero_cst (type);
+ if (induction_kind == INDUCTION_ALL)
+ return build_all_ones_cst (type);
+
+ wide_int bits = wi::zero (TYPE_PRECISION (type));
+ HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
+ HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
+ unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
+
+ /* Loop tripcount should be num_iter + 1. */
+ for (unsigned i = 0; i != num_iter + 1; i++)
+ {
+ bits = wi::bit_or (bits,
+ wi::lshift (wi::one (TYPE_PRECISION (type)),
+ start));
+ start += step;
+ }
+
+ bool inverted = false;
+ switch (induction_kind)
+ {
+ case INDUCTION_BIT_CLEAR:
+ code1 = BIT_AND_EXPR;
+ inverted = true;
+ break;
+ case INDUCTION_BIT_IOR:
+ code1 = BIT_IOR_EXPR;
+ break;
+ case INDUCTION_BIT_RESET:
+ code1 = BIT_XOR_EXPR;
+ break;
+ /* A ^= ~(1 << bit) is special, when loop tripcount is even,
+ it's equal to A ^= bits, else A ^= ~bits. */
+ case INDUCTION_BIT_XOR:
+ code1 = BIT_XOR_EXPR;
+ if (num_iter % 2 == 0)
+ inverted = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (inverted)
+ bits = wi::bit_not (bits);
+ return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
+}
+
/* Do final value replacement for LOOP, return true if we did anything. */
bool
@@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
{
gphi *phi = psi.phi ();
tree rslt = PHI_RESULT (phi);
- tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree def = phidef;
if (virtual_operand_p (def))
{
gsi_next (&psi);
@@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
&folded_casts);
def = compute_overall_effect_of_inner_loop (ex_loop, def);
+
+ /* Handle bitwise induction expression.
+
+ .i.e.
+ for (int i = 0; i != 64; i+=3)
+ res &= ~(1UL << i);
+
+ RES can't be analyzed out by SCEV because it is not polynomially
+ expressible, but in fact final value of RES can be replaced by
+ RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
+ being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
+ if (tree_fits_uhwi_p (niter)
+ && tree_to_uhwi (niter)
+ && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
+ def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
+ phidef, def, niter);
+
if (!tree_does_not_contain_chrecs (def)
|| chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
/* Moving the computation from the loop may prolong life range
--
2.18.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
2022-05-09 5:19 [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction liuhongt
@ 2022-05-11 8:45 ` Richard Biener
2022-05-13 3:37 ` Hongtao Liu
0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2022-05-11 8:45 UTC (permalink / raw)
To: liuhongt; +Cc: GCC Patches
On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This patch will enable below optimization:
>
> {
> - int bit;
> - long long unsigned int _1;
> - long long unsigned int _2;
> -
> <bb 2> [local count: 46707768]:
> -
> - <bb 3> [local count: 1027034057]:
> - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
> - # bit_13 = PHI <bit_9(3), 63(2)>
> - _1 = 1 << bit_13;
> - _2 = ~_1;
> - tmp_8 = _2 & tmp_11;
> - bit_9 = bit_13 + -3;
> - if (bit_9 != -3(OVF))
> - goto <bb 3>; [95.65%]
> - else
> - goto <bb 4>; [4.35%]
> -
> - <bb 4> [local count: 46707768]:
> - return tmp_8;
> + tmp_12 = tmp_6(D) & 7905747460161236406;
> + return tmp_12;
>
> }
>
>
> Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR middle-end/103462
> * match.pd (bitwise_induction_p): New match.
> * tree-scalar-evolution.c (gimple_bitwise_induction_p):
> Declare.
> (analyze_and_compute_bitwise_induction_effect): New function.
> (enum bit_op_kind): New enum.
> (final_value_replacement_loop): Enhanced to handle bitwise
> induction.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr103462-1.c: New test.
> * gcc.target/i386/pr103462-2.c: New test.
> * gcc.target/i386/pr103462-3.c: New test.
> * gcc.target/i386/pr103462-4.c: New test.
> * gcc.target/i386/pr103462-5.c: New test.
> * gcc.target/i386/pr103462-6.c: New test.
> ---
> gcc/match.pd | 7 +
> gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
> gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
> gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
> gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
> gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
> 8 files changed, 654 insertions(+), 1 deletion(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6d691d302b3..24ff5f9e6a8 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7746,3 +7746,10 @@ and,
> == TYPE_UNSIGNED (TREE_TYPE (@3))))
> && single_use (@4)
> && single_use (@5))))
> +
> +(for bit_op (bit_and bit_ior bit_xor)
> + (match (bitwise_induction_p @0 @2 @3)
> + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
> +
> +(match (bitwise_induction_p @0 @2 @3)
> + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> new file mode 100644
> index 00000000000..1dc4c2acad6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> @@ -0,0 +1,111 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp &= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo1 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp &= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo2 (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp &= (1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo3 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp &= (1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo4 (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp |= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo5 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp |= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo6 (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp |= (1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo7 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp |= (1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo8 (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp ^= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo9 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp ^= ~(1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo10 (unsigned long long tmp)
> +{
> + for (int bit = 0; bit < 64; bit += 3)
> + tmp ^= (1ULL << bit);
> + return tmp;
> +}
> +
> +unsigned long long
> +__attribute__((noipa))
> +foo11 (unsigned long long tmp)
> +{
> + for (int bit = 63; bit >= 0; bit -= 3)
> + tmp ^= (1ULL << bit);
> + return tmp;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> new file mode 100644
> index 00000000000..bc375cb78d4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include "pr103462-1.c"
> +
> +int main()
> +{
> + unsigned long long tmp = 0x1111111111111111ULL;
> + if (foo (tmp) != 0x110110110110110ULL)
> + __builtin_abort ();
> +
> + if (foo1 (tmp) != 0x110110110110110ULL)
> + __builtin_abort ();
> +
> + if (foo2 (tmp) != 0x0ULL)
> + __builtin_abort ();
> +
> + if (foo3 (tmp) != 0x0ULL)
> + __builtin_abort ();
> +
> + if (foo4 (tmp) != 0xffffffffffffffffULL)
> + __builtin_abort ();
> +
> + if (foo5 (tmp) != 0xffffffffffffffffULL)
> + __builtin_abort ();
> +
> + if (foo6 (tmp) != 0x9359359359359359ULL)
> + __builtin_abort ();
> +
> + if (foo7 (tmp) != 0x9359359359359359ULL)
> + __builtin_abort ();
> +
> + if (foo8 (tmp) != 0x8358358358358358ULL)
> + __builtin_abort ();
> +
> + if (foo9 (tmp) != 0x8358358358358358ULL)
> + __builtin_abort ();
> +
> + if (foo10 (tmp) != 0x8358358358358358ULL)
> + __builtin_abort ();
> +
> + if (foo11 (tmp) != 0x8358358358358358ULL)
> + __builtin_abort ();
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> new file mode 100644
> index 00000000000..4ba248a4872
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> @@ -0,0 +1,111 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> +
> +unsigned int
> +__attribute__((noipa))
> +foo (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp &= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo1 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp &= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo2 (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp &= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo3 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp &= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo4 (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp |= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo5 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp |= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo6 (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp |= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo7 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp |= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo8 (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp ^= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo9 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp ^= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo10 (unsigned int tmp)
> +{
> + for (int bit = 0; bit < 32; bit += 3)
> + tmp ^= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned int
> +__attribute__((noipa))
> +foo11 (unsigned int tmp)
> +{
> + for (int bit = 31; bit >= 0; bit -= 3)
> + tmp ^= (1U << bit);
> + return tmp;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> new file mode 100644
> index 00000000000..e2f4056f525
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include "pr103462-3.c"
> +
> +int main()
> +{
> + unsigned int tmp = 0x11111111U;
> +
> + if (foo (tmp) != 0x10110110U)
> + __builtin_abort ();
> +
> + if (foo1 (tmp) != 0x1101101U)
> + __builtin_abort ();
> +
> + if (foo2 (tmp) != 0x0U)
> + __builtin_abort ();
> +
> + if (foo3 (tmp) != 0x0U)
> + __builtin_abort ();
> +
> + if (foo4 (tmp) != 0xffffffffU)
> + __builtin_abort ();
> +
> + if (foo5 (tmp) != 0xffffffffU)
> + __builtin_abort ();
> +
> + if (foo6 (tmp) != 0x59359359U)
> + __builtin_abort ();
> +
> + if (foo7 (tmp) != 0x93593593U)
> + __builtin_abort ();
> +
> + if (foo8 (tmp) != 0xa7ca7ca7U)
> + __builtin_abort ();
> +
> + if (foo9 (tmp) != 0x7ca7ca7cU)
> + __builtin_abort ();
> +
> + if (foo10 (tmp) != 0x58358358U)
> + __builtin_abort ();
> +
> + if (foo11 (tmp) != 0x83583583U)
> + __builtin_abort ();
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> new file mode 100644
> index 00000000000..1f4ffa34b48
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> @@ -0,0 +1,111 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> +
> +unsigned short
> +__attribute__((noipa))
> +foo (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp &= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo1 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp &= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo2 (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp &= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo3 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp &= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo4 (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp |= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo5 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp |= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo6 (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp |= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo7 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp |= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo8 (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp ^= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo9 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp ^= ~(1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo10 (unsigned short tmp)
> +{
> + for (int bit = 0; bit < 16; bit += 3)
> + tmp ^= (1U << bit);
> + return tmp;
> +}
> +
> +unsigned short
> +__attribute__((noipa))
> +foo11 (unsigned short tmp)
> +{
> + for (int bit = 15; bit >= 0; bit -= 3)
> + tmp ^= (1U << bit);
> + return tmp;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> new file mode 100644
> index 00000000000..65426d71efe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +#include "pr103462-5.c"
> +
> +int main()
> +{
> + unsigned short tmp = 0x1111U;
> +
> + if (foo (tmp) != 0x110)
> + __builtin_abort ();
> +
> + if (foo1 (tmp) != 0x110)
> + __builtin_abort ();
> +
> + if (foo2 (tmp) != 0x0)
> + __builtin_abort ();
> +
> + if (foo3 (tmp) != 0x0)
> + __builtin_abort ();
> +
> + if (foo4 (tmp) != 0xffff)
> + __builtin_abort ();
> +
> + if (foo5 (tmp) != 0xffff)
> + __builtin_abort ();
> +
> + if (foo6 (tmp) != 0x9359)
> + __builtin_abort ();
> +
> + if (foo7 (tmp) != 0x9359)
> + __builtin_abort ();
> +
> + if (foo8 (tmp) != 0x8358)
> + __builtin_abort ();
> +
> + if (foo9 (tmp) != 0x8358)
> + __builtin_abort ();
> +
> + if (foo10 (tmp) != 0x8358)
> + __builtin_abort ();
> +
> + if (foo11 (tmp) != 0x8358)
> + __builtin_abort ();
> +}
> +
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index 72ceb4001e3..9b0aec4fd09 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
> || expanded_size > cache.elements ());
> }
>
> +/* Match.pd function to match bitwise inductive expression.
> + .i.e.
> + _2 = 1 << _1;
> + _3 = ~_2;
> + tmp_9 = _3 & tmp_12; */
> +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
> +
> +/* Return the inductive expression of bitwise operation if possible,
> + otherwise returns DEF. */
> +static tree
> +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
> + class loop* loop,
> + tree phidef,
> + tree def,
> + tree niter)
> +{
> + tree match_op[3],inv, bitwise_scev, bitwise_res;
> + bool folded_casts = false;
> + tree type = TREE_TYPE (phidef);
> + gimple* header_phi = NULL;
use a gphi *
> +
> + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
> +
> + op2 = PHI <phidef, inv>
> + _1 = (int) bit_17;
> + _3 = 1 << _1;
> + op1 = ~_3;
> + phidef = op1 & op2; */
> + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
> + || TREE_CODE (match_op[2]) != SSA_NAME
> + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
|| !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
and then use header_phi here, if you pass in the PHI to use you'll
always have 2 args
> + return def;
> +
> + header_phi = SSA_NAME_DEF_STMT (match_op[2]);
> + if (gimple_bb (header_phi) != loop->header)
> + return def;
I think passing in the PHI node and the exit edge instead of phidef
would make the above more clear
> + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
> + return def;
> +
> + inv = gimple_phi_arg_def (header_phi, 0);
> + if (inv == phidef)
> + inv = gimple_phi_arg_def (header_phi, 1);
inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop));
where is this used?
> +
> + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
> + match_op[1],
> + &folded_casts);
you want
bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
bitwise_scev = instantiate_parameters (loop, bitwise_scev);
instead since you are interested in in-loop behavior?
> +
> + /* Make sure bits is in range of type precision. */
> + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you
do not rule out pointers or vector types.
> + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
> + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
> + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
> + return def;
> +
> + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> + if (!tree_fits_uhwi_p (bitwise_res)
> + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
> + return def;
Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct?
So name it bit_final?
> +
> +enum bit_op_kind
> + {
> + INDUCTION_BIT_CLEAR,
> + INDUCTION_BIT_IOR,
> + INDUCTION_BIT_XOR,
> + INDUCTION_BIT_RESET,
> + INDUCTION_ZERO,
> + INDUCTION_ALL
> + };
> +
> + enum bit_op_kind induction_kind;
> + enum tree_code code1
> + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
> + enum tree_code code2
> + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
> +
> + /* BIT_CLEAR: A &= ~(1 << bit)
> + BIT_RESET: A ^= (1 << bit).
> + BIT_IOR: A |= (1 << bit)
> + BIT_ZERO: A &= (1 << bit)
> + BIT_ALL: A |= ~(1 << bit)
> + BIT_XOR: A ^= ~(1 << bit).
> + bit is induction variable. */
> + switch (code1)
> + {
> + case BIT_AND_EXPR:
> + induction_kind = code2 == BIT_NOT_EXPR
> + ? INDUCTION_BIT_CLEAR
> + : INDUCTION_ZERO;
> + break;
> + case BIT_IOR_EXPR:
> + induction_kind = code2 == BIT_NOT_EXPR
> + ? INDUCTION_ALL
> + : INDUCTION_BIT_IOR;
> + break;
> + case BIT_XOR_EXPR:
> + induction_kind = code2 == BIT_NOT_EXPR
> + ? INDUCTION_BIT_XOR
> + : INDUCTION_BIT_RESET;
> + break;
> + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
> + case BIT_NOT_EXPR:
> + gcc_assert (code2 == BIT_XOR_EXPR);
> + induction_kind = INDUCTION_BIT_XOR;
> + break;
> + default:
> + gcc_unreachable();
> + }
> +
> + if (induction_kind == INDUCTION_ZERO)
> + return build_zero_cst (type);
is that true even when the loop doesn't iterate?
> + if (induction_kind == INDUCTION_ALL)
> + return build_all_ones_cst (type);
Likewise?
> +
> + wide_int bits = wi::zero (TYPE_PRECISION (type));
> + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
you checked fits_uhwi above, name it bit_start?
> + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
> +
> + /* Loop tripcount should be num_iter + 1. */
> + for (unsigned i = 0; i != num_iter + 1; i++)
> + {
> + bits = wi::bit_or (bits,
> + wi::lshift (wi::one (TYPE_PRECISION (type)),
> + start));
you can use bits = wi::set_bit (bits, start);
> + start += step;
> + }
Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that
the IV wraps and is unsigned char and you have a 256 bit integer the
above could run quite a long time. Please limit num_iter to TYPE_PRECISION
as well (do you have a testcase with a wrapping IV? try some != CST exit
condition and a large increment)
> +
> + bool inverted = false;
> + switch (induction_kind)
> + {
> + case INDUCTION_BIT_CLEAR:
> + code1 = BIT_AND_EXPR;
> + inverted = true;
> + break;
> + case INDUCTION_BIT_IOR:
> + code1 = BIT_IOR_EXPR;
> + break;
> + case INDUCTION_BIT_RESET:
> + code1 = BIT_XOR_EXPR;
> + break;
> + /* A ^= ~(1 << bit) is special, when loop tripcount is even,
> + it's equal to A ^= bits, else A ^= ~bits. */
> + case INDUCTION_BIT_XOR:
> + code1 = BIT_XOR_EXPR;
> + if (num_iter % 2 == 0)
> + inverted = true;
> + break;
> + default:
> + gcc_unreachable ();
> + }
> +
> + if (inverted)
> + bits = wi::bit_not (bits);
> + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
> +}
> +
> /* Do final value replacement for LOOP, return true if we did anything. */
>
> bool
> @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
> {
> gphi *phi = psi.phi ();
> tree rslt = PHI_RESULT (phi);
> - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> + tree def = phidef;
> if (virtual_operand_p (def))
> {
> gsi_next (&psi);
> @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
> def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
> &folded_casts);
> def = compute_overall_effect_of_inner_loop (ex_loop, def);
> +
> + /* Handle bitwise induction expression.
> +
> + .i.e.
> + for (int i = 0; i != 64; i+=3)
> + res &= ~(1UL << i);
> +
> + RES can't be analyzed out by SCEV because it is not polynomially
> + expressible, but in fact final value of RES can be replaced by
> + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
> + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
> + if (tree_fits_uhwi_p (niter)
> + && tree_to_uhwi (niter)
that's redundant - ah, it's the not iterating case that's excluded.
please write it
as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned
HOST_WIDE_INT
and pass it that way to analyze_and_compute_bitwise_induction_effect
> + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
> + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
> + phidef, def, niter);
> +
it seems you overwrite the SCEV analysis result unconditionally here, please
move this inside of the following if () so it is done as fallback only
(or add it as && !(def = ...) to the condition)
> if (!tree_does_not_contain_chrecs (def)
> || chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
> /* Moving the computation from the loop may prolong life range
> --
> 2.18.1
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
2022-05-11 8:45 ` Richard Biener
@ 2022-05-13 3:37 ` Hongtao Liu
2022-05-13 11:16 ` Richard Biener
0 siblings, 1 reply; 6+ messages in thread
From: Hongtao Liu @ 2022-05-13 3:37 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 25543 bytes --]
On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > This patch will enable below optimization:
> >
> > {
> > - int bit;
> > - long long unsigned int _1;
> > - long long unsigned int _2;
> > -
> > <bb 2> [local count: 46707768]:
> > -
> > - <bb 3> [local count: 1027034057]:
> > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
> > - # bit_13 = PHI <bit_9(3), 63(2)>
> > - _1 = 1 << bit_13;
> > - _2 = ~_1;
> > - tmp_8 = _2 & tmp_11;
> > - bit_9 = bit_13 + -3;
> > - if (bit_9 != -3(OVF))
> > - goto <bb 3>; [95.65%]
> > - else
> > - goto <bb 4>; [4.35%]
> > -
> > - <bb 4> [local count: 46707768]:
> > - return tmp_8;
> > + tmp_12 = tmp_6(D) & 7905747460161236406;
> > + return tmp_12;
> >
> > }
> >
> >
> > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR middle-end/103462
> > * match.pd (bitwise_induction_p): New match.
> > * tree-scalar-evolution.c (gimple_bitwise_induction_p):
> > Declare.
> > (analyze_and_compute_bitwise_induction_effect): New function.
> > (enum bit_op_kind): New enum.
> > (final_value_replacement_loop): Enhanced to handle bitwise
> > induction.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr103462-1.c: New test.
> > * gcc.target/i386/pr103462-2.c: New test.
> > * gcc.target/i386/pr103462-3.c: New test.
> > * gcc.target/i386/pr103462-4.c: New test.
> > * gcc.target/i386/pr103462-5.c: New test.
> > * gcc.target/i386/pr103462-6.c: New test.
> > ---
> > gcc/match.pd | 7 +
> > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
> > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
> > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
> > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
> > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
> > 8 files changed, 654 insertions(+), 1 deletion(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 6d691d302b3..24ff5f9e6a8 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -7746,3 +7746,10 @@ and,
> > == TYPE_UNSIGNED (TREE_TYPE (@3))))
> > && single_use (@4)
> > && single_use (@5))))
> > +
> > +(for bit_op (bit_and bit_ior bit_xor)
> > + (match (bitwise_induction_p @0 @2 @3)
> > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
> > +
> > +(match (bitwise_induction_p @0 @2 @3)
> > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > new file mode 100644
> > index 00000000000..1dc4c2acad6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > @@ -0,0 +1,111 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp &= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo1 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp &= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo2 (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp &= (1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo3 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp &= (1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo4 (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp |= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo5 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp |= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo6 (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp |= (1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo7 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp |= (1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo8 (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp ^= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo9 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp ^= ~(1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo10 (unsigned long long tmp)
> > +{
> > + for (int bit = 0; bit < 64; bit += 3)
> > + tmp ^= (1ULL << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned long long
> > +__attribute__((noipa))
> > +foo11 (unsigned long long tmp)
> > +{
> > + for (int bit = 63; bit >= 0; bit -= 3)
> > + tmp ^= (1ULL << bit);
> > + return tmp;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > new file mode 100644
> > index 00000000000..bc375cb78d4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > @@ -0,0 +1,45 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O1" } */
> > +
> > +#include "pr103462-1.c"
> > +
> > +int main()
> > +{
> > + unsigned long long tmp = 0x1111111111111111ULL;
> > + if (foo (tmp) != 0x110110110110110ULL)
> > + __builtin_abort ();
> > +
> > + if (foo1 (tmp) != 0x110110110110110ULL)
> > + __builtin_abort ();
> > +
> > + if (foo2 (tmp) != 0x0ULL)
> > + __builtin_abort ();
> > +
> > + if (foo3 (tmp) != 0x0ULL)
> > + __builtin_abort ();
> > +
> > + if (foo4 (tmp) != 0xffffffffffffffffULL)
> > + __builtin_abort ();
> > +
> > + if (foo5 (tmp) != 0xffffffffffffffffULL)
> > + __builtin_abort ();
> > +
> > + if (foo6 (tmp) != 0x9359359359359359ULL)
> > + __builtin_abort ();
> > +
> > + if (foo7 (tmp) != 0x9359359359359359ULL)
> > + __builtin_abort ();
> > +
> > + if (foo8 (tmp) != 0x8358358358358358ULL)
> > + __builtin_abort ();
> > +
> > + if (foo9 (tmp) != 0x8358358358358358ULL)
> > + __builtin_abort ();
> > +
> > + if (foo10 (tmp) != 0x8358358358358358ULL)
> > + __builtin_abort ();
> > +
> > + if (foo11 (tmp) != 0x8358358358358358ULL)
> > + __builtin_abort ();
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > new file mode 100644
> > index 00000000000..4ba248a4872
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > @@ -0,0 +1,111 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp &= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo1 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp &= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo2 (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp &= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo3 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp &= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo4 (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp |= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo5 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp |= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo6 (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp |= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo7 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp |= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo8 (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp ^= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo9 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp ^= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo10 (unsigned int tmp)
> > +{
> > + for (int bit = 0; bit < 32; bit += 3)
> > + tmp ^= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned int
> > +__attribute__((noipa))
> > +foo11 (unsigned int tmp)
> > +{
> > + for (int bit = 31; bit >= 0; bit -= 3)
> > + tmp ^= (1U << bit);
> > + return tmp;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > new file mode 100644
> > index 00000000000..e2f4056f525
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > @@ -0,0 +1,46 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O1" } */
> > +
> > +#include "pr103462-3.c"
> > +
> > +int main()
> > +{
> > + unsigned int tmp = 0x11111111U;
> > +
> > + if (foo (tmp) != 0x10110110U)
> > + __builtin_abort ();
> > +
> > + if (foo1 (tmp) != 0x1101101U)
> > + __builtin_abort ();
> > +
> > + if (foo2 (tmp) != 0x0U)
> > + __builtin_abort ();
> > +
> > + if (foo3 (tmp) != 0x0U)
> > + __builtin_abort ();
> > +
> > + if (foo4 (tmp) != 0xffffffffU)
> > + __builtin_abort ();
> > +
> > + if (foo5 (tmp) != 0xffffffffU)
> > + __builtin_abort ();
> > +
> > + if (foo6 (tmp) != 0x59359359U)
> > + __builtin_abort ();
> > +
> > + if (foo7 (tmp) != 0x93593593U)
> > + __builtin_abort ();
> > +
> > + if (foo8 (tmp) != 0xa7ca7ca7U)
> > + __builtin_abort ();
> > +
> > + if (foo9 (tmp) != 0x7ca7ca7cU)
> > + __builtin_abort ();
> > +
> > + if (foo10 (tmp) != 0x58358358U)
> > + __builtin_abort ();
> > +
> > + if (foo11 (tmp) != 0x83583583U)
> > + __builtin_abort ();
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > new file mode 100644
> > index 00000000000..1f4ffa34b48
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > @@ -0,0 +1,111 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp &= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo1 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp &= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo2 (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp &= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo3 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp &= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo4 (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp |= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo5 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp |= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo6 (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp |= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo7 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp |= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo8 (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp ^= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo9 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp ^= ~(1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo10 (unsigned short tmp)
> > +{
> > + for (int bit = 0; bit < 16; bit += 3)
> > + tmp ^= (1U << bit);
> > + return tmp;
> > +}
> > +
> > +unsigned short
> > +__attribute__((noipa))
> > +foo11 (unsigned short tmp)
> > +{
> > + for (int bit = 15; bit >= 0; bit -= 3)
> > + tmp ^= (1U << bit);
> > + return tmp;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > new file mode 100644
> > index 00000000000..65426d71efe
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > @@ -0,0 +1,46 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O1" } */
> > +
> > +#include "pr103462-5.c"
> > +
> > +int main()
> > +{
> > + unsigned short tmp = 0x1111U;
> > +
> > + if (foo (tmp) != 0x110)
> > + __builtin_abort ();
> > +
> > + if (foo1 (tmp) != 0x110)
> > + __builtin_abort ();
> > +
> > + if (foo2 (tmp) != 0x0)
> > + __builtin_abort ();
> > +
> > + if (foo3 (tmp) != 0x0)
> > + __builtin_abort ();
> > +
> > + if (foo4 (tmp) != 0xffff)
> > + __builtin_abort ();
> > +
> > + if (foo5 (tmp) != 0xffff)
> > + __builtin_abort ();
> > +
> > + if (foo6 (tmp) != 0x9359)
> > + __builtin_abort ();
> > +
> > + if (foo7 (tmp) != 0x9359)
> > + __builtin_abort ();
> > +
> > + if (foo8 (tmp) != 0x8358)
> > + __builtin_abort ();
> > +
> > + if (foo9 (tmp) != 0x8358)
> > + __builtin_abort ();
> > +
> > + if (foo10 (tmp) != 0x8358)
> > + __builtin_abort ();
> > +
> > + if (foo11 (tmp) != 0x8358)
> > + __builtin_abort ();
> > +}
> > +
> > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > index 72ceb4001e3..9b0aec4fd09 100644
> > --- a/gcc/tree-scalar-evolution.cc
> > +++ b/gcc/tree-scalar-evolution.cc
> > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
> > || expanded_size > cache.elements ());
> > }
> >
> > +/* Match.pd function to match bitwise inductive expression.
> > + .i.e.
> > + _2 = 1 << _1;
> > + _3 = ~_2;
> > + tmp_9 = _3 & tmp_12; */
> > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
> > +
> > +/* Return the inductive expression of bitwise operation if possible,
> > + otherwise returns DEF. */
> > +static tree
> > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
> > + class loop* loop,
> > + tree phidef,
> > + tree def,
> > + tree niter)
> > +{
> > + tree match_op[3],inv, bitwise_scev, bitwise_res;
> > + bool folded_casts = false;
> > + tree type = TREE_TYPE (phidef);
> > + gimple* header_phi = NULL;
>
> use a gphi *
>
Changed.
> > +
> > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
> > +
> > + op2 = PHI <phidef, inv>
> > + _1 = (int) bit_17;
> > + _3 = 1 << _1;
> > + op1 = ~_3;
> > + phidef = op1 & op2; */
> > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
> > + || TREE_CODE (match_op[2]) != SSA_NAME
> > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
>
> || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
>
> > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
>
> and then use header_phi here, if you pass in the PHI to use you'll
> always have 2 args
changed.
>
> > + return def;
> > +
> > + header_phi = SSA_NAME_DEF_STMT (match_op[2]);
> > + if (gimple_bb (header_phi) != loop->header)
> > + return def;
>
> I think passing in the PHI node and the exit edge instead of phidef
> would make the above more clear
Remove the upper condition.
>
> > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
> > + return def;
> > +
> > + inv = gimple_phi_arg_def (header_phi, 0);
> > + if (inv == phidef)
> > + inv = gimple_phi_arg_def (header_phi, 1);
>
> inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop));
>
> where is this used?
Move to the place where it's used.
>
> > +
> > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
> > + match_op[1],
> > + &folded_casts);
>
> you want
>
> bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> bitwise_scev = instantiate_parameters (loop, bitwise_scev);
>
> instead since you are interested in in-loop behavior?
Yes, changed.
>
> > +
> > + /* Make sure bits is in range of type precision. */
> > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
>
> I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you
> do not rule out pointers or vector types.
Changed.
>
> > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
> > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
> > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
> > + return def;
> > +
> > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> > + if (!tree_fits_uhwi_p (bitwise_res)
> > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
> > + return def;
>
> Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct?
> So name it bit_final?
Changed.
>
> > +
> > +enum bit_op_kind
> > + {
> > + INDUCTION_BIT_CLEAR,
> > + INDUCTION_BIT_IOR,
> > + INDUCTION_BIT_XOR,
> > + INDUCTION_BIT_RESET,
> > + INDUCTION_ZERO,
> > + INDUCTION_ALL
> > + };
> > +
> > + enum bit_op_kind induction_kind;
> > + enum tree_code code1
> > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
> > + enum tree_code code2
> > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
> > +
> > + /* BIT_CLEAR: A &= ~(1 << bit)
> > + BIT_RESET: A ^= (1 << bit).
> > + BIT_IOR: A |= (1 << bit)
> > + BIT_ZERO: A &= (1 << bit)
> > + BIT_ALL: A |= ~(1 << bit)
> > + BIT_XOR: A ^= ~(1 << bit).
> > + bit is induction variable. */
> > + switch (code1)
> > + {
> > + case BIT_AND_EXPR:
> > + induction_kind = code2 == BIT_NOT_EXPR
> > + ? INDUCTION_BIT_CLEAR
> > + : INDUCTION_ZERO;
> > + break;
> > + case BIT_IOR_EXPR:
> > + induction_kind = code2 == BIT_NOT_EXPR
> > + ? INDUCTION_ALL
> > + : INDUCTION_BIT_IOR;
> > + break;
> > + case BIT_XOR_EXPR:
> > + induction_kind = code2 == BIT_NOT_EXPR
> > + ? INDUCTION_BIT_XOR
> > + : INDUCTION_BIT_RESET;
> > + break;
> > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
> > + case BIT_NOT_EXPR:
> > + gcc_assert (code2 == BIT_XOR_EXPR);
> > + induction_kind = INDUCTION_BIT_XOR;
> > + break;
> > + default:
> > + gcc_unreachable();
> > + }
> > +
> > + if (induction_kind == INDUCTION_ZERO)
> > + return build_zero_cst (type);
>
> is that true even when the loop doesn't iterate?
>
> > + if (induction_kind == INDUCTION_ALL)
> > + return build_all_ones_cst (type);
>
> Likewise?
the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it
must be iterated.
>
> > +
> > + wide_int bits = wi::zero (TYPE_PRECISION (type));
> > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
>
> you checked fits_uhwi above, name it bit_start?
>
> > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
> > +
> > + /* Loop tripcount should be num_iter + 1. */
> > + for (unsigned i = 0; i != num_iter + 1; i++)
> > + {
> > + bits = wi::bit_or (bits,
> > + wi::lshift (wi::one (TYPE_PRECISION (type)),
> > + start));
>
> you can use bits = wi::set_bit (bits, start);
Changed.
>
> > + start += step;
> > + }
>
> Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that
> the IV wraps and is unsigned char and you have a 256 bit integer the
> above could run quite a long time. Please limit num_iter to TYPE_PRECISION
> as well (do you have a testcase with a wrapping IV? try some != CST exit
> condition and a large increment)
Limit to > 0 and < TYPE_SIZE.
>
> > +
> > + bool inverted = false;
> > + switch (induction_kind)
> > + {
> > + case INDUCTION_BIT_CLEAR:
> > + code1 = BIT_AND_EXPR;
> > + inverted = true;
> > + break;
> > + case INDUCTION_BIT_IOR:
> > + code1 = BIT_IOR_EXPR;
> > + break;
> > + case INDUCTION_BIT_RESET:
> > + code1 = BIT_XOR_EXPR;
> > + break;
> > + /* A ^= ~(1 << bit) is special, when loop tripcount is even,
> > + it's equal to A ^= bits, else A ^= ~bits. */
> > + case INDUCTION_BIT_XOR:
> > + code1 = BIT_XOR_EXPR;
> > + if (num_iter % 2 == 0)
> > + inverted = true;
> > + break;
> > + default:
> > + gcc_unreachable ();
> > + }
> > +
> > + if (inverted)
> > + bits = wi::bit_not (bits);
> > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
> > +}
> > +
> > /* Do final value replacement for LOOP, return true if we did anything. */
> >
> > bool
> > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
> > {
> > gphi *phi = psi.phi ();
> > tree rslt = PHI_RESULT (phi);
> > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > + tree def = phidef;
> > if (virtual_operand_p (def))
> > {
> > gsi_next (&psi);
> > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
> > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
> > &folded_casts);
> > def = compute_overall_effect_of_inner_loop (ex_loop, def);
> > +
> > + /* Handle bitwise induction expression.
> > +
> > + .i.e.
> > + for (int i = 0; i != 64; i+=3)
> > + res &= ~(1UL << i);
> > +
> > + RES can't be analyzed out by SCEV because it is not polynomially
> > + expressible, but in fact final value of RES can be replaced by
> > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
> > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
> > + if (tree_fits_uhwi_p (niter)
> > + && tree_to_uhwi (niter)
>
> that's redundant - ah, it's the not iterating case that's excluded.
> please write it
> as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned
> HOST_WIDE_INT
> and pass it that way to analyze_and_compute_bitwise_induction_effect
Changed.
>
> > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
> > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
> > + phidef, def, niter);
> > +
>
> it seems you overwrite the SCEV analysis result unconditionally here, please
> move this inside of the following if () so it is done as fallback only
> (or add it as && !(def = ...) to the condition)
original def will be returned when bitwise induction fails which means
rewriting is ok, but i think it's better to not pass original def and
use your suggestion.
Changed.
>
> > if (!tree_does_not_contain_chrecs (def)
> > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
> > /* Moving the computation from the loop may prolong life range
> > --
> > 2.18.1
> >
Here's an updated V2 patch.
--
BR,
Hongtao
[-- Attachment #2: v2-0001-Middle-end-Enhance-final_value_replacement_loop-t.patch --]
[-- Type: application/octet-stream, Size: 19430 bytes --]
From 3d902b9283288f6564f57c95c3996d094940d46e Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 7 Dec 2021 15:41:52 +0800
Subject: [PATCH v2] [Middle-end] Enhance final_value_replacement_loop to
handle bitwise induction.
This patch will enable below optimization:
{
- int bit;
- long long unsigned int _1;
- long long unsigned int _2;
-
<bb 2> [local count: 46707768]:
-
- <bb 3> [local count: 1027034057]:
- # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
- # bit_13 = PHI <bit_9(3), 63(2)>
- _1 = 1 << bit_13;
- _2 = ~_1;
- tmp_8 = _2 & tmp_11;
- bit_9 = bit_13 + -3;
- if (bit_9 != -3(OVF))
- goto <bb 3>; [95.65%]
- else
- goto <bb 4>; [4.35%]
-
- <bb 4> [local count: 46707768]:
- return tmp_8;
+ tmp_12 = tmp_6(D) & 7905747460161236406;
+ return tmp_12;
}
gcc/ChangeLog:
PR middle-end/103462
* match.pd (bitwise_induction_p): New match.
* tree-scalar-evolution.c (gimple_bitwise_induction_p):
Declare.
(analyze_and_compute_bitwise_induction_effect): New function.
(enum bit_op_kind): New enum.
(final_value_replacement_loop): Enhanced to handle bitwise
induction.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103462-1.c: New test.
* gcc.target/i386/pr103462-2.c: New test.
* gcc.target/i386/pr103462-3.c: New test.
* gcc.target/i386/pr103462-4.c: New test.
* gcc.target/i386/pr103462-5.c: New test.
* gcc.target/i386/pr103462-6.c: New test.
---
gcc/match.pd | 10 ++
gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
gcc/tree-scalar-evolution.cc | 173 ++++++++++++++++++++-
8 files changed, 652 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b3..e8379696c6e 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7746,3 +7746,13 @@ and,
== TYPE_UNSIGNED (TREE_TYPE (@3))))
&& single_use (@4)
&& single_use (@5))))
+
+(for bit_op (bit_and bit_ior bit_xor)
+ (match (bitwise_induction_p @0 @2 @3)
+ (bit_op:c
+ (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2))))
+ @3)))
+
+(match (bitwise_induction_p @0 @2 @3)
+ (bit_not
+ (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
new file mode 100644
index 00000000000..1dc4c2acad6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned long long
+__attribute__((noipa))
+foo (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo1 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo2 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo3 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo4 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo5 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo6 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo7 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo8 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo9 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo10 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo11 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
new file mode 100644
index 00000000000..bc375cb78d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-1.c"
+
+int main()
+{
+ unsigned long long tmp = 0x1111111111111111ULL;
+ if (foo (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
new file mode 100644
index 00000000000..4ba248a4872
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned int
+__attribute__((noipa))
+foo (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo1 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo2 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo3 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo4 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo5 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo6 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo7 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo8 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo9 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo10 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo11 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
new file mode 100644
index 00000000000..e2f4056f525
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-3.c"
+
+int main()
+{
+ unsigned int tmp = 0x11111111U;
+
+ if (foo (tmp) != 0x10110110U)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x1101101U)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x59359359U)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x93593593U)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0xa7ca7ca7U)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x7ca7ca7cU)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x58358358U)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x83583583U)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
new file mode 100644
index 00000000000..1f4ffa34b48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned short
+__attribute__((noipa))
+foo (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo1 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo2 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo3 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo4 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo5 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo6 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo7 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo8 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo9 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo10 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo11 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
new file mode 100644
index 00000000000..65426d71efe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-5.c"
+
+int main()
+{
+ unsigned short tmp = 0x1111U;
+
+ if (foo (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358)
+ __builtin_abort ();
+}
+
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 72ceb4001e3..096ce653823 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3487,6 +3487,153 @@ expression_expensive_p (tree expr)
|| expanded_size > cache.elements ());
}
+/* Match.pd function to match bitwise inductive expression.
+ .i.e.
+ _2 = 1 << _1;
+ _3 = ~_2;
+ tmp_9 = _3 & tmp_12; */
+extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
+
+/* Return the inductive expression of bitwise operation if possible,
+ otherwise returns DEF. */
+static tree
+analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
+ class loop* loop,
+ tree phidef,
+ unsigned HOST_WIDE_INT niter)
+{
+ tree match_op[3],inv, bitwise_scev, bit_final;
+ tree type = TREE_TYPE (phidef);
+ gphi* header_phi = NULL;
+
+ /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
+
+ op2 = PHI <phidef, inv>
+ _1 = (int) bit_17;
+ _3 = 1 << _1;
+ op1 = ~_3;
+ phidef = op1 & op2; */
+ if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
+ || TREE_CODE (match_op[2]) != SSA_NAME
+ || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
+ || header_phi->nargs != 2)
+ return NULL_TREE;
+
+ if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
+ return NULL_TREE;
+
+ bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
+ bitwise_scev = instantiate_parameters (loop, bitwise_scev);
+
+ /* Make sure bits is in range of type precision. */
+ if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
+ || !INTEGRAL_TYPE_P (TREE_TYPE (bitwise_scev))
+ || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
+ || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
+ || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
+ return NULL_TREE;
+
+ bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
+ if (!tree_fits_uhwi_p (bit_final)
+ || tree_to_uhwi (bit_final) >= TYPE_PRECISION (type))
+ return NULL_TREE;
+
+enum bit_op_kind
+ {
+ INDUCTION_BIT_CLEAR,
+ INDUCTION_BIT_IOR,
+ INDUCTION_BIT_XOR,
+ INDUCTION_BIT_RESET,
+ INDUCTION_ZERO,
+ INDUCTION_ALL
+ };
+
+ enum bit_op_kind induction_kind;
+ enum tree_code code1
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
+ enum tree_code code2
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
+
+ /* BIT_CLEAR: A &= ~(1 << bit)
+ BIT_RESET: A ^= (1 << bit).
+ BIT_IOR: A |= (1 << bit)
+ BIT_ZERO: A &= (1 << bit)
+ BIT_ALL: A |= ~(1 << bit)
+ BIT_XOR: A ^= ~(1 << bit).
+ bit is induction variable. */
+ switch (code1)
+ {
+ case BIT_AND_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_CLEAR
+ : INDUCTION_ZERO;
+ break;
+ case BIT_IOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_ALL
+ : INDUCTION_BIT_IOR;
+ break;
+ case BIT_XOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_XOR
+ : INDUCTION_BIT_RESET;
+ break;
+ /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
+ case BIT_NOT_EXPR:
+ gcc_assert (code2 == BIT_XOR_EXPR);
+ induction_kind = INDUCTION_BIT_XOR;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (induction_kind == INDUCTION_ZERO)
+ return build_zero_cst (type);
+ if (induction_kind == INDUCTION_ALL)
+ return build_all_ones_cst (type);
+
+ wide_int bits = wi::zero (TYPE_PRECISION (type));
+ HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
+ HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
+
+ /* Loop tripcount should be niter + 1. */
+ for (unsigned i = 0; i != niter + 1; i++)
+ {
+ bits = wi::set_bit (bits, bit_start);
+ bit_start += step;
+ }
+
+ bool inverted = false;
+ switch (induction_kind)
+ {
+ case INDUCTION_BIT_CLEAR:
+ code1 = BIT_AND_EXPR;
+ inverted = true;
+ break;
+ case INDUCTION_BIT_IOR:
+ code1 = BIT_IOR_EXPR;
+ break;
+ case INDUCTION_BIT_RESET:
+ code1 = BIT_XOR_EXPR;
+ break;
+ /* A ^= ~(1 << bit) is special, when loop tripcount is even,
+ it's equal to A ^= bits, else A ^= ~bits. */
+ case INDUCTION_BIT_XOR:
+ code1 = BIT_XOR_EXPR;
+ if (niter % 2 == 0)
+ inverted = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (inverted)
+ bits = wi::bit_not (bits);
+
+ inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge (loop));
+ return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
+}
+
/* Do final value replacement for LOOP, return true if we did anything. */
bool
@@ -3519,7 +3666,8 @@ final_value_replacement_loop (class loop *loop)
{
gphi *phi = psi.phi ();
tree rslt = PHI_RESULT (phi);
- tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree def = phidef;
if (virtual_operand_p (def))
{
gsi_next (&psi);
@@ -3537,6 +3685,29 @@ final_value_replacement_loop (class loop *loop)
def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
&folded_casts);
def = compute_overall_effect_of_inner_loop (ex_loop, def);
+
+ /* Handle bitwise induction expression.
+
+ .i.e.
+ for (int i = 0; i != 64; i+=3)
+ res &= ~(1UL << i);
+
+ RES can't be analyzed out by SCEV because it is not polynomially
+ expressible, but in fact final value of RES can be replaced by
+ RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
+ being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
+ unsigned HOST_WIDE_INT niter_num;
+ tree bit_def;
+ if (tree_fits_uhwi_p (niter)
+ && (niter_num = tree_to_uhwi (niter)) != 0
+ && niter_num < TYPE_PRECISION (TREE_TYPE (phidef))
+ && (bit_def
+ = analyze_and_compute_bitwise_induction_effect (ex_loop,
+ loop,
+ phidef,
+ niter_num)))
+ def = bit_def;
+
if (!tree_does_not_contain_chrecs (def)
|| chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
/* Moving the computation from the loop may prolong life range
--
2.18.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
2022-05-13 3:37 ` Hongtao Liu
@ 2022-05-13 11:16 ` Richard Biener
2022-05-18 2:45 ` Hongtao Liu
0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2022-05-13 11:16 UTC (permalink / raw)
To: Hongtao Liu; +Cc: liuhongt, GCC Patches
On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > This patch will enable below optimization:
> > >
> > > {
> > > - int bit;
> > > - long long unsigned int _1;
> > > - long long unsigned int _2;
> > > -
> > > <bb 2> [local count: 46707768]:
> > > -
> > > - <bb 3> [local count: 1027034057]:
> > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
> > > - # bit_13 = PHI <bit_9(3), 63(2)>
> > > - _1 = 1 << bit_13;
> > > - _2 = ~_1;
> > > - tmp_8 = _2 & tmp_11;
> > > - bit_9 = bit_13 + -3;
> > > - if (bit_9 != -3(OVF))
> > > - goto <bb 3>; [95.65%]
> > > - else
> > > - goto <bb 4>; [4.35%]
> > > -
> > > - <bb 4> [local count: 46707768]:
> > > - return tmp_8;
> > > + tmp_12 = tmp_6(D) & 7905747460161236406;
> > > + return tmp_12;
> > >
> > > }
> > >
> > >
> > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > > PR middle-end/103462
> > > * match.pd (bitwise_induction_p): New match.
> > > * tree-scalar-evolution.c (gimple_bitwise_induction_p):
> > > Declare.
> > > (analyze_and_compute_bitwise_induction_effect): New function.
> > > (enum bit_op_kind): New enum.
> > > (final_value_replacement_loop): Enhanced to handle bitwise
> > > induction.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > * gcc.target/i386/pr103462-1.c: New test.
> > > * gcc.target/i386/pr103462-2.c: New test.
> > > * gcc.target/i386/pr103462-3.c: New test.
> > > * gcc.target/i386/pr103462-4.c: New test.
> > > * gcc.target/i386/pr103462-5.c: New test.
> > > * gcc.target/i386/pr103462-6.c: New test.
> > > ---
> > > gcc/match.pd | 7 +
> > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
> > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
> > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
> > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
> > > 8 files changed, 654 insertions(+), 1 deletion(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > index 6d691d302b3..24ff5f9e6a8 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -7746,3 +7746,10 @@ and,
> > > == TYPE_UNSIGNED (TREE_TYPE (@3))))
> > > && single_use (@4)
> > > && single_use (@5))))
> > > +
> > > +(for bit_op (bit_and bit_ior bit_xor)
> > > + (match (bitwise_induction_p @0 @2 @3)
> > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
> > > +
> > > +(match (bitwise_induction_p @0 @2 @3)
> > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > new file mode 100644
> > > index 00000000000..1dc4c2acad6
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > @@ -0,0 +1,111 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp &= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo1 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp &= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo2 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp &= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo3 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp &= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo4 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp |= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo5 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp |= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo6 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp |= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo7 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp |= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo8 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp ^= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo9 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp ^= ~(1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo10 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 0; bit < 64; bit += 3)
> > > + tmp ^= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned long long
> > > +__attribute__((noipa))
> > > +foo11 (unsigned long long tmp)
> > > +{
> > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > + tmp ^= (1ULL << bit);
> > > + return tmp;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > new file mode 100644
> > > index 00000000000..bc375cb78d4
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > @@ -0,0 +1,45 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O1" } */
> > > +
> > > +#include "pr103462-1.c"
> > > +
> > > +int main()
> > > +{
> > > + unsigned long long tmp = 0x1111111111111111ULL;
> > > + if (foo (tmp) != 0x110110110110110ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo1 (tmp) != 0x110110110110110ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo2 (tmp) != 0x0ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo3 (tmp) != 0x0ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo4 (tmp) != 0xffffffffffffffffULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo5 (tmp) != 0xffffffffffffffffULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo6 (tmp) != 0x9359359359359359ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo7 (tmp) != 0x9359359359359359ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo8 (tmp) != 0x8358358358358358ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo9 (tmp) != 0x8358358358358358ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo10 (tmp) != 0x8358358358358358ULL)
> > > + __builtin_abort ();
> > > +
> > > + if (foo11 (tmp) != 0x8358358358358358ULL)
> > > + __builtin_abort ();
> > > +}
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > new file mode 100644
> > > index 00000000000..4ba248a4872
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > @@ -0,0 +1,111 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp &= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo1 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp &= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo2 (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp &= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo3 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp &= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo4 (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp |= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo5 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp |= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo6 (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp |= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo7 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp |= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo8 (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp ^= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo9 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp ^= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo10 (unsigned int tmp)
> > > +{
> > > + for (int bit = 0; bit < 32; bit += 3)
> > > + tmp ^= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned int
> > > +__attribute__((noipa))
> > > +foo11 (unsigned int tmp)
> > > +{
> > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > + tmp ^= (1U << bit);
> > > + return tmp;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > new file mode 100644
> > > index 00000000000..e2f4056f525
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > @@ -0,0 +1,46 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O1" } */
> > > +
> > > +#include "pr103462-3.c"
> > > +
> > > +int main()
> > > +{
> > > + unsigned int tmp = 0x11111111U;
> > > +
> > > + if (foo (tmp) != 0x10110110U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo1 (tmp) != 0x1101101U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo2 (tmp) != 0x0U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo3 (tmp) != 0x0U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo4 (tmp) != 0xffffffffU)
> > > + __builtin_abort ();
> > > +
> > > + if (foo5 (tmp) != 0xffffffffU)
> > > + __builtin_abort ();
> > > +
> > > + if (foo6 (tmp) != 0x59359359U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo7 (tmp) != 0x93593593U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo8 (tmp) != 0xa7ca7ca7U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo9 (tmp) != 0x7ca7ca7cU)
> > > + __builtin_abort ();
> > > +
> > > + if (foo10 (tmp) != 0x58358358U)
> > > + __builtin_abort ();
> > > +
> > > + if (foo11 (tmp) != 0x83583583U)
> > > + __builtin_abort ();
> > > +}
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > new file mode 100644
> > > index 00000000000..1f4ffa34b48
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > @@ -0,0 +1,111 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp &= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo1 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp &= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo2 (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp &= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo3 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp &= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo4 (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp |= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo5 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp |= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo6 (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp |= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo7 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp |= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo8 (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp ^= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo9 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp ^= ~(1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo10 (unsigned short tmp)
> > > +{
> > > + for (int bit = 0; bit < 16; bit += 3)
> > > + tmp ^= (1U << bit);
> > > + return tmp;
> > > +}
> > > +
> > > +unsigned short
> > > +__attribute__((noipa))
> > > +foo11 (unsigned short tmp)
> > > +{
> > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > + tmp ^= (1U << bit);
> > > + return tmp;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > new file mode 100644
> > > index 00000000000..65426d71efe
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > @@ -0,0 +1,46 @@
> > > +/* { dg-do run } */
> > > +/* { dg-options "-O1" } */
> > > +
> > > +#include "pr103462-5.c"
> > > +
> > > +int main()
> > > +{
> > > + unsigned short tmp = 0x1111U;
> > > +
> > > + if (foo (tmp) != 0x110)
> > > + __builtin_abort ();
> > > +
> > > + if (foo1 (tmp) != 0x110)
> > > + __builtin_abort ();
> > > +
> > > + if (foo2 (tmp) != 0x0)
> > > + __builtin_abort ();
> > > +
> > > + if (foo3 (tmp) != 0x0)
> > > + __builtin_abort ();
> > > +
> > > + if (foo4 (tmp) != 0xffff)
> > > + __builtin_abort ();
> > > +
> > > + if (foo5 (tmp) != 0xffff)
> > > + __builtin_abort ();
> > > +
> > > + if (foo6 (tmp) != 0x9359)
> > > + __builtin_abort ();
> > > +
> > > + if (foo7 (tmp) != 0x9359)
> > > + __builtin_abort ();
> > > +
> > > + if (foo8 (tmp) != 0x8358)
> > > + __builtin_abort ();
> > > +
> > > + if (foo9 (tmp) != 0x8358)
> > > + __builtin_abort ();
> > > +
> > > + if (foo10 (tmp) != 0x8358)
> > > + __builtin_abort ();
> > > +
> > > + if (foo11 (tmp) != 0x8358)
> > > + __builtin_abort ();
> > > +}
> > > +
> > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > > index 72ceb4001e3..9b0aec4fd09 100644
> > > --- a/gcc/tree-scalar-evolution.cc
> > > +++ b/gcc/tree-scalar-evolution.cc
> > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
> > > || expanded_size > cache.elements ());
> > > }
> > >
> > > +/* Match.pd function to match bitwise inductive expression.
> > > + .i.e.
> > > + _2 = 1 << _1;
> > > + _3 = ~_2;
> > > + tmp_9 = _3 & tmp_12; */
> > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
> > > +
> > > +/* Return the inductive expression of bitwise operation if possible,
> > > + otherwise returns DEF. */
> > > +static tree
> > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
> > > + class loop* loop,
> > > + tree phidef,
> > > + tree def,
> > > + tree niter)
> > > +{
> > > + tree match_op[3],inv, bitwise_scev, bitwise_res;
> > > + bool folded_casts = false;
> > > + tree type = TREE_TYPE (phidef);
> > > + gimple* header_phi = NULL;
> >
> > use a gphi *
> >
> Changed.
> > > +
> > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
> > > +
> > > + op2 = PHI <phidef, inv>
> > > + _1 = (int) bit_17;
> > > + _3 = 1 << _1;
> > > + op1 = ~_3;
> > > + phidef = op1 & op2; */
> > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
> > > + || TREE_CODE (match_op[2]) != SSA_NAME
> > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
> >
> > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> >
> > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
> >
> > and then use header_phi here, if you pass in the PHI to use you'll
> > always have 2 args
> changed.
> >
> > > + return def;
> > > +
> > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]);
> > > + if (gimple_bb (header_phi) != loop->header)
> > > + return def;
> >
> > I think passing in the PHI node and the exit edge instead of phidef
> > would make the above more clear
> Remove the upper condition.
> >
> > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
> > > + return def;
> > > +
> > > + inv = gimple_phi_arg_def (header_phi, 0);
> > > + if (inv == phidef)
> > > + inv = gimple_phi_arg_def (header_phi, 1);
> >
> > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop));
> >
> > where is this used?
> Move to the place where it's used.
> >
> > > +
> > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
> > > + match_op[1],
> > > + &folded_casts);
> >
> > you want
> >
> > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> > bitwise_scev = instantiate_parameters (loop, bitwise_scev);
> >
> > instead since you are interested in in-loop behavior?
> Yes, changed.
> >
> > > +
> > > + /* Make sure bits is in range of type precision. */
> > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
> >
> > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you
> > do not rule out pointers or vector types.
> Changed.
> >
> > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
> > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
> > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
> > > + return def;
> > > +
> > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> > > + if (!tree_fits_uhwi_p (bitwise_res)
> > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
> > > + return def;
> >
> > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct?
> > So name it bit_final?
> Changed.
> >
> > > +
> > > +enum bit_op_kind
> > > + {
> > > + INDUCTION_BIT_CLEAR,
> > > + INDUCTION_BIT_IOR,
> > > + INDUCTION_BIT_XOR,
> > > + INDUCTION_BIT_RESET,
> > > + INDUCTION_ZERO,
> > > + INDUCTION_ALL
> > > + };
> > > +
> > > + enum bit_op_kind induction_kind;
> > > + enum tree_code code1
> > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
> > > + enum tree_code code2
> > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
> > > +
> > > + /* BIT_CLEAR: A &= ~(1 << bit)
> > > + BIT_RESET: A ^= (1 << bit).
> > > + BIT_IOR: A |= (1 << bit)
> > > + BIT_ZERO: A &= (1 << bit)
> > > + BIT_ALL: A |= ~(1 << bit)
> > > + BIT_XOR: A ^= ~(1 << bit).
> > > + bit is induction variable. */
> > > + switch (code1)
> > > + {
> > > + case BIT_AND_EXPR:
> > > + induction_kind = code2 == BIT_NOT_EXPR
> > > + ? INDUCTION_BIT_CLEAR
> > > + : INDUCTION_ZERO;
> > > + break;
> > > + case BIT_IOR_EXPR:
> > > + induction_kind = code2 == BIT_NOT_EXPR
> > > + ? INDUCTION_ALL
> > > + : INDUCTION_BIT_IOR;
> > > + break;
> > > + case BIT_XOR_EXPR:
> > > + induction_kind = code2 == BIT_NOT_EXPR
> > > + ? INDUCTION_BIT_XOR
> > > + : INDUCTION_BIT_RESET;
> > > + break;
> > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
> > > + case BIT_NOT_EXPR:
> > > + gcc_assert (code2 == BIT_XOR_EXPR);
> > > + induction_kind = INDUCTION_BIT_XOR;
> > > + break;
> > > + default:
> > > + gcc_unreachable();
> > > + }
> > > +
> > > + if (induction_kind == INDUCTION_ZERO)
> > > + return build_zero_cst (type);
> >
> > is that true even when the loop doesn't iterate?
> >
> > > + if (induction_kind == INDUCTION_ALL)
> > > + return build_all_ones_cst (type);
> >
> > Likewise?
> the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it
> must be iterated.
> >
> > > +
> > > + wide_int bits = wi::zero (TYPE_PRECISION (type));
> > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
> >
> > you checked fits_uhwi above, name it bit_start?
> >
> > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
> > > +
> > > + /* Loop tripcount should be num_iter + 1. */
> > > + for (unsigned i = 0; i != num_iter + 1; i++)
> > > + {
> > > + bits = wi::bit_or (bits,
> > > + wi::lshift (wi::one (TYPE_PRECISION (type)),
> > > + start));
> >
> > you can use bits = wi::set_bit (bits, start);
> Changed.
> >
> > > + start += step;
> > > + }
> >
> > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that
> > the IV wraps and is unsigned char and you have a 256 bit integer the
> > above could run quite a long time. Please limit num_iter to TYPE_PRECISION
> > as well (do you have a testcase with a wrapping IV? try some != CST exit
> > condition and a large increment)
> Limit to > 0 and < TYPE_SIZE.
> >
> > > +
> > > + bool inverted = false;
> > > + switch (induction_kind)
> > > + {
> > > + case INDUCTION_BIT_CLEAR:
> > > + code1 = BIT_AND_EXPR;
> > > + inverted = true;
> > > + break;
> > > + case INDUCTION_BIT_IOR:
> > > + code1 = BIT_IOR_EXPR;
> > > + break;
> > > + case INDUCTION_BIT_RESET:
> > > + code1 = BIT_XOR_EXPR;
> > > + break;
> > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even,
> > > + it's equal to A ^= bits, else A ^= ~bits. */
> > > + case INDUCTION_BIT_XOR:
> > > + code1 = BIT_XOR_EXPR;
> > > + if (num_iter % 2 == 0)
> > > + inverted = true;
> > > + break;
> > > + default:
> > > + gcc_unreachable ();
> > > + }
> > > +
> > > + if (inverted)
> > > + bits = wi::bit_not (bits);
> > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
> > > +}
> > > +
> > > /* Do final value replacement for LOOP, return true if we did anything. */
> > >
> > > bool
> > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
> > > {
> > > gphi *phi = psi.phi ();
> > > tree rslt = PHI_RESULT (phi);
> > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > + tree def = phidef;
> > > if (virtual_operand_p (def))
> > > {
> > > gsi_next (&psi);
> > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
> > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
> > > &folded_casts);
> > > def = compute_overall_effect_of_inner_loop (ex_loop, def);
> > > +
> > > + /* Handle bitwise induction expression.
> > > +
> > > + .i.e.
> > > + for (int i = 0; i != 64; i+=3)
> > > + res &= ~(1UL << i);
> > > +
> > > + RES can't be analyzed out by SCEV because it is not polynomially
> > > + expressible, but in fact final value of RES can be replaced by
> > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
> > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
> > > + if (tree_fits_uhwi_p (niter)
> > > + && tree_to_uhwi (niter)
> >
> > that's redundant - ah, it's the not iterating case that's excluded.
> > please write it
> > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned
> > HOST_WIDE_INT
> > and pass it that way to analyze_and_compute_bitwise_induction_effect
> Changed.
> >
> > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
> > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
> > > + phidef, def, niter);
> > > +
> >
> > it seems you overwrite the SCEV analysis result unconditionally here, please
> > move this inside of the following if () so it is done as fallback only
> > (or add it as && !(def = ...) to the condition)
> original def will be returned when bitwise induction fails which means
> rewriting is ok, but i think it's better to not pass original def and
> use your suggestion.
> Changed.
> >
> > > if (!tree_does_not_contain_chrecs (def)
> > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
> > > /* Moving the computation from the loop may prolong life range
> > > --
> > > 2.18.1
> > >
>
> Here's an updated V2 patch.
+ || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
+ || header_phi->nargs != 2)
please use gimple_phi_num_args (header_phi) != 2
+ bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
+ bitwise_scev = instantiate_parameters (loop, bitwise_scev);
+
so that's now OK to get the evolution of the bit IV
+ bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
but this might not, for the final value you probably want to re-analyze the
SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop
query. Of course the final value could be also computed with
CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient
to remove the above and re-formulate the check it is used in after
+ wide_int bits = wi::zero (TYPE_PRECISION (type));
+ HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
+ HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
?
OK with that changes.
Thanks,
Richard.
>
>
> --
> BR,
> Hongtao
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
2022-05-13 11:16 ` Richard Biener
@ 2022-05-18 2:45 ` Hongtao Liu
2022-05-18 6:43 ` Richard Biener
0 siblings, 1 reply; 6+ messages in thread
From: Hongtao Liu @ 2022-05-18 2:45 UTC (permalink / raw)
To: Richard Biener; +Cc: liuhongt, GCC Patches
[-- Attachment #1: Type: text/plain, Size: 30659 bytes --]
On Fri, May 13, 2022 at 7:16 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > This patch will enable below optimization:
> > > >
> > > > {
> > > > - int bit;
> > > > - long long unsigned int _1;
> > > > - long long unsigned int _2;
> > > > -
> > > > <bb 2> [local count: 46707768]:
> > > > -
> > > > - <bb 3> [local count: 1027034057]:
> > > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
> > > > - # bit_13 = PHI <bit_9(3), 63(2)>
> > > > - _1 = 1 << bit_13;
> > > > - _2 = ~_1;
> > > > - tmp_8 = _2 & tmp_11;
> > > > - bit_9 = bit_13 + -3;
> > > > - if (bit_9 != -3(OVF))
> > > > - goto <bb 3>; [95.65%]
> > > > - else
> > > > - goto <bb 4>; [4.35%]
> > > > -
> > > > - <bb 4> [local count: 46707768]:
> > > > - return tmp_8;
> > > > + tmp_12 = tmp_6(D) & 7905747460161236406;
> > > > + return tmp_12;
> > > >
> > > > }
> > > >
> > > >
> > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > > > Ok for trunk?
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR middle-end/103462
> > > > * match.pd (bitwise_induction_p): New match.
> > > > * tree-scalar-evolution.c (gimple_bitwise_induction_p):
> > > > Declare.
> > > > (analyze_and_compute_bitwise_induction_effect): New function.
> > > > (enum bit_op_kind): New enum.
> > > > (final_value_replacement_loop): Enhanced to handle bitwise
> > > > induction.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > * gcc.target/i386/pr103462-1.c: New test.
> > > > * gcc.target/i386/pr103462-2.c: New test.
> > > > * gcc.target/i386/pr103462-3.c: New test.
> > > > * gcc.target/i386/pr103462-4.c: New test.
> > > > * gcc.target/i386/pr103462-5.c: New test.
> > > > * gcc.target/i386/pr103462-6.c: New test.
> > > > ---
> > > > gcc/match.pd | 7 +
> > > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
> > > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
> > > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
> > > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
> > > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
> > > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
> > > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
> > > > 8 files changed, 654 insertions(+), 1 deletion(-)
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > >
> > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > index 6d691d302b3..24ff5f9e6a8 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -7746,3 +7746,10 @@ and,
> > > > == TYPE_UNSIGNED (TREE_TYPE (@3))))
> > > > && single_use (@4)
> > > > && single_use (@5))))
> > > > +
> > > > +(for bit_op (bit_and bit_ior bit_xor)
> > > > + (match (bitwise_induction_p @0 @2 @3)
> > > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
> > > > +
> > > > +(match (bitwise_induction_p @0 @2 @3)
> > > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > new file mode 100644
> > > > index 00000000000..1dc4c2acad6
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > @@ -0,0 +1,111 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp &= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo1 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp &= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo2 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp &= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo3 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp &= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo4 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp |= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo5 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp |= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo6 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp |= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo7 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp |= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo8 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp ^= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo9 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp ^= ~(1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo10 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > + tmp ^= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned long long
> > > > +__attribute__((noipa))
> > > > +foo11 (unsigned long long tmp)
> > > > +{
> > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > + tmp ^= (1ULL << bit);
> > > > + return tmp;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > new file mode 100644
> > > > index 00000000000..bc375cb78d4
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > @@ -0,0 +1,45 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O1" } */
> > > > +
> > > > +#include "pr103462-1.c"
> > > > +
> > > > +int main()
> > > > +{
> > > > + unsigned long long tmp = 0x1111111111111111ULL;
> > > > + if (foo (tmp) != 0x110110110110110ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo1 (tmp) != 0x110110110110110ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo2 (tmp) != 0x0ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo3 (tmp) != 0x0ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo4 (tmp) != 0xffffffffffffffffULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo5 (tmp) != 0xffffffffffffffffULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo6 (tmp) != 0x9359359359359359ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo7 (tmp) != 0x9359359359359359ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo8 (tmp) != 0x8358358358358358ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo9 (tmp) != 0x8358358358358358ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo10 (tmp) != 0x8358358358358358ULL)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo11 (tmp) != 0x8358358358358358ULL)
> > > > + __builtin_abort ();
> > > > +}
> > > > +
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > new file mode 100644
> > > > index 00000000000..4ba248a4872
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > @@ -0,0 +1,111 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp &= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo1 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp &= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo2 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp &= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo3 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp &= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo4 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp |= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo5 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp |= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo6 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp |= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo7 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp |= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo8 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp ^= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo9 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp ^= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo10 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > + tmp ^= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned int
> > > > +__attribute__((noipa))
> > > > +foo11 (unsigned int tmp)
> > > > +{
> > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > + tmp ^= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > new file mode 100644
> > > > index 00000000000..e2f4056f525
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > @@ -0,0 +1,46 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O1" } */
> > > > +
> > > > +#include "pr103462-3.c"
> > > > +
> > > > +int main()
> > > > +{
> > > > + unsigned int tmp = 0x11111111U;
> > > > +
> > > > + if (foo (tmp) != 0x10110110U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo1 (tmp) != 0x1101101U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo2 (tmp) != 0x0U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo3 (tmp) != 0x0U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo4 (tmp) != 0xffffffffU)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo5 (tmp) != 0xffffffffU)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo6 (tmp) != 0x59359359U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo7 (tmp) != 0x93593593U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo8 (tmp) != 0xa7ca7ca7U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo9 (tmp) != 0x7ca7ca7cU)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo10 (tmp) != 0x58358358U)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo11 (tmp) != 0x83583583U)
> > > > + __builtin_abort ();
> > > > +}
> > > > +
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > new file mode 100644
> > > > index 00000000000..1f4ffa34b48
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > @@ -0,0 +1,111 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp &= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo1 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp &= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo2 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp &= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo3 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp &= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo4 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp |= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo5 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp |= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo6 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp |= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo7 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp |= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo8 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp ^= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo9 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp ^= ~(1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo10 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > + tmp ^= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > +
> > > > +unsigned short
> > > > +__attribute__((noipa))
> > > > +foo11 (unsigned short tmp)
> > > > +{
> > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > + tmp ^= (1U << bit);
> > > > + return tmp;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > > new file mode 100644
> > > > index 00000000000..65426d71efe
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > > @@ -0,0 +1,46 @@
> > > > +/* { dg-do run } */
> > > > +/* { dg-options "-O1" } */
> > > > +
> > > > +#include "pr103462-5.c"
> > > > +
> > > > +int main()
> > > > +{
> > > > + unsigned short tmp = 0x1111U;
> > > > +
> > > > + if (foo (tmp) != 0x110)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo1 (tmp) != 0x110)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo2 (tmp) != 0x0)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo3 (tmp) != 0x0)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo4 (tmp) != 0xffff)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo5 (tmp) != 0xffff)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo6 (tmp) != 0x9359)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo7 (tmp) != 0x9359)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo8 (tmp) != 0x8358)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo9 (tmp) != 0x8358)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo10 (tmp) != 0x8358)
> > > > + __builtin_abort ();
> > > > +
> > > > + if (foo11 (tmp) != 0x8358)
> > > > + __builtin_abort ();
> > > > +}
> > > > +
> > > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > > > index 72ceb4001e3..9b0aec4fd09 100644
> > > > --- a/gcc/tree-scalar-evolution.cc
> > > > +++ b/gcc/tree-scalar-evolution.cc
> > > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
> > > > || expanded_size > cache.elements ());
> > > > }
> > > >
> > > > +/* Match.pd function to match bitwise inductive expression.
> > > > + .i.e.
> > > > + _2 = 1 << _1;
> > > > + _3 = ~_2;
> > > > + tmp_9 = _3 & tmp_12; */
> > > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
> > > > +
> > > > +/* Return the inductive expression of bitwise operation if possible,
> > > > + otherwise returns DEF. */
> > > > +static tree
> > > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
> > > > + class loop* loop,
> > > > + tree phidef,
> > > > + tree def,
> > > > + tree niter)
> > > > +{
> > > > + tree match_op[3],inv, bitwise_scev, bitwise_res;
> > > > + bool folded_casts = false;
> > > > + tree type = TREE_TYPE (phidef);
> > > > + gimple* header_phi = NULL;
> > >
> > > use a gphi *
> > >
> > Changed.
> > > > +
> > > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
> > > > +
> > > > + op2 = PHI <phidef, inv>
> > > > + _1 = (int) bit_17;
> > > > + _3 = 1 << _1;
> > > > + op1 = ~_3;
> > > > + phidef = op1 & op2; */
> > > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
> > > > + || TREE_CODE (match_op[2]) != SSA_NAME
> > > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
> > >
> > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> > >
> > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
> > >
> > > and then use header_phi here, if you pass in the PHI to use you'll
> > > always have 2 args
> > changed.
> > >
> > > > + return def;
> > > > +
> > > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]);
> > > > + if (gimple_bb (header_phi) != loop->header)
> > > > + return def;
> > >
> > > I think passing in the PHI node and the exit edge instead of phidef
> > > would make the above more clear
> > Remove the upper condition.
> > >
> > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
> > > > + return def;
> > > > +
> > > > + inv = gimple_phi_arg_def (header_phi, 0);
> > > > + if (inv == phidef)
> > > > + inv = gimple_phi_arg_def (header_phi, 1);
> > >
> > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop));
> > >
> > > where is this used?
> > Move to the place where it's used.
> > >
> > > > +
> > > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
> > > > + match_op[1],
> > > > + &folded_casts);
> > >
> > > you want
> > >
> > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> > > bitwise_scev = instantiate_parameters (loop, bitwise_scev);
> > >
> > > instead since you are interested in in-loop behavior?
> > Yes, changed.
> > >
> > > > +
> > > > + /* Make sure bits is in range of type precision. */
> > > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
> > >
> > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you
> > > do not rule out pointers or vector types.
> > Changed.
> > >
> > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
> > > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
> > > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
> > > > + return def;
> > > > +
> > > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> > > > + if (!tree_fits_uhwi_p (bitwise_res)
> > > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
> > > > + return def;
> > >
> > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct?
> > > So name it bit_final?
> > Changed.
> > >
> > > > +
> > > > +enum bit_op_kind
> > > > + {
> > > > + INDUCTION_BIT_CLEAR,
> > > > + INDUCTION_BIT_IOR,
> > > > + INDUCTION_BIT_XOR,
> > > > + INDUCTION_BIT_RESET,
> > > > + INDUCTION_ZERO,
> > > > + INDUCTION_ALL
> > > > + };
> > > > +
> > > > + enum bit_op_kind induction_kind;
> > > > + enum tree_code code1
> > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
> > > > + enum tree_code code2
> > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
> > > > +
> > > > + /* BIT_CLEAR: A &= ~(1 << bit)
> > > > + BIT_RESET: A ^= (1 << bit).
> > > > + BIT_IOR: A |= (1 << bit)
> > > > + BIT_ZERO: A &= (1 << bit)
> > > > + BIT_ALL: A |= ~(1 << bit)
> > > > + BIT_XOR: A ^= ~(1 << bit).
> > > > + bit is induction variable. */
> > > > + switch (code1)
> > > > + {
> > > > + case BIT_AND_EXPR:
> > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > + ? INDUCTION_BIT_CLEAR
> > > > + : INDUCTION_ZERO;
> > > > + break;
> > > > + case BIT_IOR_EXPR:
> > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > + ? INDUCTION_ALL
> > > > + : INDUCTION_BIT_IOR;
> > > > + break;
> > > > + case BIT_XOR_EXPR:
> > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > + ? INDUCTION_BIT_XOR
> > > > + : INDUCTION_BIT_RESET;
> > > > + break;
> > > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
> > > > + case BIT_NOT_EXPR:
> > > > + gcc_assert (code2 == BIT_XOR_EXPR);
> > > > + induction_kind = INDUCTION_BIT_XOR;
> > > > + break;
> > > > + default:
> > > > + gcc_unreachable();
> > > > + }
> > > > +
> > > > + if (induction_kind == INDUCTION_ZERO)
> > > > + return build_zero_cst (type);
> > >
> > > is that true even when the loop doesn't iterate?
> > >
> > > > + if (induction_kind == INDUCTION_ALL)
> > > > + return build_all_ones_cst (type);
> > >
> > > Likewise?
> > the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it
> > must be iterated.
> > >
> > > > +
> > > > + wide_int bits = wi::zero (TYPE_PRECISION (type));
> > > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
> > >
> > > you checked fits_uhwi above, name it bit_start?
> > >
> > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> > > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
> > > > +
> > > > + /* Loop tripcount should be num_iter + 1. */
> > > > + for (unsigned i = 0; i != num_iter + 1; i++)
> > > > + {
> > > > + bits = wi::bit_or (bits,
> > > > + wi::lshift (wi::one (TYPE_PRECISION (type)),
> > > > + start));
> > >
> > > you can use bits = wi::set_bit (bits, start);
> > Changed.
> > >
> > > > + start += step;
> > > > + }
> > >
> > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that
> > > the IV wraps and is unsigned char and you have a 256 bit integer the
> > > above could run quite a long time. Please limit num_iter to TYPE_PRECISION
> > > as well (do you have a testcase with a wrapping IV? try some != CST exit
> > > condition and a large increment)
> > Limit to > 0 and < TYPE_SIZE.
> > >
> > > > +
> > > > + bool inverted = false;
> > > > + switch (induction_kind)
> > > > + {
> > > > + case INDUCTION_BIT_CLEAR:
> > > > + code1 = BIT_AND_EXPR;
> > > > + inverted = true;
> > > > + break;
> > > > + case INDUCTION_BIT_IOR:
> > > > + code1 = BIT_IOR_EXPR;
> > > > + break;
> > > > + case INDUCTION_BIT_RESET:
> > > > + code1 = BIT_XOR_EXPR;
> > > > + break;
> > > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even,
> > > > + it's equal to A ^= bits, else A ^= ~bits. */
> > > > + case INDUCTION_BIT_XOR:
> > > > + code1 = BIT_XOR_EXPR;
> > > > + if (num_iter % 2 == 0)
> > > > + inverted = true;
> > > > + break;
> > > > + default:
> > > > + gcc_unreachable ();
> > > > + }
> > > > +
> > > > + if (inverted)
> > > > + bits = wi::bit_not (bits);
> > > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
> > > > +}
> > > > +
> > > > /* Do final value replacement for LOOP, return true if we did anything. */
> > > >
> > > > bool
> > > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
> > > > {
> > > > gphi *phi = psi.phi ();
> > > > tree rslt = PHI_RESULT (phi);
> > > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > > + tree def = phidef;
> > > > if (virtual_operand_p (def))
> > > > {
> > > > gsi_next (&psi);
> > > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
> > > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
> > > > &folded_casts);
> > > > def = compute_overall_effect_of_inner_loop (ex_loop, def);
> > > > +
> > > > + /* Handle bitwise induction expression.
> > > > +
> > > > + .i.e.
> > > > + for (int i = 0; i != 64; i+=3)
> > > > + res &= ~(1UL << i);
> > > > +
> > > > + RES can't be analyzed out by SCEV because it is not polynomially
> > > > + expressible, but in fact final value of RES can be replaced by
> > > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
> > > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
> > > > + if (tree_fits_uhwi_p (niter)
> > > > + && tree_to_uhwi (niter)
> > >
> > > that's redundant - ah, it's the not iterating case that's excluded.
> > > please write it
> > > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned
> > > HOST_WIDE_INT
> > > and pass it that way to analyze_and_compute_bitwise_induction_effect
> > Changed.
> > >
> > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
> > > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
> > > > + phidef, def, niter);
> > > > +
> > >
> > > it seems you overwrite the SCEV analysis result unconditionally here, please
> > > move this inside of the following if () so it is done as fallback only
> > > (or add it as && !(def = ...) to the condition)
> > original def will be returned when bitwise induction fails which means
> > rewriting is ok, but i think it's better to not pass original def and
> > use your suggestion.
> > Changed.
> > >
> > > > if (!tree_does_not_contain_chrecs (def)
> > > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
> > > > /* Moving the computation from the loop may prolong life range
> > > > --
> > > > 2.18.1
> > > >
> >
> > Here's an updated V2 patch.
>
> + || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> + || header_phi->nargs != 2)
>
> please use gimple_phi_num_args (header_phi) != 2
Changed.
>
> + bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> + bitwise_scev = instantiate_parameters (loop, bitwise_scev);
> +
>
> so that's now OK to get the evolution of the bit IV
>
> + bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
>
> but this might not, for the final value you probably want to re-analyze the
> SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop
> query. Of course the final value could be also computed with
> CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient
> to remove the above and re-formulate the check it is used in after
>
> + wide_int bits = wi::zero (TYPE_PRECISION (type));
> + HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
> + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
>
> ?
>
How about
+ HOST_WIDE_INT bit_final = bit_start + step * niter;
+
+ /* bit_start, bit_final in range of [0,TYPE_PRECISION)
+ implies all bits are set in range. */
+ if (bit_final >= TYPE_PRECISION (type)
+ || bit_final < 0)
+ return NULL_TREE;
Also remove ex_loop from the parameter.
Updated patch.
> OK with that changes.
>
> Thanks,
> Richard.
>
> >
> >
> > --
> > BR,
> > Hongtao
--
BR,
Hongtao
[-- Attachment #2: v3-0001-Middle-end-Enhance-final_value_replacement_loop-t.patch --]
[-- Type: application/octet-stream, Size: 19428 bytes --]
From 27008cd73fc16d2e4898fead2c4425d80164d9a6 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 7 Dec 2021 15:41:52 +0800
Subject: [PATCH v3] [Middle-end] Enhance final_value_replacement_loop to
handle bitwise induction.
This patch will enable below optimization:
{
- int bit;
- long long unsigned int _1;
- long long unsigned int _2;
-
<bb 2> [local count: 46707768]:
-
- <bb 3> [local count: 1027034057]:
- # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
- # bit_13 = PHI <bit_9(3), 63(2)>
- _1 = 1 << bit_13;
- _2 = ~_1;
- tmp_8 = _2 & tmp_11;
- bit_9 = bit_13 + -3;
- if (bit_9 != -3(OVF))
- goto <bb 3>; [95.65%]
- else
- goto <bb 4>; [4.35%]
-
- <bb 4> [local count: 46707768]:
- return tmp_8;
+ tmp_12 = tmp_6(D) & 7905747460161236406;
+ return tmp_12;
}
gcc/ChangeLog:
PR middle-end/103462
* match.pd (bitwise_induction_p): New match.
* tree-scalar-evolution.c (gimple_bitwise_induction_p):
Declare.
(analyze_and_compute_bitwise_induction_effect): New function.
(enum bit_op_kind): New enum.
(final_value_replacement_loop): Enhanced to handle bitwise
induction.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr103462-1.c: New test.
* gcc.target/i386/pr103462-2.c: New test.
* gcc.target/i386/pr103462-3.c: New test.
* gcc.target/i386/pr103462-4.c: New test.
* gcc.target/i386/pr103462-5.c: New test.
* gcc.target/i386/pr103462-6.c: New test.
---
gcc/match.pd | 10 ++
gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
gcc/tree-scalar-evolution.cc | 173 ++++++++++++++++++++-
8 files changed, 652 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
diff --git a/gcc/match.pd b/gcc/match.pd
index f5efa77560c..11bf252a17b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7782,3 +7782,13 @@ and,
== TYPE_UNSIGNED (TREE_TYPE (@3))))
&& single_use (@4)
&& single_use (@5))))
+
+(for bit_op (bit_and bit_ior bit_xor)
+ (match (bitwise_induction_p @0 @2 @3)
+ (bit_op:c
+ (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2))))
+ @3)))
+
+(match (bitwise_induction_p @0 @2 @3)
+ (bit_not
+ (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
new file mode 100644
index 00000000000..1dc4c2acad6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned long long
+__attribute__((noipa))
+foo (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo1 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo2 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo3 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp &= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo4 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo5 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo6 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo7 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp |= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo8 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo9 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= ~(1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo10 (unsigned long long tmp)
+{
+ for (int bit = 0; bit < 64; bit += 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
+
+unsigned long long
+__attribute__((noipa))
+foo11 (unsigned long long tmp)
+{
+ for (int bit = 63; bit >= 0; bit -= 3)
+ tmp ^= (1ULL << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
new file mode 100644
index 00000000000..bc375cb78d4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
@@ -0,0 +1,45 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-1.c"
+
+int main()
+{
+ unsigned long long tmp = 0x1111111111111111ULL;
+ if (foo (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110110110110110ULL)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0ULL)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffffffffffULL)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359359359359359ULL)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358358358358358ULL)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
new file mode 100644
index 00000000000..4ba248a4872
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned int
+__attribute__((noipa))
+foo (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo1 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo2 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo3 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo4 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo5 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo6 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo7 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo8 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo9 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo10 (unsigned int tmp)
+{
+ for (int bit = 0; bit < 32; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned int
+__attribute__((noipa))
+foo11 (unsigned int tmp)
+{
+ for (int bit = 31; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
new file mode 100644
index 00000000000..e2f4056f525
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-3.c"
+
+int main()
+{
+ unsigned int tmp = 0x11111111U;
+
+ if (foo (tmp) != 0x10110110U)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x1101101U)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0U)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffffffffU)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x59359359U)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x93593593U)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0xa7ca7ca7U)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x7ca7ca7cU)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x58358358U)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x83583583U)
+ __builtin_abort ();
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
new file mode 100644
index 00000000000..1f4ffa34b48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
@@ -0,0 +1,111 @@
+/* { dg-do compile } */
+/* { dg-options "-O1 -fdump-tree-sccp-details" } */
+/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
+
+unsigned short
+__attribute__((noipa))
+foo (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo1 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo2 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo3 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp &= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo4 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo5 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo6 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo7 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp |= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo8 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo9 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= ~(1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo10 (unsigned short tmp)
+{
+ for (int bit = 0; bit < 16; bit += 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
+
+unsigned short
+__attribute__((noipa))
+foo11 (unsigned short tmp)
+{
+ for (int bit = 15; bit >= 0; bit -= 3)
+ tmp ^= (1U << bit);
+ return tmp;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
new file mode 100644
index 00000000000..65426d71efe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+#include "pr103462-5.c"
+
+int main()
+{
+ unsigned short tmp = 0x1111U;
+
+ if (foo (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo1 (tmp) != 0x110)
+ __builtin_abort ();
+
+ if (foo2 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo3 (tmp) != 0x0)
+ __builtin_abort ();
+
+ if (foo4 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo5 (tmp) != 0xffff)
+ __builtin_abort ();
+
+ if (foo6 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo7 (tmp) != 0x9359)
+ __builtin_abort ();
+
+ if (foo8 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo9 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo10 (tmp) != 0x8358)
+ __builtin_abort ();
+
+ if (foo11 (tmp) != 0x8358)
+ __builtin_abort ();
+}
+
diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 72ceb4001e3..fc59d035b19 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -3487,6 +3487,154 @@ expression_expensive_p (tree expr)
|| expanded_size > cache.elements ());
}
+/* Match.pd function to match bitwise inductive expression.
+ .i.e.
+ _2 = 1 << _1;
+ _3 = ~_2;
+ tmp_9 = _3 & tmp_12; */
+extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
+
+/* Return the inductive expression of bitwise operation if possible,
+ otherwise returns DEF. */
+static tree
+analyze_and_compute_bitwise_induction_effect (class loop* loop,
+ tree phidef,
+ unsigned HOST_WIDE_INT niter)
+{
+ tree match_op[3],inv, bitwise_scev;
+ tree type = TREE_TYPE (phidef);
+ gphi* header_phi = NULL;
+
+ /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
+
+ op2 = PHI <phidef, inv>
+ _1 = (int) bit_17;
+ _3 = 1 << _1;
+ op1 = ~_3;
+ phidef = op1 & op2; */
+ if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
+ || TREE_CODE (match_op[2]) != SSA_NAME
+ || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
+ || gimple_phi_num_args (header_phi) != 2)
+ return NULL_TREE;
+
+ if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
+ return NULL_TREE;
+
+ bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
+ bitwise_scev = instantiate_parameters (loop, bitwise_scev);
+
+ /* Make sure bits is in range of type precision. */
+ if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
+ || !INTEGRAL_TYPE_P (TREE_TYPE (bitwise_scev))
+ || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
+ || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
+ || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
+ return NULL_TREE;
+
+enum bit_op_kind
+ {
+ INDUCTION_BIT_CLEAR,
+ INDUCTION_BIT_IOR,
+ INDUCTION_BIT_XOR,
+ INDUCTION_BIT_RESET,
+ INDUCTION_ZERO,
+ INDUCTION_ALL
+ };
+
+ enum bit_op_kind induction_kind;
+ enum tree_code code1
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
+ enum tree_code code2
+ = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
+
+ /* BIT_CLEAR: A &= ~(1 << bit)
+ BIT_RESET: A ^= (1 << bit).
+ BIT_IOR: A |= (1 << bit)
+ BIT_ZERO: A &= (1 << bit)
+ BIT_ALL: A |= ~(1 << bit)
+ BIT_XOR: A ^= ~(1 << bit).
+ bit is induction variable. */
+ switch (code1)
+ {
+ case BIT_AND_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_CLEAR
+ : INDUCTION_ZERO;
+ break;
+ case BIT_IOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_ALL
+ : INDUCTION_BIT_IOR;
+ break;
+ case BIT_XOR_EXPR:
+ induction_kind = code2 == BIT_NOT_EXPR
+ ? INDUCTION_BIT_XOR
+ : INDUCTION_BIT_RESET;
+ break;
+ /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
+ case BIT_NOT_EXPR:
+ gcc_assert (code2 == BIT_XOR_EXPR);
+ induction_kind = INDUCTION_BIT_XOR;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (induction_kind == INDUCTION_ZERO)
+ return build_zero_cst (type);
+ if (induction_kind == INDUCTION_ALL)
+ return build_all_ones_cst (type);
+
+ wide_int bits = wi::zero (TYPE_PRECISION (type));
+ HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
+ HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
+ HOST_WIDE_INT bit_final = bit_start + step * niter;
+
+ /* bit_start, bit_final in range of [0,TYPE_PRECISION)
+ implies all bits are set in range. */
+ if (bit_final >= TYPE_PRECISION (type)
+ || bit_final < 0)
+ return NULL_TREE;
+
+ /* Loop tripcount should be niter + 1. */
+ for (unsigned i = 0; i != niter + 1; i++)
+ {
+ bits = wi::set_bit (bits, bit_start);
+ bit_start += step;
+ }
+
+ bool inverted = false;
+ switch (induction_kind)
+ {
+ case INDUCTION_BIT_CLEAR:
+ code1 = BIT_AND_EXPR;
+ inverted = true;
+ break;
+ case INDUCTION_BIT_IOR:
+ code1 = BIT_IOR_EXPR;
+ break;
+ case INDUCTION_BIT_RESET:
+ code1 = BIT_XOR_EXPR;
+ break;
+ /* A ^= ~(1 << bit) is special, when loop tripcount is even,
+ it's equal to A ^= bits, else A ^= ~bits. */
+ case INDUCTION_BIT_XOR:
+ code1 = BIT_XOR_EXPR;
+ if (niter % 2 == 0)
+ inverted = true;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ if (inverted)
+ bits = wi::bit_not (bits);
+
+ inv = PHI_ARG_DEF_FROM_EDGE (header_phi, loop_preheader_edge (loop));
+ return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
+}
+
/* Do final value replacement for LOOP, return true if we did anything. */
bool
@@ -3519,7 +3667,8 @@ final_value_replacement_loop (class loop *loop)
{
gphi *phi = psi.phi ();
tree rslt = PHI_RESULT (phi);
- tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
+ tree def = phidef;
if (virtual_operand_p (def))
{
gsi_next (&psi);
@@ -3537,6 +3686,28 @@ final_value_replacement_loop (class loop *loop)
def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
&folded_casts);
def = compute_overall_effect_of_inner_loop (ex_loop, def);
+
+ /* Handle bitwise induction expression.
+
+ .i.e.
+ for (int i = 0; i != 64; i+=3)
+ res &= ~(1UL << i);
+
+ RES can't be analyzed out by SCEV because it is not polynomially
+ expressible, but in fact final value of RES can be replaced by
+ RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
+ being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
+ unsigned HOST_WIDE_INT niter_num;
+ tree bit_def;
+ if (tree_fits_uhwi_p (niter)
+ && (niter_num = tree_to_uhwi (niter)) != 0
+ && niter_num < TYPE_PRECISION (TREE_TYPE (phidef))
+ && (bit_def
+ = analyze_and_compute_bitwise_induction_effect (loop,
+ phidef,
+ niter_num)))
+ def = bit_def;
+
if (!tree_does_not_contain_chrecs (def)
|| chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
/* Moving the computation from the loop may prolong life range
--
2.18.1
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction.
2022-05-18 2:45 ` Hongtao Liu
@ 2022-05-18 6:43 ` Richard Biener
0 siblings, 0 replies; 6+ messages in thread
From: Richard Biener @ 2022-05-18 6:43 UTC (permalink / raw)
To: Hongtao Liu; +Cc: liuhongt, GCC Patches
On Wed, May 18, 2022 at 4:45 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Fri, May 13, 2022 at 7:16 PM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Fri, May 13, 2022 at 5:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Wed, May 11, 2022 at 4:45 PM Richard Biener via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Mon, May 9, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > This patch will enable below optimization:
> > > > >
> > > > > {
> > > > > - int bit;
> > > > > - long long unsigned int _1;
> > > > > - long long unsigned int _2;
> > > > > -
> > > > > <bb 2> [local count: 46707768]:
> > > > > -
> > > > > - <bb 3> [local count: 1027034057]:
> > > > > - # tmp_11 = PHI <tmp_8(3), tmp_6(D)(2)>
> > > > > - # bit_13 = PHI <bit_9(3), 63(2)>
> > > > > - _1 = 1 << bit_13;
> > > > > - _2 = ~_1;
> > > > > - tmp_8 = _2 & tmp_11;
> > > > > - bit_9 = bit_13 + -3;
> > > > > - if (bit_9 != -3(OVF))
> > > > > - goto <bb 3>; [95.65%]
> > > > > - else
> > > > > - goto <bb 4>; [4.35%]
> > > > > -
> > > > > - <bb 4> [local count: 46707768]:
> > > > > - return tmp_8;
> > > > > + tmp_12 = tmp_6(D) & 7905747460161236406;
> > > > > + return tmp_12;
> > > > >
> > > > > }
> > > > >
> > > > >
> > > > > Boostrapped and regtested on x86_64-pc-linux-gnu{-m32,}
> > > > > Ok for trunk?
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR middle-end/103462
> > > > > * match.pd (bitwise_induction_p): New match.
> > > > > * tree-scalar-evolution.c (gimple_bitwise_induction_p):
> > > > > Declare.
> > > > > (analyze_and_compute_bitwise_induction_effect): New function.
> > > > > (enum bit_op_kind): New enum.
> > > > > (final_value_replacement_loop): Enhanced to handle bitwise
> > > > > induction.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > * gcc.target/i386/pr103462-1.c: New test.
> > > > > * gcc.target/i386/pr103462-2.c: New test.
> > > > > * gcc.target/i386/pr103462-3.c: New test.
> > > > > * gcc.target/i386/pr103462-4.c: New test.
> > > > > * gcc.target/i386/pr103462-5.c: New test.
> > > > > * gcc.target/i386/pr103462-6.c: New test.
> > > > > ---
> > > > > gcc/match.pd | 7 +
> > > > > gcc/testsuite/gcc.target/i386/pr103462-1.c | 111 +++++++++++++
> > > > > gcc/testsuite/gcc.target/i386/pr103462-2.c | 45 ++++++
> > > > > gcc/testsuite/gcc.target/i386/pr103462-3.c | 111 +++++++++++++
> > > > > gcc/testsuite/gcc.target/i386/pr103462-4.c | 46 ++++++
> > > > > gcc/testsuite/gcc.target/i386/pr103462-5.c | 111 +++++++++++++
> > > > > gcc/testsuite/gcc.target/i386/pr103462-6.c | 46 ++++++
> > > > > gcc/tree-scalar-evolution.cc | 178 ++++++++++++++++++++-
> > > > > 8 files changed, 654 insertions(+), 1 deletion(-)
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > > >
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > index 6d691d302b3..24ff5f9e6a8 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -7746,3 +7746,10 @@ and,
> > > > > == TYPE_UNSIGNED (TREE_TYPE (@3))))
> > > > > && single_use (@4)
> > > > > && single_use (@5))))
> > > > > +
> > > > > +(for bit_op (bit_and bit_ior bit_xor)
> > > > > + (match (bitwise_induction_p @0 @2 @3)
> > > > > + (bit_op:c (nop_convert1? (bit_not2?@0 (convert3? (lshift integer_onep@1 @2)))) @3)))
> > > > > +
> > > > > +(match (bitwise_induction_p @0 @2 @3)
> > > > > + (bit_not (nop_convert1? (bit_xor@0 (convert2? (lshift integer_onep@1 @2)) @3))))
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-1.c b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > > new file mode 100644
> > > > > index 00000000000..1dc4c2acad6
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-1.c
> > > > > @@ -0,0 +1,111 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp &= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo1 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp &= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo2 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp &= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo3 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp &= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo4 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp |= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo5 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp |= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo6 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp |= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo7 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp |= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo8 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp ^= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo9 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp ^= ~(1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo10 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 64; bit += 3)
> > > > > + tmp ^= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned long long
> > > > > +__attribute__((noipa))
> > > > > +foo11 (unsigned long long tmp)
> > > > > +{
> > > > > + for (int bit = 63; bit >= 0; bit -= 3)
> > > > > + tmp ^= (1ULL << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-2.c b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > > new file mode 100644
> > > > > index 00000000000..bc375cb78d4
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-2.c
> > > > > @@ -0,0 +1,45 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O1" } */
> > > > > +
> > > > > +#include "pr103462-1.c"
> > > > > +
> > > > > +int main()
> > > > > +{
> > > > > + unsigned long long tmp = 0x1111111111111111ULL;
> > > > > + if (foo (tmp) != 0x110110110110110ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo1 (tmp) != 0x110110110110110ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo2 (tmp) != 0x0ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo3 (tmp) != 0x0ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo4 (tmp) != 0xffffffffffffffffULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo5 (tmp) != 0xffffffffffffffffULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo6 (tmp) != 0x9359359359359359ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo7 (tmp) != 0x9359359359359359ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo8 (tmp) != 0x8358358358358358ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo9 (tmp) != 0x8358358358358358ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo10 (tmp) != 0x8358358358358358ULL)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo11 (tmp) != 0x8358358358358358ULL)
> > > > > + __builtin_abort ();
> > > > > +}
> > > > > +
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-3.c b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > > new file mode 100644
> > > > > index 00000000000..4ba248a4872
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-3.c
> > > > > @@ -0,0 +1,111 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp &= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo1 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp &= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo2 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp &= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo3 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp &= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo4 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp |= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo5 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp |= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo6 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp |= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo7 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp |= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo8 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp ^= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo9 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp ^= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo10 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 32; bit += 3)
> > > > > + tmp ^= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned int
> > > > > +__attribute__((noipa))
> > > > > +foo11 (unsigned int tmp)
> > > > > +{
> > > > > + for (int bit = 31; bit >= 0; bit -= 3)
> > > > > + tmp ^= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-4.c b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > > new file mode 100644
> > > > > index 00000000000..e2f4056f525
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-4.c
> > > > > @@ -0,0 +1,46 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O1" } */
> > > > > +
> > > > > +#include "pr103462-3.c"
> > > > > +
> > > > > +int main()
> > > > > +{
> > > > > + unsigned int tmp = 0x11111111U;
> > > > > +
> > > > > + if (foo (tmp) != 0x10110110U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo1 (tmp) != 0x1101101U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo2 (tmp) != 0x0U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo3 (tmp) != 0x0U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo4 (tmp) != 0xffffffffU)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo5 (tmp) != 0xffffffffU)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo6 (tmp) != 0x59359359U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo7 (tmp) != 0x93593593U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo8 (tmp) != 0xa7ca7ca7U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo9 (tmp) != 0x7ca7ca7cU)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo10 (tmp) != 0x58358358U)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo11 (tmp) != 0x83583583U)
> > > > > + __builtin_abort ();
> > > > > +}
> > > > > +
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-5.c b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > > new file mode 100644
> > > > > index 00000000000..1f4ffa34b48
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-5.c
> > > > > @@ -0,0 +1,111 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O1 -fdump-tree-sccp-details" } */
> > > > > +/* { dg-final { scan-tree-dump-times {final value replacement} 12 "sccp" } } */
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp &= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo1 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp &= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo2 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp &= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo3 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp &= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo4 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp |= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo5 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp |= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo6 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp |= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo7 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp |= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo8 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp ^= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo9 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp ^= ~(1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo10 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 0; bit < 16; bit += 3)
> > > > > + tmp ^= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > +
> > > > > +unsigned short
> > > > > +__attribute__((noipa))
> > > > > +foo11 (unsigned short tmp)
> > > > > +{
> > > > > + for (int bit = 15; bit >= 0; bit -= 3)
> > > > > + tmp ^= (1U << bit);
> > > > > + return tmp;
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr103462-6.c b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > > > new file mode 100644
> > > > > index 00000000000..65426d71efe
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/i386/pr103462-6.c
> > > > > @@ -0,0 +1,46 @@
> > > > > +/* { dg-do run } */
> > > > > +/* { dg-options "-O1" } */
> > > > > +
> > > > > +#include "pr103462-5.c"
> > > > > +
> > > > > +int main()
> > > > > +{
> > > > > + unsigned short tmp = 0x1111U;
> > > > > +
> > > > > + if (foo (tmp) != 0x110)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo1 (tmp) != 0x110)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo2 (tmp) != 0x0)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo3 (tmp) != 0x0)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo4 (tmp) != 0xffff)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo5 (tmp) != 0xffff)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo6 (tmp) != 0x9359)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo7 (tmp) != 0x9359)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo8 (tmp) != 0x8358)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo9 (tmp) != 0x8358)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo10 (tmp) != 0x8358)
> > > > > + __builtin_abort ();
> > > > > +
> > > > > + if (foo11 (tmp) != 0x8358)
> > > > > + __builtin_abort ();
> > > > > +}
> > > > > +
> > > > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > > > > index 72ceb4001e3..9b0aec4fd09 100644
> > > > > --- a/gcc/tree-scalar-evolution.cc
> > > > > +++ b/gcc/tree-scalar-evolution.cc
> > > > > @@ -3487,6 +3487,164 @@ expression_expensive_p (tree expr)
> > > > > || expanded_size > cache.elements ());
> > > > > }
> > > > >
> > > > > +/* Match.pd function to match bitwise inductive expression.
> > > > > + .i.e.
> > > > > + _2 = 1 << _1;
> > > > > + _3 = ~_2;
> > > > > + tmp_9 = _3 & tmp_12; */
> > > > > +extern bool gimple_bitwise_induction_p (tree, tree *, tree (*)(tree));
> > > > > +
> > > > > +/* Return the inductive expression of bitwise operation if possible,
> > > > > + otherwise returns DEF. */
> > > > > +static tree
> > > > > +analyze_and_compute_bitwise_induction_effect (class loop* ex_loop,
> > > > > + class loop* loop,
> > > > > + tree phidef,
> > > > > + tree def,
> > > > > + tree niter)
> > > > > +{
> > > > > + tree match_op[3],inv, bitwise_scev, bitwise_res;
> > > > > + bool folded_casts = false;
> > > > > + tree type = TREE_TYPE (phidef);
> > > > > + gimple* header_phi = NULL;
> > > >
> > > > use a gphi *
> > > >
> > > Changed.
> > > > > +
> > > > > + /* Match things like op2(MATCH_OP[2]), op1(MATCH_OP[1]), phidef(PHIDEF)
> > > > > +
> > > > > + op2 = PHI <phidef, inv>
> > > > > + _1 = (int) bit_17;
> > > > > + _3 = 1 << _1;
> > > > > + op1 = ~_3;
> > > > > + phidef = op1 & op2; */
> > > > > + if (!gimple_bitwise_induction_p (phidef, &match_op[0], NULL)
> > > > > + || TREE_CODE (match_op[2]) != SSA_NAME
> > > > > + || gimple_code (SSA_NAME_DEF_STMT (match_op[2])) != GIMPLE_PHI
> > > >
> > > > || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> > > >
> > > > > + || gimple_phi_num_args (SSA_NAME_DEF_STMT (match_op[2])) != 2)
> > > >
> > > > and then use header_phi here, if you pass in the PHI to use you'll
> > > > always have 2 args
> > > changed.
> > > >
> > > > > + return def;
> > > > > +
> > > > > + header_phi = SSA_NAME_DEF_STMT (match_op[2]);
> > > > > + if (gimple_bb (header_phi) != loop->header)
> > > > > + return def;
> > > >
> > > > I think passing in the PHI node and the exit edge instead of phidef
> > > > would make the above more clear
> > > Remove the upper condition.
> > > >
> > > > > + if (PHI_ARG_DEF_FROM_EDGE (header_phi, loop_latch_edge (loop)) != phidef)
> > > > > + return def;
> > > > > +
> > > > > + inv = gimple_phi_arg_def (header_phi, 0);
> > > > > + if (inv == phidef)
> > > > > + inv = gimple_phi_arg_def (header_phi, 1);
> > > >
> > > > inv = gimple_phi_arg_def_from_edge (header_phi, loop_preheader_edge (loop));
> > > >
> > > > where is this used?
> > > Move to the place where it's used.
> > > >
> > > > > +
> > > > > + bitwise_scev = analyze_scalar_evolution_in_loop (ex_loop, loop,
> > > > > + match_op[1],
> > > > > + &folded_casts);
> > > >
> > > > you want
> > > >
> > > > bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> > > > bitwise_scev = instantiate_parameters (loop, bitwise_scev);
> > > >
> > > > instead since you are interested in in-loop behavior?
> > > Yes, changed.
> > > >
> > > > > +
> > > > > + /* Make sure bits is in range of type precision. */
> > > > > + if (TREE_CODE (bitwise_scev) != POLYNOMIAL_CHREC
> > > >
> > > > I think you want to test || !INTEGRAL_TYPE_P (bitwise_scev) since you
> > > > do not rule out pointers or vector types.
> > > Changed.
> > > >
> > > > > + || !tree_fits_uhwi_p (CHREC_LEFT (bitwise_scev))
> > > > > + || tree_to_uhwi (CHREC_LEFT (bitwise_scev)) >= TYPE_PRECISION (type)
> > > > > + || !tree_fits_shwi_p (CHREC_RIGHT (bitwise_scev)))
> > > > > + return def;
> > > > > +
> > > > > + bitwise_res = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> > > > > + if (!tree_fits_uhwi_p (bitwise_res)
> > > > > + || tree_to_uhwi (bitwise_res) >= TYPE_PRECISION (type))
> > > > > + return def;
> > > >
> > > > Hmm, so I think that bitwise_res is the final value of the 'bit' IV, correct?
> > > > So name it bit_final?
> > > Changed.
> > > >
> > > > > +
> > > > > +enum bit_op_kind
> > > > > + {
> > > > > + INDUCTION_BIT_CLEAR,
> > > > > + INDUCTION_BIT_IOR,
> > > > > + INDUCTION_BIT_XOR,
> > > > > + INDUCTION_BIT_RESET,
> > > > > + INDUCTION_ZERO,
> > > > > + INDUCTION_ALL
> > > > > + };
> > > > > +
> > > > > + enum bit_op_kind induction_kind;
> > > > > + enum tree_code code1
> > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (phidef));
> > > > > + enum tree_code code2
> > > > > + = gimple_assign_rhs_code (SSA_NAME_DEF_STMT (match_op[0]));
> > > > > +
> > > > > + /* BIT_CLEAR: A &= ~(1 << bit)
> > > > > + BIT_RESET: A ^= (1 << bit).
> > > > > + BIT_IOR: A |= (1 << bit)
> > > > > + BIT_ZERO: A &= (1 << bit)
> > > > > + BIT_ALL: A |= ~(1 << bit)
> > > > > + BIT_XOR: A ^= ~(1 << bit).
> > > > > + bit is induction variable. */
> > > > > + switch (code1)
> > > > > + {
> > > > > + case BIT_AND_EXPR:
> > > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > > + ? INDUCTION_BIT_CLEAR
> > > > > + : INDUCTION_ZERO;
> > > > > + break;
> > > > > + case BIT_IOR_EXPR:
> > > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > > + ? INDUCTION_ALL
> > > > > + : INDUCTION_BIT_IOR;
> > > > > + break;
> > > > > + case BIT_XOR_EXPR:
> > > > > + induction_kind = code2 == BIT_NOT_EXPR
> > > > > + ? INDUCTION_BIT_XOR
> > > > > + : INDUCTION_BIT_RESET;
> > > > > + break;
> > > > > + /* A ^ ~(1 << bit) is equal to ~(A ^ (1 << bit)). */
> > > > > + case BIT_NOT_EXPR:
> > > > > + gcc_assert (code2 == BIT_XOR_EXPR);
> > > > > + induction_kind = INDUCTION_BIT_XOR;
> > > > > + break;
> > > > > + default:
> > > > > + gcc_unreachable();
> > > > > + }
> > > > > +
> > > > > + if (induction_kind == INDUCTION_ZERO)
> > > > > + return build_zero_cst (type);
> > > >
> > > > is that true even when the loop doesn't iterate?
> > > >
> > > > > + if (induction_kind == INDUCTION_ALL)
> > > > > + return build_all_ones_cst (type);
> > > >
> > > > Likewise?
> > > the niter is known > 0 and < TYPE_PRECISION (TYPE_SIZE (type), so it
> > > must be iterated.
> > > >
> > > > > +
> > > > > + wide_int bits = wi::zero (TYPE_PRECISION (type));
> > > > > + HOST_WIDE_INT start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
> > > >
> > > > you checked fits_uhwi above, name it bit_start?
> > > >
> > > > > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> > > > > + unsigned HOST_WIDE_INT num_iter = tree_to_uhwi (niter);
> > > > > +
> > > > > + /* Loop tripcount should be num_iter + 1. */
> > > > > + for (unsigned i = 0; i != num_iter + 1; i++)
> > > > > + {
> > > > > + bits = wi::bit_or (bits,
> > > > > + wi::lshift (wi::one (TYPE_PRECISION (type)),
> > > > > + start));
> > > >
> > > > you can use bits = wi::set_bit (bits, start);
> > > Changed.
> > > >
> > > > > + start += step;
> > > > > + }
> > > >
> > > > Uh. You don't limit 'num_iter' but you limit bit_final. So supposed that
> > > > the IV wraps and is unsigned char and you have a 256 bit integer the
> > > > above could run quite a long time. Please limit num_iter to TYPE_PRECISION
> > > > as well (do you have a testcase with a wrapping IV? try some != CST exit
> > > > condition and a large increment)
> > > Limit to > 0 and < TYPE_SIZE.
> > > >
> > > > > +
> > > > > + bool inverted = false;
> > > > > + switch (induction_kind)
> > > > > + {
> > > > > + case INDUCTION_BIT_CLEAR:
> > > > > + code1 = BIT_AND_EXPR;
> > > > > + inverted = true;
> > > > > + break;
> > > > > + case INDUCTION_BIT_IOR:
> > > > > + code1 = BIT_IOR_EXPR;
> > > > > + break;
> > > > > + case INDUCTION_BIT_RESET:
> > > > > + code1 = BIT_XOR_EXPR;
> > > > > + break;
> > > > > + /* A ^= ~(1 << bit) is special, when loop tripcount is even,
> > > > > + it's equal to A ^= bits, else A ^= ~bits. */
> > > > > + case INDUCTION_BIT_XOR:
> > > > > + code1 = BIT_XOR_EXPR;
> > > > > + if (num_iter % 2 == 0)
> > > > > + inverted = true;
> > > > > + break;
> > > > > + default:
> > > > > + gcc_unreachable ();
> > > > > + }
> > > > > +
> > > > > + if (inverted)
> > > > > + bits = wi::bit_not (bits);
> > > > > + return fold_build2 (code1, type, inv, wide_int_to_tree (type, bits));
> > > > > +}
> > > > > +
> > > > > /* Do final value replacement for LOOP, return true if we did anything. */
> > > > >
> > > > > bool
> > > > > @@ -3519,7 +3677,8 @@ final_value_replacement_loop (class loop *loop)
> > > > > {
> > > > > gphi *phi = psi.phi ();
> > > > > tree rslt = PHI_RESULT (phi);
> > > > > - tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > > > + tree phidef = PHI_ARG_DEF_FROM_EDGE (phi, exit);
> > > > > + tree def = phidef;
> > > > > if (virtual_operand_p (def))
> > > > > {
> > > > > gsi_next (&psi);
> > > > > @@ -3537,6 +3696,23 @@ final_value_replacement_loop (class loop *loop)
> > > > > def = analyze_scalar_evolution_in_loop (ex_loop, loop, def,
> > > > > &folded_casts);
> > > > > def = compute_overall_effect_of_inner_loop (ex_loop, def);
> > > > > +
> > > > > + /* Handle bitwise induction expression.
> > > > > +
> > > > > + .i.e.
> > > > > + for (int i = 0; i != 64; i+=3)
> > > > > + res &= ~(1UL << i);
> > > > > +
> > > > > + RES can't be analyzed out by SCEV because it is not polynomially
> > > > > + expressible, but in fact final value of RES can be replaced by
> > > > > + RES & CONSTANT where CONSTANT all ones with bit {0,3,6,9,... ,63}
> > > > > + being cleared, similar for BIT_IOR_EXPR/BIT_XOR_EXPR. */
> > > > > + if (tree_fits_uhwi_p (niter)
> > > > > + && tree_to_uhwi (niter)
> > > >
> > > > that's redundant - ah, it's the not iterating case that's excluded.
> > > > please write it
> > > > as tree_to_uhwi (niter) != 0, also consider assigning to an unsigned
> > > > HOST_WIDE_INT
> > > > and pass it that way to analyze_and_compute_bitwise_induction_effect
> > > Changed.
> > > >
> > > > > + && tree_to_uhwi (niter) != HOST_WIDE_INT_M1U)
> > > > > + def = analyze_and_compute_bitwise_induction_effect (ex_loop, loop,
> > > > > + phidef, def, niter);
> > > > > +
> > > >
> > > > it seems you overwrite the SCEV analysis result unconditionally here, please
> > > > move this inside of the following if () so it is done as fallback only
> > > > (or add it as && !(def = ...) to the condition)
> > > original def will be returned when bitwise induction fails which means
> > > rewriting is ok, but i think it's better to not pass original def and
> > > use your suggestion.
> > > Changed.
> > > >
> > > > > if (!tree_does_not_contain_chrecs (def)
> > > > > || chrec_contains_symbols_defined_in_loop (def, ex_loop->num)
> > > > > /* Moving the computation from the loop may prolong life range
> > > > > --
> > > > > 2.18.1
> > > > >
> > >
> > > Here's an updated V2 patch.
> >
> > + || !(header_phi = dyn_cast <gphi *> (SSA_NAME_DEF_STMT (match_op[2])))
> > + || header_phi->nargs != 2)
> >
> > please use gimple_phi_num_args (header_phi) != 2
> Changed.
> >
> > + bitwise_scev = analyze_scalar_evolution (loop, match_op[1]);
> > + bitwise_scev = instantiate_parameters (loop, bitwise_scev);
> > +
> >
> > so that's now OK to get the evolution of the bit IV
> >
> > + bit_final = compute_overall_effect_of_inner_loop (ex_loop, bitwise_scev);
> >
> > but this might not, for the final value you probably want to re-analyze the
> > SCEV wrt the outer loop using your original analyze_scalar_evolution_in_loop
> > query. Of course the final value could be also computed with
> > CHREC_LEFT + niter * CHREC_RIGHT, so it might be more convenient
> > to remove the above and re-formulate the check it is used in after
> >
> > + wide_int bits = wi::zero (TYPE_PRECISION (type));
> > + HOST_WIDE_INT bit_start = tree_to_shwi (CHREC_LEFT (bitwise_scev));
> > + HOST_WIDE_INT step = tree_to_shwi (CHREC_RIGHT (bitwise_scev));
> >
> > ?
> >
> How about
> + HOST_WIDE_INT bit_final = bit_start + step * niter;
> +
> + /* bit_start, bit_final in range of [0,TYPE_PRECISION)
> + implies all bits are set in range. */
> + if (bit_final >= TYPE_PRECISION (type)
> + || bit_final < 0)
> + return NULL_TREE;
>
> Also remove ex_loop from the parameter.
OK.
Thanks,
Richard.
> Updated patch.
> > OK with that changes.
> >
> > Thanks,
> > Richard.
> >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao
>
>
>
> --
> BR,
> Hongtao
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2022-05-18 6:43 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-09 5:19 [PATCH] [Middle-end] Enhance final_value_replacement_loop to handle bitwise induction liuhongt
2022-05-11 8:45 ` Richard Biener
2022-05-13 3:37 ` Hongtao Liu
2022-05-13 11:16 ` Richard Biener
2022-05-18 2:45 ` Hongtao Liu
2022-05-18 6:43 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).