* [PATCH] tree-optimization/103345: Improved load merging
@ 2021-11-22 8:39 Roger Sayle
2021-11-22 12:28 ` Richard Biener
2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
0 siblings, 2 replies; 7+ messages in thread
From: Roger Sayle @ 2021-11-22 8:39 UTC (permalink / raw)
To: 'GCC Patches'
[-- Attachment #1: Type: text/plain, Size: 1100 bytes --]
This patch implements PR tree-optimization/103345 to merge adjacent
loads when combined with addition or bitwise xor. The current code
in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
the same way at BIT_IOR_EXPR. Many thanks to Andrew Pinski for
pointing out that this also resolves PR target/98953.
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures. The new testcases should also
pass (but haven't been tested) on other endian targets.
Ok for mainline?
2021-11-22 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR tree-optimization/98953
PR tree-optimization/103345
* gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle
BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR.
(pass_optimize_bswap::execute): Likewise.
gcc/testsuite/ChangeLog
PR tree-optimization/98953
PR tree-optimization/103345
* gcc.dg/tree-ssa/pr98953.c: New test case.
* gcc.dg/tree-ssa/pr103345.c: New test case.
Thanks in advance,
Roger
--
[-- Attachment #2: patchx.txt --]
[-- Type: text/plain, Size: 3381 bytes --]
diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
index 4efa200..1740c9e 100644
--- a/gcc/gimple-ssa-store-merging.c
+++ b/gcc/gimple-ssa-store-merging.c
@@ -742,10 +742,7 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
struct symbolic_number n1, n2;
gimple *source_stmt, *source_stmt2;
- if (code != BIT_IOR_EXPR)
- return NULL;
-
- if (TREE_CODE (rhs2) != SSA_NAME)
+ if (!rhs2 || TREE_CODE (rhs2) != SSA_NAME)
return NULL;
rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
@@ -753,6 +750,8 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
switch (code)
{
case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ case PLUS_EXPR:
source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
if (!source_stmt1)
@@ -1495,6 +1494,8 @@ pass_optimize_bswap::execute (function *fun)
continue;
/* Fall through. */
case BIT_IOR_EXPR:
+ case BIT_XOR_EXPR:
+ case PLUS_EXPR:
break;
case CONSTRUCTOR:
{
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c
new file mode 100644
index 0000000..94388b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-bswap-details" } */
+
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+
+uint32_t load_le_32_or(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return ((uint32_t)ptr[0]) |
+ ((uint32_t)ptr[1] << 8) |
+ ((uint32_t)ptr[2] << 16) |
+ ((uint32_t)ptr[3] << 24);
+#else
+ return ((uint32_t)ptr[3]) |
+ ((uint32_t)ptr[2] << 8) |
+ ((uint32_t)ptr[1] << 16) |
+ ((uint32_t)ptr[0] << 24);
+#endif
+}
+
+uint32_t load_le_32_add(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return ((uint32_t)ptr[0]) +
+ ((uint32_t)ptr[1] << 8) +
+ ((uint32_t)ptr[2] << 16) +
+ ((uint32_t)ptr[3] << 24);
+#else
+ return ((uint32_t)ptr[3]) +
+ ((uint32_t)ptr[2] << 8) +
+ ((uint32_t)ptr[1] << 16) +
+ ((uint32_t)ptr[0] << 24);
+#endif
+}
+
+uint32_t load_le_32_xor(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return ((uint32_t)ptr[0]) ^
+ ((uint32_t)ptr[1] << 8) ^
+ ((uint32_t)ptr[2] << 16) ^
+ ((uint32_t)ptr[3] << 24);
+#else
+ return ((uint32_t)ptr[0]) ^
+ ((uint32_t)ptr[1] << 8) ^
+ ((uint32_t)ptr[2] << 16) ^
+ ((uint32_t)ptr[3] << 24);
+#endif
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in target endianness found" 3 "bswap" } } */
+
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c
new file mode 100644
index 0000000..7687dc2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-bswap-details" } */
+
+int foo(unsigned char *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return ptr[0] + (ptr[1] << 8);
+#else
+ return ptr[1] + (ptr[0] << 8);
+#endif
+}
+
+/* { dg-final { scan-tree-dump "16 bit load in target endianness found" "bswap" } } */
+
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] tree-optimization/103345: Improved load merging
2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
@ 2021-11-22 12:28 ` Richard Biener
2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
1 sibling, 0 replies; 7+ messages in thread
From: Richard Biener @ 2021-11-22 12:28 UTC (permalink / raw)
To: Roger Sayle; +Cc: GCC Patches
On Mon, Nov 22, 2021 at 9:40 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch implements PR tree-optimization/103345 to merge adjacent
> loads when combined with addition or bitwise xor. The current code
> in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> the same way at BIT_IOR_EXPR. Many thanks to Andrew Pinski for
> pointing out that this also resolves PR target/98953.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures. The new testcases should also
> pass (but haven't been tested) on other endian targets.
>
> Ok for mainline?
OK.
Thanks,
Richard.
>
> 2021-11-22 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> PR tree-optimization/98953
> PR tree-optimization/103345
> * gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle
> BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR.
> (pass_optimize_bswap::execute): Likewise.
>
> gcc/testsuite/ChangeLog
> PR tree-optimization/98953
> PR tree-optimization/103345
> * gcc.dg/tree-ssa/pr98953.c: New test case.
> * gcc.dg/tree-ssa/pr103345.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376]
2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
2021-11-22 12:28 ` Richard Biener
@ 2021-11-24 8:35 ` Jakub Jelinek
2021-11-24 8:45 ` Richard Biener
1 sibling, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-24 8:35 UTC (permalink / raw)
To: Richard Biener, Roger Sayle; +Cc: gcc-patches
On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote:
> This patch implements PR tree-optimization/103345 to merge adjacent
> loads when combined with addition or bitwise xor. The current code
> in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> the same way at BIT_IOR_EXPR.
Unfortunately they aren't exactly the same. They work the same if always
at least one operand (or corresponding byte in it) is known to be 0,
0 | 0 = 0 ^ 0 = 0 + 0 = 0. But for | also x | x = x for any other x,
so perform_symbolic_merge has been accepting either that at least one
of the bytes is 0 or that both are the same, but that is wrong for ^
and +.
The following patch fixes that by passing through the code of binary
operation and allowing non-zero masked1 == masked2 through only
for BIT_IOR_EXPR.
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
We could allow masked1 == masked2 case for it, but would need to
do something different than the
n->n = n1->n | n2->n;
we do on all the bytes together.
In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
both variants are the same) and masked1 != 0xff we would need to
clear corresponding n->n byte instead of setting it to the input
as x ^ x = 0 (but if we don't know what x and y are, the result is
also don't know). Now, for plus it is much harder, because not only
for non-zero operands we don't know what the result is, but it can
modify upper bytes as well. So perhaps only if current's byte
masked1 && masked2 set the resulting byte to 0xff (unknown) iff
the byte above it is 0 and 0, and set that resulting byte to 0xff too.
Also, even for | we could instead of return NULL just set the resulting
byte to 0xff if it is different, perhaps it will be masked off later on.
Ok to handle that incrementally?
2021-11-24 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/103376
* gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE
argument. If CODE is not BIT_IOR_EXPR, ensure that one of masked1
or masked2 is 0.
(find_bswap_or_nop_1, find_bswap_or_nop,
imm_store_chain_info::try_coalesce_bswap): Adjust
perform_symbolic_merge callers.
* gcc.c-torture/execute/pr103376.c: New test.
--- gcc/gimple-ssa-store-merging.c.jj 2021-11-23 10:26:30.000000000 +0100
+++ gcc/gimple-ssa-store-merging.c 2021-11-23 11:49:33.806168782 +0100
@@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr
return true;
}
-/* Compute the symbolic number N representing the result of a bitwise OR on 2
- symbolic number N1 and N2 whose source statements are respectively
- SOURCE_STMT1 and SOURCE_STMT2. */
+/* Compute the symbolic number N representing the result of a bitwise OR,
+ bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements
+ are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */
gimple *
perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
gimple *source_stmt2, struct symbolic_number *n2,
- struct symbolic_number *n)
+ struct symbolic_number *n, enum tree_code code)
{
int i, size;
uint64_t mask;
@@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s
masked1 = n1->n & mask;
masked2 = n2->n & mask;
- if (masked1 && masked2 && masked1 != masked2)
+ /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
+ has to be 0, for BIT_IOR_EXPR x | x is still x. */
+ if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
return NULL;
}
n->n = n1->n | n2->n;
@@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc
return NULL;
source_stmt
- = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
+ = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n,
+ code);
if (!source_stmt)
return NULL;
@@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct
else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
return NULL;
ins_stmt
- = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
+ = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n,
+ BIT_IOR_EXPR);
if (!ins_stmt)
return NULL;
@@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap
end = MAX (end, info->bitpos + info->bitsize);
ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
- &this_n, &n);
+ &this_n, &n, BIT_IOR_EXPR);
if (ins_stmt == NULL)
return false;
}
--- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj 2021-11-23 12:03:38.339948150 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr103376.c 2021-11-23 12:02:44.668723595 +0100
@@ -0,0 +1,29 @@
+/* PR tree-optimization/103376 */
+
+long long a = 0x123456789abcdef0LL, f;
+int b, c, *d;
+
+__attribute__((noipa)) void
+foo (int x)
+{
+ asm volatile ("" : : "r" (x));
+}
+
+int
+main ()
+{
+ long long e;
+ e = a;
+ if (b)
+ {
+ foo (c);
+ d = (int *) 0;
+ while (*d)
+ ;
+ }
+ f = a ^ e;
+ asm volatile ("" : "+m" (f));
+ if (f != 0)
+ __builtin_abort ();
+ return 0;
+}
Jakub
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376]
2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
@ 2021-11-24 8:45 ` Richard Biener
2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-11-24 8:45 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches
On Wed, 24 Nov 2021, Jakub Jelinek wrote:
> On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote:
> > This patch implements PR tree-optimization/103345 to merge adjacent
> > loads when combined with addition or bitwise xor. The current code
> > in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> > so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> > the same way at BIT_IOR_EXPR.
>
> Unfortunately they aren't exactly the same. They work the same if always
> at least one operand (or corresponding byte in it) is known to be 0,
> 0 | 0 = 0 ^ 0 = 0 + 0 = 0. But for | also x | x = x for any other x,
> so perform_symbolic_merge has been accepting either that at least one
> of the bytes is 0 or that both are the same, but that is wrong for ^
> and +.
>
> The following patch fixes that by passing through the code of binary
> operation and allowing non-zero masked1 == masked2 through only
> for BIT_IOR_EXPR.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK.
> Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> We could allow masked1 == masked2 case for it, but would need to
> do something different than the
> n->n = n1->n | n2->n;
> we do on all the bytes together.
> In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> both variants are the same) and masked1 != 0xff we would need to
> clear corresponding n->n byte instead of setting it to the input
> as x ^ x = 0 (but if we don't know what x and y are, the result is
> also don't know). Now, for plus it is much harder, because not only
> for non-zero operands we don't know what the result is, but it can
> modify upper bytes as well. So perhaps only if current's byte
> masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> Also, even for | we could instead of return NULL just set the resulting
> byte to 0xff if it is different, perhaps it will be masked off later on.
> Ok to handle that incrementally?
Not sure if it is worth the trouble - the XOR handling sounds
straight forward at least. But sure, the merging routine could
simply be conservatively correct here.
Thanks,
Richard.
> 2021-11-24 Jakub Jelinek <jakub@redhat.com>
>
> PR tree-optimization/103376
> * gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE
> argument. If CODE is not BIT_IOR_EXPR, ensure that one of masked1
> or masked2 is 0.
> (find_bswap_or_nop_1, find_bswap_or_nop,
> imm_store_chain_info::try_coalesce_bswap): Adjust
> perform_symbolic_merge callers.
>
> * gcc.c-torture/execute/pr103376.c: New test.
>
> --- gcc/gimple-ssa-store-merging.c.jj 2021-11-23 10:26:30.000000000 +0100
> +++ gcc/gimple-ssa-store-merging.c 2021-11-23 11:49:33.806168782 +0100
> @@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr
> return true;
> }
>
> -/* Compute the symbolic number N representing the result of a bitwise OR on 2
> - symbolic number N1 and N2 whose source statements are respectively
> - SOURCE_STMT1 and SOURCE_STMT2. */
> +/* Compute the symbolic number N representing the result of a bitwise OR,
> + bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements
> + are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */
>
> gimple *
> perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
> gimple *source_stmt2, struct symbolic_number *n2,
> - struct symbolic_number *n)
> + struct symbolic_number *n, enum tree_code code)
> {
> int i, size;
> uint64_t mask;
> @@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s
>
> masked1 = n1->n & mask;
> masked2 = n2->n & mask;
> - if (masked1 && masked2 && masked1 != masked2)
> + /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
> + has to be 0, for BIT_IOR_EXPR x | x is still x. */
> + if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
> return NULL;
> }
> n->n = n1->n | n2->n;
> @@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc
> return NULL;
>
> source_stmt
> - = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
> + = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n,
> + code);
>
> if (!source_stmt)
> return NULL;
> @@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct
> else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
> return NULL;
> ins_stmt
> - = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
> + = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n,
> + BIT_IOR_EXPR);
>
> if (!ins_stmt)
> return NULL;
> @@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap
> end = MAX (end, info->bitpos + info->bitsize);
>
> ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
> - &this_n, &n);
> + &this_n, &n, BIT_IOR_EXPR);
> if (ins_stmt == NULL)
> return false;
> }
> --- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj 2021-11-23 12:03:38.339948150 +0100
> +++ gcc/testsuite/gcc.c-torture/execute/pr103376.c 2021-11-23 12:02:44.668723595 +0100
> @@ -0,0 +1,29 @@
> +/* PR tree-optimization/103376 */
> +
> +long long a = 0x123456789abcdef0LL, f;
> +int b, c, *d;
> +
> +__attribute__((noipa)) void
> +foo (int x)
> +{
> + asm volatile ("" : : "r" (x));
> +}
> +
> +int
> +main ()
> +{
> + long long e;
> + e = a;
> + if (b)
> + {
> + foo (c);
> + d = (int *) 0;
> + while (*d)
> + ;
> + }
> + f = a ^ e;
> + asm volatile ("" : "+m" (f));
> + if (f != 0)
> + __builtin_abort ();
> + return 0;
> +}
>
>
> Jakub
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
2021-11-24 8:45 ` Richard Biener
@ 2021-11-25 7:54 ` Jakub Jelinek
2021-11-25 8:21 ` Richard Biener
0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-25 7:54 UTC (permalink / raw)
To: Richard Biener; +Cc: Roger Sayle, gcc-patches
On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote:
> > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> > We could allow masked1 == masked2 case for it, but would need to
> > do something different than the
> > n->n = n1->n | n2->n;
> > we do on all the bytes together.
> > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> > both variants are the same) and masked1 != 0xff we would need to
> > clear corresponding n->n byte instead of setting it to the input
> > as x ^ x = 0 (but if we don't know what x and y are, the result is
> > also don't know). Now, for plus it is much harder, because not only
> > for non-zero operands we don't know what the result is, but it can
> > modify upper bytes as well. So perhaps only if current's byte
> > masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> > the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> > Also, even for | we could instead of return NULL just set the resulting
> > byte to 0xff if it is different, perhaps it will be masked off later on.
> > Ok to handle that incrementally?
>
> Not sure if it is worth the trouble - the XOR handling sounds
> straight forward at least. But sure, the merging routine could
> simply be conservatively correct here.
This patch implements that (except that for + it just punts whenever
both operand bytes aren't 0 like before).
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
2021-11-25 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/103376
* gimple-ssa-store-merging.c (perform_symbolic_merge): For
BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
0.
--- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100
+++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100
@@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
n->bytepos = n_start->bytepos;
n->type = n_start->type;
size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
+ uint64_t res_n = n1->n | n2->n;
for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
{
@@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
masked1 = n1->n & mask;
masked2 = n2->n & mask;
- /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
- has to be 0, for BIT_IOR_EXPR x | x is still x. */
- if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
- return NULL;
+ /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */
+ if (masked1 && masked2)
+ {
+ /* + can carry into upper bits, just punt. */
+ if (code == PLUS_EXPR)
+ return NULL;
+ /* x | x is still x. */
+ if (code == BIT_IOR_EXPR && masked1 == masked2)
+ continue;
+ if (code == BIT_XOR_EXPR)
+ {
+ /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
+ unknown values and unknown ^ unknown is unknown. */
+ if (masked1 == masked2
+ && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
+ << i * BITS_PER_MARKER))
+ {
+ res_n &= ~mask;
+ continue;
+ }
+ }
+ /* Otherwise set the byte to unknown, it might still be
+ later masked off. */
+ res_n |= mask;
+ }
}
- n->n = n1->n | n2->n;
+ n->n = res_n;
n->n_ops = n1->n_ops + n2->n_ops;
return source_stmt;
Jakub
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
@ 2021-11-25 8:21 ` Richard Biener
2021-11-25 9:44 ` Jakub Jelinek
0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-11-25 8:21 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches
On Thu, 25 Nov 2021, Jakub Jelinek wrote:
> On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote:
> > > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> > > We could allow masked1 == masked2 case for it, but would need to
> > > do something different than the
> > > n->n = n1->n | n2->n;
> > > we do on all the bytes together.
> > > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> > > both variants are the same) and masked1 != 0xff we would need to
> > > clear corresponding n->n byte instead of setting it to the input
> > > as x ^ x = 0 (but if we don't know what x and y are, the result is
> > > also don't know). Now, for plus it is much harder, because not only
> > > for non-zero operands we don't know what the result is, but it can
> > > modify upper bytes as well. So perhaps only if current's byte
> > > masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> > > the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> > > Also, even for | we could instead of return NULL just set the resulting
> > > byte to 0xff if it is different, perhaps it will be masked off later on.
> > > Ok to handle that incrementally?
> >
> > Not sure if it is worth the trouble - the XOR handling sounds
> > straight forward at least. But sure, the merging routine could
> > simply be conservatively correct here.
>
> This patch implements that (except that for + it just punts whenever
> both operand bytes aren't 0 like before).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK if you can add a testcase that exercises this "feature".
Thanks,
Richard.
> 2021-11-25 Jakub Jelinek <jakub@redhat.com>
>
> PR tree-optimization/103376
> * gimple-ssa-store-merging.c (perform_symbolic_merge): For
> BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
> punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
> For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
> byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
> 0.
>
> --- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100
> +++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100
> @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
> n->bytepos = n_start->bytepos;
> n->type = n_start->type;
> size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
> + uint64_t res_n = n1->n | n2->n;
>
> for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
> {
> @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
>
> masked1 = n1->n & mask;
> masked2 = n2->n & mask;
> - /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
> - has to be 0, for BIT_IOR_EXPR x | x is still x. */
> - if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
> - return NULL;
> + /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */
> + if (masked1 && masked2)
> + {
> + /* + can carry into upper bits, just punt. */
> + if (code == PLUS_EXPR)
> + return NULL;
> + /* x | x is still x. */
> + if (code == BIT_IOR_EXPR && masked1 == masked2)
> + continue;
> + if (code == BIT_XOR_EXPR)
> + {
> + /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
> + unknown values and unknown ^ unknown is unknown. */
> + if (masked1 == masked2
> + && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
> + << i * BITS_PER_MARKER))
> + {
> + res_n &= ~mask;
> + continue;
> + }
> + }
> + /* Otherwise set the byte to unknown, it might still be
> + later masked off. */
> + res_n |= mask;
> + }
> }
> - n->n = n1->n | n2->n;
> + n->n = res_n;
> n->n_ops = n1->n_ops + n2->n_ops;
>
> return source_stmt;
>
>
> Jakub
>
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
2021-11-25 8:21 ` Richard Biener
@ 2021-11-25 9:44 ` Jakub Jelinek
0 siblings, 0 replies; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-25 9:44 UTC (permalink / raw)
To: Richard Biener; +Cc: Roger Sayle, gcc-patches
On Thu, Nov 25, 2021 at 09:21:37AM +0100, Richard Biener wrote:
> OK if you can add a testcase that exercises this "feature".
Sure, that is easy.
Here is what I've committed. f2 tests the x | x = x handling in it,
f3 tests x | y = unknown instead of punting, f4 tests x ^ x = 0
and f5 tests x ^ y = unknown. Without the patch only f2 is optimized
to __builtin_bswap32, with the patch all of them.
2021-11-25 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/103376
* gimple-ssa-store-merging.c (perform_symbolic_merge): For
BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
0.
* gcc.dg/optimize-bswapsi-7.c: New test.
--- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100
+++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100
@@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
n->bytepos = n_start->bytepos;
n->type = n_start->type;
size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
+ uint64_t res_n = n1->n | n2->n;
for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
{
@@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
masked1 = n1->n & mask;
masked2 = n2->n & mask;
- /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
- has to be 0, for BIT_IOR_EXPR x | x is still x. */
- if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
- return NULL;
+ /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */
+ if (masked1 && masked2)
+ {
+ /* + can carry into upper bits, just punt. */
+ if (code == PLUS_EXPR)
+ return NULL;
+ /* x | x is still x. */
+ if (code == BIT_IOR_EXPR && masked1 == masked2)
+ continue;
+ if (code == BIT_XOR_EXPR)
+ {
+ /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
+ unknown values and unknown ^ unknown is unknown. */
+ if (masked1 == masked2
+ && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
+ << i * BITS_PER_MARKER))
+ {
+ res_n &= ~mask;
+ continue;
+ }
+ }
+ /* Otherwise set the byte to unknown, it might still be
+ later masked off. */
+ res_n |= mask;
+ }
}
- n->n = n1->n | n2->n;
+ n->n = res_n;
n->n_ops = n1->n_ops + n2->n_ops;
return source_stmt;
--- gcc/testsuite/gcc.dg/optimize-bswapsi-7.c.jj 2021-11-25 10:36:03.847529686 +0100
+++ gcc/testsuite/gcc.dg/optimize-bswapsi-7.c 2021-11-25 10:35:46.522778192 +0100
@@ -0,0 +1,37 @@
+/* PR tree-optimization/103376 */
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap } */
+/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-optimized" } */
+/* { dg-additional-options "-march=z900" { target s390-*-* } } */
+
+static unsigned int
+f1 (unsigned int x)
+{
+ return (x << 24) | (x >> 8);
+}
+
+unsigned int
+f2 (unsigned *p)
+{
+ return ((f1 (p[0]) | (p[0] >> 8)) & 0xff000000U) | (p[0] >> 24) | ((p[0] & 0xff00U) << 8) | ((p[0] & 0xff0000U) >> 8);
+}
+
+unsigned int
+f3 (unsigned *p)
+{
+ return ((f1 (p[0]) | (p[0] & 0x00ff00ffU)) & 0xff00ff00U) | (f1 (f1 (f1 (p[0]))) & 0x00ff00ffU);
+}
+
+unsigned int
+f4 (unsigned *p)
+{
+ return (f1 (p[0]) ^ (p[0] >> 8)) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8);
+}
+
+unsigned int
+f5 (unsigned *p)
+{
+ return (((f1 (p[0]) | (p[0] >> 16)) ^ (p[0] >> 8)) & 0xffff0000U) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8);
+}
+
+/* { dg-final { scan-tree-dump-times "= __builtin_bswap32 \\\(" 4 "optimized" } } */
Jakub
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-11-25 9:44 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
2021-11-22 12:28 ` Richard Biener
2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
2021-11-24 8:45 ` Richard Biener
2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
2021-11-25 8:21 ` Richard Biener
2021-11-25 9:44 ` Jakub Jelinek
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).