* [PATCH] tree-optimization/103345: Improved load merging @ 2021-11-22 8:39 Roger Sayle 2021-11-22 12:28 ` Richard Biener 2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek 0 siblings, 2 replies; 7+ messages in thread From: Roger Sayle @ 2021-11-22 8:39 UTC (permalink / raw) To: 'GCC Patches' [-- Attachment #1: Type: text/plain, Size: 1100 bytes --] This patch implements PR tree-optimization/103345 to merge adjacent loads when combined with addition or bitwise xor. The current code in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior, so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in the same way at BIT_IOR_EXPR. Many thanks to Andrew Pinski for pointing out that this also resolves PR target/98953. This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and make -k check with no new failures. The new testcases should also pass (but haven't been tested) on other endian targets. Ok for mainline? 2021-11-22 Roger Sayle <roger@nextmovesoftware.com> gcc/ChangeLog PR tree-optimization/98953 PR tree-optimization/103345 * gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR. (pass_optimize_bswap::execute): Likewise. gcc/testsuite/ChangeLog PR tree-optimization/98953 PR tree-optimization/103345 * gcc.dg/tree-ssa/pr98953.c: New test case. * gcc.dg/tree-ssa/pr103345.c: New test case. Thanks in advance, Roger -- [-- Attachment #2: patchx.txt --] [-- Type: text/plain, Size: 3381 bytes --] diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index 4efa200..1740c9e 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -742,10 +742,7 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) struct symbolic_number n1, n2; gimple *source_stmt, *source_stmt2; - if (code != BIT_IOR_EXPR) - return NULL; - - if (TREE_CODE (rhs2) != SSA_NAME) + if (!rhs2 || TREE_CODE (rhs2) != SSA_NAME) return NULL; rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); @@ -753,6 +750,8 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) switch (code) { case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + case PLUS_EXPR: source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1); if (!source_stmt1) @@ -1495,6 +1494,8 @@ pass_optimize_bswap::execute (function *fun) continue; /* Fall through. */ case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + case PLUS_EXPR: break; case CONSTRUCTOR: { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c new file mode 100644 index 0000000..94388b5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-bswap-details" } */ + +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; + +uint32_t load_le_32_or(const uint8_t *ptr) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ((uint32_t)ptr[0]) | + ((uint32_t)ptr[1] << 8) | + ((uint32_t)ptr[2] << 16) | + ((uint32_t)ptr[3] << 24); +#else + return ((uint32_t)ptr[3]) | + ((uint32_t)ptr[2] << 8) | + ((uint32_t)ptr[1] << 16) | + ((uint32_t)ptr[0] << 24); +#endif +} + +uint32_t load_le_32_add(const uint8_t *ptr) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ((uint32_t)ptr[0]) + + ((uint32_t)ptr[1] << 8) + + ((uint32_t)ptr[2] << 16) + + ((uint32_t)ptr[3] << 24); +#else + return ((uint32_t)ptr[3]) + + ((uint32_t)ptr[2] << 8) + + ((uint32_t)ptr[1] << 16) + + ((uint32_t)ptr[0] << 24); +#endif +} + +uint32_t load_le_32_xor(const uint8_t *ptr) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ((uint32_t)ptr[0]) ^ + ((uint32_t)ptr[1] << 8) ^ + ((uint32_t)ptr[2] << 16) ^ + ((uint32_t)ptr[3] << 24); +#else + return ((uint32_t)ptr[0]) ^ + ((uint32_t)ptr[1] << 8) ^ + ((uint32_t)ptr[2] << 16) ^ + ((uint32_t)ptr[3] << 24); +#endif +} + +/* { dg-final { scan-tree-dump-times "32 bit load in target endianness found" 3 "bswap" } } */ + diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c new file mode 100644 index 0000000..7687dc2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-bswap-details" } */ + +int foo(unsigned char *ptr) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return ptr[0] + (ptr[1] << 8); +#else + return ptr[1] + (ptr[0] << 8); +#endif +} + +/* { dg-final { scan-tree-dump "16 bit load in target endianness found" "bswap" } } */ + ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] tree-optimization/103345: Improved load merging 2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle @ 2021-11-22 12:28 ` Richard Biener 2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek 1 sibling, 0 replies; 7+ messages in thread From: Richard Biener @ 2021-11-22 12:28 UTC (permalink / raw) To: Roger Sayle; +Cc: GCC Patches On Mon, Nov 22, 2021 at 9:40 AM Roger Sayle <roger@nextmovesoftware.com> wrote: > > > This patch implements PR tree-optimization/103345 to merge adjacent > loads when combined with addition or bitwise xor. The current code > in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior, > so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in > the same way at BIT_IOR_EXPR. Many thanks to Andrew Pinski for > pointing out that this also resolves PR target/98953. > > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap > and make -k check with no new failures. The new testcases should also > pass (but haven't been tested) on other endian targets. > > Ok for mainline? OK. Thanks, Richard. > > 2021-11-22 Roger Sayle <roger@nextmovesoftware.com> > > gcc/ChangeLog > PR tree-optimization/98953 > PR tree-optimization/103345 > * gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle > BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR. > (pass_optimize_bswap::execute): Likewise. > > gcc/testsuite/ChangeLog > PR tree-optimization/98953 > PR tree-optimization/103345 > * gcc.dg/tree-ssa/pr98953.c: New test case. > * gcc.dg/tree-ssa/pr103345.c: New test case. > > > Thanks in advance, > Roger > -- > ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] 2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle 2021-11-22 12:28 ` Richard Biener @ 2021-11-24 8:35 ` Jakub Jelinek 2021-11-24 8:45 ` Richard Biener 1 sibling, 1 reply; 7+ messages in thread From: Jakub Jelinek @ 2021-11-24 8:35 UTC (permalink / raw) To: Richard Biener, Roger Sayle; +Cc: gcc-patches On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote: > This patch implements PR tree-optimization/103345 to merge adjacent > loads when combined with addition or bitwise xor. The current code > in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior, > so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in > the same way at BIT_IOR_EXPR. Unfortunately they aren't exactly the same. They work the same if always at least one operand (or corresponding byte in it) is known to be 0, 0 | 0 = 0 ^ 0 = 0 + 0 = 0. But for | also x | x = x for any other x, so perform_symbolic_merge has been accepting either that at least one of the bytes is 0 or that both are the same, but that is wrong for ^ and +. The following patch fixes that by passing through the code of binary operation and allowing non-zero masked1 == masked2 through only for BIT_IOR_EXPR. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? Thinking more about it, perhaps we could do more for BIT_XOR_EXPR. We could allow masked1 == masked2 case for it, but would need to do something different than the n->n = n1->n | n2->n; we do on all the bytes together. In particular, for masked1 == masked2 if masked1 != 0 (well, for 0 both variants are the same) and masked1 != 0xff we would need to clear corresponding n->n byte instead of setting it to the input as x ^ x = 0 (but if we don't know what x and y are, the result is also don't know). Now, for plus it is much harder, because not only for non-zero operands we don't know what the result is, but it can modify upper bytes as well. So perhaps only if current's byte masked1 && masked2 set the resulting byte to 0xff (unknown) iff the byte above it is 0 and 0, and set that resulting byte to 0xff too. Also, even for | we could instead of return NULL just set the resulting byte to 0xff if it is different, perhaps it will be masked off later on. Ok to handle that incrementally? 2021-11-24 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/103376 * gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE argument. If CODE is not BIT_IOR_EXPR, ensure that one of masked1 or masked2 is 0. (find_bswap_or_nop_1, find_bswap_or_nop, imm_store_chain_info::try_coalesce_bswap): Adjust perform_symbolic_merge callers. * gcc.c-torture/execute/pr103376.c: New test. --- gcc/gimple-ssa-store-merging.c.jj 2021-11-23 10:26:30.000000000 +0100 +++ gcc/gimple-ssa-store-merging.c 2021-11-23 11:49:33.806168782 +0100 @@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr return true; } -/* Compute the symbolic number N representing the result of a bitwise OR on 2 - symbolic number N1 and N2 whose source statements are respectively - SOURCE_STMT1 and SOURCE_STMT2. */ +/* Compute the symbolic number N representing the result of a bitwise OR, + bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements + are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */ gimple * perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, gimple *source_stmt2, struct symbolic_number *n2, - struct symbolic_number *n) + struct symbolic_number *n, enum tree_code code) { int i, size; uint64_t mask; @@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s masked1 = n1->n & mask; masked2 = n2->n & mask; - if (masked1 && masked2 && masked1 != masked2) + /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 + has to be 0, for BIT_IOR_EXPR x | x is still x. */ + if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) return NULL; } n->n = n1->n | n2->n; @@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc return NULL; source_stmt - = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n); + = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n, + code); if (!source_stmt) return NULL; @@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) return NULL; ins_stmt - = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n); + = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n, + BIT_IOR_EXPR); if (!ins_stmt) return NULL; @@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap end = MAX (end, info->bitpos + info->bitsize); ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt, - &this_n, &n); + &this_n, &n, BIT_IOR_EXPR); if (ins_stmt == NULL) return false; } --- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj 2021-11-23 12:03:38.339948150 +0100 +++ gcc/testsuite/gcc.c-torture/execute/pr103376.c 2021-11-23 12:02:44.668723595 +0100 @@ -0,0 +1,29 @@ +/* PR tree-optimization/103376 */ + +long long a = 0x123456789abcdef0LL, f; +int b, c, *d; + +__attribute__((noipa)) void +foo (int x) +{ + asm volatile ("" : : "r" (x)); +} + +int +main () +{ + long long e; + e = a; + if (b) + { + foo (c); + d = (int *) 0; + while (*d) + ; + } + f = a ^ e; + asm volatile ("" : "+m" (f)); + if (f != 0) + __builtin_abort (); + return 0; +} Jakub ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] 2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek @ 2021-11-24 8:45 ` Richard Biener 2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek 0 siblings, 1 reply; 7+ messages in thread From: Richard Biener @ 2021-11-24 8:45 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches On Wed, 24 Nov 2021, Jakub Jelinek wrote: > On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote: > > This patch implements PR tree-optimization/103345 to merge adjacent > > loads when combined with addition or bitwise xor. The current code > > in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior, > > so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in > > the same way at BIT_IOR_EXPR. > > Unfortunately they aren't exactly the same. They work the same if always > at least one operand (or corresponding byte in it) is known to be 0, > 0 | 0 = 0 ^ 0 = 0 + 0 = 0. But for | also x | x = x for any other x, > so perform_symbolic_merge has been accepting either that at least one > of the bytes is 0 or that both are the same, but that is wrong for ^ > and +. > > The following patch fixes that by passing through the code of binary > operation and allowing non-zero masked1 == masked2 through only > for BIT_IOR_EXPR. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK. > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR. > We could allow masked1 == masked2 case for it, but would need to > do something different than the > n->n = n1->n | n2->n; > we do on all the bytes together. > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0 > both variants are the same) and masked1 != 0xff we would need to > clear corresponding n->n byte instead of setting it to the input > as x ^ x = 0 (but if we don't know what x and y are, the result is > also don't know). Now, for plus it is much harder, because not only > for non-zero operands we don't know what the result is, but it can > modify upper bytes as well. So perhaps only if current's byte > masked1 && masked2 set the resulting byte to 0xff (unknown) iff > the byte above it is 0 and 0, and set that resulting byte to 0xff too. > Also, even for | we could instead of return NULL just set the resulting > byte to 0xff if it is different, perhaps it will be masked off later on. > Ok to handle that incrementally? Not sure if it is worth the trouble - the XOR handling sounds straight forward at least. But sure, the merging routine could simply be conservatively correct here. Thanks, Richard. > 2021-11-24 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/103376 > * gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE > argument. If CODE is not BIT_IOR_EXPR, ensure that one of masked1 > or masked2 is 0. > (find_bswap_or_nop_1, find_bswap_or_nop, > imm_store_chain_info::try_coalesce_bswap): Adjust > perform_symbolic_merge callers. > > * gcc.c-torture/execute/pr103376.c: New test. > > --- gcc/gimple-ssa-store-merging.c.jj 2021-11-23 10:26:30.000000000 +0100 > +++ gcc/gimple-ssa-store-merging.c 2021-11-23 11:49:33.806168782 +0100 > @@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr > return true; > } > > -/* Compute the symbolic number N representing the result of a bitwise OR on 2 > - symbolic number N1 and N2 whose source statements are respectively > - SOURCE_STMT1 and SOURCE_STMT2. */ > +/* Compute the symbolic number N representing the result of a bitwise OR, > + bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements > + are respectively SOURCE_STMT1 and SOURCE_STMT2. CODE is the operation. */ > > gimple * > perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, > gimple *source_stmt2, struct symbolic_number *n2, > - struct symbolic_number *n) > + struct symbolic_number *n, enum tree_code code) > { > int i, size; > uint64_t mask; > @@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s > > masked1 = n1->n & mask; > masked2 = n2->n & mask; > - if (masked1 && masked2 && masked1 != masked2) > + /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 > + has to be 0, for BIT_IOR_EXPR x | x is still x. */ > + if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) > return NULL; > } > n->n = n1->n | n2->n; > @@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc > return NULL; > > source_stmt > - = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n); > + = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n, > + code); > > if (!source_stmt) > return NULL; > @@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct > else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz)) > return NULL; > ins_stmt > - = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n); > + = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n, > + BIT_IOR_EXPR); > > if (!ins_stmt) > return NULL; > @@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap > end = MAX (end, info->bitpos + info->bitsize); > > ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt, > - &this_n, &n); > + &this_n, &n, BIT_IOR_EXPR); > if (ins_stmt == NULL) > return false; > } > --- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj 2021-11-23 12:03:38.339948150 +0100 > +++ gcc/testsuite/gcc.c-torture/execute/pr103376.c 2021-11-23 12:02:44.668723595 +0100 > @@ -0,0 +1,29 @@ > +/* PR tree-optimization/103376 */ > + > +long long a = 0x123456789abcdef0LL, f; > +int b, c, *d; > + > +__attribute__((noipa)) void > +foo (int x) > +{ > + asm volatile ("" : : "r" (x)); > +} > + > +int > +main () > +{ > + long long e; > + e = a; > + if (b) > + { > + foo (c); > + d = (int *) 0; > + while (*d) > + ; > + } > + f = a ^ e; > + asm volatile ("" : "+m" (f)); > + if (f != 0) > + __builtin_abort (); > + return 0; > +} > > > Jakub > > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg) ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH] bswap: Improve perform_symbolic_merge [PR103376] 2021-11-24 8:45 ` Richard Biener @ 2021-11-25 7:54 ` Jakub Jelinek 2021-11-25 8:21 ` Richard Biener 0 siblings, 1 reply; 7+ messages in thread From: Jakub Jelinek @ 2021-11-25 7:54 UTC (permalink / raw) To: Richard Biener; +Cc: Roger Sayle, gcc-patches On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote: > > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR. > > We could allow masked1 == masked2 case for it, but would need to > > do something different than the > > n->n = n1->n | n2->n; > > we do on all the bytes together. > > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0 > > both variants are the same) and masked1 != 0xff we would need to > > clear corresponding n->n byte instead of setting it to the input > > as x ^ x = 0 (but if we don't know what x and y are, the result is > > also don't know). Now, for plus it is much harder, because not only > > for non-zero operands we don't know what the result is, but it can > > modify upper bytes as well. So perhaps only if current's byte > > masked1 && masked2 set the resulting byte to 0xff (unknown) iff > > the byte above it is 0 and 0, and set that resulting byte to 0xff too. > > Also, even for | we could instead of return NULL just set the resulting > > byte to 0xff if it is different, perhaps it will be masked off later on. > > Ok to handle that incrementally? > > Not sure if it is worth the trouble - the XOR handling sounds > straight forward at least. But sure, the merging routine could > simply be conservatively correct here. This patch implements that (except that for + it just punts whenever both operand bytes aren't 0 like before). Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2021-11-25 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/103376 * gimple-ssa-store-merging.c (perform_symbolic_merge): For BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN. For BIT_XOR_EXPR similarly and if masked1 == masked2 and the byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to 0. --- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100 +++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100 @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s n->bytepos = n_start->bytepos; n->type = n_start->type; size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; + uint64_t res_n = n1->n | n2->n; for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) { @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s masked1 = n1->n & mask; masked2 = n2->n & mask; - /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 - has to be 0, for BIT_IOR_EXPR x | x is still x. */ - if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) - return NULL; + /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */ + if (masked1 && masked2) + { + /* + can carry into upper bits, just punt. */ + if (code == PLUS_EXPR) + return NULL; + /* x | x is still x. */ + if (code == BIT_IOR_EXPR && masked1 == masked2) + continue; + if (code == BIT_XOR_EXPR) + { + /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for + unknown values and unknown ^ unknown is unknown. */ + if (masked1 == masked2 + && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN + << i * BITS_PER_MARKER)) + { + res_n &= ~mask; + continue; + } + } + /* Otherwise set the byte to unknown, it might still be + later masked off. */ + res_n |= mask; + } } - n->n = n1->n | n2->n; + n->n = res_n; n->n_ops = n1->n_ops + n2->n_ops; return source_stmt; Jakub ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376] 2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek @ 2021-11-25 8:21 ` Richard Biener 2021-11-25 9:44 ` Jakub Jelinek 0 siblings, 1 reply; 7+ messages in thread From: Richard Biener @ 2021-11-25 8:21 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches On Thu, 25 Nov 2021, Jakub Jelinek wrote: > On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote: > > > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR. > > > We could allow masked1 == masked2 case for it, but would need to > > > do something different than the > > > n->n = n1->n | n2->n; > > > we do on all the bytes together. > > > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0 > > > both variants are the same) and masked1 != 0xff we would need to > > > clear corresponding n->n byte instead of setting it to the input > > > as x ^ x = 0 (but if we don't know what x and y are, the result is > > > also don't know). Now, for plus it is much harder, because not only > > > for non-zero operands we don't know what the result is, but it can > > > modify upper bytes as well. So perhaps only if current's byte > > > masked1 && masked2 set the resulting byte to 0xff (unknown) iff > > > the byte above it is 0 and 0, and set that resulting byte to 0xff too. > > > Also, even for | we could instead of return NULL just set the resulting > > > byte to 0xff if it is different, perhaps it will be masked off later on. > > > Ok to handle that incrementally? > > > > Not sure if it is worth the trouble - the XOR handling sounds > > straight forward at least. But sure, the merging routine could > > simply be conservatively correct here. > > This patch implements that (except that for + it just punts whenever > both operand bytes aren't 0 like before). > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK if you can add a testcase that exercises this "feature". Thanks, Richard. > 2021-11-25 Jakub Jelinek <jakub@redhat.com> > > PR tree-optimization/103376 > * gimple-ssa-store-merging.c (perform_symbolic_merge): For > BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't > punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN. > For BIT_XOR_EXPR similarly and if masked1 == masked2 and the > byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to > 0. > > --- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100 > +++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100 > @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s > n->bytepos = n_start->bytepos; > n->type = n_start->type; > size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; > + uint64_t res_n = n1->n | n2->n; > > for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) > { > @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s > > masked1 = n1->n & mask; > masked2 = n2->n & mask; > - /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 > - has to be 0, for BIT_IOR_EXPR x | x is still x. */ > - if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) > - return NULL; > + /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */ > + if (masked1 && masked2) > + { > + /* + can carry into upper bits, just punt. */ > + if (code == PLUS_EXPR) > + return NULL; > + /* x | x is still x. */ > + if (code == BIT_IOR_EXPR && masked1 == masked2) > + continue; > + if (code == BIT_XOR_EXPR) > + { > + /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for > + unknown values and unknown ^ unknown is unknown. */ > + if (masked1 == masked2 > + && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN > + << i * BITS_PER_MARKER)) > + { > + res_n &= ~mask; > + continue; > + } > + } > + /* Otherwise set the byte to unknown, it might still be > + later masked off. */ > + res_n |= mask; > + } > } > - n->n = n1->n | n2->n; > + n->n = res_n; > n->n_ops = n1->n_ops + n2->n_ops; > > return source_stmt; > > > Jakub > > -- Richard Biener <rguenther@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg) ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376] 2021-11-25 8:21 ` Richard Biener @ 2021-11-25 9:44 ` Jakub Jelinek 0 siblings, 0 replies; 7+ messages in thread From: Jakub Jelinek @ 2021-11-25 9:44 UTC (permalink / raw) To: Richard Biener; +Cc: Roger Sayle, gcc-patches On Thu, Nov 25, 2021 at 09:21:37AM +0100, Richard Biener wrote: > OK if you can add a testcase that exercises this "feature". Sure, that is easy. Here is what I've committed. f2 tests the x | x = x handling in it, f3 tests x | y = unknown instead of punting, f4 tests x ^ x = 0 and f5 tests x ^ y = unknown. Without the patch only f2 is optimized to __builtin_bswap32, with the patch all of them. 2021-11-25 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/103376 * gimple-ssa-store-merging.c (perform_symbolic_merge): For BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN. For BIT_XOR_EXPR similarly and if masked1 == masked2 and the byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to 0. * gcc.dg/optimize-bswapsi-7.c: New test. --- gcc/gimple-ssa-store-merging.c.jj 2021-11-24 09:54:37.684365460 +0100 +++ gcc/gimple-ssa-store-merging.c 2021-11-24 11:18:54.422226266 +0100 @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s n->bytepos = n_start->bytepos; n->type = n_start->type; size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; + uint64_t res_n = n1->n | n2->n; for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) { @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s masked1 = n1->n & mask; masked2 = n2->n & mask; - /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2 - has to be 0, for BIT_IOR_EXPR x | x is still x. */ - if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2)) - return NULL; + /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x. */ + if (masked1 && masked2) + { + /* + can carry into upper bits, just punt. */ + if (code == PLUS_EXPR) + return NULL; + /* x | x is still x. */ + if (code == BIT_IOR_EXPR && masked1 == masked2) + continue; + if (code == BIT_XOR_EXPR) + { + /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for + unknown values and unknown ^ unknown is unknown. */ + if (masked1 == masked2 + && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN + << i * BITS_PER_MARKER)) + { + res_n &= ~mask; + continue; + } + } + /* Otherwise set the byte to unknown, it might still be + later masked off. */ + res_n |= mask; + } } - n->n = n1->n | n2->n; + n->n = res_n; n->n_ops = n1->n_ops + n2->n_ops; return source_stmt; --- gcc/testsuite/gcc.dg/optimize-bswapsi-7.c.jj 2021-11-25 10:36:03.847529686 +0100 +++ gcc/testsuite/gcc.dg/optimize-bswapsi-7.c 2021-11-25 10:35:46.522778192 +0100 @@ -0,0 +1,37 @@ +/* PR tree-optimization/103376 */ +/* { dg-do compile } */ +/* { dg-require-effective-target bswap } */ +/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-optimized" } */ +/* { dg-additional-options "-march=z900" { target s390-*-* } } */ + +static unsigned int +f1 (unsigned int x) +{ + return (x << 24) | (x >> 8); +} + +unsigned int +f2 (unsigned *p) +{ + return ((f1 (p[0]) | (p[0] >> 8)) & 0xff000000U) | (p[0] >> 24) | ((p[0] & 0xff00U) << 8) | ((p[0] & 0xff0000U) >> 8); +} + +unsigned int +f3 (unsigned *p) +{ + return ((f1 (p[0]) | (p[0] & 0x00ff00ffU)) & 0xff00ff00U) | (f1 (f1 (f1 (p[0]))) & 0x00ff00ffU); +} + +unsigned int +f4 (unsigned *p) +{ + return (f1 (p[0]) ^ (p[0] >> 8)) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8); +} + +unsigned int +f5 (unsigned *p) +{ + return (((f1 (p[0]) | (p[0] >> 16)) ^ (p[0] >> 8)) & 0xffff0000U) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8); +} + +/* { dg-final { scan-tree-dump-times "= __builtin_bswap32 \\\(" 4 "optimized" } } */ Jakub ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-11-25 9:44 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-11-22 8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle 2021-11-22 12:28 ` Richard Biener 2021-11-24 8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek 2021-11-24 8:45 ` Richard Biener 2021-11-25 7:54 ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek 2021-11-25 8:21 ` Richard Biener 2021-11-25 9:44 ` Jakub Jelinek
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).