[PATCH] tree-optimization/103345: Improved load merging

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] tree-optimization/103345: Improved load merging
@ 2021-11-22  8:39 Roger Sayle
  2021-11-22 12:28 ` Richard Biener
  2021-11-24  8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
  0 siblings, 2 replies; 7+ messages in thread
From: Roger Sayle @ 2021-11-22  8:39 UTC (permalink / raw)
  To: 'GCC Patches'

[-- Attachment #1: Type: text/plain, Size: 1100 bytes --]


This patch implements PR tree-optimization/103345 to merge adjacent
loads when combined with addition or bitwise xor.  The current code
in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
the same way at BIT_IOR_EXPR.  Many thanks to Andrew Pinski for
pointing out that this also resolves PR target/98953.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures.  The new testcases should also
pass (but haven't been tested) on other endian targets.

Ok for mainline?


2021-11-22  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR tree-optimization/98953
	PR tree-optimization/103345
	* gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle
	BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR.
	(pass_optimize_bswap::execute): Likewise.

gcc/testsuite/ChangeLog
	PR tree-optimization/98953
	PR tree-optimization/103345
	* gcc.dg/tree-ssa/pr98953.c: New test case.
	* gcc.dg/tree-ssa/pr103345.c: New test case.


Thanks in advance,
Roger
--


[-- Attachment #2: patchx.txt --]
[-- Type: text/plain, Size: 3381 bytes --]

diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
index 4efa200..1740c9e 100644
--- a/gcc/gimple-ssa-store-merging.c
+++ b/gcc/gimple-ssa-store-merging.c
@@ -742,10 +742,7 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
       struct symbolic_number n1, n2;
       gimple *source_stmt, *source_stmt2;
 
-      if (code != BIT_IOR_EXPR)
-	return NULL;
-
-      if (TREE_CODE (rhs2) != SSA_NAME)
+      if (!rhs2 || TREE_CODE (rhs2) != SSA_NAME)
 	return NULL;
 
       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
@@ -753,6 +750,8 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
       switch (code)
 	{
 	case BIT_IOR_EXPR:
+	case BIT_XOR_EXPR:
+	case PLUS_EXPR:
 	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
 
 	  if (!source_stmt1)
@@ -1495,6 +1494,8 @@ pass_optimize_bswap::execute (function *fun)
 		continue;
 	      /* Fall through.  */
 	    case BIT_IOR_EXPR:
+	    case BIT_XOR_EXPR:
+	    case PLUS_EXPR:
 	      break;
 	    case CONSTRUCTOR:
 	      {
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c
new file mode 100644
index 0000000..94388b5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr103345.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-bswap-details" } */
+
+typedef unsigned int uint32_t;
+typedef unsigned char uint8_t;
+
+uint32_t load_le_32_or(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return ((uint32_t)ptr[0]) |
+         ((uint32_t)ptr[1] << 8) |
+         ((uint32_t)ptr[2] << 16) |
+         ((uint32_t)ptr[3] << 24);
+#else
+  return ((uint32_t)ptr[3]) |
+         ((uint32_t)ptr[2] << 8) |
+         ((uint32_t)ptr[1] << 16) |
+         ((uint32_t)ptr[0] << 24);
+#endif
+}
+
+uint32_t load_le_32_add(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return ((uint32_t)ptr[0]) +
+         ((uint32_t)ptr[1] << 8) +
+         ((uint32_t)ptr[2] << 16) +
+         ((uint32_t)ptr[3] << 24);
+#else
+  return ((uint32_t)ptr[3]) +
+         ((uint32_t)ptr[2] << 8) +
+         ((uint32_t)ptr[1] << 16) +
+         ((uint32_t)ptr[0] << 24);
+#endif
+}
+
+uint32_t load_le_32_xor(const uint8_t *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  return ((uint32_t)ptr[0]) ^
+         ((uint32_t)ptr[1] << 8) ^
+         ((uint32_t)ptr[2] << 16) ^
+         ((uint32_t)ptr[3] << 24);
+#else
+  return ((uint32_t)ptr[0]) ^
+         ((uint32_t)ptr[1] << 8) ^
+         ((uint32_t)ptr[2] << 16) ^
+         ((uint32_t)ptr[3] << 24);
+#endif
+}
+
+/* { dg-final { scan-tree-dump-times "32 bit load in target endianness found" 3 "bswap" } } */
+
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c
new file mode 100644
index 0000000..7687dc2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr98953.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-bswap-details" } */
+
+int foo(unsigned char *ptr)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    return ptr[0] + (ptr[1] << 8);
+#else
+    return ptr[1] + (ptr[0] << 8);
+#endif
+}
+
+/* { dg-final { scan-tree-dump "16 bit load in target endianness found" "bswap" } } */
+

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] tree-optimization/103345: Improved load merging
  2021-11-22  8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
@ 2021-11-22 12:28 ` Richard Biener
  2021-11-24  8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
  1 sibling, 0 replies; 7+ messages in thread
From: Richard Biener @ 2021-11-22 12:28 UTC (permalink / raw)
  To: Roger Sayle; +Cc: GCC Patches

On Mon, Nov 22, 2021 at 9:40 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch implements PR tree-optimization/103345 to merge adjacent
> loads when combined with addition or bitwise xor.  The current code
> in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> the same way at BIT_IOR_EXPR.  Many thanks to Andrew Pinski for
> pointing out that this also resolves PR target/98953.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  The new testcases should also
> pass (but haven't been tested) on other endian targets.
>
> Ok for mainline?

OK.

Thanks,
Richard.

>
> 2021-11-22  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR tree-optimization/98953
>         PR tree-optimization/103345
>         * gimple-ssa-store-merging.c (find_bswap_or_nop_1): Handle
>         BIT_XOR_EXPR and PLUS_EXPR the same as BIT_IOR_EXPR.
>         (pass_optimize_bswap::execute): Likewise.
>
> gcc/testsuite/ChangeLog
>         PR tree-optimization/98953
>         PR tree-optimization/103345
>         * gcc.dg/tree-ssa/pr98953.c: New test case.
>         * gcc.dg/tree-ssa/pr103345.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376]
  2021-11-22  8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
  2021-11-22 12:28 ` Richard Biener
@ 2021-11-24  8:35 ` Jakub Jelinek
  2021-11-24  8:45   ` Richard Biener
  1 sibling, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-24  8:35 UTC (permalink / raw)
  To: Richard Biener, Roger Sayle; +Cc: gcc-patches

On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote:
> This patch implements PR tree-optimization/103345 to merge adjacent
> loads when combined with addition or bitwise xor.  The current code
> in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> the same way at BIT_IOR_EXPR.

Unfortunately they aren't exactly the same.  They work the same if always
at least one operand (or corresponding byte in it) is known to be 0,
0 | 0 = 0 ^ 0 = 0 + 0 = 0.  But for | also x | x = x for any other x,
so perform_symbolic_merge has been accepting either that at least one
of the bytes is 0 or that both are the same, but that is wrong for ^
and +.

The following patch fixes that by passing through the code of binary
operation and allowing non-zero masked1 == masked2 through only
for BIT_IOR_EXPR.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
We could allow masked1 == masked2 case for it, but would need to
do something different than the
  n->n = n1->n | n2->n;
we do on all the bytes together.
In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
both variants are the same) and masked1 != 0xff we would need to
clear corresponding n->n byte instead of setting it to the input
as x ^ x = 0 (but if we don't know what x and y are, the result is
also don't know).  Now, for plus it is much harder, because not only
for non-zero operands we don't know what the result is, but it can
modify upper bytes as well.  So perhaps only if current's byte
masked1 && masked2 set the resulting byte to 0xff (unknown) iff
the byte above it is 0 and 0, and set that resulting byte to 0xff too.
Also, even for | we could instead of return NULL just set the resulting
byte to 0xff if it is different, perhaps it will be masked off later on.
Ok to handle that incrementally?

2021-11-24  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/103376
	* gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE
	argument.  If CODE is not BIT_IOR_EXPR, ensure that one of masked1
	or masked2 is 0.
	(find_bswap_or_nop_1, find_bswap_or_nop,
	imm_store_chain_info::try_coalesce_bswap): Adjust
	perform_symbolic_merge callers.

	* gcc.c-torture/execute/pr103376.c: New test.

--- gcc/gimple-ssa-store-merging.c.jj	2021-11-23 10:26:30.000000000 +0100
+++ gcc/gimple-ssa-store-merging.c	2021-11-23 11:49:33.806168782 +0100
@@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr
   return true;
 }
 
-/* Compute the symbolic number N representing the result of a bitwise OR on 2
-   symbolic number N1 and N2 whose source statements are respectively
-   SOURCE_STMT1 and SOURCE_STMT2.  */
+/* Compute the symbolic number N representing the result of a bitwise OR,
+   bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements
+   are respectively SOURCE_STMT1 and SOURCE_STMT2.  CODE is the operation.  */
 
 gimple *
 perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
 			gimple *source_stmt2, struct symbolic_number *n2,
-			struct symbolic_number *n)
+			struct symbolic_number *n, enum tree_code code)
 {
   int i, size;
   uint64_t mask;
@@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s
 
       masked1 = n1->n & mask;
       masked2 = n2->n & mask;
-      if (masked1 && masked2 && masked1 != masked2)
+      /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
+	 has to be 0, for BIT_IOR_EXPR x | x is still x.  */
+      if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
 	return NULL;
     }
   n->n = n1->n | n2->n;
@@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc
 	    return NULL;
 
 	  source_stmt
-	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
+	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n,
+				      code);
 
 	  if (!source_stmt)
 	    return NULL;
@@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct
 	      else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
 		return NULL;
 	      ins_stmt
-		= perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
+		= perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n,
+					  BIT_IOR_EXPR);
 
 	      if (!ins_stmt)
 		return NULL;
@@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap
 	  end = MAX (end, info->bitpos + info->bitsize);
 
 	  ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
-					     &this_n, &n);
+					     &this_n, &n, BIT_IOR_EXPR);
 	  if (ins_stmt == NULL)
 	    return false;
 	}
--- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj	2021-11-23 12:03:38.339948150 +0100
+++ gcc/testsuite/gcc.c-torture/execute/pr103376.c	2021-11-23 12:02:44.668723595 +0100
@@ -0,0 +1,29 @@
+/* PR tree-optimization/103376 */
+
+long long a = 0x123456789abcdef0LL, f;
+int b, c, *d;
+
+__attribute__((noipa)) void
+foo (int x)
+{
+  asm volatile ("" : : "r" (x));
+}
+
+int
+main ()
+{
+  long long e;
+  e = a;
+  if (b)
+    {
+      foo (c);
+      d = (int *) 0;
+      while (*d)
+	;
+    }
+  f = a ^ e;
+  asm volatile ("" : "+m" (f));
+  if (f != 0)
+    __builtin_abort ();
+  return 0;
+}


	Jakub


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376]
  2021-11-24  8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
@ 2021-11-24  8:45   ` Richard Biener
  2021-11-25  7:54     ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-11-24  8:45 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches

On Wed, 24 Nov 2021, Jakub Jelinek wrote:

> On Mon, Nov 22, 2021 at 08:39:42AM -0000, Roger Sayle wrote:
> > This patch implements PR tree-optimization/103345 to merge adjacent
> > loads when combined with addition or bitwise xor.  The current code
> > in gimple-ssa-store-merging.c's find_bswap_or_nop alreay handles ior,
> > so that all that's required is to treat PLUS_EXPR and BIT_XOR_EXPR in
> > the same way at BIT_IOR_EXPR.
> 
> Unfortunately they aren't exactly the same.  They work the same if always
> at least one operand (or corresponding byte in it) is known to be 0,
> 0 | 0 = 0 ^ 0 = 0 + 0 = 0.  But for | also x | x = x for any other x,
> so perform_symbolic_merge has been accepting either that at least one
> of the bytes is 0 or that both are the same, but that is wrong for ^
> and +.
> 
> The following patch fixes that by passing through the code of binary
> operation and allowing non-zero masked1 == masked2 through only
> for BIT_IOR_EXPR.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

> Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> We could allow masked1 == masked2 case for it, but would need to
> do something different than the
>   n->n = n1->n | n2->n;
> we do on all the bytes together.
> In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> both variants are the same) and masked1 != 0xff we would need to
> clear corresponding n->n byte instead of setting it to the input
> as x ^ x = 0 (but if we don't know what x and y are, the result is
> also don't know).  Now, for plus it is much harder, because not only
> for non-zero operands we don't know what the result is, but it can
> modify upper bytes as well.  So perhaps only if current's byte
> masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> Also, even for | we could instead of return NULL just set the resulting
> byte to 0xff if it is different, perhaps it will be masked off later on.
> Ok to handle that incrementally?

Not sure if it is worth the trouble - the XOR handling sounds
straight forward at least.  But sure, the merging routine could
simply be conservatively correct here.

Thanks,
Richard.

> 2021-11-24  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/103376
> 	* gimple-ssa-store-merging.c (perform_symbolic_merge): Add CODE
> 	argument.  If CODE is not BIT_IOR_EXPR, ensure that one of masked1
> 	or masked2 is 0.
> 	(find_bswap_or_nop_1, find_bswap_or_nop,
> 	imm_store_chain_info::try_coalesce_bswap): Adjust
> 	perform_symbolic_merge callers.
> 
> 	* gcc.c-torture/execute/pr103376.c: New test.
> 
> --- gcc/gimple-ssa-store-merging.c.jj	2021-11-23 10:26:30.000000000 +0100
> +++ gcc/gimple-ssa-store-merging.c	2021-11-23 11:49:33.806168782 +0100
> @@ -434,14 +434,14 @@ find_bswap_or_nop_load (gimple *stmt, tr
>    return true;
>  }
>  
> -/* Compute the symbolic number N representing the result of a bitwise OR on 2
> -   symbolic number N1 and N2 whose source statements are respectively
> -   SOURCE_STMT1 and SOURCE_STMT2.  */
> +/* Compute the symbolic number N representing the result of a bitwise OR,
> +   bitwise XOR or plus on 2 symbolic number N1 and N2 whose source statements
> +   are respectively SOURCE_STMT1 and SOURCE_STMT2.  CODE is the operation.  */
>  
>  gimple *
>  perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
>  			gimple *source_stmt2, struct symbolic_number *n2,
> -			struct symbolic_number *n)
> +			struct symbolic_number *n, enum tree_code code)
>  {
>    int i, size;
>    uint64_t mask;
> @@ -563,7 +563,9 @@ perform_symbolic_merge (gimple *source_s
>  
>        masked1 = n1->n & mask;
>        masked2 = n2->n & mask;
> -      if (masked1 && masked2 && masked1 != masked2)
> +      /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
> +	 has to be 0, for BIT_IOR_EXPR x | x is still x.  */
> +      if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
>  	return NULL;
>      }
>    n->n = n1->n | n2->n;
> @@ -769,7 +771,8 @@ find_bswap_or_nop_1 (gimple *stmt, struc
>  	    return NULL;
>  
>  	  source_stmt
> -	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
> +	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n,
> +				      code);
>  
>  	  if (!source_stmt)
>  	    return NULL;
> @@ -943,7 +946,8 @@ find_bswap_or_nop (gimple *stmt, struct
>  	      else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
>  		return NULL;
>  	      ins_stmt
> -		= perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
> +		= perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n,
> +					  BIT_IOR_EXPR);
>  
>  	      if (!ins_stmt)
>  		return NULL;
> @@ -2881,7 +2885,7 @@ imm_store_chain_info::try_coalesce_bswap
>  	  end = MAX (end, info->bitpos + info->bitsize);
>  
>  	  ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt,
> -					     &this_n, &n);
> +					     &this_n, &n, BIT_IOR_EXPR);
>  	  if (ins_stmt == NULL)
>  	    return false;
>  	}
> --- gcc/testsuite/gcc.c-torture/execute/pr103376.c.jj	2021-11-23 12:03:38.339948150 +0100
> +++ gcc/testsuite/gcc.c-torture/execute/pr103376.c	2021-11-23 12:02:44.668723595 +0100
> @@ -0,0 +1,29 @@
> +/* PR tree-optimization/103376 */
> +
> +long long a = 0x123456789abcdef0LL, f;
> +int b, c, *d;
> +
> +__attribute__((noipa)) void
> +foo (int x)
> +{
> +  asm volatile ("" : : "r" (x));
> +}
> +
> +int
> +main ()
> +{
> +  long long e;
> +  e = a;
> +  if (b)
> +    {
> +      foo (c);
> +      d = (int *) 0;
> +      while (*d)
> +	;
> +    }
> +  f = a ^ e;
> +  asm volatile ("" : "+m" (f));
> +  if (f != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> 
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
  2021-11-24  8:45   ` Richard Biener
@ 2021-11-25  7:54     ` Jakub Jelinek
  2021-11-25  8:21       ` Richard Biener
  0 siblings, 1 reply; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-25  7:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: Roger Sayle, gcc-patches

On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote:
> > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> > We could allow masked1 == masked2 case for it, but would need to
> > do something different than the
> >   n->n = n1->n | n2->n;
> > we do on all the bytes together.
> > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> > both variants are the same) and masked1 != 0xff we would need to
> > clear corresponding n->n byte instead of setting it to the input
> > as x ^ x = 0 (but if we don't know what x and y are, the result is
> > also don't know).  Now, for plus it is much harder, because not only
> > for non-zero operands we don't know what the result is, but it can
> > modify upper bytes as well.  So perhaps only if current's byte
> > masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> > the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> > Also, even for | we could instead of return NULL just set the resulting
> > byte to 0xff if it is different, perhaps it will be masked off later on.
> > Ok to handle that incrementally?
> 
> Not sure if it is worth the trouble - the XOR handling sounds
> straight forward at least.  But sure, the merging routine could
> simply be conservatively correct here.

This patch implements that (except that for + it just punts whenever
both operand bytes aren't 0 like before).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-11-25  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/103376
	* gimple-ssa-store-merging.c (perform_symbolic_merge): For
	BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
	punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
	For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
	byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
	0.

--- gcc/gimple-ssa-store-merging.c.jj	2021-11-24 09:54:37.684365460 +0100
+++ gcc/gimple-ssa-store-merging.c	2021-11-24 11:18:54.422226266 +0100
@@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
   n->bytepos = n_start->bytepos;
   n->type = n_start->type;
   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
+  uint64_t res_n = n1->n | n2->n;
 
   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
     {
@@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
 
       masked1 = n1->n & mask;
       masked2 = n2->n & mask;
-      /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
-	 has to be 0, for BIT_IOR_EXPR x | x is still x.  */
-      if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
-	return NULL;
+      /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x.  */
+      if (masked1 && masked2)
+	{
+	  /* + can carry into upper bits, just punt.  */
+	  if (code == PLUS_EXPR)
+	    return NULL;
+	  /* x | x is still x.  */
+	  if (code == BIT_IOR_EXPR && masked1 == masked2)
+	    continue;
+	  if (code == BIT_XOR_EXPR)
+	    {
+	      /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
+		 unknown values and unknown ^ unknown is unknown.  */
+	      if (masked1 == masked2
+		  && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
+				 << i * BITS_PER_MARKER))
+		{
+		  res_n &= ~mask;
+		  continue;
+		}
+	    }
+	  /* Otherwise set the byte to unknown, it might still be
+	     later masked off.  */
+	  res_n |= mask;
+	}
     }
-  n->n = n1->n | n2->n;
+  n->n = res_n;
   n->n_ops = n1->n_ops + n2->n_ops;
 
   return source_stmt;


	Jakub


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
  2021-11-25  7:54     ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
@ 2021-11-25  8:21       ` Richard Biener
  2021-11-25  9:44         ` Jakub Jelinek
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Biener @ 2021-11-25  8:21 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Roger Sayle, gcc-patches

On Thu, 25 Nov 2021, Jakub Jelinek wrote:

> On Wed, Nov 24, 2021 at 09:45:16AM +0100, Richard Biener wrote:
> > > Thinking more about it, perhaps we could do more for BIT_XOR_EXPR.
> > > We could allow masked1 == masked2 case for it, but would need to
> > > do something different than the
> > >   n->n = n1->n | n2->n;
> > > we do on all the bytes together.
> > > In particular, for masked1 == masked2 if masked1 != 0 (well, for 0
> > > both variants are the same) and masked1 != 0xff we would need to
> > > clear corresponding n->n byte instead of setting it to the input
> > > as x ^ x = 0 (but if we don't know what x and y are, the result is
> > > also don't know).  Now, for plus it is much harder, because not only
> > > for non-zero operands we don't know what the result is, but it can
> > > modify upper bytes as well.  So perhaps only if current's byte
> > > masked1 && masked2 set the resulting byte to 0xff (unknown) iff
> > > the byte above it is 0 and 0, and set that resulting byte to 0xff too.
> > > Also, even for | we could instead of return NULL just set the resulting
> > > byte to 0xff if it is different, perhaps it will be masked off later on.
> > > Ok to handle that incrementally?
> > 
> > Not sure if it is worth the trouble - the XOR handling sounds
> > straight forward at least.  But sure, the merging routine could
> > simply be conservatively correct here.
> 
> This patch implements that (except that for + it just punts whenever
> both operand bytes aren't 0 like before).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK if you can add a testcase that exercises this "feature".

Thanks,
Richard.

> 2021-11-25  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/103376
> 	* gimple-ssa-store-merging.c (perform_symbolic_merge): For
> 	BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
> 	punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
> 	For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
> 	byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
> 	0.
> 
> --- gcc/gimple-ssa-store-merging.c.jj	2021-11-24 09:54:37.684365460 +0100
> +++ gcc/gimple-ssa-store-merging.c	2021-11-24 11:18:54.422226266 +0100
> @@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
>    n->bytepos = n_start->bytepos;
>    n->type = n_start->type;
>    size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
> +  uint64_t res_n = n1->n | n2->n;
>  
>    for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
>      {
> @@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
>  
>        masked1 = n1->n & mask;
>        masked2 = n2->n & mask;
> -      /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
> -	 has to be 0, for BIT_IOR_EXPR x | x is still x.  */
> -      if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
> -	return NULL;
> +      /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x.  */
> +      if (masked1 && masked2)
> +	{
> +	  /* + can carry into upper bits, just punt.  */
> +	  if (code == PLUS_EXPR)
> +	    return NULL;
> +	  /* x | x is still x.  */
> +	  if (code == BIT_IOR_EXPR && masked1 == masked2)
> +	    continue;
> +	  if (code == BIT_XOR_EXPR)
> +	    {
> +	      /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
> +		 unknown values and unknown ^ unknown is unknown.  */
> +	      if (masked1 == masked2
> +		  && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
> +				 << i * BITS_PER_MARKER))
> +		{
> +		  res_n &= ~mask;
> +		  continue;
> +		}
> +	    }
> +	  /* Otherwise set the byte to unknown, it might still be
> +	     later masked off.  */
> +	  res_n |= mask;
> +	}
>      }
> -  n->n = n1->n | n2->n;
> +  n->n = res_n;
>    n->n_ops = n1->n_ops + n2->n_ops;
>  
>    return source_stmt;
> 
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bswap: Improve perform_symbolic_merge [PR103376]
  2021-11-25  8:21       ` Richard Biener
@ 2021-11-25  9:44         ` Jakub Jelinek
  0 siblings, 0 replies; 7+ messages in thread
From: Jakub Jelinek @ 2021-11-25  9:44 UTC (permalink / raw)
  To: Richard Biener; +Cc: Roger Sayle, gcc-patches

On Thu, Nov 25, 2021 at 09:21:37AM +0100, Richard Biener wrote:
> OK if you can add a testcase that exercises this "feature".

Sure, that is easy.

Here is what I've committed.  f2 tests the x | x = x handling in it,
f3 tests x | y = unknown instead of punting, f4 tests x ^ x = 0
and f5 tests x ^ y = unknown.  Without the patch only f2 is optimized
to __builtin_bswap32, with the patch all of them.

2021-11-25  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/103376
	* gimple-ssa-store-merging.c (perform_symbolic_merge): For
	BIT_IOR_EXPR, if masked1 && masked2 && masked1 != masked2, don't
	punt, but set the corresponding result byte to MARKER_BYTE_UNKNOWN.
	For BIT_XOR_EXPR similarly and if masked1 == masked2 and the
	byte isn't MARKER_BYTE_UNKNOWN, set the corresponding result byte to
	0.

	* gcc.dg/optimize-bswapsi-7.c: New test.

--- gcc/gimple-ssa-store-merging.c.jj	2021-11-24 09:54:37.684365460 +0100
+++ gcc/gimple-ssa-store-merging.c	2021-11-24 11:18:54.422226266 +0100
@@ -556,6 +556,7 @@ perform_symbolic_merge (gimple *source_s
   n->bytepos = n_start->bytepos;
   n->type = n_start->type;
   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
+  uint64_t res_n = n1->n | n2->n;
 
   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
     {
@@ -563,12 +564,33 @@ perform_symbolic_merge (gimple *source_s
 
       masked1 = n1->n & mask;
       masked2 = n2->n & mask;
-      /* For BIT_XOR_EXPR or PLUS_EXPR, at least one of masked1 and masked2
-	 has to be 0, for BIT_IOR_EXPR x | x is still x.  */
-      if (masked1 && masked2 && (code != BIT_IOR_EXPR || masked1 != masked2))
-	return NULL;
+      /* If at least one byte is 0, all of 0 | x == 0 ^ x == 0 + x == x.  */
+      if (masked1 && masked2)
+	{
+	  /* + can carry into upper bits, just punt.  */
+	  if (code == PLUS_EXPR)
+	    return NULL;
+	  /* x | x is still x.  */
+	  if (code == BIT_IOR_EXPR && masked1 == masked2)
+	    continue;
+	  if (code == BIT_XOR_EXPR)
+	    {
+	      /* x ^ x is 0, but MARKER_BYTE_UNKNOWN stands for
+		 unknown values and unknown ^ unknown is unknown.  */
+	      if (masked1 == masked2
+		  && masked1 != ((uint64_t) MARKER_BYTE_UNKNOWN
+				 << i * BITS_PER_MARKER))
+		{
+		  res_n &= ~mask;
+		  continue;
+		}
+	    }
+	  /* Otherwise set the byte to unknown, it might still be
+	     later masked off.  */
+	  res_n |= mask;
+	}
     }
-  n->n = n1->n | n2->n;
+  n->n = res_n;
   n->n_ops = n1->n_ops + n2->n_ops;
 
   return source_stmt;
--- gcc/testsuite/gcc.dg/optimize-bswapsi-7.c.jj	2021-11-25 10:36:03.847529686 +0100
+++ gcc/testsuite/gcc.dg/optimize-bswapsi-7.c	2021-11-25 10:35:46.522778192 +0100
@@ -0,0 +1,37 @@
+/* PR tree-optimization/103376 */
+/* { dg-do compile } */
+/* { dg-require-effective-target bswap } */
+/* { dg-options "-O2 -fno-tree-vectorize -fdump-tree-optimized" } */
+/* { dg-additional-options "-march=z900" { target s390-*-* } } */
+
+static unsigned int
+f1 (unsigned int x)
+{
+  return (x << 24) | (x >> 8);
+}
+
+unsigned int
+f2 (unsigned *p)
+{
+  return ((f1 (p[0]) | (p[0] >> 8)) & 0xff000000U) | (p[0] >> 24) | ((p[0] & 0xff00U) << 8) | ((p[0] & 0xff0000U) >> 8);
+}
+
+unsigned int
+f3 (unsigned *p)
+{
+  return ((f1 (p[0]) | (p[0] & 0x00ff00ffU)) & 0xff00ff00U) | (f1 (f1 (f1 (p[0]))) & 0x00ff00ffU);
+}
+
+unsigned int
+f4 (unsigned *p)
+{
+  return (f1 (p[0]) ^ (p[0] >> 8)) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8);
+}
+
+unsigned int
+f5 (unsigned *p)
+{
+  return (((f1 (p[0]) | (p[0] >> 16)) ^ (p[0] >> 8)) & 0xffff0000U) ^ (p[0] >> 24) ^ ((p[0] & 0xff00U) << 8) ^ ((p[0] & 0xff0000U) >> 8);
+}
+
+/* { dg-final { scan-tree-dump-times "= __builtin_bswap32 \\\(" 4 "optimized" } } */


	Jakub


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-11-25  9:44 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-22  8:39 [PATCH] tree-optimization/103345: Improved load merging Roger Sayle
2021-11-22 12:28 ` Richard Biener
2021-11-24  8:35 ` [PATCH] bswap: Fix up symbolic merging for xor and plus [PR103376] Jakub Jelinek
2021-11-24  8:45   ` Richard Biener
2021-11-25  7:54     ` [PATCH] bswap: Improve perform_symbolic_merge [PR103376] Jakub Jelinek
2021-11-25  8:21       ` Richard Biener
2021-11-25  9:44         ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).