[PATCH] Improve integer bit test on atomic builtin return

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Improve integer bit test on atomic builtin return
@ 2021-10-04 13:53 H.J. Lu
  2021-10-05 10:07 ` Richard Biener
  0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-10-04 13:53 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek, Richard Biener

commit adedd5c173388ae505470df152b9cb3947339566
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Tue May 3 13:37:25 2016 +0200

    re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')

optimized bit test on atomic builtin return with lock bts/btr/btc.  But
it works only for unsigned integers since atomic builtins operate on the
'uintptr_t' type.  It fails on bool:

  _1 = atomic builtin;
  _4 = (_Bool) _1;

and signed integers:

  _1 = atomic builtin;
  _2 = (int) _1;
  _5 = _2 & (1 << N);

Improve bit test on atomic builtin return by converting:

  _1 = atomic builtin;
  _4 = (_Bool) _1;

to

  _1 = atomic builtin;
  _5 = _1 & (1 << 0);
  _4 = (_Bool) _5;

and converting:

  _1 = atomic builtin;
  _2 = (int) _1;
  _5 = _2 & (1 << N);

to
  _1 = atomic builtin;
  _6 = _1 & (1 << N);
  _5 = (int) _6;

gcc/

	PR middle-end/102566
	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
	between atomic builtin and bit test.

gcc/testsuite/

	PR middle-end/102566
	* g++.target/i386/pr102566-1.C: New test.
	* gcc.target/i386/pr102566-1a.c: Likewise.
	* gcc.target/i386/pr102566-1b.c: Likewise.
	* gcc.target/i386/pr102566-2.c: Likewise.
---
 gcc/testsuite/g++.target/i386/pr102566-1.C  |  12 ++
 gcc/testsuite/gcc.target/i386/pr102566-1a.c | 188 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-1b.c | 107 +++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-2.c  |  14 ++
 gcc/tree-ssa-ccp.c                          | 136 +++++++++++++-
 5 files changed, 452 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c

diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
new file mode 100644
index 00000000000..6e33298d8bf
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
@@ -0,0 +1,12 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool tbit(std::atomic<int> &i)
+{
+  return i.fetch_or(1, std::memory_order_relaxed) & 1;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
new file mode 100644
index 00000000000..a915de354e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
@@ -0,0 +1,188 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void bar (void);
+
+__attribute__((noinline, noclone)) int
+f1 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f2 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
+  int t2 = t1 & mask;
+  return t2 != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f3 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f5 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f6 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) int
+f9 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f10 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f11 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f12 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f13 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f14 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f15 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f16 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f17 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f18 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f19 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f20 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f21 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f22 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f23 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) short int
+f24 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) short int
+f25 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
new file mode 100644
index 00000000000..c4dab8135c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
@@ -0,0 +1,107 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -g" } */
+
+int cnt;
+
+__attribute__((noinline, noclone)) void
+bar (void)
+{
+  cnt++;
+}
+
+#include "pr102566-1a.c"
+
+int a;
+long int b;
+unsigned long int c;
+unsigned short int d;
+
+int
+main ()
+{
+  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
+  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
+      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
+    __builtin_abort ();
+  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
+      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
+    __builtin_abort ();
+  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
+  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
+      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
+    __builtin_abort ();
+  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
+  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
+      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
+    __builtin_abort ();
+  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
+      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (cnt != 0
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
+  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
+  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
+      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
+    __builtin_abort ();
+  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
+  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
+      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
+  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || cnt != 2)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
new file mode 100644
index 00000000000..d1c30315353
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v)
+{
+  return atomic_fetch_or_explicit (v, 1, memory_order_relaxed) & 1;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
index 70ce6a4d5b8..a3f7b7f233e 100644
--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -3279,10 +3279,115 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
       || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
       || !single_imm_use (lhs, &use_p, &use_stmt)
       || !is_gimple_assign (use_stmt)
-      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
       || !gimple_vdef (call))
     return;
 
+  mask = gimple_call_arg (call, 1);
+  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
+  if (rhs_code != BIT_AND_EXPR)
+    {
+      if (rhs_code != NOP_EXPR)
+	return;
+
+      tree nop_lhs = gimple_assign_lhs (use_stmt);
+      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nop_lhs))
+	return;
+
+      tree nop_rhs = gimple_assign_rhs1 (use_stmt);
+
+      gimple *g;
+      gimple_stmt_iterator gsi;
+      tree var;
+
+      if (TREE_CODE (TREE_TYPE (nop_lhs)) == BOOLEAN_TYPE)
+	{
+	  /* Convert
+	     _1 = atomic bit op;
+	     _4 = (_Bool) _1;
+	     to
+	     _1 = atomic bit op;
+	     _5 = _1 & 1;
+	     _4 = (_Bool) _5;
+	   */
+	  var = make_ssa_name (TREE_TYPE (nop_rhs));
+	  replace_uses_by (nop_rhs, var);
+	  g = gimple_build_assign (var, BIT_AND_EXPR, nop_rhs,
+				   build_int_cst (TREE_TYPE (lhs), 1));
+	  gsi = gsi_for_stmt (use_stmt);
+	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
+	  use_stmt = g;
+	}
+      else if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
+	       == TYPE_PRECISION (TREE_TYPE (nop_rhs)))
+	{
+	  gimple *use_nop_stmt;
+	  if (!single_imm_use (nop_lhs, &use_p, &use_nop_stmt)
+	      || !is_gimple_assign (use_nop_stmt)
+	      || gimple_assign_rhs_code (use_nop_stmt) != BIT_AND_EXPR)
+	    return;
+
+	  tree op_mask = mask;
+	  if (TREE_CODE (op_mask) == SSA_NAME)
+	    {
+	      g = SSA_NAME_DEF_STMT (op_mask);
+	      if (gimple_assign_rhs_code (g) == NOP_EXPR)
+		{
+		  tree mask_nop_lhs = gimple_assign_lhs (g);
+
+		  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
+		    return;
+
+		  tree mask_nop_rhs = gimple_assign_rhs1 (g);
+		  if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
+		      != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
+		    return;
+		  op_mask = mask_nop_rhs;
+		  g = SSA_NAME_DEF_STMT (op_mask);
+		}
+
+	      if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+		{
+		  if (!is_gimple_assign (g)
+		      || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
+		    return;
+		  tree reset_mask = gimple_assign_rhs1 (g);
+		  if (TREE_CODE (op_mask) != SSA_NAME)
+		    return;
+		  g = SSA_NAME_DEF_STMT (reset_mask);
+		}
+
+	      if (!is_gimple_assign (g)
+		  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
+		  || !integer_onep (gimple_assign_rhs1 (g)))
+		return;
+	    }
+
+	  /* Convert
+	     _1 = atomic bit op;
+	     _2 = (int) _1;
+	     _5 = _2 & N;
+	     to
+	     _1 = atomic bit op;
+	     _6 = _1 & N;
+	     _5 = (int) _6;
+	   */
+	  replace_uses_by (nop_lhs, lhs);
+	  tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
+	  var = make_ssa_name (TREE_TYPE (use_nop_lhs));
+	  gimple_assign_set_lhs (use_nop_stmt, var);
+	  gsi = gsi_for_stmt (use_stmt);
+	  gsi_remove (&gsi, true);
+	  release_defs (use_stmt);
+	  gsi_remove (gsip, true);
+	  var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
+	  gsi = gsi_for_stmt (use_nop_stmt);
+	  g = gimple_build_assign (use_nop_lhs, var);
+	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	  use_stmt = use_nop_stmt;
+	  mask = op_mask;
+	}
+    }
+
   switch (fn)
     {
     case IFN_ATOMIC_BIT_TEST_AND_SET:
@@ -3301,7 +3406,6 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
     return;
 
-  mask = gimple_call_arg (call, 1);
   tree use_lhs = gimple_assign_lhs (use_stmt);
   if (!use_lhs)
     return;
@@ -3434,18 +3538,40 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
 	 of the specified bit after the atomic operation (makes only sense
 	 for xor, otherwise the bit content is compile time known),
 	 we need to invert the bit.  */
+      tree mask_convert = mask;
+      gimple *g_convert = nullptr;
+      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
+	{
+	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
+	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
+	  g_convert = gimple_build_assign (mask_convert, var);
+	}
       g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
 			       BIT_XOR_EXPR, new_lhs,
 			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
-					: mask);
+					: mask_convert);
       new_lhs = gimple_assign_lhs (g);
       if (throws)
 	{
-	  gsi_insert_on_edge_immediate (e, g);
+	  if (g_convert)
+	    {
+	      gsi_insert_on_edge_immediate (e, g_convert);
+	      gsi = gsi_for_stmt (g_convert);
+	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	    }
+	  else
+	    gsi_insert_on_edge_immediate (e, g);
 	  gsi = gsi_for_stmt (g);
 	}
       else
-	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	{
+	  if (g_convert)
+	    {
+	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
+	      gsi = gsi_for_stmt (g_convert);
+	    }
+	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	}
     }
   if (use_bool && has_debug_uses)
     {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Improve integer bit test on atomic builtin return
  2021-10-04 13:53 [PATCH] Improve integer bit test on atomic builtin return H.J. Lu
@ 2021-10-05 10:07 ` Richard Biener
  2021-10-05 16:40   ` H.J. Lu
  0 siblings, 1 reply; 10+ messages in thread
From: Richard Biener @ 2021-10-05 10:07 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On Mon, 4 Oct 2021, H.J. Lu wrote:

> commit adedd5c173388ae505470df152b9cb3947339566
> Author: Jakub Jelinek <jakub@redhat.com>
> Date:   Tue May 3 13:37:25 2016 +0200
> 
>     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> 
> optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> it works only for unsigned integers since atomic builtins operate on the
> 'uintptr_t' type.  It fails on bool:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> and signed integers:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> Improve bit test on atomic builtin return by converting:
> 
>   _1 = atomic builtin;
>   _4 = (_Bool) _1;
> 
> to
> 
>   _1 = atomic builtin;
>   _5 = _1 & (1 << 0);
>   _4 = (_Bool) _5;
> 
> and converting:
> 
>   _1 = atomic builtin;
>   _2 = (int) _1;
>   _5 = _2 & (1 << N);
> 
> to
>   _1 = atomic builtin;
>   _6 = _1 & (1 << N);
>   _5 = (int) _6;

Why not do this last bit with match.pd patterns (and independent on
whether _1 is defined by an atomic builtin)?  For the first suggested
transform that's likely going to be undone by folding, no?

Richard.

> gcc/
> 
> 	PR middle-end/102566
> 	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
> 	between atomic builtin and bit test.
> 
> gcc/testsuite/
> 
> 	PR middle-end/102566
> 	* g++.target/i386/pr102566-1.C: New test.
> 	* gcc.target/i386/pr102566-1a.c: Likewise.
> 	* gcc.target/i386/pr102566-1b.c: Likewise.
> 	* gcc.target/i386/pr102566-2.c: Likewise.
> ---
>  gcc/testsuite/g++.target/i386/pr102566-1.C  |  12 ++
>  gcc/testsuite/gcc.target/i386/pr102566-1a.c | 188 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-1b.c | 107 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-2.c  |  14 ++
>  gcc/tree-ssa-ccp.c                          | 136 +++++++++++++-
>  5 files changed, 452 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c
> 
> diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
> new file mode 100644
> index 00000000000..6e33298d8bf
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target c++11 } } */
> +/* { dg-options "-O2" } */
> +
> +#include <atomic>
> +
> +bool tbit(std::atomic<int> &i)
> +{
> +  return i.fetch_or(1, std::memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..a915de354e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,188 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +void bar (void);
> +
> +__attribute__((noinline, noclone)) int
> +f1 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f2 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
> +  int t2 = t1 & mask;
> +  return t2 != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f3 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f4 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f5 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f6 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f7 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) void
> +f8 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
> +    bar ();
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f9 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f10 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f11 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f12 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f13 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f14 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f15 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f16 (int *a)
> +{
> +  int mask = 1 << 7;
> +  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f17 (int *a)
> +{
> +  int mask = 1 << 13;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f18 (int *a)
> +{
> +  int mask = 1 << 0;
> +  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f19 (long int *a, int bit)
> +{
> +  long int mask = 1l << bit;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f20 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
> +}
> +
> +__attribute__((noinline, noclone)) int
> +f21 (int *a, int bit)
> +{
> +  int mask = 1 << bit;
> +  return (__sync_fetch_and_or (a, mask) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f22 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) long int
> +f23 (long int *a)
> +{
> +  long int mask = 1l << 7;
> +  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f24 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__sync_fetch_and_or (a, mask) & mask) != 0;
> +}
> +
> +__attribute__((noinline, noclone)) short int
> +f25 (short int *a)
> +{
> +  short int mask = 1 << 7;
> +  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> new file mode 100644
> index 00000000000..c4dab8135c7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
> @@ -0,0 +1,107 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -g" } */
> +
> +int cnt;
> +
> +__attribute__((noinline, noclone)) void
> +bar (void)
> +{
> +  cnt++;
> +}
> +
> +#include "pr102566-1a.c"
> +
> +int a;
> +long int b;
> +unsigned long int c;
> +unsigned short int d;
> +
> +int
> +main ()
> +{
> +  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
> +  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
> +      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
> +    __builtin_abort ();
> +  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
> +      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
> +    __builtin_abort ();
> +  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
> +  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
> +      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
> +  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
> +      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
> +    __builtin_abort ();
> +  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
> +      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (cnt != 0
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
> +      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
> +      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
> +    __builtin_abort ();
> +  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
> +  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
> +      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
> +    __builtin_abort ();
> +  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
> +      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
> +      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
> +      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
> +    __builtin_abort ();
> +  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
> +  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
> +      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
> +    __builtin_abort ();
> +  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
> +  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
> +      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
> +    __builtin_abort ();
> +  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
> +      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
> +    __builtin_abort ();
> +  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
> +  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
> +      || cnt != 2)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> new file mode 100644
> index 00000000000..d1c30315353
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +bool
> +foo (_Atomic int *v)
> +{
> +  return atomic_fetch_or_explicit (v, 1, memory_order_relaxed) & 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
> index 70ce6a4d5b8..a3f7b7f233e 100644
> --- a/gcc/tree-ssa-ccp.c
> +++ b/gcc/tree-ssa-ccp.c
> @@ -3279,10 +3279,115 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>        || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
>        || !single_imm_use (lhs, &use_p, &use_stmt)
>        || !is_gimple_assign (use_stmt)
> -      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
>        || !gimple_vdef (call))
>      return;
>  
> +  mask = gimple_call_arg (call, 1);
> +  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
> +  if (rhs_code != BIT_AND_EXPR)
> +    {
> +      if (rhs_code != NOP_EXPR)
> +	return;
> +
> +      tree nop_lhs = gimple_assign_lhs (use_stmt);
> +      if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (nop_lhs))
> +	return;
> +
> +      tree nop_rhs = gimple_assign_rhs1 (use_stmt);
> +
> +      gimple *g;
> +      gimple_stmt_iterator gsi;
> +      tree var;
> +
> +      if (TREE_CODE (TREE_TYPE (nop_lhs)) == BOOLEAN_TYPE)
> +	{
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _4 = (_Bool) _1;
> +	     to
> +	     _1 = atomic bit op;
> +	     _5 = _1 & 1;
> +	     _4 = (_Bool) _5;
> +	   */
> +	  var = make_ssa_name (TREE_TYPE (nop_rhs));
> +	  replace_uses_by (nop_rhs, var);
> +	  g = gimple_build_assign (var, BIT_AND_EXPR, nop_rhs,
> +				   build_int_cst (TREE_TYPE (lhs), 1));
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = g;
> +	}
> +      else if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
> +	       == TYPE_PRECISION (TREE_TYPE (nop_rhs)))
> +	{
> +	  gimple *use_nop_stmt;
> +	  if (!single_imm_use (nop_lhs, &use_p, &use_nop_stmt)
> +	      || !is_gimple_assign (use_nop_stmt)
> +	      || gimple_assign_rhs_code (use_nop_stmt) != BIT_AND_EXPR)
> +	    return;
> +
> +	  tree op_mask = mask;
> +	  if (TREE_CODE (op_mask) == SSA_NAME)
> +	    {
> +	      g = SSA_NAME_DEF_STMT (op_mask);
> +	      if (gimple_assign_rhs_code (g) == NOP_EXPR)
> +		{
> +		  tree mask_nop_lhs = gimple_assign_lhs (g);
> +
> +		  if (SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
> +		    return;
> +
> +		  tree mask_nop_rhs = gimple_assign_rhs1 (g);
> +		  if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
> +		      != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
> +		    return;
> +		  op_mask = mask_nop_rhs;
> +		  g = SSA_NAME_DEF_STMT (op_mask);
> +		}
> +
> +	      if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
> +		{
> +		  if (!is_gimple_assign (g)
> +		      || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
> +		    return;
> +		  tree reset_mask = gimple_assign_rhs1 (g);
> +		  if (TREE_CODE (op_mask) != SSA_NAME)
> +		    return;
> +		  g = SSA_NAME_DEF_STMT (reset_mask);
> +		}
> +
> +	      if (!is_gimple_assign (g)
> +		  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
> +		  || !integer_onep (gimple_assign_rhs1 (g)))
> +		return;
> +	    }
> +
> +	  /* Convert
> +	     _1 = atomic bit op;
> +	     _2 = (int) _1;
> +	     _5 = _2 & N;
> +	     to
> +	     _1 = atomic bit op;
> +	     _6 = _1 & N;
> +	     _5 = (int) _6;
> +	   */
> +	  replace_uses_by (nop_lhs, lhs);
> +	  tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
> +	  var = make_ssa_name (TREE_TYPE (use_nop_lhs));
> +	  gimple_assign_set_lhs (use_nop_stmt, var);
> +	  gsi = gsi_for_stmt (use_stmt);
> +	  gsi_remove (&gsi, true);
> +	  release_defs (use_stmt);
> +	  gsi_remove (gsip, true);
> +	  var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
> +	  gsi = gsi_for_stmt (use_nop_stmt);
> +	  g = gimple_build_assign (use_nop_lhs, var);
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	  use_stmt = use_nop_stmt;
> +	  mask = op_mask;
> +	}
> +    }
> +
>    switch (fn)
>      {
>      case IFN_ATOMIC_BIT_TEST_AND_SET:
> @@ -3301,7 +3406,6 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>    if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
>      return;
>  
> -  mask = gimple_call_arg (call, 1);
>    tree use_lhs = gimple_assign_lhs (use_stmt);
>    if (!use_lhs)
>      return;
> @@ -3434,18 +3538,40 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
>  	 of the specified bit after the atomic operation (makes only sense
>  	 for xor, otherwise the bit content is compile time known),
>  	 we need to invert the bit.  */
> +      tree mask_convert = mask;
> +      gimple *g_convert = nullptr;
> +      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
> +	{
> +	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
> +	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
> +	  g_convert = gimple_build_assign (mask_convert, var);
> +	}
>        g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
>  			       BIT_XOR_EXPR, new_lhs,
>  			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
> -					: mask);
> +					: mask_convert);
>        new_lhs = gimple_assign_lhs (g);
>        if (throws)
>  	{
> -	  gsi_insert_on_edge_immediate (e, g);
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_on_edge_immediate (e, g_convert);
> +	      gsi = gsi_for_stmt (g_convert);
> +	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	    }
> +	  else
> +	    gsi_insert_on_edge_immediate (e, g);
>  	  gsi = gsi_for_stmt (g);
>  	}
>        else
> -	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	{
> +	  if (g_convert)
> +	    {
> +	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
> +	      gsi = gsi_for_stmt (g_convert);
> +	    }
> +	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
> +	}
>      }
>    if (use_bool && has_debug_uses)
>      {
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Improve integer bit test on atomic builtin return
  2021-10-05 10:07 ` Richard Biener
@ 2021-10-05 16:40   ` H.J. Lu
  2021-10-05 23:54     ` [PATCH v2] Improve integer bit test on __atomic_fetch_[or|and]_* returns H.J. Lu
  2021-10-08  7:16     ` [PATCH] Improve integer bit test on atomic builtin return Richard Biener
  0 siblings, 2 replies; 10+ messages in thread
From: H.J. Lu @ 2021-10-05 16:40 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Mon, 4 Oct 2021, H.J. Lu wrote:
>
> > commit adedd5c173388ae505470df152b9cb3947339566
> > Author: Jakub Jelinek <jakub@redhat.com>
> > Date:   Tue May 3 13:37:25 2016 +0200
> >
> >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> >
> > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > it works only for unsigned integers since atomic builtins operate on the
> > 'uintptr_t' type.  It fails on bool:
> >
> >   _1 = atomic builtin;
> >   _4 = (_Bool) _1;
> >
> > and signed integers:
> >
> >   _1 = atomic builtin;
> >   _2 = (int) _1;
> >   _5 = _2 & (1 << N);
> >
> > Improve bit test on atomic builtin return by converting:
> >
> >   _1 = atomic builtin;
> >   _4 = (_Bool) _1;
> >
> > to
> >
> >   _1 = atomic builtin;
> >   _5 = _1 & (1 << 0);
> >   _4 = (_Bool) _5;
> >
> > and converting:
> >
> >   _1 = atomic builtin;
> >   _2 = (int) _1;
> >   _5 = _2 & (1 << N);
> >
> > to
> >   _1 = atomic builtin;
> >   _6 = _1 & (1 << N);
> >   _5 = (int) _6;
>
> Why not do this last bit with match.pd patterns (and independent on
> whether _1 is defined by an atomic builtin)?  For the first suggested

The full picture is

 _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
  _2 = (int) _1;
  _5 = _2 & mask;

to

  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
  _6 = _1 & mask;
  _5 = (int) _6;

It is useful only if 2 masks are the same.

> transform that's likely going to be undone by folding, no?
>

The bool case is

  _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
  _4 = (_Bool) _1;

to

  _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
  _5 = _1 & 1;
  _4 = (_Bool) _5;

Without __atomic_fetch_or_*, the conversion isn't needed.
After the conversion, optimize_atomic_bit_test_and will
immediately optimize the code sequence to

  _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
  _4 = (_Bool) _6;

and there is nothing to fold after it.

-- 
H.J.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] Improve integer bit test on __atomic_fetch_[or|and]_* returns
  2021-10-05 16:40   ` H.J. Lu
@ 2021-10-05 23:54     ` H.J. Lu
  2021-10-08  7:16     ` [PATCH] Improve integer bit test on atomic builtin return Richard Biener
  1 sibling, 0 replies; 10+ messages in thread
From: H.J. Lu @ 2021-10-05 23:54 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 2250 bytes --]

On Tue, Oct 5, 2021 at 9:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Mon, 4 Oct 2021, H.J. Lu wrote:
> >
> > > commit adedd5c173388ae505470df152b9cb3947339566
> > > Author: Jakub Jelinek <jakub@redhat.com>
> > > Date:   Tue May 3 13:37:25 2016 +0200
> > >
> > >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> > >
> > > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > > it works only for unsigned integers since atomic builtins operate on the
> > > 'uintptr_t' type.  It fails on bool:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > and signed integers:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > Improve bit test on atomic builtin return by converting:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > to
> > >
> > >   _1 = atomic builtin;
> > >   _5 = _1 & (1 << 0);
> > >   _4 = (_Bool) _5;
> > >
> > > and converting:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > to
> > >   _1 = atomic builtin;
> > >   _6 = _1 & (1 << N);
> > >   _5 = (int) _6;
> >
> > Why not do this last bit with match.pd patterns (and independent on
> > whether _1 is defined by an atomic builtin)?  For the first suggested
>
> The full picture is
>
>  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _2 = (int) _1;
>   _5 = _2 & mask;
>
> to
>
>   _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _6 = _1 & mask;
>   _5 = (int) _6;
>
> It is useful only if 2 masks are the same.
>
> > transform that's likely going to be undone by folding, no?
> >
>
> The bool case is
>
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _4 = (_Bool) _1;
>
> to
>
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _5 = _1 & 1;
>   _4 = (_Bool) _5;
>
> Without __atomic_fetch_or_*, the conversion isn't needed.
> After the conversion, optimize_atomic_bit_test_and will
> immediately optimize the code sequence to
>
>   _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
>   _4 = (_Bool) _6;
>
> and there is nothing to fold after it.
>

Here is the v2 patch to handle more cases.


-- 
H.J.

[-- Attachment #2: v2-0001-Improve-integer-bit-test-on-__atomic_fetch_-or-an.patch --]
[-- Type: text/x-patch, Size: 47623 bytes --]

From 5e72aa847ddc73754d821b96c508c3892f523d4b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 3 Oct 2021 09:43:00 -0700
Subject: [PATCH v2] Improve integer bit test on __atomic_fetch_[or|and]_*
 returns

commit adedd5c173388ae505470df152b9cb3947339566
Author: Jakub Jelinek <jakub@redhat.com>
Date:   Tue May 3 13:37:25 2016 +0200

    re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')

optimized bit test on __atomic_fetch_or_* and __atomic_fetch_and_* returns
with lock bts/btr/btc by turning

  mask_2 = 1 << cnt_1;
  _4 = __atomic_fetch_or_* (ptr_6, mask_2, _3);
  _5 = _4 & mask_2;

into

  _4 = ATOMIC_BIT_TEST_AND_SET (ptr_6, cnt_1, 0, _3);
  _5 = _4;

and

  mask_6 = 1 << bit_5(D);
  _1 = ~mask_6;
  _2 = __atomic_fetch_and_4 (v_8(D), _1, 0);
  _3 = _2 & mask_6;
  _4 = _3 != 0;

into

  mask_6 = 1 << bit_5(D);
  _1 = ~mask_6;
  _11 = .ATOMIC_BIT_TEST_AND_RESET (v_8(D), bit_5(D), 1, 0);
  _4 = _11 != 0;

But it failed to optimize when there is a cast.  Improve this optimization
by transforming the IR to meet the required conditions if possible:

1. Transform

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _4 = (_Bool) _1;

to

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _5 = _1 & 1;
  _4 = (_Bool) _5;

or

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _4 = (_Bool) _1;

to

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _5 = _1 & 1;
  _4 = (_Bool) _5;

2. Transform

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _7 = ~_1;
  _5 = (_Bool) _7;

to

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _7 = _1 & 1;
  _5 = _7 == 0;

or

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _7 = ~_1;
  _5 = (_Bool) _7;

to

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _7 = _1 & 1;
  _5 = _7 == 0;

3. Transform

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _2 = (int) _1;
  _7 = ~_2;
  _5 = (_Bool) _7;

to

  _1 = __atomic_fetch_or_4 (ptr_6, 1, _3);
  _7 = _1 & 1;
  _5 = _7 == 0;

or

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _2 = (int) _1;
  _7 = ~_2;
  _5 = (_Bool) _7;

to

  _1 = __atomic_fetch_and_4 (ptr_6, ~1, _3);
  _7 = _1 & 1;
  _5 = _7 == 0;

4. Transform

  _1 = _atomic_fetch_or_4 (ptr_6, mask, _3);
  _2 = (int) _1;
  _5 = _2 & mask;

to

  _1 = _atomic_fetch_or_4 (ptr_6, mask, _3);
  _6 = _1 & mask;
  _5 = (int) _6;

5. Transform

  _1 = __atomic_fetch_or_4 (ptr_6, 0x80000000, _3);
  _5 = (signed int) _1;
  _4 = _5 < 0;

to

  _1 = __atomic_fetch_or_4 (ptr_6, 0x80000000, _3);
  _6 = _1 & 0x80000000;
  _4 = _6 != 0;

or

  _1 = __atomic_fetch_and_4 (ptr_6, 0x7fffffff, _3);
  _5 = (signed int) _1;
  _4 = _5 < 0;

to

  _1 = __atomic_fetch_and_4 (ptr_6, 0x7fffffff, _3);
  _6 = _1 & 0x80000000;
  _4 = _6 != 0;

6. Extract the bit mask from

  _1 = 1 << bit_4(D);
  mask_5 = (unsigned int) _1;
  _2 = __atomic_fetch_or_4 (v_7(D), mask_5, 0);
  _3 = _2 & mask_5;

and

  mask_7 = 1 << bit_6(D);
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (v_9(D), _2, 0);
  _4 = (int) _3;
  _5 = _4 & mask_7;

gcc/

	PR middle-end/102566
	* tree-ssa-ccp.c (optimize_atomic_bit_test_and): Handle cast
	before/after atomic builtin and bit test.

gcc/testsuite/

	PR middle-end/102566
	* g++.target/i386/pr102566-1.C: New test.
	* g++.target/i386/pr102566-2.C: Likewise.
	* g++.target/i386/pr102566-3.C: Likewise.
	* g++.target/i386/pr102566-4.C: Likewise.
	* g++.target/i386/pr102566-5a.C: Likewise.
	* g++.target/i386/pr102566-5b.C: Likewise.
	* g++.target/i386/pr102566-6a.C: Likewise.
	* g++.target/i386/pr102566-6b.C: Likewise.
	* gcc.target/i386/pr102566-1a.c: Likewise.
	* gcc.target/i386/pr102566-1b.c: Likewise.
	* gcc.target/i386/pr102566-2.c: Likewise.
	* gcc.target/i386/pr102566-3a.c: Likewise.
	* gcc.target/i386/pr102566-3b.c: Likewise.
	* gcc.target/i386/pr102566-4.c: Likewise.
	* gcc.target/i386/pr102566-5.c: Likewise.
	* gcc.target/i386/pr102566-6.c: Likewise.
	* gcc.target/i386/pr102566-7.c: Likewise.
	* gcc.target/i386/pr102566-8a.c: Likewise.
	* gcc.target/i386/pr102566-8b.c: Likewise.
	* gcc.target/i386/pr102566-9a.c: Likewise.
	* gcc.target/i386/pr102566-9b.c: Likewise.
	* gcc.target/i386/pr102566-10a.c: Likewise.
	* gcc.target/i386/pr102566-10b.c: Likewise.
---
 gcc/testsuite/g++.target/i386/pr102566-1.C   |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-2.C   |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-3.C   |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-4.C   |  29 ++
 gcc/testsuite/g++.target/i386/pr102566-5a.C  |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-5b.C  |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-6a.C  |  31 ++
 gcc/testsuite/g++.target/i386/pr102566-6b.C  |  31 ++
 gcc/testsuite/gcc.target/i386/pr102566-10a.c |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-10b.c |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-1a.c  | 188 +++++++++
 gcc/testsuite/gcc.target/i386/pr102566-1b.c  | 107 +++++
 gcc/testsuite/gcc.target/i386/pr102566-2.c   |  32 ++
 gcc/testsuite/gcc.target/i386/pr102566-3a.c  |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-3b.c  |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-4.c   |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-5.c   |  15 +
 gcc/testsuite/gcc.target/i386/pr102566-6.c   |  32 ++
 gcc/testsuite/gcc.target/i386/pr102566-7.c   |  30 ++
 gcc/testsuite/gcc.target/i386/pr102566-8a.c  |  32 ++
 gcc/testsuite/gcc.target/i386/pr102566-8b.c  |  32 ++
 gcc/testsuite/gcc.target/i386/pr102566-9a.c  |  32 ++
 gcc/testsuite/gcc.target/i386/pr102566-9b.c  |  32 ++
 gcc/tree-ssa-ccp.c                           | 406 ++++++++++++++++++-
 24 files changed, 1251 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-1.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-2.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-3.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-4.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-5a.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-5b.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-6a.C
 create mode 100644 gcc/testsuite/g++.target/i386/pr102566-6b.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-10a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-10b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-3a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-3b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-7.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-8a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-8b.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-9a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-9b.c

diff --git a/gcc/testsuite/g++.target/i386/pr102566-1.C b/gcc/testsuite/g++.target/i386/pr102566-1.C
new file mode 100644
index 00000000000..94a66d717cc
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-1.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<int> &i)
+{
+#define BIT (1 << 0)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<int> &i)
+{
+#define BIT (1 << 30)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<int> &i)
+{
+#define BIT (1 << 31)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-2.C b/gcc/testsuite/g++.target/i386/pr102566-2.C
new file mode 100644
index 00000000000..4f2aea961c2
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-2.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 0)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 30)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 31)
+  return i.fetch_or(BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-3.C b/gcc/testsuite/g++.target/i386/pr102566-3.C
new file mode 100644
index 00000000000..e88921dd155
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-3.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 0)
+  return !(i.fetch_or(BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 30)
+  return !(i.fetch_or(BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 31)
+  return !(i.fetch_or(BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-4.C b/gcc/testsuite/g++.target/i386/pr102566-4.C
new file mode 100644
index 00000000000..44d1362ac2e
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-4.C
@@ -0,0 +1,29 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+typedef int __attribute__ ((mode (__word__))) int_type;
+
+#define BIT (1 << 0)
+
+bool
+tbit0 (std::atomic<int_type> &i)
+{
+  return i.fetch_or(BIT, std::memory_order_relaxed) & ~1;
+}
+
+bool
+tbit30 (std::atomic<int_type> &i)
+{
+  return i.fetch_or(BIT, std::memory_order_relaxed) & ~2;
+}
+
+bool
+tbit31 (std::atomic<int_type> &i)
+{
+  return i.fetch_or(BIT, std::memory_order_relaxed) & ~4;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*cmpxchg" 3 } } */
+/* { dg-final { scan-assembler-not "bts" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-5a.C b/gcc/testsuite/g++.target/i386/pr102566-5a.C
new file mode 100644
index 00000000000..f9595bee2ab
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-5a.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 0)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 30)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 31)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-5b.C b/gcc/testsuite/g++.target/i386/pr102566-5b.C
new file mode 100644
index 00000000000..d917b27a918
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-5b.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { c++11 && { ! ia32 } } } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 0)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 30)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 63)
+  return i.fetch_and(~BIT, std::memory_order_relaxed) & BIT;
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrq" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-6a.C b/gcc/testsuite/g++.target/i386/pr102566-6a.C
new file mode 100644
index 00000000000..01d495eda23
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-6a.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target c++11 } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 0)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 30)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned int> &i)
+{
+#define BIT (1 << 31)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/g++.target/i386/pr102566-6b.C b/gcc/testsuite/g++.target/i386/pr102566-6b.C
new file mode 100644
index 00000000000..adc11fcbf2d
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr102566-6b.C
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { c++11 && { ! ia32 } } } } */
+/* { dg-options "-O2" } */
+
+#include <atomic>
+
+bool
+tbit0 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 0)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit30 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 30)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+bool
+tbit31 (std::atomic<unsigned long long> &i)
+{
+#define BIT (1ll << 63)
+  return !(i.fetch_and(~BIT, std::memory_order_relaxed) & BIT);
+#undef BIT 
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrq" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-10a.c b/gcc/testsuite/gcc.target/i386/pr102566-10a.c
new file mode 100644
index 00000000000..1c1f86a9659
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-10a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v, int bit)
+{
+  int mask = 1 << bit;
+  return atomic_fetch_and_explicit (v, ~mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-10b.c b/gcc/testsuite/gcc.target/i386/pr102566-10b.c
new file mode 100644
index 00000000000..0bf39824ea6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-10b.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic long long int *v, int bit)
+{
+  long long int mask = 1ll << bit;
+  return atomic_fetch_and_explicit (v, ~mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrq" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
new file mode 100644
index 00000000000..a915de354e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
@@ -0,0 +1,188 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+void bar (void);
+
+__attribute__((noinline, noclone)) int
+f1 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f2 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  int t1 = __atomic_fetch_or (a, mask, __ATOMIC_RELAXED);
+  int t2 = t1 & mask;
+  return t2 != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f3 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f5 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f6 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) void
+f7 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__sync_fetch_and_xor (a, mask) & mask) != 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) void
+f8 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  if ((__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) == 0)
+    bar ();
+}
+
+__attribute__((noinline, noclone)) int
+f9 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f10 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_xor (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f11 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f12 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f13 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f14 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f15 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f16 (int *a)
+{
+  int mask = 1 << 7;
+  return (__sync_fetch_and_and (a, ~mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f17 (int *a)
+{
+  int mask = 1 << 13;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) int
+f18 (int *a)
+{
+  int mask = 1 << 0;
+  return (__atomic_fetch_and (a, ~mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f19 (long int *a, int bit)
+{
+  long int mask = 1l << bit;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) long int
+f20 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask) == 0;
+}
+
+__attribute__((noinline, noclone)) int
+f21 (int *a, int bit)
+{
+  int mask = 1 << bit;
+  return (__sync_fetch_and_or (a, mask) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f22 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_xor_fetch (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) long int
+f23 (long int *a)
+{
+  long int mask = 1l << 7;
+  return (__atomic_fetch_xor (a, mask, __ATOMIC_SEQ_CST) & mask);
+}
+
+__attribute__((noinline, noclone)) short int
+f24 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__sync_fetch_and_or (a, mask) & mask) != 0;
+}
+
+__attribute__((noinline, noclone)) short int
+f25 (short int *a)
+{
+  short int mask = 1 << 7;
+  return (__atomic_fetch_or (a, mask, __ATOMIC_SEQ_CST) & mask) != 0;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 9 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 10 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1b.c b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
new file mode 100644
index 00000000000..c4dab8135c7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1b.c
@@ -0,0 +1,107 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -g" } */
+
+int cnt;
+
+__attribute__((noinline, noclone)) void
+bar (void)
+{
+  cnt++;
+}
+
+#include "pr102566-1a.c"
+
+int a;
+long int b;
+unsigned long int c;
+unsigned short int d;
+
+int
+main ()
+{
+  __atomic_store_n (&a, 15, __ATOMIC_RELAXED);
+  if (f1 (&a, 2) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 15
+      || f1 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31)
+    __builtin_abort ();
+  if (f2 (&a, 1) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 31
+      || f2 (&a, 5) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 63)
+    __builtin_abort ();
+  __atomic_store_n (&b, 24, __ATOMIC_RELAXED);
+  if (f3 (&b, 2) != 1 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28
+      || f3 (&b, 3) != 0 || __atomic_load_n (&b, __ATOMIC_RELAXED) != 28)
+    __builtin_abort ();
+  __atomic_store_n (&a, 0, __ATOMIC_RELAXED);
+  if (f4 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128
+      || f4 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  if (f5 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f5 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320)
+    __builtin_abort ();
+  if (f6 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321
+      || f6 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (cnt != 0
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f7 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if ((f8 (&a, 7), cnt) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || (f8 (&a, 7), cnt) != 2 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f9 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f9 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f10 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f10 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f11 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 129
+      || f11 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f12 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8320
+      || f12 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8321)
+    __builtin_abort ();
+  if (f13 (&a, 7) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f13 (&a, 7) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f14 (&a, 13) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f14 (&a, 13) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f15 (&a, 0) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f15 (&a, 0) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 8321, __ATOMIC_RELAXED);
+  if (f16 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193
+      || f16 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 8193)
+    __builtin_abort ();
+  if (f17 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1
+      || f17 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f18 (&a) != 1 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0
+      || f18 (&a) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f19 (&c, 7) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f19 (&c, 7) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  if (f20 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 128
+      || f20 (&c) != 1 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 0)
+    __builtin_abort ();
+  __atomic_store_n (&a, 128, __ATOMIC_RELAXED);
+  if (f21 (&a, 4) != 0 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144
+      || f21 (&a, 4) != 16 || __atomic_load_n (&a, __ATOMIC_RELAXED) != 144)
+    __builtin_abort ();
+  __atomic_store_n (&c, 1, __ATOMIC_RELAXED);
+  if (f22 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f22 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f23 (&c) != 0 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 129
+      || f23 (&c) != 128 || __atomic_load_n (&c, __ATOMIC_RELAXED) != 1)
+    __builtin_abort ();
+  if (f24 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128
+      || f24 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 128)
+    __builtin_abort ();
+  __atomic_store_n (&d, 1, __ATOMIC_RELAXED);
+  if (f25 (&d) != 0 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || f25 (&d) != 1 || __atomic_load_n (&d, __ATOMIC_RELAXED) != 129
+      || cnt != 2)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2.c b/gcc/testsuite/gcc.target/i386/pr102566-2.c
new file mode 100644
index 00000000000..00a7c349f2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic int *v)
+{
+#define BIT (1 << 0)
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo30 (_Atomic int *v)
+{
+#define BIT (1 << 30)
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo31 (_Atomic int *v)
+{
+#define BIT (1 << 31)
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-3a.c b/gcc/testsuite/gcc.target/i386/pr102566-3a.c
new file mode 100644
index 00000000000..8bf1cd6e1bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-3a.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v, int bit)
+{
+  int mask = 1 << bit;
+  return atomic_fetch_or_explicit (v, mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-3b.c b/gcc/testsuite/gcc.target/i386/pr102566-3b.c
new file mode 100644
index 00000000000..d155ed367a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-3b.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic long long int *v, int bit)
+{
+  long long int mask = 1ll << bit;
+  return atomic_fetch_or_explicit (v, mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsq" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-4.c b/gcc/testsuite/gcc.target/i386/pr102566-4.c
new file mode 100644
index 00000000000..2668ccf827c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-4.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v, int bit)
+{
+  unsigned int mask = 1 << bit;
+  return atomic_fetch_or_explicit (v, mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-5.c b/gcc/testsuite/gcc.target/i386/pr102566-5.c
new file mode 100644
index 00000000000..8bf1cd6e1bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-5.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo (_Atomic int *v, int bit)
+{
+  int mask = 1 << bit;
+  return atomic_fetch_or_explicit (v, mask, memory_order_relaxed) & mask;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 1 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-6.c b/gcc/testsuite/gcc.target/i386/pr102566-6.c
new file mode 100644
index 00000000000..3dfe55ac683
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-6.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic int *v)
+{
+#define BIT (1 << 0)
+  return !(atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo30 (_Atomic int *v)
+{
+#define BIT (1 << 30)
+  return !(atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo31 (_Atomic int *v)
+{
+#define BIT (1 << 31)
+  return !(atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btsl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-7.c b/gcc/testsuite/gcc.target/i386/pr102566-7.c
new file mode 100644
index 00000000000..6bc0ae0f320
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-7.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+typedef int __attribute__ ((mode (__word__))) int_type;
+
+#define BIT (1 << 0)
+
+bool
+foo0 (_Atomic int_type *v)
+{
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & ~1;
+}
+
+bool
+foo1 (_Atomic int_type *v)
+{
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & ~2;
+}
+
+bool
+foo2 (_Atomic int_type *v)
+{
+  return atomic_fetch_or_explicit (v, BIT, memory_order_relaxed) & ~3;
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*cmpxchg" 3 } } */
+/* { dg-final { scan-assembler-not "bts" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-8a.c b/gcc/testsuite/gcc.target/i386/pr102566-8a.c
new file mode 100644
index 00000000000..168e3db78c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-8a.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic int *v)
+{
+#define BIT (1 << 0)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo30 (_Atomic int *v)
+{
+#define BIT (1 << 30)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo31 (_Atomic int *v)
+{
+#define BIT (1 << 31)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-8b.c b/gcc/testsuite/gcc.target/i386/pr102566-8b.c
new file mode 100644
index 00000000000..392da3098e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-8b.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic long long *v)
+{
+#define BIT (1ll << 0)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo30 (_Atomic long long *v)
+{
+#define BIT (1ll << 62)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+bool
+foo31 (_Atomic long long *v)
+{
+#define BIT (1ll << 63)
+  return atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT;
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrq" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-9a.c b/gcc/testsuite/gcc.target/i386/pr102566-9a.c
new file mode 100644
index 00000000000..3fa2a3ef043
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-9a.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic int *v)
+{
+#define BIT (1 << 0)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo30 (_Atomic int *v)
+{
+#define BIT (1 << 30)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo31 (_Atomic int *v)
+{
+#define BIT (1 << 31)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrl" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-9b.c b/gcc/testsuite/gcc.target/i386/pr102566-9b.c
new file mode 100644
index 00000000000..38ddbdc630f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-9b.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+
+bool
+foo0 (_Atomic long long *v)
+{
+#define BIT (1ll << 0)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo30 (_Atomic long long *v)
+{
+#define BIT (1ll << 62)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+bool
+foo31 (_Atomic long long *v)
+{
+#define BIT (1ll << 63)
+  return !(atomic_fetch_and_explicit (v, ~BIT, memory_order_relaxed) & BIT);
+#undef BIT
+}
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btrq" 3 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
index 70ce6a4d5b8..4a011414f9a 100644
--- a/gcc/tree-ssa-ccp.c
+++ b/gcc/tree-ssa-ccp.c
@@ -3243,6 +3243,81 @@ optimize_unreachable (gimple_stmt_iterator i)
   return ret;
 }
 
+/* Convert
+   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
+   _7 = ~_1;
+   _5 = (_Bool) _7;
+   to
+   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
+   _8 = _1 & 1;
+   _5 = _8 == 0;
+   and convert
+   _1 = __atomic_fetch_and_* (ptr_6, ~1, _3);
+   _7 = ~_1;
+   _4 = (_Bool) _7;
+   to
+   _1 = __atomic_fetch_and_* (ptr_6, ~1, _3);
+   _8 = _1 & 1;
+   _4 = (_Bool) _8;
+
+   USE_STMT is the gimplt statement which uses the return value of
+   __atomic_fetch_or_*.  LHS is the return value of __atomic_fetch_or_*.
+   MASK is the mask passed to __atomic_fetch_or_*.
+ */
+
+static gimple *
+convert_atomic_bit_not (enum internal_fn fn, gimple *use_stmt,
+			tree lhs, tree mask)
+{
+  tree and_mask;
+  if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+    {
+      /* MASK must be ~1.  */
+      if (!operand_equal_p (build_int_cst (TREE_TYPE (lhs),
+					   ~HOST_WIDE_INT_1), mask, 0))
+	return nullptr;
+      and_mask = build_int_cst (TREE_TYPE (lhs), 1);
+    }
+  else
+    {
+      /* MASK must be 1.  */
+      if (!operand_equal_p (build_int_cst (TREE_TYPE (lhs), 1), mask, 0))
+	return nullptr;
+      and_mask = mask;
+    }
+
+  tree use_lhs = gimple_assign_lhs (use_stmt);
+
+  use_operand_p use_p;
+  gimple *use_not_stmt;
+
+  if (!single_imm_use (use_lhs, &use_p, &use_not_stmt)
+      || !is_gimple_assign (use_not_stmt))
+    return nullptr;
+
+  if (gimple_assign_rhs_code (use_not_stmt) != NOP_EXPR)
+    return nullptr;
+
+  tree use_not_lhs = gimple_assign_lhs (use_not_stmt);
+  if (TREE_CODE (TREE_TYPE (use_not_lhs)) != BOOLEAN_TYPE)
+    return nullptr;
+
+  gimple_stmt_iterator gsi;
+  gsi = gsi_for_stmt (use_stmt);
+  gsi_remove (&gsi, true);
+  tree var = make_ssa_name (TREE_TYPE (lhs));
+  use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
+  gsi = gsi_for_stmt (use_not_stmt);
+  gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
+  lhs = gimple_assign_lhs (use_not_stmt);
+  gimple *g = gimple_build_assign (lhs, EQ_EXPR, var,
+				   build_zero_cst (TREE_TYPE (mask)));
+  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+  gsi = gsi_for_stmt (use_not_stmt);
+  gsi_remove (&gsi, true);
+  return use_stmt;
+}
+
 /* Optimize
      mask_2 = 1 << cnt_1;
      _4 = __atomic_fetch_or_* (ptr_6, mask_2, _3);
@@ -3279,10 +3354,283 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
       || SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs)
       || !single_imm_use (lhs, &use_p, &use_stmt)
       || !is_gimple_assign (use_stmt)
-      || gimple_assign_rhs_code (use_stmt) != BIT_AND_EXPR
       || !gimple_vdef (call))
     return;
 
+  mask = gimple_call_arg (call, 1);
+  tree_code rhs_code = gimple_assign_rhs_code (use_stmt);
+  if (rhs_code != BIT_AND_EXPR)
+    {
+      if (rhs_code != NOP_EXPR && rhs_code != BIT_NOT_EXPR)
+	return;
+
+      tree use_lhs = gimple_assign_lhs (use_stmt);
+      if (TREE_CODE (use_lhs) == SSA_NAME
+	  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use_lhs))
+	return;
+
+      tree use_rhs = gimple_assign_rhs1 (use_stmt);
+      if (lhs != use_rhs)
+	return;
+
+      gimple *g;
+      gimple_stmt_iterator gsi;
+      tree var;
+
+      if (rhs_code == BIT_NOT_EXPR)
+	{
+	  g = convert_atomic_bit_not (fn, use_stmt, lhs, mask);
+	  if (!g)
+	    return;
+	  use_stmt = g;
+	}
+      else if (TREE_CODE (TREE_TYPE (use_lhs)) == BOOLEAN_TYPE)
+	{
+	  tree and_mask;
+	  if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+	    {
+	      /* MASK must be ~1.  */
+	      if (!operand_equal_p (build_int_cst (TREE_TYPE (lhs),
+						   ~HOST_WIDE_INT_1),
+				    mask, 0))
+		return;
+
+	      /* Convert
+		 _1 = __atomic_fetch_and_* (ptr_6, ~1, _3);
+		 _4 = (_Bool) _1;
+		 to
+		 _1 = __atomic_fetch_and_* (ptr_6, ~1, _3);
+		 _5 = _1 & 1;
+		 _4 = (_Bool) _5;
+	       */
+	      and_mask = build_int_cst (TREE_TYPE (lhs), 1);
+	    }
+	  else
+	    {
+	      and_mask = build_int_cst (TREE_TYPE (lhs), 1);
+	      if (!operand_equal_p (and_mask, mask, 0))
+		return;
+
+	      /* Convert
+		 _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
+		 _4 = (_Bool) _1;
+		 to
+		 _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
+		 _5 = _1 & 1;
+		 _4 = (_Bool) _5;
+	       */
+	    }
+	  var = make_ssa_name (TREE_TYPE (use_rhs));
+	  replace_uses_by (use_rhs, var);
+	  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
+				   and_mask);
+	  gsi = gsi_for_stmt (use_stmt);
+	  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
+	  use_stmt = g;
+	}
+      else if (TYPE_PRECISION (TREE_TYPE (use_lhs))
+	       == TYPE_PRECISION (TREE_TYPE (use_rhs)))
+	{
+	  gimple *use_nop_stmt;
+	  if (!single_imm_use (use_lhs, &use_p, &use_nop_stmt)
+	      || !is_gimple_assign (use_nop_stmt))
+	    return;
+	  rhs_code = gimple_assign_rhs_code (use_nop_stmt);
+	  if (rhs_code != BIT_AND_EXPR)
+	    {
+	      tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
+	      if (TREE_CODE (use_nop_lhs) == SSA_NAME
+		  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (use_nop_lhs))
+		return;
+	      if (rhs_code == BIT_NOT_EXPR)
+		{
+		  g = convert_atomic_bit_not (fn, use_nop_stmt, lhs,
+					      mask);
+		  if (!g)
+		    return;
+		  /* Convert
+		     _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
+		     _2 = (int) _1;
+		     _7 = ~_2;
+		     _5 = (_Bool) _7;
+		     to
+		     _1 = __atomic_fetch_or_* (ptr_6, ~1, _3);
+		     _8 = _1 & 1;
+		     _5 = _8 == 0;
+		     and convert
+		     _1 = __atomic_fetch_and_* (ptr_6, ~1, _3);
+		     _2 = (int) _1;
+		     _7 = ~_2;
+		     _5 = (_Bool) _7;
+		     to
+		     _1 = __atomic_fetch_and_* (ptr_6, 1, _3);
+		     _8 = _1 & 1;
+		     _5 = _8 == 0;
+		   */
+		  gsi = gsi_for_stmt (use_stmt);
+		  gsi_remove (&gsi, true);
+		  use_stmt = g;
+		}
+	      else
+		{
+		  if (TREE_CODE (TREE_TYPE (use_nop_lhs)) != BOOLEAN_TYPE)
+		    return;
+		  if (rhs_code != GE_EXPR && rhs_code != LT_EXPR)
+		    return;
+		  tree cmp_rhs1 = gimple_assign_rhs1 (use_nop_stmt);
+		  if (use_lhs != cmp_rhs1)
+		    return;
+		  tree cmp_rhs2 = gimple_assign_rhs2 (use_nop_stmt);
+		  if (!integer_zerop (cmp_rhs2))
+		    return;
+
+		  tree and_mask;
+
+		  unsigned HOST_WIDE_INT bytes
+		    = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (use_rhs)));
+		  unsigned HOST_WIDE_INT highest
+		    = HOST_WIDE_INT_1U << (bytes * BITS_PER_UNIT - 1);
+
+		  if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+		    {
+		      /* Get the signed maximum of the USE_RHS type.  */
+		      and_mask = build_int_cst (TREE_TYPE (use_rhs),
+						highest - 1);
+		      if (!operand_equal_p (and_mask, mask, 0))
+			return;
+
+		      /* Convert
+			 _1 = __atomic_fetch_and_* (ptr_6, 0x7fffffff, _3);
+			 _5 = (signed int) _1;
+			 _4 = _5 < 0 or _5 >= 0;
+			 to
+			 _1 = __atomic_fetch_and_* (ptr_6, 0x7fffffff, _3);
+			 _6 = _1 & 0x80000000;
+			 _4 = _6 != 0 or _6 == 0;
+		       */
+		      and_mask = build_int_cst (TREE_TYPE (use_rhs),
+						highest);
+		    }
+		  else
+		    {
+		      /* Get the signed minimum of the USE_RHS type.  */
+		      and_mask = build_int_cst (TREE_TYPE (use_rhs),
+						highest);
+		      if (!operand_equal_p (and_mask, mask, 0))
+			return;
+
+		      /* Convert
+			 _1 = __atomic_fetch_or_* (ptr_6, 0x80000000, _3);
+			 _5 = (signed int) _1;
+			 _4 = _5 < 0 or _5 >= 0;
+			 to
+			 _1 = __atomic_fetch_or_* (ptr_6, 0x80000000, _3);
+			 _6 = _1 & 0x80000000;
+			 _4 = _6 != 0 or _6 == 0;
+		       */
+		    }
+		  var = make_ssa_name (TREE_TYPE (use_rhs));
+		  gsi = gsi_for_stmt (use_stmt);
+		  gsi_remove (&gsi, true);
+		  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
+					   and_mask);
+		  gsi = gsi_for_stmt (use_nop_stmt);
+		  gsi_insert_before (&gsi, g, GSI_NEW_STMT);
+		  use_stmt = g;
+		  g = gimple_build_assign (use_nop_lhs,
+					   (rhs_code == GE_EXPR
+					    ? EQ_EXPR : NE_EXPR),
+					   var,
+					   build_zero_cst (TREE_TYPE (use_rhs)));
+		  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+		  gsi = gsi_for_stmt (use_nop_stmt);
+		  gsi_remove (&gsi, true);
+		}
+	    }
+	  else
+	    {
+	      tree op_mask = mask;
+	      tree check_mask = op_mask;
+	      if (TREE_CODE (op_mask) == SSA_NAME)
+		{
+		  g = SSA_NAME_DEF_STMT (op_mask);
+		  if (gimple_assign_rhs_code (g) == NOP_EXPR)
+		    {
+		      tree mask_nop_lhs = gimple_assign_lhs (g);
+
+		      if (TREE_CODE (mask_nop_lhs) == SSA_NAME
+			  && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (mask_nop_lhs))
+			return;
+
+		      tree mask_nop_rhs = gimple_assign_rhs1 (g);
+		      if (TYPE_PRECISION (TREE_TYPE (mask_nop_lhs))
+			  != TYPE_PRECISION (TREE_TYPE (mask_nop_rhs)))
+			return;
+		      op_mask = mask_nop_rhs;
+		      check_mask = op_mask;
+		      g = SSA_NAME_DEF_STMT (op_mask);
+		    }
+
+		  if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+		    {
+		      if (!is_gimple_assign (g)
+			  || gimple_assign_rhs_code (g) != BIT_NOT_EXPR)
+			return;
+		      check_mask = gimple_assign_rhs1 (g);
+		      if (TREE_CODE (check_mask) != SSA_NAME)
+			return;
+		      g = SSA_NAME_DEF_STMT (check_mask);
+		    }
+
+		  if (!is_gimple_assign (g)
+		      || gimple_assign_rhs_code (g) != LSHIFT_EXPR
+		      || !integer_onep (gimple_assign_rhs1 (g)))
+		    return;
+		}
+
+	      if (TREE_CODE (check_mask) == INTEGER_CST)
+		{
+		  if (fn == IFN_ATOMIC_BIT_TEST_AND_RESET)
+		    check_mask = const_unop (BIT_NOT_EXPR,
+					     TREE_TYPE (check_mask),
+					     check_mask);
+		  check_mask = fold_convert (TREE_TYPE (lhs),
+					     check_mask);
+		}
+
+	      tree use_nop_rhs1 = gimple_assign_rhs1 (use_nop_stmt);
+	      tree use_nop_rhs2 = gimple_assign_rhs2 (use_nop_stmt);
+	      if (!operand_equal_p (use_nop_rhs1, check_mask, 0)
+		  && !operand_equal_p (use_nop_rhs2, check_mask, 0))
+		return;
+
+	      /* Convert
+		 _1 = __atomic_fetch_or_* (ptr_6, mask, _3);
+		 _2 = (int) _1;
+		 _5 = _2 & mask;
+		 to
+		 _1 = __atomic_fetch_or_* (ptr_6, mask, _3);
+		 _6 = _1 & mask;
+		 _5 = (int) _6;
+	       */
+	      replace_uses_by (use_lhs, lhs);
+	      tree use_nop_lhs = gimple_assign_lhs (use_nop_stmt);
+	      var = make_ssa_name (TREE_TYPE (use_nop_lhs));
+	      gimple_assign_set_lhs (use_nop_stmt, var);
+	      gsi = gsi_for_stmt (use_stmt);
+	      gsi_remove (&gsi, true);
+	      release_defs (use_stmt);
+	      gsi_remove (gsip, true);
+	      var = build1 (NOP_EXPR, TREE_TYPE (use_nop_lhs), var);
+	      gsi = gsi_for_stmt (use_nop_stmt);
+	      g = gimple_build_assign (use_nop_lhs, var);
+	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	      use_stmt = use_nop_stmt;
+	      mask = op_mask;
+	    }
+	}
+    }
+
   switch (fn)
     {
     case IFN_ATOMIC_BIT_TEST_AND_SET:
@@ -3301,7 +3649,6 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   if (optab_handler (optab, TYPE_MODE (TREE_TYPE (lhs))) == CODE_FOR_nothing)
     return;
 
-  mask = gimple_call_arg (call, 1);
   tree use_lhs = gimple_assign_lhs (use_stmt);
   if (!use_lhs)
     return;
@@ -3329,9 +3676,30 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
 	    return;
 	  g = SSA_NAME_DEF_STMT (mask);
 	}
-      if (!is_gimple_assign (g)
-	  || gimple_assign_rhs_code (g) != LSHIFT_EXPR
-	  || !integer_onep (gimple_assign_rhs1 (g)))
+      if (!is_gimple_assign (g))
+	return;
+      rhs_code = gimple_assign_rhs_code (g);
+      if (rhs_code != LSHIFT_EXPR)
+	{
+	  if (rhs_code != NOP_EXPR)
+	    return;
+
+	  /* Handle
+	     _1 = 1 << bit_4(D);
+	     mask_5 = (unsigned int) _1;
+	     _2 = __atomic_fetch_or_4 (v_7(D), mask_5, 0);
+	     _3 = _2 & mask_5;
+	   */
+	  tree nop_lhs = gimple_assign_lhs (g);
+	  tree nop_rhs = gimple_assign_rhs1 (g);
+	  if (TYPE_PRECISION (TREE_TYPE (nop_lhs))
+	      != TYPE_PRECISION (TREE_TYPE (nop_rhs)))
+	    return;
+	  g = SSA_NAME_DEF_STMT (nop_rhs);
+	  if (gimple_assign_rhs_code (g) != LSHIFT_EXPR)
+	    return;
+	}
+      if (!integer_onep (gimple_assign_rhs1 (g)))
 	return;
       bit = gimple_assign_rhs2 (g);
     }
@@ -3434,18 +3802,40 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
 	 of the specified bit after the atomic operation (makes only sense
 	 for xor, otherwise the bit content is compile time known),
 	 we need to invert the bit.  */
+      tree mask_convert = mask;
+      gimple *g_convert = nullptr;
+      if (!use_bool && TREE_TYPE (lhs) != TREE_TYPE (mask))
+	{
+	  mask_convert = make_ssa_name (TREE_TYPE (lhs));
+	  tree var = build1 (NOP_EXPR, TREE_TYPE (lhs), mask);
+	  g_convert = gimple_build_assign (mask_convert, var);
+	}
       g = gimple_build_assign (make_ssa_name (TREE_TYPE (lhs)),
 			       BIT_XOR_EXPR, new_lhs,
 			       use_bool ? build_int_cst (TREE_TYPE (lhs), 1)
-					: mask);
+					: mask_convert);
       new_lhs = gimple_assign_lhs (g);
       if (throws)
 	{
-	  gsi_insert_on_edge_immediate (e, g);
+	  if (g_convert)
+	    {
+	      gsi_insert_on_edge_immediate (e, g_convert);
+	      gsi = gsi_for_stmt (g_convert);
+	      gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	    }
+	  else
+	    gsi_insert_on_edge_immediate (e, g);
 	  gsi = gsi_for_stmt (g);
 	}
       else
-	gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	{
+	  if (g_convert)
+	    {
+	      gsi_insert_after (&gsi, g_convert, GSI_NEW_STMT);
+	      gsi = gsi_for_stmt (g_convert);
+	    }
+	  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
+	}
     }
   if (use_bool && has_debug_uses)
     {
-- 
2.31.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Improve integer bit test on atomic builtin return
  2021-10-05 16:40   ` H.J. Lu
  2021-10-05 23:54     ` [PATCH v2] Improve integer bit test on __atomic_fetch_[or|and]_* returns H.J. Lu
@ 2021-10-08  7:16     ` Richard Biener
  2021-10-08 14:55       ` H.J. Lu
  1 sibling, 1 reply; 10+ messages in thread
From: Richard Biener @ 2021-10-08  7:16 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Jakub Jelinek

On Tue, 5 Oct 2021, H.J. Lu wrote:

> On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Mon, 4 Oct 2021, H.J. Lu wrote:
> >
> > > commit adedd5c173388ae505470df152b9cb3947339566
> > > Author: Jakub Jelinek <jakub@redhat.com>
> > > Date:   Tue May 3 13:37:25 2016 +0200
> > >
> > >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> > >
> > > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > > it works only for unsigned integers since atomic builtins operate on the
> > > 'uintptr_t' type.  It fails on bool:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > and signed integers:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > Improve bit test on atomic builtin return by converting:
> > >
> > >   _1 = atomic builtin;
> > >   _4 = (_Bool) _1;
> > >
> > > to
> > >
> > >   _1 = atomic builtin;
> > >   _5 = _1 & (1 << 0);
> > >   _4 = (_Bool) _5;
> > >
> > > and converting:
> > >
> > >   _1 = atomic builtin;
> > >   _2 = (int) _1;
> > >   _5 = _2 & (1 << N);
> > >
> > > to
> > >   _1 = atomic builtin;
> > >   _6 = _1 & (1 << N);
> > >   _5 = (int) _6;
> >
> > Why not do this last bit with match.pd patterns (and independent on
> > whether _1 is defined by an atomic builtin)?  For the first suggested
> 
> The full picture is
> 
>  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _2 = (int) _1;
>   _5 = _2 & mask;
> 
> to
> 
>   _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
>   _6 = _1 & mask;
>   _5 = (int) _6;
> 
> It is useful only if 2 masks are the same.
> 
> > transform that's likely going to be undone by folding, no?
> >
> 
> The bool case is
> 
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _4 = (_Bool) _1;
> 
> to
> 
>   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
>   _5 = _1 & 1;
>   _4 = (_Bool) _5;
> 
> Without __atomic_fetch_or_*, the conversion isn't needed.
> After the conversion, optimize_atomic_bit_test_and will
> immediately optimize the code sequence to
> 
>   _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
>   _4 = (_Bool) _6;
> 
> and there is nothing to fold after it.

Hmm, I see - so how about instead teaching the code that
produces the .ATOMIC_BIT_TEST_AND_SET the alternate forms instead
of doing the intermediate step separately?

Sorry for the delay btw, I've been busy all week ...

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Improve integer bit test on atomic builtin return
  2021-10-08  7:16     ` [PATCH] Improve integer bit test on atomic builtin return Richard Biener
@ 2021-10-08 14:55       ` H.J. Lu
  2021-10-22  5:48         ` [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask liuhongt
  0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-10-08 14:55 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, Jakub Jelinek

On Fri, Oct 8, 2021 at 12:16 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Tue, 5 Oct 2021, H.J. Lu wrote:
>
> > On Tue, Oct 5, 2021 at 3:07 AM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Mon, 4 Oct 2021, H.J. Lu wrote:
> > >
> > > > commit adedd5c173388ae505470df152b9cb3947339566
> > > > Author: Jakub Jelinek <jakub@redhat.com>
> > > > Date:   Tue May 3 13:37:25 2016 +0200
> > > >
> > > >     re PR target/49244 (__sync or __atomic builtins will not emit 'lock bts/btr/btc')
> > > >
> > > > optimized bit test on atomic builtin return with lock bts/btr/btc.  But
> > > > it works only for unsigned integers since atomic builtins operate on the
> > > > 'uintptr_t' type.  It fails on bool:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _4 = (_Bool) _1;
> > > >
> > > > and signed integers:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _2 = (int) _1;
> > > >   _5 = _2 & (1 << N);
> > > >
> > > > Improve bit test on atomic builtin return by converting:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _4 = (_Bool) _1;
> > > >
> > > > to
> > > >
> > > >   _1 = atomic builtin;
> > > >   _5 = _1 & (1 << 0);
> > > >   _4 = (_Bool) _5;
> > > >
> > > > and converting:
> > > >
> > > >   _1 = atomic builtin;
> > > >   _2 = (int) _1;
> > > >   _5 = _2 & (1 << N);
> > > >
> > > > to
> > > >   _1 = atomic builtin;
> > > >   _6 = _1 & (1 << N);
> > > >   _5 = (int) _6;
> > >
> > > Why not do this last bit with match.pd patterns (and independent on
> > > whether _1 is defined by an atomic builtin)?  For the first suggested
> >
> > The full picture is
> >
> >  _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
> >   _2 = (int) _1;
> >   _5 = _2 & mask;
> >
> > to
> >
> >   _1 = _atomic_fetch_or_* (ptr_6, mask, _3);
> >   _6 = _1 & mask;
> >   _5 = (int) _6;
> >
> > It is useful only if 2 masks are the same.
> >
> > > transform that's likely going to be undone by folding, no?
> > >
> >
> > The bool case is
> >
> >   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
> >   _4 = (_Bool) _1;
> >
> > to
> >
> >   _1 = __atomic_fetch_or_* (ptr_6, 1, _3);
> >   _5 = _1 & 1;
> >   _4 = (_Bool) _5;
> >
> > Without __atomic_fetch_or_*, the conversion isn't needed.
> > After the conversion, optimize_atomic_bit_test_and will
> > immediately optimize the code sequence to
> >
> >   _6 = .ATOMIC_BIT_TEST_AND_SET (&v, 0, 0, 0);
> >   _4 = (_Bool) _6;
> >
> > and there is nothing to fold after it.
>
> Hmm, I see - so how about instead teaching the code that
> produces the .ATOMIC_BIT_TEST_AND_SET the alternate forms instead
> of doing the intermediate step separately?
>

The old algorithm is

1.  Check gimple forms.  Return if the form isn't supported.
2.  Do transformation.

My current approach treats the gimple forms accepted by the
old algorithm as canonical forms and changes the algorithm
to

1.  If gimple forms aren't canonical, then
       a. If gimple forms can't be transformed to canonical forms,
           return;
       b. Transform to canonical form.
   endif
2.  Check gimple forms.  Return if the form isn't supported.
3.  Do transformation.

The #2 check is redundant when gimple forms have been
transformed to canonical forms.

I can change my patch to

1.  If gimple forms aren't canonical, then
       a. If gimple forms can't be transformed to canonical forms,
           return;
       b. Transform to canonical form.
    else
      Check gimple forms. Return if the form isn't supported.
    endif
2.  Do transformation.

The advantage of canonical forms is that we don't have to
transform all different forms.

Does it sound OK?

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask.
  2021-10-08 14:55       ` H.J. Lu
@ 2021-10-22  5:48         ` liuhongt
  2021-10-22 13:12           ` H.J. Lu
  0 siblings, 1 reply; 10+ messages in thread
From: liuhongt @ 2021-10-22  5:48 UTC (permalink / raw)
  To: gcc-patches

Hi:
 This patch is try to canoicalize bit_and and nop_convert order for
__atomic_fetch_or_*, __atomic_fetch_xor_*,
__atomic_xor_fetch_*,__sync_fetch_and_or_*,
__sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
__atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.

.i.e.

+/* Canonicalize
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = (int) _1;
+  _5 = _2 & 1;
+
+to
+
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = _1 & 1;
+  _5 = (int) _2;

+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+  _7 = _1 & 8192;
+  _6 = (int) _7;
+ So it can be handled by  optimize_atomic_bit_test_and.  */

I'm trying to rewrite match part in match.pd and find the
canonicalization is ok when mask is constant, but not for variable
since it will be simplified back by
 /* In GIMPLE, getting rid of 2 conversions for one new results
    in smaller IL.  */
 (simplify
  (convert (bitop:cs@2 (nop_convert:s @0) @1))
  (if (GIMPLE
       && TREE_CODE (@1) != INTEGER_CST
       && tree_nop_conversion_p (type, TREE_TYPE (@2))
       && types_match (type, @0))
   (bitop @0 (convert @1)))))

The canonicalization for variabled is like

convert
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
 _4 = (int) _3;
 _5 = _4 & mask_7;

to
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
  _4 = (unsigned int) mask_7
  _6 = _3 & _4
  _5 = (int) _6

and be simplified back.

I've also tried another way of simplication like

convert
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
 _4 = (int) _3;
 _5 = _4 & mask_7;

to
  _1 = (unsigned int)mask_7;
  _2 = ~ _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
   _6 = _3 & _1
  _5 = (int)

but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
we need to regenerate it with updated parameter.

  /* We can't and should not emit calls to non-const functions.  */
  if (!(flags_from_decl_or_type (decl) & ECF_CONST))
    return NULL;


  Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

	* match.pd: Canonicalize bit_and and nop_convert order for
	__atomic/sync_fetch_or/xor/and for when mask is constant.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102566-1a.c: New test.
	* gcc.target/i386/pr102566-2a.c: New test.
---
 gcc/match.pd                                | 118 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-1a.c |  66 +++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-2a.c |  65 +++++++++++
 3 files changed, 249 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 5bed2e12715..06b369d1ab1 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (define_operator_list COND_TERNARY
   IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
 
+/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_*  */
+(define_operator_list ATOMIC_FETCH_OR_XOR_N
+  BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
+  BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
+  BUILT_IN_ATOMIC_FETCH_OR_16
+  BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
+  BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
+  BUILT_IN_ATOMIC_FETCH_XOR_16
+  BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
+  BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
+  BUILT_IN_ATOMIC_XOR_FETCH_16)
+/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_*  */
+(define_operator_list SYNC_FETCH_OR_XOR_N
+  BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
+  BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
+  BUILT_IN_SYNC_FETCH_AND_OR_16
+  BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
+  BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
+  BUILT_IN_SYNC_FETCH_AND_XOR_16
+  BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
+  BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
+  BUILT_IN_SYNC_XOR_AND_FETCH_16)
+/* __atomic_fetch_and_*.  */
+(define_operator_list ATOMIC_FETCH_AND_N
+  BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
+  BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
+  BUILT_IN_ATOMIC_FETCH_AND_16)
+/* __sync_fetch_and_and_*.  */
+(define_operator_list SYNC_FETCH_AND_AND_N
+  BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
+  BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
+  BUILT_IN_SYNC_FETCH_AND_AND_16)
+
 /* With nop_convert? combine convert? and view_convert? in one pattern
    plus conditionalize on tree_nop_conversion_p conversions.  */
 (match (nop_convert @0)
@@ -3907,6 +3940,91 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (vec_cond @0 (op! @3 @1) (op! @3 @2))))
 #endif
 
+#if GIMPLE
+/* Canonicalize
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = (int) _1;
+  _5 = _2 & 1;
+
+to
+
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = _1 & 1;
+  _5 = (int) _2;
+
+  So it can be handled by optimize_atomic_bit_test_and.  */
+(simplify
+  (bit_and
+    (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
+    INTEGER_CST@4)
+    (with { int ibit = tree_log2 (@1);
+	    int ibit2 = tree_log2 (@4); }
+      (if (ibit >= 0 && ibit == ibit2
+	   && single_use (@5))
+      /* Make sure the second operand have the same type as @3
+	 orelse will hit gcc_asssert.  */
+	 (convert:type
+	   (bit_and @3
+		    { build_int_cst (TREE_TYPE (@3),
+				     HOST_WIDE_INT_1U << ibit);})))))
+
+(simplify
+  (bit_and
+    (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
+    INTEGER_CST@2)
+    (with { int ibit = tree_log2 (@1);
+	    int ibit2 = tree_log2 (@2); }
+      (if (ibit >= 0 && ibit == ibit2
+	   && single_use (@4))
+      /* Make sure the second operand have the same type as @3
+	 orelse will hit gcc_asssert.  */
+	 (convert:type
+	   (bit_and @3
+		    { build_int_cst (TREE_TYPE (@3),
+				     HOST_WIDE_INT_1U << ibit);})))))
+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+  _7 = _1 & 8192;
+  _6 = (int) _7;
+ So it can be handled by  optimize_atomic_bit_test_and.  */
+
+(simplify
+  (bit_and
+    (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
+    INTEGER_CST@4)
+    (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+						 TYPE_PRECISION(type)));
+	    int ibit2 = tree_log2 (@4); }
+      (if (ibit >= 0 && ibit == ibit2
+	   && single_use (@5))
+      /* Make sure the second operand have the same type as @3
+	 orelse will hit gcc_asssert.  */
+	 (convert:type
+	   (bit_and @3
+		    { build_int_cst (TREE_TYPE (@3),
+				     HOST_WIDE_INT_1U << ibit);})))))
+
+(simplify
+  (bit_and
+    (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
+    INTEGER_CST@2)
+    (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+						 TYPE_PRECISION(type)));
+	    int ibit2 = tree_log2 (@2); }
+      (if (ibit >= 0 && ibit == ibit2
+	   && single_use (@4))
+      /* Make sure the second operand have the same type as @3
+	 orelse will hit gcc_asssert.  */
+	 (convert:type
+	   (bit_and @3
+		    { build_int_cst (TREE_TYPE (@3),
+				     HOST_WIDE_INT_1U << ibit);})))))
+#endif
+
 /* (v ? w : 0) ? a : b is just (v & w) ? a : b
    Currently disabled after pass lvec because ARM understands
    VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
new file mode 100644
index 00000000000..2657a2f62ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+
+#define FOO(TYPE,MASK)							\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_or (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_xor (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_xor_and_fetch (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_and (a, ~mask) & mask;			\
+  }									\
+
+FOO(short, 0);
+FOO(short, 7);
+FOO(short, 15);
+FOO(int, 0);
+FOO(int, 15);
+FOO(int, 31);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
new file mode 100644
index 00000000000..24681c1da18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
@@ -0,0 +1,65 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+typedef long long int64;
+
+#define FOO(TYPE,MASK)							\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_or (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_xor (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_xor_and_fetch (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_and (a, ~mask) & mask;			\
+  }									\
+
+
+FOO(int64, 0);
+FOO(int64, 32);
+FOO(int64, 63);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
-- 
2.18.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask.
  2021-10-22  5:48         ` [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask liuhongt
@ 2021-10-22 13:12           ` H.J. Lu
  2021-10-25  5:59             ` liuhongt
  0 siblings, 1 reply; 10+ messages in thread
From: H.J. Lu @ 2021-10-22 13:12 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches, Hongtao Liu, Richard Biener

On Thu, Oct 21, 2021 at 10:48 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> Hi:
>  This patch is try to canoicalize bit_and and nop_convert order for
> __atomic_fetch_or_*, __atomic_fetch_xor_*,
> __atomic_xor_fetch_*,__sync_fetch_and_or_*,
> __sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
> __atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.
>
> .i.e.
>
> +/* Canonicalize
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = (int) _1;
> +  _5 = _2 & 1;
> +
> +to
> +
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = _1 & 1;
> +  _5 = (int) _2;
>
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> +  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> +  _7 = _1 & 8192;
> +  _6 = (int) _7;
> + So it can be handled by  optimize_atomic_bit_test_and.  */
>
> I'm trying to rewrite match part in match.pd and find the
> canonicalization is ok when mask is constant, but not for variable
> since it will be simplified back by
>  /* In GIMPLE, getting rid of 2 conversions for one new results
>     in smaller IL.  */
>  (simplify
>   (convert (bitop:cs@2 (nop_convert:s @0) @1))
>   (if (GIMPLE
>        && TREE_CODE (@1) != INTEGER_CST
>        && tree_nop_conversion_p (type, TREE_TYPE (@2))
>        && types_match (type, @0))
>    (bitop @0 (convert @1)))))
>
> The canonicalization for variabled is like
>
> convert
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>  _4 = (int) _3;
>  _5 = _4 & mask_7;
>
> to
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>   _4 = (unsigned int) mask_7
>   _6 = _3 & _4
>   _5 = (int) _6
>
> and be simplified back.
>
> I've also tried another way of simplication like
>
> convert
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>  _4 = (int) _3;
>  _5 = _4 & mask_7;
>
> to
>   _1 = (unsigned int)mask_7;
>   _2 = ~ _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>    _6 = _3 & _1
>   _5 = (int)
>
> but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
> we need to regenerate it with updated parameter.
>
>   /* We can't and should not emit calls to non-const functions.  */
>   if (!(flags_from_decl_or_type (decl) & ECF_CONST))
>     return NULL;
>
>
>   Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
>   Ok for trunk?
>
> gcc/ChangeLog:
>
>         * match.pd: Canonicalize bit_and and nop_convert order for
>         __atomic/sync_fetch_or/xor/and for when mask is constant.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102566-1a.c: New test.
>         * gcc.target/i386/pr102566-2a.c: New test.
> ---
>  gcc/match.pd                                | 118 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-1a.c |  66 +++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-2a.c |  65 +++++++++++
>  3 files changed, 249 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5bed2e12715..06b369d1ab1 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (define_operator_list COND_TERNARY
>    IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
>
> +/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_*  */
> +(define_operator_list ATOMIC_FETCH_OR_XOR_N
> +  BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
> +  BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
> +  BUILT_IN_ATOMIC_FETCH_OR_16
> +  BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
> +  BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
> +  BUILT_IN_ATOMIC_FETCH_XOR_16
> +  BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
> +  BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
> +  BUILT_IN_ATOMIC_XOR_FETCH_16)
> +/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_*  */
> +(define_operator_list SYNC_FETCH_OR_XOR_N
> +  BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
> +  BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
> +  BUILT_IN_SYNC_FETCH_AND_OR_16
> +  BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
> +  BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
> +  BUILT_IN_SYNC_FETCH_AND_XOR_16
> +  BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
> +  BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
> +  BUILT_IN_SYNC_XOR_AND_FETCH_16)
> +/* __atomic_fetch_and_*.  */
> +(define_operator_list ATOMIC_FETCH_AND_N
> +  BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
> +  BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
> +  BUILT_IN_ATOMIC_FETCH_AND_16)
> +/* __sync_fetch_and_and_*.  */
> +(define_operator_list SYNC_FETCH_AND_AND_N
> +  BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
> +  BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
> +  BUILT_IN_SYNC_FETCH_AND_AND_16)
> +
>  /* With nop_convert? combine convert? and view_convert? in one pattern
>     plus conditionalize on tree_nop_conversion_p conversions.  */
>  (match (nop_convert @0)
> @@ -3907,6 +3940,91 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (vec_cond @0 (op! @3 @1) (op! @3 @2))))
>  #endif
>
> +#if GIMPLE
> +/* Canonicalize
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = (int) _1;
> +  _5 = _2 & 1;
> +
> +to
> +
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = _1 & 1;
> +  _5 = (int) _2;
> +
> +  So it can be handled by optimize_atomic_bit_test_and.  */
> +(simplify
> +  (bit_and
> +    (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
> +    INTEGER_CST@4)
> +    (with { int ibit = tree_log2 (@1);
> +           int ibit2 = tree_log2 (@4); }
> +      (if (ibit >= 0 && ibit == ibit2
> +          && single_use (@5))

Is it possible to check single_use first before checking constants?

> +      /* Make sure the second operand have the same type as @3
> +        orelse will hit gcc_asssert.  */
> +        (convert:type
> +          (bit_and @3
> +                   { build_int_cst (TREE_TYPE (@3),
> +                                    HOST_WIDE_INT_1U << ibit);})))))
> +
> +(simplify
> +  (bit_and
> +    (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
> +    INTEGER_CST@2)
> +    (with { int ibit = tree_log2 (@1);
> +           int ibit2 = tree_log2 (@2); }
> +      (if (ibit >= 0 && ibit == ibit2
> +          && single_use (@4))
> +      /* Make sure the second operand have the same type as @3
> +        orelse will hit gcc_asssert.  */
> +        (convert:type
> +          (bit_and @3
> +                   { build_int_cst (TREE_TYPE (@3),
> +                                    HOST_WIDE_INT_1U << ibit);})))))
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> +  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> +  _7 = _1 & 8192;
> +  _6 = (int) _7;
> + So it can be handled by  optimize_atomic_bit_test_and.  */
> +
> +(simplify
> +  (bit_and
> +    (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
> +    INTEGER_CST@4)
> +    (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> +                                                TYPE_PRECISION(type)));
> +           int ibit2 = tree_log2 (@4); }
> +      (if (ibit >= 0 && ibit == ibit2
> +          && single_use (@5))
> +      /* Make sure the second operand have the same type as @3
> +        orelse will hit gcc_asssert.  */
> +        (convert:type
> +          (bit_and @3
> +                   { build_int_cst (TREE_TYPE (@3),
> +                                    HOST_WIDE_INT_1U << ibit);})))))
> +
> +(simplify
> +  (bit_and
> +    (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
> +    INTEGER_CST@2)
> +    (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> +                                                TYPE_PRECISION(type)));
> +           int ibit2 = tree_log2 (@2); }
> +      (if (ibit >= 0 && ibit == ibit2
> +          && single_use (@4))
> +      /* Make sure the second operand have the same type as @3
> +        orelse will hit gcc_asssert.  */
> +        (convert:type
> +          (bit_and @3
> +                   { build_int_cst (TREE_TYPE (@3),
> +                                    HOST_WIDE_INT_1U << ibit);})))))
> +#endif
> +
>  /* (v ? w : 0) ? a : b is just (v & w) ? a : b
>     Currently disabled after pass lvec because ARM understands
>     VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..2657a2f62ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,66 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +#define FOO(TYPE,MASK)                                                 \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)                    \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;     \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)                  \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_or (a, mask) & mask;                       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_xor (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_xor_and_fetch (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_and (a, ~mask) & mask;                     \
> +  }                                                                    \
> +
> +FOO(short, 0);
> +FOO(short, 7);
> +FOO(short, 15);
> +FOO(int, 0);
> +FOO(int, 15);
> +FOO(int, 31);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> new file mode 100644
> index 00000000000..24681c1da18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +typedef long long int64;
> +
> +#define FOO(TYPE,MASK)                                                 \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)                    \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;     \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)                  \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_or (a, mask) & mask;                       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_xor (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_xor_and_fetch (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_and (a, ~mask) & mask;                     \
> +  }                                                                    \
> +
> +
> +FOO(int64, 0);
> +FOO(int64, 32);
> +FOO(int64, 63);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> --
> 2.18.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask.
  2021-10-22 13:12           ` H.J. Lu
@ 2021-10-25  5:59             ` liuhongt
  2021-10-25  9:07               ` Hongtao Liu
  0 siblings, 1 reply; 10+ messages in thread
From: liuhongt @ 2021-10-25  5:59 UTC (permalink / raw)
  To: gcc-patches

Canoicalize & and nop_convert order for
__atomic_fetch_or_*, __atomic_fetch_xor_*,
__atomic_xor_fetch_*,__sync_fetch_and_or_*,
__sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
__atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.

.i.e.

+/* Canonicalize
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = (int) _1;
+  _5 = _2 & 1;
+
+to
+
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = _1 & 1;
+  _5 = (int) _2;

+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+  _7 = _1 & 8192;
+  _6 = (int) _7;
+ So it can be handled by  optimize_atomic_bit_test_and.  */

I'm trying to rewrite match part in match.pd and find the
canonicalization is ok when mask is constant, but not for variable
since it will be simplified back by
 /* In GIMPLE, getting rid of 2 conversions for one new results
    in smaller IL.  */
 (simplify
  (convert (bitop:cs@2 (nop_convert:s @0) @1))
  (if (GIMPLE
       && TREE_CODE (@1) != INTEGER_CST
       && tree_nop_conversion_p (type, TREE_TYPE (@2))
       && types_match (type, @0))
   (bitop @0 (convert @1)))))

The canonicalization for variabled is like

convert
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
 _4 = (int) _3;
 _5 = _4 & mask_7;

to
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
  _4 = (unsigned int) mask_7
  _6 = _3 & _4
  _5 = (int) _6

and be simplified back.

I've also tried another way of simplication like

convert
  _1 = ~mask_7;
  _2 = (unsigned int) _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
 _4 = (int) _3;
 _5 = _4 & mask_7;

to
  _1 = (unsigned int)mask_7;
  _2 = ~ _1;
  _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
   _6 = _3 & _1
  _5 = (int)

but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
we need to regenerate it with updated parameter.

  /* We can't and should not emit calls to non-const functions.  */
  if (!(flags_from_decl_or_type (decl) & ECF_CONST))
    return NULL;

gcc/ChangeLog:

	* match.pd: Canonicalize __atomic/sync_fetch_or/xor/and for
	constant mask.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102566-1a.c: New test.
	* gcc.target/i386/pr102566-2a.c: New test.
---
 gcc/match.pd                                | 114 ++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-1a.c |  66 ++++++++++++
 gcc/testsuite/gcc.target/i386/pr102566-2a.c |  65 +++++++++++
 3 files changed, 245 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 5bed2e12715..545a243eae6 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (define_operator_list COND_TERNARY
   IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
 
+/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_*  */
+(define_operator_list ATOMIC_FETCH_OR_XOR_N
+  BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
+  BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
+  BUILT_IN_ATOMIC_FETCH_OR_16
+  BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
+  BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
+  BUILT_IN_ATOMIC_FETCH_XOR_16
+  BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
+  BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
+  BUILT_IN_ATOMIC_XOR_FETCH_16)
+/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_*  */
+(define_operator_list SYNC_FETCH_OR_XOR_N
+  BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
+  BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
+  BUILT_IN_SYNC_FETCH_AND_OR_16
+  BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
+  BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
+  BUILT_IN_SYNC_FETCH_AND_XOR_16
+  BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
+  BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
+  BUILT_IN_SYNC_XOR_AND_FETCH_16)
+/* __atomic_fetch_and_*.  */
+(define_operator_list ATOMIC_FETCH_AND_N
+  BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
+  BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
+  BUILT_IN_ATOMIC_FETCH_AND_16)
+/* __sync_fetch_and_and_*.  */
+(define_operator_list SYNC_FETCH_AND_AND_N
+  BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
+  BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
+  BUILT_IN_SYNC_FETCH_AND_AND_16)
+
 /* With nop_convert? combine convert? and view_convert? in one pattern
    plus conditionalize on tree_nop_conversion_p conversions.  */
 (match (nop_convert @0)
@@ -3907,6 +3940,87 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (vec_cond @0 (op! @3 @1) (op! @3 @2))))
 #endif
 
+#if GIMPLE
+/* Canonicalize
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = (int) _1;
+  _5 = _2 & 1;
+
+to
+
+  _1 = __atomic_fetch_or_4 (&v, 1, 0);
+  _2 = _1 & 1;
+  _5 = (int) _2;
+
+  So it can be handled by optimize_atomic_bit_test_and.  */
+(simplify
+ (bit_and
+  (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
+  INTEGER_CST@4)
+ (if (single_use (@5))
+  (with { int ibit = tree_log2 (@1);
+	  int ibit2 = tree_log2 (@4); }
+   (if (ibit >= 0 && ibit == ibit2)
+   /* Make sure the second operand have the same type as @3
+      orelse will hit gcc_asssert.  */
+    (convert:type
+     (bit_and @3
+	      { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+
+(simplify
+ (bit_and
+  (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
+  INTEGER_CST@2)
+ (if (single_use (@4))
+  (with { int ibit = tree_log2 (@1);
+	  int ibit2 = tree_log2 (@2); }
+   (if (ibit >= 0 && ibit == ibit2)
+   /* Make sure the second operand have the same type as @3
+      orelse will hit gcc_asssert.  */
+    (convert:type
+     (bit_and @3
+	      { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+/* Convert
+ _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
+ _2 = (int) _1;
+ _3 = _2 & 8192;
+to
+  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
+  _7 = _1 & 8192;
+  _6 = (int) _7;
+ So it can be handled by  optimize_atomic_bit_test_and.  */
+
+(simplify
+ (bit_and
+  (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
+  INTEGER_CST@4)
+ (if (single_use (@5))
+  (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+					       TYPE_PRECISION(type)));
+	  int ibit2 = tree_log2 (@4); }
+   (if (ibit >= 0 && ibit == ibit2)
+   /* Make sure the second operand have the same type as @3
+      orelse will hit gcc_asssert.  */
+    (convert:type
+     (bit_and @3
+	      { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+
+(simplify
+ (bit_and
+  (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
+  INTEGER_CST@2)
+ (if (single_use(@4))
+  (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
+					       TYPE_PRECISION(type)));
+	  int ibit2 = tree_log2 (@2); }
+   (if (ibit >= 0 && ibit == ibit2)
+   /* Make sure the second operand have the same type as @3
+      orelse will hit gcc_asssert.  */
+    (convert:type
+     (bit_and @3
+	      { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
+#endif
+
 /* (v ? w : 0) ? a : b is just (v & w) ? a : b
    Currently disabled after pass lvec because ARM understands
    VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
new file mode 100644
index 00000000000..2657a2f62ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+
+#define FOO(TYPE,MASK)							\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_or (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_xor (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_xor_and_fetch (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1 << MASK;						\
+    return __sync_fetch_and_and (a, ~mask) & mask;			\
+  }									\
+
+FOO(short, 0);
+FOO(short, 7);
+FOO(short, 15);
+FOO(int, 0);
+FOO(int, 15);
+FOO(int, 31);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
new file mode 100644
index 00000000000..24681c1da18
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
@@ -0,0 +1,65 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+#include <stdatomic.h>
+#include <stdbool.h>
+typedef long long int64;
+
+#define FOO(TYPE,MASK)							\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;	\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_or (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_xor (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_xor_and_fetch (a, mask) & mask;			\
+  }									\
+  __attribute__((noinline,noclone)) TYPE				\
+  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)			\
+  {									\
+    TYPE mask = 1ll << MASK;						\
+    return __sync_fetch_and_and (a, ~mask) & mask;			\
+  }									\
+
+
+FOO(int64, 0);
+FOO(int64, 32);
+FOO(int64, 63);
+
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
+/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
+/* { dg-final { scan-assembler-not "cmpxchg" } } */
-- 
2.18.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask.
  2021-10-25  5:59             ` liuhongt
@ 2021-10-25  9:07               ` Hongtao Liu
  0 siblings, 0 replies; 10+ messages in thread
From: Hongtao Liu @ 2021-10-25  9:07 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches, H. J. Lu

On Mon, Oct 25, 2021 at 1:59 PM liuhongt <hongtao.liu@intel.com> wrote:
>
> Canoicalize & and nop_convert order for
> __atomic_fetch_or_*, __atomic_fetch_xor_*,
> __atomic_xor_fetch_*,__sync_fetch_and_or_*,
> __sync_fetch_and_xor_*,__sync_xor_and_fetch_*,
> __atomic_fetch_and_*,__sync_fetch_and_and_* when mask is constant.
>
> .i.e.
>
> +/* Canonicalize
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = (int) _1;
> +  _5 = _2 & 1;
> +
> +to
> +
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = _1 & 1;
> +  _5 = (int) _2;
>
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> +  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> +  _7 = _1 & 8192;
> +  _6 = (int) _7;
> + So it can be handled by  optimize_atomic_bit_test_and.  */
>
> I'm trying to rewrite match part in match.pd and find the
> canonicalization is ok when mask is constant, but not for variable
> since it will be simplified back by
>  /* In GIMPLE, getting rid of 2 conversions for one new results
>     in smaller IL.  */
>  (simplify
>   (convert (bitop:cs@2 (nop_convert:s @0) @1))
>   (if (GIMPLE
>        && TREE_CODE (@1) != INTEGER_CST
>        && tree_nop_conversion_p (type, TREE_TYPE (@2))
>        && types_match (type, @0))
>    (bitop @0 (convert @1)))))
>
> The canonicalization for variabled is like
>
> convert
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>  _4 = (int) _3;
>  _5 = _4 & mask_7;
>
> to
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>   _4 = (unsigned int) mask_7
>   _6 = _3 & _4
>   _5 = (int) _6
>
> and be simplified back.
>
> I've also tried another way of simplication like
>
> convert
>   _1 = ~mask_7;
>   _2 = (unsigned int) _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>  _4 = (int) _3;
>  _5 = _4 & mask_7;
>
> to
>   _1 = (unsigned int)mask_7;
>   _2 = ~ _1;
>   _3 = __atomic_fetch_and_4 (ptr_6, _2, 0);
>    _6 = _3 & _1
>   _5 = (int)
>
> but it's prevent by below since __atomic_fetch_and_4 is not CONST, but
> we need to regenerate it with updated parameter.
>
>   /* We can't and should not emit calls to non-const functions.  */
>   if (!(flags_from_decl_or_type (decl) & ECF_CONST))
>     return NULL;
>
> gcc/ChangeLog:
>
>         * match.pd: Canonicalize __atomic/sync_fetch_or/xor/and for
>         constant mask.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102566-1a.c: New test.
>         * gcc.target/i386/pr102566-2a.c: New test.
> ---
>  gcc/match.pd                                | 114 ++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-1a.c |  66 ++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102566-2a.c |  65 +++++++++++
>  3 files changed, 245 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102566-2a.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 5bed2e12715..545a243eae6 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -104,6 +104,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (define_operator_list COND_TERNARY
>    IFN_COND_FMA IFN_COND_FMS IFN_COND_FNMA IFN_COND_FNMS)
>
> +/* __atomic_fetch_or_*, __atomic_fetch_xor_*, __atomic_xor_fetch_*  */
> +(define_operator_list ATOMIC_FETCH_OR_XOR_N
> +  BUILT_IN_ATOMIC_FETCH_OR_1 BUILT_IN_ATOMIC_FETCH_OR_2
> +  BUILT_IN_ATOMIC_FETCH_OR_4 BUILT_IN_ATOMIC_FETCH_OR_8
> +  BUILT_IN_ATOMIC_FETCH_OR_16
> +  BUILT_IN_ATOMIC_FETCH_XOR_1 BUILT_IN_ATOMIC_FETCH_XOR_2
> +  BUILT_IN_ATOMIC_FETCH_XOR_4 BUILT_IN_ATOMIC_FETCH_XOR_8
> +  BUILT_IN_ATOMIC_FETCH_XOR_16
> +  BUILT_IN_ATOMIC_XOR_FETCH_1 BUILT_IN_ATOMIC_XOR_FETCH_2
> +  BUILT_IN_ATOMIC_XOR_FETCH_4 BUILT_IN_ATOMIC_XOR_FETCH_8
> +  BUILT_IN_ATOMIC_XOR_FETCH_16)
> +/* __sync_fetch_and_or_*, __sync_fetch_and_xor_*, __sync_xor_and_fetch_*  */
> +(define_operator_list SYNC_FETCH_OR_XOR_N
> +  BUILT_IN_SYNC_FETCH_AND_OR_1 BUILT_IN_SYNC_FETCH_AND_OR_2
> +  BUILT_IN_SYNC_FETCH_AND_OR_4 BUILT_IN_SYNC_FETCH_AND_OR_8
> +  BUILT_IN_SYNC_FETCH_AND_OR_16
> +  BUILT_IN_SYNC_FETCH_AND_XOR_1 BUILT_IN_SYNC_FETCH_AND_XOR_2
> +  BUILT_IN_SYNC_FETCH_AND_XOR_4 BUILT_IN_SYNC_FETCH_AND_XOR_8
> +  BUILT_IN_SYNC_FETCH_AND_XOR_16
> +  BUILT_IN_SYNC_XOR_AND_FETCH_1 BUILT_IN_SYNC_XOR_AND_FETCH_2
> +  BUILT_IN_SYNC_XOR_AND_FETCH_4 BUILT_IN_SYNC_XOR_AND_FETCH_8
> +  BUILT_IN_SYNC_XOR_AND_FETCH_16)
> +/* __atomic_fetch_and_*.  */
> +(define_operator_list ATOMIC_FETCH_AND_N
> +  BUILT_IN_ATOMIC_FETCH_AND_1 BUILT_IN_ATOMIC_FETCH_AND_2
> +  BUILT_IN_ATOMIC_FETCH_AND_4 BUILT_IN_ATOMIC_FETCH_AND_8
> +  BUILT_IN_ATOMIC_FETCH_AND_16)
> +/* __sync_fetch_and_and_*.  */
> +(define_operator_list SYNC_FETCH_AND_AND_N
> +  BUILT_IN_SYNC_FETCH_AND_AND_1 BUILT_IN_SYNC_FETCH_AND_AND_2
> +  BUILT_IN_SYNC_FETCH_AND_AND_4 BUILT_IN_SYNC_FETCH_AND_AND_8
> +  BUILT_IN_SYNC_FETCH_AND_AND_16)
> +
>  /* With nop_convert? combine convert? and view_convert? in one pattern
>     plus conditionalize on tree_nop_conversion_p conversions.  */
>  (match (nop_convert @0)
> @@ -3907,6 +3940,87 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (vec_cond @0 (op! @3 @1) (op! @3 @2))))
>  #endif
>
> +#if GIMPLE
> +/* Canonicalize
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = (int) _1;
> +  _5 = _2 & 1;
> +
> +to
> +
> +  _1 = __atomic_fetch_or_4 (&v, 1, 0);
> +  _2 = _1 & 1;
> +  _5 = (int) _2;
> +
> +  So it can be handled by optimize_atomic_bit_test_and.  */
> +(simplify
> + (bit_and
> +  (nop_convert@5 (ATOMIC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1 @2))
> +  INTEGER_CST@4)
> + (if (single_use (@5))
> +  (with { int ibit = tree_log2 (@1);
> +         int ibit2 = tree_log2 (@4); }
> +   (if (ibit >= 0 && ibit == ibit2)
> +   /* Make sure the second operand have the same type as @3
> +      orelse will hit gcc_asssert.  */
> +    (convert:type
> +     (bit_and @3
> +             { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +
> +(simplify
> + (bit_and
> +  (nop_convert@4 (SYNC_FETCH_OR_XOR_N@3 @0 INTEGER_CST@1))
> +  INTEGER_CST@2)
> + (if (single_use (@4))
> +  (with { int ibit = tree_log2 (@1);
> +         int ibit2 = tree_log2 (@2); }
> +   (if (ibit >= 0 && ibit == ibit2)
> +   /* Make sure the second operand have the same type as @3
> +      orelse will hit gcc_asssert.  */
> +    (convert:type
> +     (bit_and @3
> +             { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +/* Convert
> + _1 = __atomic_fetch_and_4 (a_6(D), 4294959103, 0);
> + _2 = (int) _1;
> + _3 = _2 & 8192;
> +to
> +  _1 = __atomic_fetch_and_4 (a_4(D), 4294959103, 0);
> +  _7 = _1 & 8192;
> +  _6 = (int) _7;
> + So it can be handled by  optimize_atomic_bit_test_and.  */
> +
> +(simplify
> + (bit_and
> +  (nop_convert@5 (ATOMIC_FETCH_AND_N@3 @0 INTEGER_CST@1 @2))
> +  INTEGER_CST@4)
> + (if (single_use (@5))
> +  (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> +                                              TYPE_PRECISION(type)));
> +         int ibit2 = tree_log2 (@4); }
> +   (if (ibit >= 0 && ibit == ibit2)
> +   /* Make sure the second operand have the same type as @3
> +      orelse will hit gcc_asssert.  */
> +    (convert:type
> +     (bit_and @3
> +             { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +
Now we have single_use condition first

static bool
gimple_simplify_469 (gimple_match_op *res_op, gimple_seq *seq,
                 tree (*valueize)(tree) ATTRIBUTE_UNUSED,
                 const tree ARG_UNUSED (type), tree *ARG_UNUSED (captures)
, const combined_fn ARG_UNUSED (ATOMIC_FETCH_OR_XOR_N))
{
/* #line 3960 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
  if (single_use (captures[0])
)
    {
      {
/* #line 3961 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
 int ibit = tree_log2 (captures[3]);
 int ibit2 = tree_log2 (captures[5]);
/* #line 3963 "/export/users2/liuhongt/gcc/intel-innersource/lock/gcc/match.pd"
*/
          if (ibit >= 0 && ibit == ibit2
)
            {
              gimple_seq *lseq = seq;
              if (__builtin_expect (!dbg_cnt (match), 0)) goto
next_after_fail763;
              if (__builtin_expect (dump_file && (dump_flags &
TDF_FOLDING), 0)) fprintf (dump_file, "Applying pattern %s:%d,
%s:%d\n", "match.pd", 3966, __FILE__, __LINE__);
              {
                res_op->set_op (NOP_EXPR, type, 1);
                {
                  tree _o1[2], _r1;
                  _o1[0] = captures[1];
                  _o1[1] =  build_int_cst (TREE_TYPE (captures[1]),
HOST_WIDE_INT_1U << ibit);
                  gimple_match_op tem_op (res_op->cond.any_else (),
BIT_AND_EXPR, TREE_TYPE (_o1[0]), _o1[0], _o1[1]);
                  tem_op.resimplify (lseq, valueize);
                  _r1 = maybe_push_res_to_seq (&tem_op, lseq);
                  if (!_r1) goto next_after_fail763;
                  res_op->ops[0] = _r1;
                }
                res_op->resimplify (lseq, valueize);
                return true;
              }
next_after_fail763:;
> +(simplify
> + (bit_and
> +  (nop_convert@4 (SYNC_FETCH_AND_AND_N@3 @0 @1))
> +  INTEGER_CST@2)
> + (if (single_use(@4))
> +  (with { int ibit = wi::exact_log2 (wi::zext (wi::bit_not (wi::to_wide (@1)),
> +                                              TYPE_PRECISION(type)));
> +         int ibit2 = tree_log2 (@2); }
> +   (if (ibit >= 0 && ibit == ibit2)
> +   /* Make sure the second operand have the same type as @3
> +      orelse will hit gcc_asssert.  */
> +    (convert:type
> +     (bit_and @3
> +             { build_int_cst (TREE_TYPE (@3), HOST_WIDE_INT_1U << ibit);}))))))
> +#endif
> +
>  /* (v ? w : 0) ? a : b is just (v & w) ? a : b
>     Currently disabled after pass lvec because ARM understands
>     VEC_COND_EXPR<v==w,-1,0> but not a plain v==w fed to BIT_IOR_EXPR.  */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-1a.c b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> new file mode 100644
> index 00000000000..2657a2f62ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-1a.c
> @@ -0,0 +1,66 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +
> +#define FOO(TYPE,MASK)                                                 \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)                    \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;     \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)                  \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_or (a, mask) & mask;                       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_xor (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_xor_and_fetch (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1 << MASK;                                             \
> +    return __sync_fetch_and_and (a, ~mask) & mask;                     \
> +  }                                                                    \
> +
> +FOO(short, 0);
> +FOO(short, 7);
> +FOO(short, 15);
> +FOO(int, 0);
> +FOO(int, 15);
> +FOO(int, 31);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 24 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 12 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr102566-2a.c b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> new file mode 100644
> index 00000000000..24681c1da18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102566-2a.c
> @@ -0,0 +1,65 @@
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2" } */
> +#include <stdatomic.h>
> +#include <stdbool.h>
> +typedef long long int64;
> +
> +#define FOO(TYPE,MASK)                                                 \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_or_##TYPE##_##MASK (_Atomic TYPE* a)                    \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_or (a, mask, __ATOMIC_RELAXED) & mask;       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_xor_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_xor (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_xor_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_xor_fetch (a, mask, __ATOMIC_RELAXED) & mask;      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  atomic_fetch_and_##TYPE##_##MASK (_Atomic TYPE* a)                   \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __atomic_fetch_and (a, ~mask, __ATOMIC_RELAXED) & mask;     \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_or_##TYPE##_##MASK (_Atomic TYPE* a)                  \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_or (a, mask) & mask;                       \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_xor_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_xor (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_xor_and_fetch_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_xor_and_fetch (a, mask) & mask;                      \
> +  }                                                                    \
> +  __attribute__((noinline,noclone)) TYPE                               \
> +  sync_fetch_and_and_##TYPE##_##MASK (_Atomic TYPE* a)                 \
> +  {                                                                    \
> +    TYPE mask = 1ll << MASK;                                           \
> +    return __sync_fetch_and_and (a, ~mask) & mask;                     \
> +  }                                                                    \
> +
> +
> +FOO(int64, 0);
> +FOO(int64, 32);
> +FOO(int64, 63);
> +
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*bts" 6 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btc" 12 } } */
> +/* { dg-final { scan-assembler-times "lock;?\[ \t\]*btr" 6 } } */
> +/* { dg-final { scan-assembler-not "cmpxchg" } } */
> --
> 2.18.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-10-25  9:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-04 13:53 [PATCH] Improve integer bit test on atomic builtin return H.J. Lu
2021-10-05 10:07 ` Richard Biener
2021-10-05 16:40   ` H.J. Lu
2021-10-05 23:54     ` [PATCH v2] Improve integer bit test on __atomic_fetch_[or|and]_* returns H.J. Lu
2021-10-08  7:16     ` [PATCH] Improve integer bit test on atomic builtin return Richard Biener
2021-10-08 14:55       ` H.J. Lu
2021-10-22  5:48         ` [PATCH] Canonicalize __atomic/sync_fetch_or/xor/and for constant mask liuhongt
2021-10-22 13:12           ` H.J. Lu
2021-10-25  5:59             ` liuhongt
2021-10-25  9:07               ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).