[PATCH, AArch64 v2 08/11] aarch64: Implement -matomic-ool

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH, AArch64 v2 08/11] aarch64: Implement -matomic-ool
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (6 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

	* config/aarch64/aarch64.opt (-matomic-ool): New.
	* config/aarch64/aarch64.c (aarch64_atomic_ool_func): New.
	(aarch64_ool_cas_names, aarch64_ool_swp_names): New.
	(aarch64_ool_ldadd_names, aarch64_ool_ldset_names): New.
	(aarch64_ool_ldclr_names, aarch64_ool_ldeor_names): New.
	(aarch64_ool_stadd_names, aarch64_ool_stset_names): New.
	(aarch64_ool_stclr_names, aarch64_ool_steor_names): New.
	(aarch64_expand_compare_and_swap): Honor TARGET_ATOMIC_OOL.
	* config/aarch64/atomics.md (atomic_exchange<ALLI>): Likewise.
	(atomic_<atomic_op><ALLI>): Likewise.
	(atomic_fetch_<atomic_op><ALLI>): Likewise.
	(atomic_<atomic_op>_fetch<ALLI>): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h           | 17 ++++
 gcc/config/aarch64/aarch64.c                  | 95 +++++++++++++++++++
 .../atomic-comp-swap-release-acquire.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-char.c       |  2 +-
 .../gcc.target/aarch64/atomic-op-consume.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-imm.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-int.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-long.c       |  2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-release.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-short.c      |  2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |  2 +-
 .../atomic_cmp_exchange_zero_strong_1.c       |  2 +-
 .../gcc.target/aarch64/sync-comp-swap.c       |  2 +-
 .../gcc.target/aarch64/sync-op-acquire.c      |  2 +-
 .../gcc.target/aarch64/sync-op-full.c         |  2 +-
 gcc/config/aarch64/aarch64.opt                |  4 +
 gcc/config/aarch64/atomics.md                 | 94 ++++++++++++++++--
 gcc/doc/invoke.texi                           | 14 ++-
 22 files changed, 232 insertions(+), 26 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 1d2f8487d1a..c7b96b12bbe 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -624,4 +624,21 @@ poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
 bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
+struct atomic_ool_names
+{
+    const char *str[4][4];
+};
+
+rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			    const atomic_ool_names *names);
+extern const atomic_ool_names aarch64_ool_swp_names;
+extern const atomic_ool_names aarch64_ool_stadd_names;
+extern const atomic_ool_names aarch64_ool_stset_names;
+extern const atomic_ool_names aarch64_ool_stclr_names;
+extern const atomic_ool_names aarch64_ool_steor_names;
+extern const atomic_ool_names aarch64_ool_ldadd_names;
+extern const atomic_ool_names aarch64_ool_ldset_names;
+extern const atomic_ool_names aarch64_ool_ldclr_names;
+extern const atomic_ool_names aarch64_ool_ldeor_names;
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 867759f7e80..49b47382b5d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14160,6 +14160,90 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
+/* We store the names of the various atomic helpers in a 4x4 array.
+   Return the libcall function given MODE, MODEL and NAMES.  */
+
+rtx
+aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			const atomic_ool_names *names)
+{
+  memmodel model = memmodel_base (INTVAL (model_rtx));
+  int mode_idx, model_idx;
+
+  switch (mode)
+    {
+    case E_QImode:
+      mode_idx = 0;
+      break;
+    case E_HImode:
+      mode_idx = 1;
+      break;
+    case E_SImode:
+      mode_idx = 2;
+      break;
+    case E_DImode:
+      mode_idx = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+      model_idx = 0;
+      break;
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_ACQUIRE:
+      model_idx = 1;
+      break;
+    case MEMMODEL_RELEASE:
+      model_idx = 2;
+      break;
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      model_idx = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
+				      VISIBILITY_HIDDEN);
+}
+
+#define DEF0(B, N) \
+  { "__aa64_" #B #N "_relax", \
+    "__aa64_" #B #N "_acq", \
+    "__aa64_" #B #N "_rel", \
+    "__aa64_" #B #N "_acq_rel" }
+
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8)
+
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF4(cas) } };
+const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
+const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
+const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
+const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
+const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
+
+/* Note that the store forms are only available for relax and release
+   memory models.  For the other models, re-use the load forms.  */
+#undef DEF0
+#define DEF0(B, N) \
+  { "__aa64_st" #B #N "_relax", \
+    "__aa64_ld" #B #N "_acq", \
+    "__aa64_st" #B #N "_rel", \
+    "__aa64_ld" #B #N "_acq_rel" }
+
+const atomic_ool_names aarch64_ool_stadd_names = { { DEF4(add) } };
+const atomic_ool_names aarch64_ool_stset_names = { { DEF4(set) } };
+const atomic_ool_names aarch64_ool_stclr_names = { { DEF4(clr) } };
+const atomic_ool_names aarch64_ool_steor_names = { { DEF4(eor) } };
+
+#undef DEF0
+#undef DEF4
+
 /* Expand a compare and swap pattern.  */
 
 void
@@ -14206,6 +14290,17 @@ aarch64_expand_compare_and_swap (rtx operands[])
 						   newval, mod_s));
       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
     }
+  else if (TARGET_ATOMIC_OOL)
+    {
+      /* Oldval must satisfy compare afterward.  */
+      if (!aarch64_plus_operand (oldval, mode))
+	oldval = force_reg (mode, oldval);
+      rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
+      rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
+				      oldval, mode, newval, mode,
+				      XEXP (mem, 0), ptr_mode);
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+    }
   else
     {
       /* The oldval predicate varies by mode.  Test it and force to reg.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
index 49ca5d0d09c..e92f205c3a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-atomic-ool" } */
 
 #include "atomic-comp-swap-release-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
index 74f26348e42..6965431f7d9 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-acq_rel.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
index 66c1b1efe20..07dbca49d56 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
index c09d0434ecf..73bfbb7afc9 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-char.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
index 5783ab84f5c..c7945b3a22d 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-consume.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
index 18b8f0b04e9..e46bb3de7c1 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 int v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
index 8520f0839ba..9b55deb5225 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-int.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
index d011f8c5ce2..2622f75331f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 long v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
index ed96bfdb978..f118a37a352 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-relaxed.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
index fc4be17de89..579634b08e8 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-release.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
index 613000fe490..016b0d6619f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-seq_cst.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
index e82c8118ece..978bd1d8377 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-short.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
index f2a21ddf2e1..77430ecdbce 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-atomic-ool" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
index 8d2ae67dfbe..7d58b2f6bd0 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-atomic-ool" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
index e571b2f13b3..7fc5885d0fd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-atomic-ool" } */
 
 #include "sync-comp-swap.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
index 357bf1be3b2..6ad0daa8998 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "sync-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
index c6ba1629965..9a7afeb70d3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "sync-op-full.x"
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index b2e80cbf6f1..83166834165 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -218,3 +218,7 @@ Enables verbose cost model dumping in the debug dump files.
 mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
+
+matomic-ool
+Target Report Mask(ATOMIC_OOL) Save
+Generate local calls to out-of-line atomic operations.
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index ee662a4480e..72f9962fe55 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -140,16 +140,27 @@
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
-    rtx (*gen) (rtx, rtx, rtx, rtx);
-
     /* Use an atomic SWP when available.  */
     if (TARGET_LSE)
-      gen = gen_aarch64_atomic_exchange<mode>_lse;
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>_lse
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
+    else if (TARGET_ATOMIC_OOL)
+      {
+	machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[3],
+					    &aarch64_ool_swp_names);
+	rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL,
+					    mode, operands[2], mode,
+					    XEXP (operands[1], 0), ptr_mode);
+        emit_move_insn (operands[0], rval);
+      }
     else
-      gen = gen_aarch64_atomic_exchange<mode>;
-
-    emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
-
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
     DONE;
   }
 )
@@ -234,6 +245,39 @@
 	  }
 	operands[1] = force_reg (<MODE>mode, operands[1]);
       }
+    else if (TARGET_ATOMIC_OOL)
+      {
+        const atomic_ool_names *names;
+	switch (<CODE>)
+	  {
+	  case MINUS:
+	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
+					      NULL, 1);
+	    /* fallthru */
+	  case PLUS:
+	    names = &aarch64_ool_stadd_names;
+	    break;
+	  case IOR:
+	    names = &aarch64_ool_stset_names;
+	    break;
+	  case XOR:
+	    names = &aarch64_ool_steor_names;
+	    break;
+	  case AND:
+	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
+					      NULL, 1);
+	    names = &aarch64_ool_stclr_names;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+        machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[2], names);
+	emit_library_call_value (func, NULL_RTX, LCT_NORMAL, mode,
+				 operands[1], mode,
+				 XEXP (operands[0], 0), ptr_mode);
+        DONE;
+      }
     else
       gen = gen_aarch64_atomic_<atomic_optab><mode>;
 
@@ -350,6 +394,40 @@
 	}
       operands[2] = force_reg (<MODE>mode, operands[2]);
     }
+  else if (TARGET_ATOMIC_OOL)
+    {
+      const atomic_ool_names *names;
+      switch (<CODE>)
+	{
+	case MINUS:
+	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
+					    NULL, 1);
+	  /* fallthru */
+	case PLUS:
+	  names = &aarch64_ool_ldadd_names;
+	  break;
+	case IOR:
+	  names = &aarch64_ool_ldset_names;
+	  break;
+	case XOR:
+	  names = &aarch64_ool_ldeor_names;
+	  break;
+	case AND:
+	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
+					    NULL, 1);
+	  names = &aarch64_ool_ldclr_names;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      machine_mode mode = <MODE>mode;
+      rtx func = aarch64_atomic_ool_func (mode, operands[3], names);
+      rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL, mode,
+					  operands[2], mode,
+					  XEXP (operands[1], 0), ptr_mode);
+      emit_move_insn (operands[0], rval);
+      DONE;
+    }
   else
     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
 
@@ -439,7 +517,7 @@
 {
   /* Use an atomic load-operate instruction when possible.  In this case
      we will re-compute the result from the original mem value. */
-  if (TARGET_LSE)
+  if (TARGET_LSE || TARGET_ATOMIC_OOL)
     {
       rtx tmp = gen_reg_rtx (<MODE>mode);
       operands[2] = force_reg (<MODE>mode, operands[2]);
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5c95f674e0c..c947abea899 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -621,7 +621,7 @@ Objective-C and Objective-C++ Dialects}.
 -mpc-relative-literal-loads @gol
 -msign-return-address=@var{scope} @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}  @gol
--moverride=@var{string}  -mverbose-cost-dump -mtrack-speculation} 
+-moverride=@var{string}  -mverbose-cost-dump -mtrack-speculation -matomic-ool} 
 
 @emph{Adapteva Epiphany Options}
 @gccoptlist{-mhalf-reg-file  -mprefer-short-insn-regs @gol
@@ -15057,6 +15057,18 @@ be used by the compiler when expanding calls to
 @code{__builtin_speculation_safe_copy} to permit a more efficient code
 sequence to be generated.
 
+@item -matomic-ool
+@itemx -mno-atomic-ool
+Enable or disable calls to out-of-line helpers to implement atomic operations.
+These helpers will, at runtime, determine if ARMv8.1-Atomics instructions
+should be used; if not, they will use the load/store-exclusive instructions
+that are present in the base ARMv8.0 ISA.
+
+This option is only applicable when compiling for the base ARMv8.0
+instruction set.  If using a later revision, e.g. @option{-march=armv8.1-a}
+or @option{-march=armv8-a+lse}, the ARMv8.1-Atomics instructions will be
+used directly.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (5 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 20:50   ` James Greenhalgh
  2018-10-02 16:19 ` [PATCH, AArch64 v2 08/11] aarch64: Implement -matomic-ool Richard Henderson
                   ` (4 subsequent siblings)
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Allow zero as an input; fix constraints; avoid unnecessary split.

	* config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
	(aarch64_gen_atomic_ldop): Don't call it.
	* config/aarch64/atomics.md (atomic_exchange<ALLI>):
	Use aarch64_reg_or_zero.
	(aarch64_atomic_exchange<ALLI>): Likewise.
	(aarch64_atomic_exchange<ALLI>_lse): Remove split; remove & from
	operand 0; use aarch64_reg_or_zero for input; merge ...
	(@aarch64_atomic_swp<ALLI>): ... this and remove.
---
 gcc/config/aarch64/aarch64.c  | 13 ----------
 gcc/config/aarch64/atomics.md | 49 +++++++++++------------------------
 2 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0e2b85de1e3..f7b0af2589e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14403,15 +14403,6 @@ aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
   emit_insn (gen (dst, s2, shift_rtx, s1));
 }
 
-/* Emit an atomic swap.  */
-
-static void
-aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
-			  rtx mem, rtx model)
-{
-  emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
-}
-
 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
    location to store the data read from memory.  OUT_RESULT is the location to
    store the result of the operation.  MEM is the memory location to read and
@@ -14452,10 +14443,6 @@ aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
      a SET then emit a swap instruction and finish.  */
   switch (code)
     {
-    case SET:
-      aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
-      return;
-
     case MINUS:
       /* Negate the value and treat it as a PLUS.  */
       {
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index e44301b40c7..bc9e396dc96 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -136,7 +136,7 @@
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-  (match_operand:ALLI 2 "register_operand" "")
+  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
@@ -156,10 +156,10 @@
 
 (define_insn_and_split "aarch64_atomic_exchange<mode>"
   [(set (match_operand:ALLI 0 "register_operand" "=&r")		;; output
-    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))	;; memory
    (set (match_dup 1)
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "register_operand" "r")	;; input
+      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")	;; input
        (match_operand:SI 3 "const_int_operand" "")]		;; model
       UNSPECV_ATOMIC_EXCHG))
    (clobber (reg:CC CC_REGNUM))
@@ -175,22 +175,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_exchange<mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
+(define_insn "aarch64_atomic_exchange<mode>_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
     (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
    (set (match_dup 1)
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "register_operand" "r")
+      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
        (match_operand:SI 3 "const_int_operand" "")]
       UNSPECV_ATOMIC_EXCHG))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (SET, operands[0], NULL, operands[1],
-			     operands[2], operands[3]);
-    DONE;
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model))
+      return "swp<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else if (is_mm_acquire (model) || is_mm_consume (model))
+      return "swpa<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else if (is_mm_release (model))
+      return "swpl<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else
+      return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
   }
 )
 
@@ -585,28 +588,6 @@
 
 ;; ARMv8.1-A LSE instructions.
 
-;; Atomic swap with memory.
-(define_insn "@aarch64_atomic_swp<mode>"
- [(set (match_operand:ALLI 0 "register_operand" "+&r")
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
-  (set (match_dup 1)
-   (unspec_volatile:ALLI
-    [(match_operand:ALLI 2 "register_operand" "r")
-     (match_operand:SI 3 "const_int_operand" "")]
-    UNSPECV_ATOMIC_SWP))]
-  "TARGET_LSE && reload_completed"
-  {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model))
-      return "swp<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else if (is_mm_acquire (model) || is_mm_consume (model))
-      return "swpa<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else if (is_mm_release (model))
-      return "swpl<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else
-      return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
-  })
-
 ;; Atomic load-op: Load data, operate, store result, keep data.
 
 (define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (7 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 08/11] aarch64: Implement -matomic-ool Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 21:46   ` James Greenhalgh
                     ` (2 more replies)
  2018-10-02 16:36 ` [PATCH, AArch64 v2 10/11] aarch64: Implement TImode compare-and-swap Richard Henderson
                   ` (2 subsequent siblings)
  11 siblings, 3 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

	* optabs-libfuncs.c (build_libfunc_function_visibility):
	New, split out from...
	(build_libfunc_function): ... here.
	(init_one_libfunc_visibility): New, split out from ...
	(init_one_libfunc): ... here.
---
 gcc/optabs-libfuncs.h |  2 ++
 gcc/optabs-libfuncs.c | 26 ++++++++++++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
index 0669ea1fdd7..cf39da36887 100644
--- a/gcc/optabs-libfuncs.h
+++ b/gcc/optabs-libfuncs.h
@@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
 void gen_satfractuns_conv_libfunc (convert_optab, const char *,
 				   machine_mode, machine_mode);
 
+tree build_libfunc_function_visibility (const char *, symbol_visibility);
 tree build_libfunc_function (const char *);
+rtx init_one_libfunc_visibility (const char *, symbol_visibility);
 rtx init_one_libfunc (const char *);
 rtx set_user_assembler_libfunc (const char *, const char *);
 
diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
index bd0df8baa37..73a28e9ca7a 100644
--- a/gcc/optabs-libfuncs.c
+++ b/gcc/optabs-libfuncs.c
@@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash<tree_node>
 /* A table of previously-created libfuncs, hashed by name.  */
 static GTY (()) hash_table<libfunc_decl_hasher> *libfunc_decls;
 
-/* Build a decl for a libfunc named NAME.  */
+/* Build a decl for a libfunc named NAME with visibility VIS.  */
 
 tree
-build_libfunc_function (const char *name)
+build_libfunc_function_visibility (const char *name, symbol_visibility vis)
 {
   /* ??? We don't have any type information; pretend this is "int foo ()".  */
   tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
@@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
   DECL_EXTERNAL (decl) = 1;
   TREE_PUBLIC (decl) = 1;
   DECL_ARTIFICIAL (decl) = 1;
-  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
+  DECL_VISIBILITY (decl) = vis;
   DECL_VISIBILITY_SPECIFIED (decl) = 1;
   gcc_assert (DECL_ASSEMBLER_NAME (decl));
 
@@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
   return decl;
 }
 
+/* Build a decl for a libfunc named NAME.  */
+
+tree
+build_libfunc_function (const char *name)
+{
+  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Return a libfunc for NAME, creating one if we don't already have one.
-   The returned rtx is a SYMBOL_REF.  */
+   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
 
 rtx
-init_one_libfunc (const char *name)
+init_one_libfunc_visibility (const char *name, symbol_visibility vis)
 {
   tree id, decl;
   hashval_t hash;
@@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
     {
       /* Create a new decl, so that it can be passed to
 	 targetm.encode_section_info.  */
-      decl = build_libfunc_function (name);
+      decl = build_libfunc_function_visibility (name, vis);
       *slot = decl;
     }
   return XEXP (DECL_RTL (decl), 0);
 }
 
+rtx
+init_one_libfunc (const char *name)
+{
+  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
 
 rtx
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (3 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 21:40   ` James Greenhalgh
  2018-10-02 16:19 ` [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers Richard Henderson
                   ` (6 subsequent siblings)
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
logical for ldclr aka bic.

	* config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
	(aarch64_atomic_ldop_supported_p): Remove.
	(aarch64_gen_atomic_ldop): Remove.
	* config/aarch64/atomic.md (atomic_<atomic_optab><ALLI>):
	Fully expand LSE operations here.
	(atomic_fetch_<atomic_optab><ALLI>): Likewise.
	(atomic_<atomic_optab>_fetch<ALLI>): Likewise.
	(aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): Drop atomic_op iterator
	and use ATOMIC_LDOP instead; use register_operand for the input;
	drop the split and emit insns directly.
	(aarch64_atomic_fetch_<ATOMIC_LDOP><ALLI>_lse): Likewise.
	(aarch64_atomic_<atomic_op>_fetch<ALLI>_lse): Remove.
	(@aarch64_atomic_load<ATOMIC_LDOP><ALLI>): Remove.
---
 gcc/config/aarch64/aarch64-protos.h |   2 -
 gcc/config/aarch64/aarch64.c        | 176 -------------------------
 gcc/config/aarch64/atomics.md       | 197 +++++++++++++++-------------
 gcc/config/aarch64/iterators.md     |   5 +-
 4 files changed, 108 insertions(+), 272 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3d045cf43be..1d2f8487d1a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -563,8 +563,6 @@ rtx aarch64_load_tp (rtx);
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 
-bool aarch64_atomic_ldop_supported_p (enum rtx_code);
-void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f7b0af2589e..867759f7e80 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14226,32 +14226,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
   emit_insn (gen_rtx_SET (bval, x));
 }
 
-/* Test whether the target supports using a atomic load-operate instruction.
-   CODE is the operation and AFTER is TRUE if the data in memory after the
-   operation should be returned and FALSE if the data before the operation
-   should be returned.  Returns FALSE if the operation isn't supported by the
-   architecture.  */
-
-bool
-aarch64_atomic_ldop_supported_p (enum rtx_code code)
-{
-  if (!TARGET_LSE)
-    return false;
-
-  switch (code)
-    {
-    case SET:
-    case AND:
-    case IOR:
-    case XOR:
-    case MINUS:
-    case PLUS:
-      return true;
-    default:
-      return false;
-    }
-}
-
 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
    sequence implementing an atomic operation.  */
 
@@ -14384,156 +14358,6 @@ aarch64_split_compare_and_swap (rtx operands[])
     aarch64_emit_post_barrier (model);
 }
 
-/* Emit a BIC instruction.  */
-
-static void
-aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
-{
-  rtx shift_rtx = GEN_INT (shift);
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
-    case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, s2, shift_rtx, s1));
-}
-
-/* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
-   location to store the data read from memory.  OUT_RESULT is the location to
-   store the result of the operation.  MEM is the memory location to read and
-   modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
-   operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
-   be NULL.  */
-
-void
-aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
-			 rtx mem, rtx value, rtx model_rtx)
-{
-  machine_mode mode = GET_MODE (mem);
-  machine_mode wmode = (mode == DImode ? DImode : SImode);
-  const bool short_mode = (mode < SImode);
-  int ldop_code;
-  rtx src;
-  rtx x;
-
-  if (out_data)
-    out_data = gen_lowpart (mode, out_data);
-
-  if (out_result)
-    out_result = gen_lowpart (mode, out_result);
-
-  /* Make sure the value is in a register, putting it into a destination
-     register if it needs to be manipulated.  */
-  if (!register_operand (value, mode)
-      || code == AND || code == MINUS)
-    {
-      src = out_result ? out_result : out_data;
-      emit_move_insn (src, gen_lowpart (mode, value));
-    }
-  else
-    src = value;
-  gcc_assert (register_operand (src, mode));
-
-  /* Preprocess the data for the operation as necessary.  If the operation is
-     a SET then emit a swap instruction and finish.  */
-  switch (code)
-    {
-    case MINUS:
-      /* Negate the value and treat it as a PLUS.  */
-      {
-	rtx neg_src;
-
-	/* Resize the value if necessary.  */
-	if (short_mode)
-	  src = gen_lowpart (wmode, src);
-
-	neg_src = gen_rtx_NEG (wmode, src);
-	emit_insn (gen_rtx_SET (src, neg_src));
-
-	if (short_mode)
-	  src = gen_lowpart (mode, src);
-      }
-      /* Fall-through.  */
-    case PLUS:
-      ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
-      break;
-
-    case IOR:
-      ldop_code = UNSPECV_ATOMIC_LDOP_OR;
-      break;
-
-    case XOR:
-      ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
-      break;
-
-    case AND:
-      {
-	rtx not_src;
-
-	/* Resize the value if necessary.  */
-	if (short_mode)
-	  src = gen_lowpart (wmode, src);
-
-	not_src = gen_rtx_NOT (wmode, src);
-	emit_insn (gen_rtx_SET (src, not_src));
-
-	if (short_mode)
-	  src = gen_lowpart (mode, src);
-      }
-      ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
-      break;
-
-    default:
-      /* The operation can't be done with atomic instructions.  */
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
-				      out_data, mem, src, model_rtx));
-
-  /* If necessary, calculate the data in memory after the update by redoing the
-     operation from values in registers.  */
-  if (!out_result)
-    return;
-
-  if (short_mode)
-    {
-      src = gen_lowpart (wmode, src);
-      out_data = gen_lowpart (wmode, out_data);
-      out_result = gen_lowpart (wmode, out_result);
-    }
-
-  x = NULL_RTX;
-
-  switch (code)
-    {
-    case MINUS:
-    case PLUS:
-      x = gen_rtx_PLUS (wmode, out_data, src);
-      break;
-    case IOR:
-      x = gen_rtx_IOR (wmode, out_data, src);
-      break;
-    case XOR:
-      x = gen_rtx_XOR (wmode, out_data, src);
-      break;
-    case AND:
-      aarch64_emit_bic (wmode, out_result, out_data, src, 0);
-      return;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_set_insn (out_result, x);
-
-  return;
-}
-
 /* Split an atomic operation.  */
 
 void
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bc9e396dc96..2198649b1be 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -207,13 +207,37 @@
     rtx (*gen) (rtx, rtx, rtx);
 
     /* Use an atomic load-operate instruction when possible.  */
-    if (aarch64_atomic_ldop_supported_p (<CODE>))
-      gen = gen_aarch64_atomic_<atomic_optab><mode>_lse;
+    if (TARGET_LSE)
+      {
+	switch (<CODE>)
+	  {
+	  case MINUS:
+	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
+					      NULL, 1);
+	    /* fallthru */
+	  case PLUS:
+	    gen = gen_aarch64_atomic_add<mode>_lse;
+	    break;
+	  case IOR:
+	    gen = gen_aarch64_atomic_ior<mode>_lse;
+	    break;
+	  case XOR:
+	    gen = gen_aarch64_atomic_xor<mode>_lse;
+	    break;
+	  case AND:
+	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
+					      NULL, 1);
+	    gen = gen_aarch64_atomic_bic<mode>_lse;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+	operands[1] = force_reg (<MODE>mode, operands[1]);
+      }
     else
       gen = gen_aarch64_atomic_<atomic_optab><mode>;
 
     emit_insn (gen (operands[0], operands[1], operands[2]));
-
     DONE;
   }
 )
@@ -239,22 +263,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_<atomic_optab><mode>_lse"
+(define_insn "aarch64_atomic_<atomic_ldoptab><mode>_lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
-    (unspec_volatile:ALLI
-      [(atomic_op:ALLI (match_dup 0)
-	(match_operand:ALLI 1 "<atomic_op_operand>" "r<const_atomic>"))
-       (match_operand:SI 2 "const_int_operand")]
-      UNSPECV_ATOMIC_OP))
+	(unspec_volatile:ALLI
+	  [(match_dup 0)
+	   (match_operand:ALLI 1 "register_operand" "r")
+	   (match_operand:SI 2 "const_int_operand")]
+      ATOMIC_LDOP))
    (clobber (match_scratch:ALLI 3 "=&r"))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (<CODE>, operands[3], NULL, operands[0],
-			     operands[1], operands[2]);
-    DONE;
+   enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+   if (is_mm_relaxed (model))
+     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+   else if (is_mm_release (model))
+     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+   else if (is_mm_acquire (model) || is_mm_consume (model))
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
+   else
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0";
   }
 )
 
@@ -280,7 +307,7 @@
   }
 )
 
-;; Load-operate-store, returning the updated memory data.
+;; Load-operate-store, returning the original memory data.
 
 (define_expand "atomic_fetch_<atomic_optab><mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -293,13 +320,37 @@
   rtx (*gen) (rtx, rtx, rtx, rtx);
 
   /* Use an atomic load-operate instruction when possible.  */
-  if (aarch64_atomic_ldop_supported_p (<CODE>))
-    gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>_lse;
+  if (TARGET_LSE)
+    {
+      switch (<CODE>)
+        {
+	case MINUS:
+	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
+					    NULL, 1);
+	  /* fallthru */
+	case PLUS:
+	  gen = gen_aarch64_atomic_fetch_add<mode>_lse;
+	  break;
+	case IOR:
+	  gen = gen_aarch64_atomic_fetch_ior<mode>_lse;
+	  break;
+	case XOR:
+	  gen = gen_aarch64_atomic_fetch_xor<mode>_lse;
+	  break;
+	case AND:
+	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
+					    NULL, 1);
+	  gen = gen_aarch64_atomic_fetch_bic<mode>_lse;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+    }
   else
     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
 
   emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
-
   DONE;
 })
 
@@ -326,23 +377,26 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_fetch_<atomic_optab><mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
-    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
+(define_insn "aarch64_atomic_fetch_<atomic_ldoptab><mode>_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
+	(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
    (set (match_dup 1)
-    (unspec_volatile:ALLI
-      [(atomic_op:ALLI (match_dup 1)
-	(match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>"))
-       (match_operand:SI 3 "const_int_operand")]
-      UNSPECV_ATOMIC_LDOP))]
+	(unspec_volatile:ALLI
+	  [(match_dup 1)
+	   (match_operand:ALLI 2 "register_operand" "r")
+	   (match_operand:SI 3 "const_int_operand")]
+	  ATOMIC_LDOP))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (<CODE>, operands[0], NULL, operands[1],
-			     operands[2], operands[3]);
-    DONE;
+   enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+   if (is_mm_relaxed (model))
+     return "ld<atomic_ldop><atomic_sfx>\t%<w>2, %<w>0, %1";
+   else if (is_mm_acquire (model) || is_mm_consume (model))
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>2, %<w>0, %1";
+   else if (is_mm_release (model))
+     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>2, %<w>0, %1";
+   else
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>2, %<w>0, %1";
   }
 )
 
@@ -370,7 +424,7 @@
   }
 )
 
-;; Load-operate-store, returning the original memory data.
+;; Load-operate-store, returning the updated memory data.
 
 (define_expand "atomic_<atomic_optab>_fetch<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -380,17 +434,23 @@
   (match_operand:SI 3 "const_int_operand")]
  ""
 {
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-  rtx value = operands[2];
-
-  /* Use an atomic load-operate instruction when possible.  */
-  if (aarch64_atomic_ldop_supported_p (<CODE>))
-    gen = gen_aarch64_atomic_<atomic_optab>_fetch<mode>_lse;
+  /* Use an atomic load-operate instruction when possible.  In this case
+     we will re-compute the result from the original mem value. */
+  if (TARGET_LSE)
+    {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+      emit_insn (gen_atomic_fetch_<atomic_optab><mode>
+                 (tmp, operands[1], operands[2], operands[3]));
+      tmp = expand_simple_binop (<MODE>mode, <CODE>, tmp, operands[2],
+				 operands[0], 1, OPTAB_WIDEN);
+      emit_move_insn (operands[0], tmp);
+    }
   else
-    gen = gen_aarch64_atomic_<atomic_optab>_fetch<mode>;
-
-  emit_insn (gen (operands[0], operands[1], value, operands[3]));
-
+    {
+      emit_insn (gen_aarch64_atomic_<atomic_optab>_fetch<mode>
+                 (operands[0], operands[1], operands[2], operands[3]));
+    }
   DONE;
 })
 
@@ -417,29 +477,6 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_<atomic_optab>_fetch<mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
-    (atomic_op:ALLI
-     (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")
-     (match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>")))
-   (set (match_dup 1)
-    (unspec_volatile:ALLI
-      [(match_dup 1)
-       (match_dup 2)
-       (match_operand:SI 3 "const_int_operand")]
-      UNSPECV_ATOMIC_LDOP))
-     (clobber (match_scratch:ALLI 4 "=&r"))]
-  "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_ldop (<CODE>, operands[4], operands[0], operands[1],
-			     operands[2], operands[3]);
-    DONE;
-  }
-)
-
 (define_insn_and_split "atomic_nand_fetch<mode>"
   [(set (match_operand:ALLI 0 "register_operand" "=&r")
     (not:ALLI
@@ -585,29 +622,3 @@
       return "dmb\\tish";
   }
 )
-
-;; ARMv8.1-A LSE instructions.
-
-;; Atomic load-op: Load data, operate, store result, keep data.
-
-(define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
- [(set (match_operand:ALLI 0 "register_operand" "=r")
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
-  (set (match_dup 1)
-   (unspec_volatile:ALLI
-    [(match_dup 1)
-     (match_operand:ALLI 2 "register_operand")
-     (match_operand:SI 3 "const_int_operand")]
-    ATOMIC_LDOP))]
- "TARGET_LSE && reload_completed"
- {
-   enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-   if (is_mm_relaxed (model))
-     return "ld<atomic_ldop><atomic_sfx>\t%<w>2, %<w>0, %1";
-   else if (is_mm_acquire (model) || is_mm_consume (model))
-     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>2, %<w>0, %1";
-   else if (is_mm_release (model))
-     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>2, %<w>0, %1";
-   else
-     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>2, %<w>0, %1";
- })
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a43956054e8..524e4e6929b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -503,7 +503,6 @@
     UNSPECV_ATOMIC_CAS		; Represent an atomic CAS.
     UNSPECV_ATOMIC_SWP		; Represent an atomic SWP.
     UNSPECV_ATOMIC_OP		; Represent an atomic operation.
-    UNSPECV_ATOMIC_LDOP		; Represent an atomic load-operation
     UNSPECV_ATOMIC_LDOP_OR	; Represent an atomic load-or
     UNSPECV_ATOMIC_LDOP_BIC	; Represent an atomic load-bic
     UNSPECV_ATOMIC_LDOP_XOR	; Represent an atomic load-xor
@@ -1591,6 +1590,10 @@
  [(UNSPECV_ATOMIC_LDOP_OR "set") (UNSPECV_ATOMIC_LDOP_BIC "clr")
   (UNSPECV_ATOMIC_LDOP_XOR "eor") (UNSPECV_ATOMIC_LDOP_PLUS "add")])
 
+(define_int_attr atomic_ldoptab
+ [(UNSPECV_ATOMIC_LDOP_OR "ior") (UNSPECV_ATOMIC_LDOP_BIC "bic")
+  (UNSPECV_ATOMIC_LDOP_XOR "xor") (UNSPECV_ATOMIC_LDOP_PLUS "add")])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
 ;; -------------------------------------------------------------------
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 07/11] aarch64: Add out-of-line functions for LSE atomics
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 11/11] Enable -matomic-ool by default Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation Richard Henderson
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

This is the libgcc part of the interface -- providing the functions.
Rationale is provided at the top of libgcc/config/aarch64/lse.c.

	* config/aarch64/lse.c: New file.
	* config/aarch64/t-lse: New file.
	* config.host: Add t-lse to all aarch64 tuples.
---
 libgcc/config/aarch64/lse.c | 260 ++++++++++++++++++++++++++++++++++++
 libgcc/config.host          |   4 +
 libgcc/config/aarch64/t-lse |  44 ++++++
 3 files changed, 308 insertions(+)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
new file mode 100644
index 00000000000..68ca7df667b
--- /dev/null
+++ b/libgcc/config/aarch64/lse.c
@@ -0,0 +1,260 @@
+/* Out-of-line LSE atomics for AArch64 architecture.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/*
+ * The problem that we are trying to solve is operating system deployment
+ * of ARMv8.1-Atomics, also known as Large System Exensions (LSE).
+ *
+ * There are a number of potential solutions for this problem which have
+ * been proposed and rejected for various reasons.  To recap:
+ *
+ * (1) Multiple builds.  The dynamic linker will examine /lib64/atomics/
+ * if HWCAP_ATOMICS is set, allowing entire libraries to be overwritten.
+ * However, not all Linux distributions are happy with multiple builds,
+ * and anyway it has no effect on main applications.
+ *
+ * (2) IFUNC.  We could put these functions into libgcc_s.so, and have
+ * a single copy of each function for all DSOs.  However, ARM is concerned
+ * that the branch-to-indirect-branch that is implied by using a PLT,
+ * as required by IFUNC, is too much overhead for smaller cpus.
+ *
+ * (3) Statically predicted direct branches.  This is the approach that
+ * is taken here.  These functions are linked into every DSO that uses them.
+ * All of the symbols are hidden, so that the functions are called via a
+ * direct branch.  The choice of LSE vs non-LSE is done via one byte load
+ * followed by a well-predicted direct branch.  The functions are compiled
+ * separately to minimize code size.
+ */
+
+/* Define or declare the symbol gating the LSE implementations.  */
+#ifndef L_have_atomics
+extern
+#endif
+_Bool __aa64_have_atomics __attribute__((visibility("hidden"), nocommon));
+
+/* The branch controlled by this test should be easily predicted, in that
+   it will, after constructors, always branch the same way.  The expectation
+   is that systems that implement ARMv8.1-Atomics are "beefier" than those
+   that omit the extension.  By arranging for the fall-through path to use
+   load-store-exclusive insns, we aid the branch predictor of the
+   smallest cpus.  */
+#define have_atomics  __builtin_expect (__aa64_have_atomics, 0)
+
+#ifdef L_have_atomics
+/* Disable initialization of __aa64_have_atomics during bootstrap.  */
+# ifndef inhibit_libc
+#  include <sys/auxv.h>
+/* Disable initialization if the system headers are too old.  */
+#  if defined(AT_HWCAP) && defined(HWCAP_ATOMICS)
+static void __attribute__((constructor))
+init_have_atomics (void)
+{
+  unsigned long hwcap = getauxval (AT_HWCAP);
+  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
+#  endif /* HWCAP */
+# endif /* inhibit_libc */
+#else
+
+/* Tell the assembler to accept LSE instructions.  */
+asm(".arch armv8-a+lse");
+
+/* Turn size and memory model defines into mnemonic fragments.  */
+#if SIZE == 1
+# define S     "b"
+# define MASK  ", uxtb"
+#elif SIZE == 2
+# define S     "h"
+# define MASK  ", uxth"
+#elif SIZE == 4 || SIZE == 8
+# define S     ""
+# define MASK  ""
+#else
+# error
+#endif
+
+#if SIZE < 8
+# define T  unsigned int
+# define W  "w"
+#else
+# define T  unsigned long long
+# define W  ""
+#endif
+
+#if MODEL == 1
+# define SUFF  _relax
+# define A     ""
+# define L     ""
+#elif MODEL == 2
+# define SUFF  _acq
+# define A     "a"
+# define L     ""
+#elif MODEL == 3
+# define SUFF  _rel
+# define A     ""
+# define L     "l"
+#elif MODEL == 4
+# define SUFF  _acq_rel
+# define A     "a"
+# define L     "l"
+#else
+# error
+#endif
+
+#define NAME2(B, S, X)  __aa64_ ## B ## S ## X
+#define NAME1(B, S, X)  NAME2(B, S, X)
+#define NAME(BASE)	NAME1(BASE, SIZE, SUFF)
+
+#define str1(S)  #S
+#define str(S)   str1(S)
+
+#ifdef L_cas
+T NAME(cas) (T cmp, T new, T *ptr) __attribute__((visibility("hidden")));
+T NAME(cas) (T cmp, T new, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__("cas" A L S " %"W"0, %"W"2, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	"cmp %"W"0, %"W"4" MASK "\n\t"
+	"bne 1f\n\t"
+	"st" L "xr"S" %w2, %"W"3, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+
+  return old;
+}
+#endif
+
+#ifdef L_swp
+T NAME(swp) (T new, T *ptr) __attribute__((visibility("hidden")));
+T NAME(swp) (T new, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__("swp" A L S " %"W"2, %"W"0, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(new));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	"st" L "xr"S" %w2, %"W"3, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new));
+
+  return old;
+}
+#endif
+
+#if defined(L_ldadd) || defined(L_ldclr) \
+    || defined(L_ldeor) || defined(L_ldset)
+
+#ifdef L_ldadd
+#define LDOP  ldadd
+#define OP    add
+#elif defined(L_ldclr)
+#define LDOP  ldclr
+#define OP    bic
+#elif defined(L_ldeor)
+#define LDOP  ldeor
+#define OP    eor
+#elif defined(L_ldset)
+#define LDOP  ldset
+#define OP    orr
+#else
+#error
+#endif
+
+T NAME(LDOP) (T val, T *ptr) __attribute__((visibility("hidden")));
+T NAME(LDOP) (T val, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__(str(LDOP) A L S " %"W"2, %"W"0, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(val));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	str(OP) " %"W"2, %"W"0, %"W"3\n\t"
+	"st" L "xr"S" %w2, %"W"2, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(val));
+
+  return old;
+}
+#endif
+
+#if defined(L_stadd) || defined(L_stclr) \
+    || defined(L_steor) || defined(L_stset)
+
+#ifdef L_stadd
+#define STOP  stadd
+#define OP    add
+#elif defined(L_stclr)
+#define STOP  stclr
+#define OP    bic
+#elif defined(L_steor)
+#define STOP  steor
+#define OP    eor
+#elif defined(L_stset)
+#define STOP  stset
+#define OP    orr
+#else
+#error
+#endif
+
+void NAME(STOP) (T val, T *ptr) __attribute__((visibility("hidden")));
+void NAME(STOP) (T val, T *ptr)
+{
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__(str(STOP) L S " %"W"1, %0" : "+m"(*ptr) : "r"(val));
+  else
+    __asm__(
+	"0: "
+	"ldxr"S" %"W"1, %0\n\t"
+	str(OP) " %"W"1, %"W"1, %"W"2\n\t"
+	"st" L "xr"S" %w1, %"W"1, %0\n\t"
+	"cbnz %w1, 0b\n"
+	"1:"
+	: "+m"(*ptr), "=&r"(tmp) : "r"(val));
+}
+#endif
+#endif /* L_have_atomics */
diff --git a/libgcc/config.host b/libgcc/config.host
index 029f6569caf..7e9a8b6bc8f 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -340,23 +340,27 @@ aarch64*-*-elf | aarch64*-*-rtems*)
 	extra_parts="$extra_parts crtbegin.o crtend.o crti.o crtn.o"
 	extra_parts="$extra_parts crtfastmath.o"
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	md_unwind_header=aarch64/aarch64-unwind.h
 	;;
 aarch64*-*-freebsd*)
 	extra_parts="$extra_parts crtfastmath.o"
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	md_unwind_header=aarch64/freebsd-unwind.h
 	;;
 aarch64*-*-fuchsia*)
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
 	;;
 aarch64*-*-linux*)
 	extra_parts="$extra_parts crtfastmath.o"
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	;;
 alpha*-*-linux*)
diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
new file mode 100644
index 00000000000..e862b0c2448
--- /dev/null
+++ b/libgcc/config/aarch64/t-lse
@@ -0,0 +1,44 @@
+# Out-of-line LSE atomics for AArch64 architecture.
+# Copyright (C) 2018 Free Software Foundation, Inc.
+# Contributed by Linaro Ltd.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# CAS, Swap, Load-and-operate have 4 sizes and 4 memory models
+S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), cas swp ldadd ldclr ldeor ldset))
+O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
+
+# Store-and-operate has 4 sizes but only 2 memory models (relaxed, release).
+S2 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), stadd stclr steor stset))
+O2 := $(foreach m, 1 3, $(addsuffix _$(m)$(objext), $(S2)))
+
+LSE_OBJS := $(O1) $(O2)
+
+libgcc-objects += $(LSE_OBJS) have_atomic$(objext)
+
+empty      =
+space      = $(empty) $(empty)
+PAT_SPLIT  = $(subst _,$(space),$(*F))
+PAT_BASE   = $(word 1,$(PAT_SPLIT))
+PAT_N      = $(word 2,$(PAT_SPLIT))
+PAT_M      = $(word 3,$(PAT_SPLIT))
+
+have_atomic$(objext): $(srcdir)/config/aarch64/lse.c
+	$(gcc_compile) -DL_have_atomics -c $<
+
+$(LSE_OBJS): $(srcdir)/config/aarch64/lse.c
+	$(gcc_compile) -DL_$(PAT_BASE) -DSIZE=$(PAT_N) -DMODEL=$(PAT_M) -c $<
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 11/11] Enable -matomic-ool by default
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation Richard Henderson
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Do Not Merge Upstream.
This is for agraf and his testing within SLES.
---
 gcc/common/config/aarch64/aarch64-common.c | 6 ++++--
 gcc/config/aarch64/aarch64.c               | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index 292fb818705..3bd1312a3f8 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -31,9 +31,11 @@
 #include "flags.h"
 #include "diagnostic.h"
 
-#ifdef  TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef  TARGET_HANDLE_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ac2f055a09e..b20d8bbf19b 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -17617,9 +17617,11 @@ aarch64_run_selftests (void)
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
 
-#ifdef TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef TARGET_CLASS_MAX_NREGS
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 00/11] LSE atomics out-of-line
@ 2018-10-02 16:19 Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 11/11] Enable -matomic-ool by default Richard Henderson
                   ` (11 more replies)
  0 siblings, 12 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Changes since v1:
  * Use config/t-slibgcc-libgcc instead of gcc.c changes.
  * Some style fixes.
  * Ifdefs to work with old glibc.

  * Force TImode registers into even regnos.
    Required by CASP, allowed by the ABI, and is seen as the
    simplier solution than adding two new register classes.

  * Use match_dup instead of matching constraints for CAS{P}.
    Matching constraints result in lots of extraneous moves
    for TImode, and keeping the expander interface the same
    for non-TImode simplifies the code.


r~


Richard Henderson (11):
  aarch64: Simplify LSE cas generation
  aarch64: Improve cas generation
  aarch64: Improve swp generation
  aarch64: Improve atomic-op lse generation
  aarch64: Emit LSE st<op> instructions
  Add visibility to libfunc constructors
  aarch64: Add out-of-line functions for LSE atomics
  aarch64: Implement -matomic-ool
  aarch64: Force TImode values into even registers
  aarch64: Implement TImode compare-and-swap
  Enable -matomic-ool by default

 gcc/config/aarch64/aarch64-protos.h           |  20 +-
 gcc/optabs-libfuncs.h                         |   2 +
 gcc/common/config/aarch64/aarch64-common.c    |   6 +-
 gcc/config/aarch64/aarch64.c                  | 494 +++++++-------
 gcc/optabs-libfuncs.c                         |  26 +-
 .../atomic-comp-swap-release-acquire.c        |   2 +-
 .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
 .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
 .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
 .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
 .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
 .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
 .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
 .../gcc.target/aarch64/sync-op-full.c         |   2 +-
 libgcc/config/aarch64/lse.c                   | 282 ++++++++
 gcc/config/aarch64/aarch64.opt                |   4 +
 gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
 gcc/config/aarch64/iterators.md               |   8 +-
 gcc/config/aarch64/predicates.md              |  12 +
 gcc/doc/invoke.texi                           |  14 +-
 libgcc/config.host                            |   4 +
 libgcc/config/aarch64/t-lse                   |  48 ++
 32 files changed, 1058 insertions(+), 576 deletions(-)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
  2018-10-02 16:19 ` [PATCH, AArch64 v2 11/11] Enable -matomic-ool by default Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 20:37   ` James Greenhalgh
  2018-10-02 16:19 ` [PATCH, AArch64 v2 07/11] aarch64: Add out-of-line functions for LSE atomics Richard Henderson
                   ` (9 subsequent siblings)
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Do not zero-extend the input to the cas for subword operations;
instead, use the appropriate zero-extending compare insns.
Correct the predicates and constraints for immediate expected operand.

	* config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): New.
	(aarch64_split_compare_and_swap): Use it.
	(aarch64_expand_compare_and_swap): Likewise.  Remove convert_modes;
	test oldval against the proper predicate.
	* config/aarch64/atomics.md (@atomic_compare_and_swap<ALLI>):
	Use nonmemory_operand for expected.
	(cas_short_expected_pred): New.
	(@aarch64_compare_and_swap<SHORT>): Use it; use "rn" not "rI" to match.
	(@aarch64_compare_and_swap<GPI>): Use "rn" not "rI" for expected.
	* config/aarch64/predicates.md (aarch64_plushi_immediate): New.
	(aarch64_plushi_operand): New.
---
 gcc/config/aarch64/aarch64.c     | 90 +++++++++++++++++++-------------
 gcc/config/aarch64/atomics.md    | 19 ++++---
 gcc/config/aarch64/predicates.md | 12 +++++
 3 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index fbec54fe5da..0e2b85de1e3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1613,6 +1613,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
   return cc_reg;
 }
 
+/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
+
+static rtx
+aarch64_gen_compare_reg_maybe_ze(RTX_CODE code, rtx x, rtx y,
+                                 machine_mode y_mode)
+{
+  if (y_mode == E_QImode || y_mode == E_HImode)
+    {
+      if (CONST_INT_P (y))
+	y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+      else
+	{
+	  rtx t, cc_reg;
+	  machine_mode cc_mode;
+
+	  t = gen_rtx_ZERO_EXTEND (SImode, y);
+	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
+	  cc_mode = CC_SWPmode;
+	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+	  emit_set_insn (cc_reg, t);
+	  return cc_reg;
+	}
+    }
+
+  return aarch64_gen_compare_reg (code, x, y);
+}
+
 /* Build the SYMBOL_REF for __tls_get_addr.  */
 
 static GTY(()) rtx tls_get_addr_libfunc;
@@ -14138,8 +14165,8 @@ aarch64_emit_unlikely_jump (rtx insn)
 void
 aarch64_expand_compare_and_swap (rtx operands[])
 {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-  machine_mode mode, cmp_mode;
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
+  machine_mode mode, r_mode;
 
   bval = operands[0];
   rval = operands[1];
@@ -14150,36 +14177,19 @@ aarch64_expand_compare_and_swap (rtx operands[])
   mod_s = operands[6];
   mod_f = operands[7];
   mode = GET_MODE (mem);
-  cmp_mode = mode;
 
   /* Normally the succ memory model must be stronger than fail, but in the
      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
-
   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  switch (mode)
+  r_mode = mode;
+  if (mode == QImode || mode == HImode)
     {
-    case E_QImode:
-    case E_HImode:
-      /* For short modes, we're going to perform the comparison in SImode,
-	 so do the zero-extension now.  */
-      cmp_mode = SImode;
-      rval = gen_reg_rtx (SImode);
-      oldval = convert_modes (SImode, mode, oldval, true);
-      /* Fall through.  */
-
-    case E_SImode:
-    case E_DImode:
-      /* Force the value into a register if needed.  */
-      if (!aarch64_plus_operand (oldval, mode))
-	oldval = force_reg (cmp_mode, oldval);
-      break;
-
-    default:
-      gcc_unreachable ();
+      r_mode = SImode;
+      rval = gen_reg_rtx (r_mode);
     }
 
   if (TARGET_LSE)
@@ -14187,26 +14197,32 @@ aarch64_expand_compare_and_swap (rtx operands[])
       /* The CAS insn requires oldval and rval overlap, but we need to
 	 have a copy of oldval saved across the operation to tell if
 	 the operation is successful.  */
-      if (mode == QImode || mode == HImode)
-	rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
-      else if (reg_overlap_mentioned_p (rval, oldval))
-        rval = copy_to_mode_reg (mode, oldval);
-      else
-	emit_move_insn (rval, oldval);
+      if (reg_overlap_mentioned_p (rval, oldval))
+        rval = copy_to_mode_reg (r_mode, oldval);
+      else 
+	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
+
       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
 						   newval, mod_s));
-      aarch64_gen_compare_reg (EQ, rval, oldval);
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
     }
   else
-    emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
-					     is_weak, mod_s, mod_f));
+    {
+      /* The oldval predicate varies by mode.  Test it and force to reg.  */
+      insn_code code = code_for_aarch64_compare_and_swap (mode);
+      if (!insn_data[code].operand[2].predicate (oldval, mode))
+	oldval = force_reg (mode, oldval);
 
-  if (mode == QImode || mode == HImode)
+      emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
+				 is_weak, mod_s, mod_f));
+      cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+    }
+
+  if (r_mode != mode)
     rval = gen_lowpart (mode, rval);
   emit_move_insn (operands[1], rval);
 
-  x = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_EQ (SImode, x, const0_rtx);
+  x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
   emit_insn (gen_rtx_SET (bval, x));
 }
 
@@ -14321,10 +14337,10 @@ aarch64_split_compare_and_swap (rtx operands[])
     }
   else
     {
-      cond = aarch64_gen_compare_reg (NE, rval, oldval);
+      cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
     }
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 22660850af1..e44301b40c7 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -24,8 +24,8 @@
   [(match_operand:SI 0 "register_operand" "")			;; bool out
    (match_operand:ALLI 1 "register_operand" "")			;; val out
    (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
-   (match_operand:ALLI 3 "general_operand" "")			;; expected
-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")			;; desired
+   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
+   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
    (match_operand:SI 5 "const_int_operand")			;; is_weak
    (match_operand:SI 6 "const_int_operand")			;; mod_s
    (match_operand:SI 7 "const_int_operand")]			;; mod_f
@@ -36,19 +36,22 @@
   }
 )
 
+(define_mode_attr cas_short_expected_pred
+  [(QI "aarch64_reg_or_imm") (HI "aarch64_plushi_operand")])
+
 (define_insn_and_split "@aarch64_compare_and_swap<mode>"
   [(set (reg:CC CC_REGNUM)					;; bool out
     (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:SI 0 "register_operand" "=&r")	   ;; val out
+   (set (match_operand:SI 0 "register_operand" "=&r")		;; val out
     (zero_extend:SI
       (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
    (set (match_dup 1)
     (unspec_volatile:SHORT
-      [(match_operand:SI 2 "aarch64_plus_operand" "rI")	;; expected
+      [(match_operand:SHORT 2 "<cas_short_expected_pred>" "rn")	;; expected
        (match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")	;; desired
-       (match_operand:SI 4 "const_int_operand")		;; is_weak
-       (match_operand:SI 5 "const_int_operand")		;; mod_s
-       (match_operand:SI 6 "const_int_operand")]	;; mod_f
+       (match_operand:SI 4 "const_int_operand")			;; is_weak
+       (match_operand:SI 5 "const_int_operand")			;; mod_s
+       (match_operand:SI 6 "const_int_operand")]		;; mod_f
       UNSPECV_ATOMIC_CMPSW))
    (clobber (match_scratch:SI 7 "=&r"))]
   ""
@@ -68,7 +71,7 @@
     (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))   ;; memory
    (set (match_dup 1)
     (unspec_volatile:GPI
-      [(match_operand:GPI 2 "aarch64_plus_operand" "rI")	;; expect
+      [(match_operand:GPI 2 "aarch64_plus_operand" "rn")	;; expect
        (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")		;; desired
        (match_operand:SI 4 "const_int_operand")			;; is_weak
        (match_operand:SI 5 "const_int_operand")			;; mod_s
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5b08b03c586..4c75eff3e5a 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -114,6 +114,18 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_plus_immediate")))
 
+(define_predicate "aarch64_plushi_immediate"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT val = INTVAL (op);
+  /* The HImode value must be zero-extendable to an SImode plus_operand.  */
+  return ((val & 0xfff) == val || sext_hwi (val & 0xf000, 16) == val);
+})
+
+(define_predicate "aarch64_plushi_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_plushi_immediate")))
+
 (define_predicate "aarch64_pluslong_immediate"
   (and (match_code "const_int")
        (match_test "(INTVAL (op) < 0xffffff && INTVAL (op) > -0xffffff)")))
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (4 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 21:47   ` James Greenhalgh
  2018-10-02 16:19 ` [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation Richard Henderson
                   ` (5 subsequent siblings)
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

The LSE CASP instruction requires values to be placed in even
register pairs.  A solution involving two additional register
classes was rejected in favor of the much simpler solution of
simply requiring all TImode values to be aligned.

	* config/aarch64/aarch64.c (aarch64_hard_regno_mode_ok): Force
	16-byte modes held in GP registers to use an even regno.
---
 gcc/config/aarch64/aarch64.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 49b47382b5d..ce4d7e51d00 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1451,10 +1451,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
     return mode == Pmode;
 
-  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
-    return true;
-
-  if (FP_REGNUM_P (regno))
+  if (GP_REGNUM_P (regno))
+    {
+      if (known_le (GET_MODE_SIZE (mode), 8))
+	return true;
+      else if (known_le (GET_MODE_SIZE (mode), 16))
+	return (regno & 1) == 0;
+    }
+  else if (FP_REGNUM_P (regno))
     {
       if (vec_flags & VEC_STRUCT)
 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (2 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 07/11] aarch64: Add out-of-line functions for LSE atomics Richard Henderson
@ 2018-10-02 16:19 ` Richard Henderson
  2018-10-30 20:32   ` James Greenhalgh
  2018-10-02 16:19 ` [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation Richard Henderson
                   ` (7 subsequent siblings)
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

The cas insn is a single insn, and if expanded properly need not
be split after reload.  Use the proper inputs for the insn.

	* config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
	Force oldval into the rval register for TARGET_LSE; emit the compare
	during initial expansion so that it may be deleted if unused.
	(aarch64_gen_atomic_cas): Remove.
	* config/aarch64/atomics.md (@aarch64_compare_and_swap<SHORT>_lse):
	Change =&r to +r for operand 0; use match_dup for operand 2;
	remove is_weak and mod_f operands as unused.  Drop the split
	and merge with...
	(@aarch64_atomic_cas<SHORT>): ... this pattern's output; remove.
	(@aarch64_compare_and_swap<GPI>_lse): Similarly.
	(@aarch64_atomic_cas<GPI>): Similarly.
---
 gcc/config/aarch64/aarch64-protos.h |   1 -
 gcc/config/aarch64/aarch64.c        |  46 ++++-------
 gcc/config/aarch64/atomics.md       | 121 ++++++++--------------------
 3 files changed, 49 insertions(+), 119 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0..3d045cf43be 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -562,7 +562,6 @@ rtx aarch64_load_tp (rtx);
 
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
-void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_atomic_ldop_supported_p (enum rtx_code);
 void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 12f7dfe9a75..fbec54fe5da 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14183,16 +14183,27 @@ aarch64_expand_compare_and_swap (rtx operands[])
     }
 
   if (TARGET_LSE)
-    emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
-						 newval, is_weak, mod_s,
-						 mod_f));
+    {
+      /* The CAS insn requires oldval and rval overlap, but we need to
+	 have a copy of oldval saved across the operation to tell if
+	 the operation is successful.  */
+      if (mode == QImode || mode == HImode)
+	rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
+      else if (reg_overlap_mentioned_p (rval, oldval))
+        rval = copy_to_mode_reg (mode, oldval);
+      else
+	emit_move_insn (rval, oldval);
+      emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
+						   newval, mod_s));
+      aarch64_gen_compare_reg (EQ, rval, oldval);
+    }
   else
     emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
 					     is_weak, mod_s, mod_f));
 
-
   if (mode == QImode || mode == HImode)
-    emit_move_insn (operands[1], gen_lowpart (mode, rval));
+    rval = gen_lowpart (mode, rval);
+  emit_move_insn (operands[1], rval);
 
   x = gen_rtx_REG (CCmode, CC_REGNUM);
   x = gen_rtx_EQ (SImode, x, const0_rtx);
@@ -14242,31 +14253,6 @@ aarch64_emit_post_barrier (enum memmodel model)
     }
 }
 
-/* Emit an atomic compare-and-swap operation.  RVAL is the destination register
-   for the data in memory.  EXPECTED is the value expected to be in memory.
-   DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
-   is the memory ordering to use.  */
-
-void
-aarch64_gen_atomic_cas (rtx rval, rtx mem,
-			rtx expected, rtx desired,
-			rtx model)
-{
-  machine_mode mode;
-
-  mode = GET_MODE (mem);
-
-  /* Move the expected value into the CAS destination register.  */
-  emit_insn (gen_rtx_SET (rval, expected));
-
-  /* Emit the CAS.  */
-  emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
-
-  /* Compare the expected value with the value loaded by the CAS, to establish
-     whether the swap was made.  */
-  aarch64_gen_compare_reg (EQ, rval, expected);
-}
-
 /* Split a compare and swap pattern.  */
 
 void
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bba8e9e9c8e..22660850af1 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -85,56 +85,50 @@
   }
 )
 
-(define_insn_and_split "@aarch64_compare_and_swap<mode>_lse"
-  [(set (reg:CC CC_REGNUM)					;; bool out
-    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:SI 0 "register_operand" "=&r")		;; val out
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:SI 0 "register_operand" "+r")		;; val out
     (zero_extend:SI
-      (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
+     (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
    (set (match_dup 1)
     (unspec_volatile:SHORT
-      [(match_operand:SI 2 "aarch64_plus_operand" "rI")	;; expected
-       (match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")	;; desired
-       (match_operand:SI 4 "const_int_operand")		;; is_weak
-       (match_operand:SI 5 "const_int_operand")		;; mod_s
-       (match_operand:SI 6 "const_int_operand")]	;; mod_f
+      [(match_dup 0)						;; expected
+       (match_operand:SHORT 2 "aarch64_reg_or_zero" "rZ")	;; desired
+       (match_operand:SI 3 "const_int_operand")]		;; mod_s
       UNSPECV_ATOMIC_CMPSW))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_cas (operands[0], operands[1],
-			    operands[2], operands[3],
-			    operands[5]);
-    DONE;
-  }
-)
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+  if (is_mm_relaxed (model))
+    return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else if (is_mm_release (model))
+    return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else
+    return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
+})
 
-(define_insn_and_split "@aarch64_compare_and_swap<mode>_lse"
-  [(set (reg:CC CC_REGNUM)					;; bool out
-    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:GPI 0 "register_operand" "=&r")		;; val out
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:GPI 0 "register_operand" "+r")		;; val out
     (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))   ;; memory
    (set (match_dup 1)
     (unspec_volatile:GPI
-      [(match_operand:GPI 2 "aarch64_plus_operand" "rI")	;; expect
-       (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")		;; desired
-       (match_operand:SI 4 "const_int_operand")			;; is_weak
-       (match_operand:SI 5 "const_int_operand")			;; mod_s
-       (match_operand:SI 6 "const_int_operand")]		;; mod_f
+      [(match_dup 0)						;; expected
+       (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")		;; desired
+       (match_operand:SI 3 "const_int_operand")]		;; mod_s
       UNSPECV_ATOMIC_CMPSW))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_cas (operands[0], operands[1],
-			    operands[2], operands[3],
-			    operands[5]);
-    DONE;
-  }
-)
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+  if (is_mm_relaxed (model))
+    return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else if (is_mm_release (model))
+    return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
+  else
+    return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
+})
 
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -610,55 +604,6 @@
       return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
   })
 
-;; Atomic compare-and-swap: HI and smaller modes.
-
-(define_insn "@aarch64_atomic_cas<mode>"
- [(set (match_operand:SI 0 "register_operand" "+&r")		  ;; out
-   (zero_extend:SI
-    (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q")))  ;; memory.
-  (set (match_dup 1)
-   (unspec_volatile:SHORT
-    [(match_dup 0)
-     (match_operand:SHORT 2 "aarch64_reg_or_zero" "rZ")	;; value.
-     (match_operand:SI 3 "const_int_operand" "")]	;; model.
-    UNSPECV_ATOMIC_CAS))]
- "TARGET_LSE && reload_completed"
-{
-  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-  if (is_mm_relaxed (model))
-    return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else if (is_mm_acquire (model) || is_mm_consume (model))
-    return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else if (is_mm_release (model))
-    return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else
-    return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
-})
-
-;; Atomic compare-and-swap: SI and larger modes.
-
-(define_insn "@aarch64_atomic_cas<mode>"
- [(set (match_operand:GPI 0 "register_operand" "+&r")	      ;; out
-   (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))  ;; memory.
-  (set (match_dup 1)
-   (unspec_volatile:GPI
-    [(match_dup 0)
-     (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")	;; value.
-     (match_operand:SI 3 "const_int_operand" "")]	;; model.
-    UNSPECV_ATOMIC_CAS))]
-  "TARGET_LSE && reload_completed"
-{
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model))
-      return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else if (is_mm_acquire (model) || is_mm_consume (model))
-      return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else if (is_mm_release (model))
-      return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else
-      return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
-})
-
 ;; Atomic load-op: Load data, operate, store result, keep data.
 
 (define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 10/11] aarch64: Implement TImode compare-and-swap
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (8 preceding siblings ...)
  2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
@ 2018-10-02 16:36 ` Richard Henderson
  2018-10-02 16:37 ` [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions Richard Henderson
  2018-10-11 17:44 ` [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
  11 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:36 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

This pattern will only be used with the __sync functions, because
we do not yet have a bare TImode atomic load.

	* config/aarch64/aarch64.c (aarch64_gen_compare_reg): Add support		for NE comparison of TImode values.
	(aarch64_print_operand): Extend %R to handle general registers.
	(aarch64_emit_load_exclusive): Add support for TImode.
	(aarch64_emit_store_exclusive): Likewise.
	(aarch64_atomic_ool_func): Likewise.
	(aarch64_ool_cas_names): Likewise.
	* config/aarch64/atomics.md (@atomic_compare_and_swap<ALLI_TI>):
	Change iterator from ALLI to ALLI_TI.
	(@atomic_compare_and_swap<JUST_TI>): New.
	(@atomic_compare_and_swap<JUST_TI>_lse): New.
	(aarch64_load_exclusive_pair): New.
	(aarch64_store_exclusive_pair): New.
	* config/aarch64/iterators.md (JUST_TI): New.

	* config/aarch64/lse.c (cas): Add support for SIZE == 16.
	* config/aarch64/t-lse (S0, O0): Split out cas.
	(LSE_OBJS): Include $(O0).
---
 gcc/config/aarch64/aarch64-protos.h |  2 +-
 gcc/config/aarch64/aarch64.c        | 72 ++++++++++++++++++-----
 libgcc/config/aarch64/lse.c         | 48 ++++++++++-----
 gcc/config/aarch64/atomics.md       | 91 +++++++++++++++++++++++++++--
 gcc/config/aarch64/iterators.md     |  3 +
 libgcc/config/aarch64/t-lse         | 10 +++-
 6 files changed, 189 insertions(+), 37 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index c7b96b12bbe..f735c4e5ad8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -626,7 +626,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
 struct atomic_ool_names
 {
-    const char *str[4][4];
+    const char *str[5][4];
 };
 
 rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ce4d7e51d00..ac2f055a09e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1610,10 +1610,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  if (cmp_mode == E_TImode)
+    {
+      gcc_assert (code == NE);
+
+      cc_mode = E_CCmode;
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+
+      rtx x_lo = operand_subword (x, 0, 0, TImode);
+      rtx y_lo = operand_subword (y, 0, 0, TImode);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+      rtx x_hi = operand_subword (x, 1, 0, TImode);
+      rtx y_hi = operand_subword (y, 1, 0, TImode);
+      emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+			     gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+			     GEN_INT (AARCH64_EQ)));
+    }
+  else
+    {
+      cc_mode = SELECT_CC_MODE (code, x, y);
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+    }
   return cc_reg;
 }
 
@@ -6693,7 +6716,7 @@ sizetochar (int size)
      'S/T/U/V':		Print a FP/SIMD register name for a register list.
 			The register printed is the FP/SIMD register name
 			of X + 0/1/2/3 for S/T/U/V.
-     'R':		Print a scalar FP/SIMD register name + 1.
+     'R':		Print a scalar Integer/FP/SIMD register name + 1.
      'X':		Print bottom 16 bits of integer constant in hex.
      'w/x':		Print a general register name or the zero register
 			(32-bit or 64-bit).
@@ -6885,12 +6908,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       break;
 
     case 'R':
-      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-	{
-	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
-	  return;
-	}
-      asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+      else
+	output_operand_lossage ("incompatible register operand for '%%%c'",
+				code);
       break;
 
     case 'X':
@@ -14143,16 +14167,26 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 			     rtx mem, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
+  if (mode == E_TImode)
+    emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+						gen_highpart (DImode, rval),
+						mem, model_rtx));
+  else
+    emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
 }
 
 /* Emit store exclusive.  */
 
 static void
 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
-			      rtx rval, rtx mem, rtx model_rtx)
+			      rtx mem, rtx val, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
+  if (mode == E_TImode)
+    emit_insn (gen_aarch64_store_exclusive_pair
+	       (bval, mem, operand_subword (val, 0, 0, TImode),
+		operand_subword (val, 1, 0, TImode), model_rtx));
+  else
+    emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, val, model_rtx));
 }
 
 /* Mark the previous jump instruction as unlikely.  */
@@ -14164,7 +14198,7 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
-/* We store the names of the various atomic helpers in a 4x4 array.
+/* We store the names of the various atomic helpers in a 5x4 array.
    Return the libcall function given MODE, MODEL and NAMES.  */
 
 rtx
@@ -14188,6 +14222,9 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     case E_DImode:
       mode_idx = 3;
       break;
+    case E_TImode:
+      mode_idx = 4;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -14222,9 +14259,11 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     "__aa64_" #B #N "_rel", \
     "__aa64_" #B #N "_acq_rel" }
 
-#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8)
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
+		 { NULL, NULL, NULL, NULL }
+#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
 
-static const atomic_ool_names aarch64_ool_cas_names = { { DEF4(cas) } };
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
@@ -14247,6 +14286,7 @@ const atomic_ool_names aarch64_ool_steor_names = { { DEF4(eor) } };
 
 #undef DEF0
 #undef DEF4
+#undef DEF5
 
 /* Expand a compare and swap pattern.  */
 
diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
index 68ca7df667b..f6114add71a 100644
--- a/libgcc/config/aarch64/lse.c
+++ b/libgcc/config/aarch64/lse.c
@@ -91,6 +91,7 @@ asm(".arch armv8-a+lse");
 #elif SIZE == 4 || SIZE == 8
 # define S     ""
 # define MASK  ""
+#elif SIZE == 16
 #else
 # error
 #endif
@@ -98,9 +99,11 @@ asm(".arch armv8-a+lse");
 #if SIZE < 8
 # define T  unsigned int
 # define W  "w"
-#else
+#elif SIZE == 8
 # define T  unsigned long long
 # define W  ""
+#else
+# define T  unsigned __int128
 #endif
 
 #if MODEL == 1
@@ -138,19 +141,38 @@ T NAME(cas) (T cmp, T new, T *ptr)
   unsigned tmp;
 
   if (have_atomics)
-    __asm__("cas" A L S " %"W"0, %"W"2, %1"
-            : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+    {
+#if SIZE == 16
+      __asm__("casp" A L " %0, %R0, %2, %R2, %1"
+              : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+#else
+      __asm__("cas" A L S " %"W"0, %"W"2, %1"
+              : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+#endif
+    }
   else
-    __asm__(
-	"0: "
-	"ld" A "xr"S" %"W"0, %1\n\t"
-	"cmp %"W"0, %"W"4" MASK "\n\t"
-	"bne 1f\n\t"
-	"st" L "xr"S" %w2, %"W"3, %1\n\t"
-	"cbnz %w2, 0b\n"
-	"1:"
-	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
-
+    {
+#if SIZE == 16
+      __asm__("0: "
+	      "ld" A "xp %0, %R0, %1\n\t"
+	      "cmp %0, %4\n\t"
+	      "ccmp %R0, %R4, #0, eq\n\t"
+	      "bne 1f\n\t"
+	      "st" L "xp %w2, %3, %R3, %1\n\t"
+	      "cbnz %w2, 0b\n"
+	      "1:"
+	      : "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+#else
+      __asm__("0: "
+	      "ld" A "xr"S" %"W"0, %1\n\t"
+	      "cmp %"W"0, %"W"4" MASK "\n\t"
+	      "bne 1f\n\t"
+	      "st" L "xr"S" %w2, %"W"3, %1\n\t"
+	      "cbnz %w2, 0b\n"
+	      "1:"
+	      : "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+#endif
+    }
   return old;
 }
 #endif
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 72f9962fe55..fe604606bdd 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -22,10 +22,10 @@
 
 (define_expand "@atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand" "")			;; bool out
-   (match_operand:ALLI 1 "register_operand" "")			;; val out
-   (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
-   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
+   (match_operand:ALLI_TI 1 "register_operand" "")		;; val out
+   (match_operand:ALLI_TI 2 "aarch64_sync_memory_operand" "")	;; memory
+   (match_operand:ALLI_TI 3 "nonmemory_operand" "")		;; expected
+   (match_operand:ALLI_TI 4 "aarch64_reg_or_zero" "")		;; desired
    (match_operand:SI 5 "const_int_operand")			;; is_weak
    (match_operand:SI 6 "const_int_operand")			;; mod_s
    (match_operand:SI 7 "const_int_operand")]			;; mod_f
@@ -88,6 +88,30 @@
   }
 )
 
+(define_insn_and_split "@aarch64_compare_and_swap<mode>"
+  [(set (reg:CC CC_REGNUM)					;; bool out
+    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
+   (set (match_operand:JUST_TI 0 "register_operand" "=&r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_operand:JUST_TI 2 "register_operand" "r")		;; expect
+       (match_operand:JUST_TI 3 "aarch64_reg_or_zero" "rZ")	;; desired
+       (match_operand:SI 4 "const_int_operand")			;; is_weak
+       (match_operand:SI 5 "const_int_operand")			;; mod_s
+       (match_operand:SI 6 "const_int_operand")]		;; mod_f
+      UNSPECV_ATOMIC_CMPSW))
+   (clobber (match_scratch:SI 7 "=&r"))]
+  ""
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_compare_and_swap (operands);
+    DONE;
+  }
+)
+
 (define_insn "@aarch64_compare_and_swap<mode>_lse"
   [(set (match_operand:SI 0 "register_operand" "+r")		;; val out
     (zero_extend:SI
@@ -133,6 +157,28 @@
     return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
 })
 
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:JUST_TI 0 "register_operand" "+r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_dup 0)						;; expect
+       (match_operand:JUST_TI 2 "register_operand" "r")		;; desired
+       (match_operand:SI 3 "const_int_operand")]		;; mod_s
+      UNSPECV_ATOMIC_CMPSW))]
+  "TARGET_LSE"
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+  if (is_mm_relaxed (model))
+    return "casp\t%0, %R0, %2, %R2, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "caspa\t%0, %R0, %2, %R2, %1";
+  else if (is_mm_release (model))
+    return "caspl\t%0, %R0, %2, %R2, %1";
+  else
+    return "caspal\t%0, %R0, %2, %R2, %1";
+})
+
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
@@ -650,6 +696,24 @@
   }
 )
 
+(define_insn "aarch64_load_exclusive_pair"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI
+	  [(match_operand:TI 2 "aarch64_sync_memory_operand" "Q")
+	   (match_operand:SI 3 "const_int_operand")]
+	  UNSPECV_LX))
+   (set (match_operand:DI 1 "register_operand" "=r")
+	(unspec_volatile:DI [(match_dup 2) (match_dup 3)] UNSPECV_LX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
+      return "ldxp\t%0, %1, %2";
+    else
+      return "ldaxp\t%0, %1, %2";
+  }
+)
+
 (define_insn "@aarch64_store_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=&r")
     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
@@ -668,6 +732,25 @@
   }
 )
 
+(define_insn "aarch64_store_exclusive_pair"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
+   (set (match_operand:TI 1 "aarch64_sync_memory_operand" "=Q")
+	(unspec_volatile:TI
+	  [(match_operand:DI 2 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:DI 3 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:SI 4 "const_int_operand")]
+	  UNSPECV_SX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+      return "stxp\t%w0, %x2, %x3, %1";
+    else
+      return "stlxp\t%w0, %x2, %x3, %1";
+  }
+)
+
 (define_expand "mem_thread_fence"
   [(match_operand:SI 0 "const_int_operand" "")]
   ""
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 524e4e6929b..dd26bdbbc6b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -29,6 +29,9 @@
 ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
 (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
 
+;; "Iterator" for just TI -- features like @pattern only work with iterators.
+(define_mode_iterator JUST_TI [TI])
+
 ;; Iterator for QI and HI modes
 (define_mode_iterator SHORT [QI HI])
 
diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
index e862b0c2448..534ff6efea8 100644
--- a/libgcc/config/aarch64/t-lse
+++ b/libgcc/config/aarch64/t-lse
@@ -18,15 +18,19 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-# CAS, Swap, Load-and-operate have 4 sizes and 4 memory models
-S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), cas swp ldadd ldclr ldeor ldset))
+# Compare-and-swap has 5 sizes and 4 memory models.
+S0 := $(foreach s, 1 2 4 8 16, $(addsuffix _$(s), cas))
+O0 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S0)))
+
+# Swap, Load-and-operate have 4 sizes and 4 memory models
+S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), swp ldadd ldclr ldeor ldset))
 O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
 
 # Store-and-operate has 4 sizes but only 2 memory models (relaxed, release).
 S2 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), stadd stclr steor stset))
 O2 := $(foreach m, 1 3, $(addsuffix _$(m)$(objext), $(S2)))
 
-LSE_OBJS := $(O1) $(O2)
+LSE_OBJS := $(O0) $(O1) $(O2)
 
 libgcc-objects += $(LSE_OBJS) have_atomic$(objext)
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (9 preceding siblings ...)
  2018-10-02 16:36 ` [PATCH, AArch64 v2 10/11] aarch64: Implement TImode compare-and-swap Richard Henderson
@ 2018-10-02 16:37 ` Richard Henderson
  2018-10-30 21:42   ` James Greenhalgh
  2018-10-11 17:44 ` [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-02 16:37 UTC (permalink / raw)
  To: gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

When the result of an operation is not used, we can ignore the
result by storing to XZR.  For two of the memory models, using
XZR with LD<op> has a preferred assembler alias, ST<op>.

	* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
	Use ST<op> for relaxed and release models; load to XZR otherwise;
	remove the now unnecessary scratch register.

	* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
	* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
---
 .../gcc.target/aarch64/atomic-inst-ldadd.c    | 18 ++++---
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  | 54 ++++++++++++-------
 gcc/config/aarch64/atomics.md                 | 15 +++---
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
index 4b2282c6861..db2206186b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
@@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
 TEST (sub_load, SUB_LOAD)
 TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
 
-/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
+/* { dg-final { scan-assembler-times "staddb\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
+/* { dg-final { scan-assembler-times "staddh\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
+/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
 /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
-/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
+/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
 /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
+/* { dg-final { scan-assembler-times "stadd\t" 16} } */
+/* { dg-final { scan-assembler-times "staddl\t" 16} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
index 4879d52b9b4..b8a53e0a676 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
@@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
 
 /* Load-OR.  */
 
-/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
+/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
+/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
+/* { dg-final { scan-assembler-times "stseth\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldset\t" 16} } */
+/* { dg-final { scan-assembler-times "ldset\t" 8} } */
 /* { dg-final { scan-assembler-times "ldseta\t" 32} } */
-/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
+/* { dg-final { scan-assembler-times "stset\t" 8} } */
+/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
 
 /* Load-AND.  */
 
-/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrh\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclr\t" 16} */
+/* { dg-final { scan-assembler-times "ldclr\t" 8} */
 /* { dg-final { scan-assembler-times "ldclra\t" 32} } */
-/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldclral\t" 32} } */
+/* { dg-final { scan-assembler-times "stclr\t" 8} */
+/* { dg-final { scan-assembler-times "stclrl\t" 8} } */
 
 /* Load-XOR.  */
 
-/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */
+/* { dg-final { scan-assembler-times "steorb\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */
+/* { dg-final { scan-assembler-times "steorh\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeor\t" 16} */
+/* { dg-final { scan-assembler-times "ldeor\t" 8} */
 /* { dg-final { scan-assembler-times "ldeora\t" 32} } */
-/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */
+/* { dg-final { scan-assembler-times "steor\t" 8} */
+/* { dg-final { scan-assembler-times "steorl\t" 8} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 2198649b1be..ee662a4480e 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -269,19 +269,22 @@
 	  [(match_dup 0)
 	   (match_operand:ALLI 1 "register_operand" "r")
 	   (match_operand:SI 2 "const_int_operand")]
-      ATOMIC_LDOP))
-   (clobber (match_scratch:ALLI 3 "=&r"))]
+      ATOMIC_LDOP))]
   "TARGET_LSE"
   {
    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+   /* Note that while ST<op> is an alias for LD<op> with the second
+      operand as XZR, the assember only defines them for the RELAXED
+      and REL models.  But there's nothing that prevents us from explicitly
+      using XZR with LD<op> for the ACQ and ACQ_REL models.  */
    if (is_mm_relaxed (model))
-     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0";
    else if (is_mm_release (model))
-     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0";
    else if (is_mm_acquire (model) || is_mm_consume (model))
-     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, <w>zr, %0";
    else
-     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, <w>zr, %0";
   }
 )
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 00/11] LSE atomics out-of-line
  2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
                   ` (10 preceding siblings ...)
  2018-10-02 16:37 ` [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions Richard Henderson
@ 2018-10-11 17:44 ` Richard Henderson
  2018-10-22 14:01   ` Richard Henderson
  11 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-11 17:44 UTC (permalink / raw)
  To: Richard Henderson, gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Ping.

On 10/2/18 9:19 AM, Richard Henderson wrote:
> Changes since v1:
>   * Use config/t-slibgcc-libgcc instead of gcc.c changes.
>   * Some style fixes.
>   * Ifdefs to work with old glibc.
> 
>   * Force TImode registers into even regnos.
>     Required by CASP, allowed by the ABI, and is seen as the
>     simplier solution than adding two new register classes.
> 
>   * Use match_dup instead of matching constraints for CAS{P}.
>     Matching constraints result in lots of extraneous moves
>     for TImode, and keeping the expander interface the same
>     for non-TImode simplifies the code.
> 
> 
> r~
> 
> 
> Richard Henderson (11):
>   aarch64: Simplify LSE cas generation
>   aarch64: Improve cas generation
>   aarch64: Improve swp generation
>   aarch64: Improve atomic-op lse generation
>   aarch64: Emit LSE st<op> instructions
>   Add visibility to libfunc constructors
>   aarch64: Add out-of-line functions for LSE atomics
>   aarch64: Implement -matomic-ool
>   aarch64: Force TImode values into even registers
>   aarch64: Implement TImode compare-and-swap
>   Enable -matomic-ool by default
> 
>  gcc/config/aarch64/aarch64-protos.h           |  20 +-
>  gcc/optabs-libfuncs.h                         |   2 +
>  gcc/common/config/aarch64/aarch64-common.c    |   6 +-
>  gcc/config/aarch64/aarch64.c                  | 494 +++++++-------
>  gcc/optabs-libfuncs.c                         |  26 +-
>  .../atomic-comp-swap-release-acquire.c        |   2 +-
>  .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
>  .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
>  .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
>  .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
>  .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
>  .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
>  .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
>  .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
>  .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
>  .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
>  .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
>  .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
>  .../gcc.target/aarch64/sync-op-full.c         |   2 +-
>  libgcc/config/aarch64/lse.c                   | 282 ++++++++
>  gcc/config/aarch64/aarch64.opt                |   4 +
>  gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
>  gcc/config/aarch64/iterators.md               |   8 +-
>  gcc/config/aarch64/predicates.md              |  12 +
>  gcc/doc/invoke.texi                           |  14 +-
>  libgcc/config.host                            |   4 +
>  libgcc/config/aarch64/t-lse                   |  48 ++
>  32 files changed, 1058 insertions(+), 576 deletions(-)
>  create mode 100644 libgcc/config/aarch64/lse.c
>  create mode 100644 libgcc/config/aarch64/t-lse
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 00/11] LSE atomics out-of-line
  2018-10-11 17:44 ` [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
@ 2018-10-22 14:01   ` Richard Henderson
  0 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-22 14:01 UTC (permalink / raw)
  To: Richard Henderson, gcc-patches
  Cc: ramana.radhakrishnan, agraf, marcus.shawcroft, james.greenhalgh,
	richard.earnshaw

Ping 2.

On 10/11/18 6:33 PM, Richard Henderson wrote:
> Ping.
> 
> On 10/2/18 9:19 AM, Richard Henderson wrote:
>> Changes since v1:
>>   * Use config/t-slibgcc-libgcc instead of gcc.c changes.
>>   * Some style fixes.
>>   * Ifdefs to work with old glibc.
>>
>>   * Force TImode registers into even regnos.
>>     Required by CASP, allowed by the ABI, and is seen as the
>>     simplier solution than adding two new register classes.
>>
>>   * Use match_dup instead of matching constraints for CAS{P}.
>>     Matching constraints result in lots of extraneous moves
>>     for TImode, and keeping the expander interface the same
>>     for non-TImode simplifies the code.
>>
>>
>> r~
>>
>>
>> Richard Henderson (11):
>>   aarch64: Simplify LSE cas generation
>>   aarch64: Improve cas generation
>>   aarch64: Improve swp generation
>>   aarch64: Improve atomic-op lse generation
>>   aarch64: Emit LSE st<op> instructions
>>   Add visibility to libfunc constructors
>>   aarch64: Add out-of-line functions for LSE atomics
>>   aarch64: Implement -matomic-ool
>>   aarch64: Force TImode values into even registers
>>   aarch64: Implement TImode compare-and-swap
>>   Enable -matomic-ool by default
>>
>>  gcc/config/aarch64/aarch64-protos.h           |  20 +-
>>  gcc/optabs-libfuncs.h                         |   2 +
>>  gcc/common/config/aarch64/aarch64-common.c    |   6 +-
>>  gcc/config/aarch64/aarch64.c                  | 494 +++++++-------
>>  gcc/optabs-libfuncs.c                         |  26 +-
>>  .../atomic-comp-swap-release-acquire.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
>>  .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
>>  .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
>>  .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
>>  .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
>>  .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
>>  .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
>>  .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
>>  .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
>>  .../gcc.target/aarch64/sync-op-full.c         |   2 +-
>>  libgcc/config/aarch64/lse.c                   | 282 ++++++++
>>  gcc/config/aarch64/aarch64.opt                |   4 +
>>  gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
>>  gcc/config/aarch64/iterators.md               |   8 +-
>>  gcc/config/aarch64/predicates.md              |  12 +
>>  gcc/doc/invoke.texi                           |  14 +-
>>  libgcc/config.host                            |   4 +
>>  libgcc/config/aarch64/t-lse                   |  48 ++
>>  32 files changed, 1058 insertions(+), 576 deletions(-)
>>  create mode 100644 libgcc/config/aarch64/lse.c
>>  create mode 100644 libgcc/config/aarch64/t-lse
>>

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation
  2018-10-02 16:19 ` [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation Richard Henderson
@ 2018-10-30 20:32   ` James Greenhalgh
  2018-10-31 10:35     ` Richard Henderson
  0 siblings, 1 reply; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 20:32 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On Tue, Oct 02, 2018 at 11:19:05AM -0500, Richard Henderson wrote:
> The cas insn is a single insn, and if expanded properly need not
> be split after reload.  Use the proper inputs for the insn.

OK.

Thanks,
James

> 
> 	* config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
> 	Force oldval into the rval register for TARGET_LSE; emit the compare
> 	during initial expansion so that it may be deleted if unused.
> 	(aarch64_gen_atomic_cas): Remove.
> 	* config/aarch64/atomics.md (@aarch64_compare_and_swap<SHORT>_lse):
> 	Change =&r to +r for operand 0; use match_dup for operand 2;
> 	remove is_weak and mod_f operands as unused.  Drop the split
> 	and merge with...
> 	(@aarch64_atomic_cas<SHORT>): ... this pattern's output; remove.
> 	(@aarch64_compare_and_swap<GPI>_lse): Similarly.
> 	(@aarch64_atomic_cas<GPI>): Similarly.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation
  2018-10-02 16:19 ` [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation Richard Henderson
@ 2018-10-30 20:37   ` James Greenhalgh
  0 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 20:37 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On Tue, Oct 02, 2018 at 11:19:06AM -0500, Richard Henderson wrote:
> Do not zero-extend the input to the cas for subword operations;
> instead, use the appropriate zero-extending compare insns.
> Correct the predicates and constraints for immediate expected operand.

OK, modulo two very dull style comments.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index fbec54fe5da..0e2b85de1e3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1613,6 +1613,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
>    return cc_reg;
>  }
>  
> +/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
> +
> +static rtx
> +aarch64_gen_compare_reg_maybe_ze(RTX_CODE code, rtx x, rtx y,
> +                                 machine_mode y_mode)

Space before the bracket: aarch64_gen_compare_reg_maybe_ze (RTX_CODE

> @@ -14187,26 +14197,32 @@ aarch64_expand_compare_and_swap (rtx operands[])
>        /* The CAS insn requires oldval and rval overlap, but we need to
>  	 have a copy of oldval saved across the operation to tell if
>  	 the operation is successful.  */
> -      if (mode == QImode || mode == HImode)
> -	rval = copy_to_mode_reg (SImode, gen_lowpart (SImode, oldval));
> -      else if (reg_overlap_mentioned_p (rval, oldval))
> -        rval = copy_to_mode_reg (mode, oldval);
> -      else
> -	emit_move_insn (rval, oldval);
> +      if (reg_overlap_mentioned_p (rval, oldval))
> +        rval = copy_to_mode_reg (r_mode, oldval);
> +      else 

Trailing space on else.

> +	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
> +
>        emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
>  						   newval, mod_s));
> -      aarch64_gen_compare_reg (EQ, rval, oldval);
> +      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
>      }

Thanks,
James

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation
  2018-10-02 16:19 ` [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation Richard Henderson
@ 2018-10-30 20:50   ` James Greenhalgh
  0 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 20:50 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On Tue, Oct 02, 2018 at 11:19:07AM -0500, Richard Henderson wrote:
> Allow zero as an input; fix constraints; avoid unnecessary split.

OK.

James

> 
> 	* config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
> 	(aarch64_gen_atomic_ldop): Don't call it.
> 	* config/aarch64/atomics.md (atomic_exchange<ALLI>):
> 	Use aarch64_reg_or_zero.
> 	(aarch64_atomic_exchange<ALLI>): Likewise.
> 	(aarch64_atomic_exchange<ALLI>_lse): Remove split; remove & from
> 	operand 0; use aarch64_reg_or_zero for input; merge ...
> 	(@aarch64_atomic_swp<ALLI>): ... this and remove.
> ---

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation
  2018-10-02 16:19 ` [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation Richard Henderson
@ 2018-10-30 21:40   ` James Greenhalgh
  0 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 21:40 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On Tue, Oct 02, 2018 at 11:19:08AM -0500, Richard Henderson wrote:
> Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
> iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
> logical for ldclr aka bic.

OK.

Thanks,
James

> 
> 	* config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
> 	(aarch64_atomic_ldop_supported_p): Remove.
> 	(aarch64_gen_atomic_ldop): Remove.
> 	* config/aarch64/atomic.md (atomic_<atomic_optab><ALLI>):
> 	Fully expand LSE operations here.
> 	(atomic_fetch_<atomic_optab><ALLI>): Likewise.
> 	(atomic_<atomic_optab>_fetch<ALLI>): Likewise.
> 	(aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): Drop atomic_op iterator
> 	and use ATOMIC_LDOP instead; use register_operand for the input;
> 	drop the split and emit insns directly.
> 	(aarch64_atomic_fetch_<ATOMIC_LDOP><ALLI>_lse): Likewise.
> 	(aarch64_atomic_<atomic_op>_fetch<ALLI>_lse): Remove.
> 	(@aarch64_atomic_load<ATOMIC_LDOP><ALLI>): Remove.
> ---
>  gcc/config/aarch64/aarch64-protos.h |   2 -
>  gcc/config/aarch64/aarch64.c        | 176 -------------------------
>  gcc/config/aarch64/atomics.md       | 197 +++++++++++++++-------------
>  gcc/config/aarch64/iterators.md     |   5 +-
>  4 files changed, 108 insertions(+), 272 deletions(-)
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-02 16:37 ` [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions Richard Henderson
@ 2018-10-30 21:42   ` James Greenhalgh
  2018-10-31 11:13     ` Richard Henderson
  2018-10-31 11:25     ` Richard Henderson
  0 siblings, 2 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 21:42 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd, will.deacon

On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
> When the result of an operation is not used, we can ignore the
> result by storing to XZR.  For two of the memory models, using
> XZR with LD<op> has a preferred assembler alias, ST<op>.

ST<op> has different semantics to LD<op>, in particular, ST<op> is not
ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.

The relevant Arm Arm text is:

  If the destination register is not one of WZR or XZR, LDADDA and
  LDADDAL load from memory with acquire semantics

  LDADDL and LDADDAL store to memory with release semantics.

  LDADD has no memory ordering requirements.

I'm taking this to mean that even if the result is unused, using XZR is not
a valid transformation; it weakens the expected acquire semantics to
unordered.

The example I have from Will Deacon on an internal bug database is:

  P0 (atomic_int* y,atomic_int* x) {
    atomic_store_explicit(x,1,memory_order_relaxed);
    atomic_thread_fence(memory_order_release);
    atomic_store_explicit(y,1,memory_order_relaxed);
  }

  P1 (atomic_int* y,atomic_int* x) {
    int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed);
    atomic_thread_fence(memory_order_acquire);
    int r1 = atomic_load_explicit(x,memory_order_relaxed);
  }

  The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal.

This example comes from a while back in my memory; so copying Will for
any more detailed questions.

My impression is that this transformation is not safe, and so the patch is
not OK.

Thanks,
James

> 
> 	* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
> 	Use ST<op> for relaxed and release models; load to XZR otherwise;
> 	remove the now unnecessary scratch register.
> 
> 	* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
> 	* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
> ---
>  .../gcc.target/aarch64/atomic-inst-ldadd.c    | 18 ++++---
>  .../gcc.target/aarch64/atomic-inst-ldlogic.c  | 54 ++++++++++++-------
>  gcc/config/aarch64/atomics.md                 | 15 +++---
>  3 files changed, 57 insertions(+), 30 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> index 4b2282c6861..db2206186b4 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
> @@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
>  TEST (sub_load, SUB_LOAD)
>  TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
>  
> -/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
> +/* { dg-final { scan-assembler-times "staddb\t" 8} } */
> +/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
>  
> -/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
> +/* { dg-final { scan-assembler-times "staddh\t" 8} } */
> +/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
>  
> -/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
> +/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
>  /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
> -/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
> +/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
>  /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
> +/* { dg-final { scan-assembler-times "stadd\t" 16} } */
> +/* { dg-final { scan-assembler-times "staddl\t" 16} } */
>  
>  /* { dg-final { scan-assembler-not "ldaxr\t" } } */
>  /* { dg-final { scan-assembler-not "stlxr\t" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> index 4879d52b9b4..b8a53e0a676 100644
> --- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> +++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> @@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
>  
>  /* Load-OR.  */
>  
> -/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
> +/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
> +/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
> +/* { dg-final { scan-assembler-times "stseth\t" 4} } */
> +/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldset\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldset\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldseta\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
> +/* { dg-final { scan-assembler-times "stset\t" 8} } */
> +/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
>  
>  /* Load-AND.  */
>  
> -/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
> +/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
> +/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */
> +/* { dg-final { scan-assembler-times "stclrh\t" 4} } */
> +/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldclr\t" 16} */
> +/* { dg-final { scan-assembler-times "ldclr\t" 8} */
>  /* { dg-final { scan-assembler-times "ldclra\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldclral\t" 32} } */
> +/* { dg-final { scan-assembler-times "stclr\t" 8} */
> +/* { dg-final { scan-assembler-times "stclrl\t" 8} } */
>  
>  /* Load-XOR.  */
>  
> -/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */
> +/* { dg-final { scan-assembler-times "steorb\t" 4} } */
> +/* { dg-final { scan-assembler-times "steorlb\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */
> -/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */
> +/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */
>  /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */
> +/* { dg-final { scan-assembler-times "steorh\t" 4} } */
> +/* { dg-final { scan-assembler-times "steorlh\t" 4} } */
>  
> -/* { dg-final { scan-assembler-times "ldeor\t" 16} */
> +/* { dg-final { scan-assembler-times "ldeor\t" 8} */
>  /* { dg-final { scan-assembler-times "ldeora\t" 32} } */
> -/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */
> +/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */
>  /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */
> +/* { dg-final { scan-assembler-times "steor\t" 8} */
> +/* { dg-final { scan-assembler-times "steorl\t" 8} } */
>  
>  /* { dg-final { scan-assembler-not "ldaxr\t" } } */
>  /* { dg-final { scan-assembler-not "stlxr\t" } } */
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index 2198649b1be..ee662a4480e 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -269,19 +269,22 @@
>  	  [(match_dup 0)
>  	   (match_operand:ALLI 1 "register_operand" "r")
>  	   (match_operand:SI 2 "const_int_operand")]
> -      ATOMIC_LDOP))
> -   (clobber (match_scratch:ALLI 3 "=&r"))]
> +      ATOMIC_LDOP))]
>    "TARGET_LSE"
>    {
>     enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
> +   /* Note that while ST<op> is an alias for LD<op> with the second
> +      operand as XZR, the assember only defines them for the RELAXED
> +      and REL models.  But there's nothing that prevents us from explicitly
> +      using XZR with LD<op> for the ACQ and ACQ_REL models.  */
>     if (is_mm_relaxed (model))
> -     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
> +     return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0";
>     else if (is_mm_release (model))
> -     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
> +     return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0";
>     else if (is_mm_acquire (model) || is_mm_consume (model))
> -     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
> +     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, <w>zr, %0";
>     else
> -     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0";
> +     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, <w>zr, %0";
>    }
>  )
>  
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
@ 2018-10-30 21:46   ` James Greenhalgh
  2018-10-31 11:29   ` Richard Henderson
  2018-10-31 17:46   ` Eric Botcazou
  2 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 21:46 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd, law

This one needs some other reviewers copied in, who may have missed that
it is not an AARch64 only patch (it looks fine to me).

James

On Tue, Oct 02, 2018 at 11:19:10AM -0500, Richard Henderson wrote:
> 	* optabs-libfuncs.c (build_libfunc_function_visibility):
> 	New, split out from...
> 	(build_libfunc_function): ... here.
> 	(init_one_libfunc_visibility): New, split out from ...
> 	(init_one_libfunc): ... here.
> ---
>  gcc/optabs-libfuncs.h |  2 ++
>  gcc/optabs-libfuncs.c | 26 ++++++++++++++++++++------
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
> index 0669ea1fdd7..cf39da36887 100644
> --- a/gcc/optabs-libfuncs.h
> +++ b/gcc/optabs-libfuncs.h
> @@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
>  void gen_satfractuns_conv_libfunc (convert_optab, const char *,
>  				   machine_mode, machine_mode);
>  
> +tree build_libfunc_function_visibility (const char *, symbol_visibility);
>  tree build_libfunc_function (const char *);
> +rtx init_one_libfunc_visibility (const char *, symbol_visibility);
>  rtx init_one_libfunc (const char *);
>  rtx set_user_assembler_libfunc (const char *, const char *);
>  
> diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
> index bd0df8baa37..73a28e9ca7a 100644
> --- a/gcc/optabs-libfuncs.c
> +++ b/gcc/optabs-libfuncs.c
> @@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash<tree_node>
>  /* A table of previously-created libfuncs, hashed by name.  */
>  static GTY (()) hash_table<libfunc_decl_hasher> *libfunc_decls;
>  
> -/* Build a decl for a libfunc named NAME.  */
> +/* Build a decl for a libfunc named NAME with visibility VIS.  */
>  
>  tree
> -build_libfunc_function (const char *name)
> +build_libfunc_function_visibility (const char *name, symbol_visibility vis)
>  {
>    /* ??? We don't have any type information; pretend this is "int foo ()".  */
>    tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
> @@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
>    DECL_EXTERNAL (decl) = 1;
>    TREE_PUBLIC (decl) = 1;
>    DECL_ARTIFICIAL (decl) = 1;
> -  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
> +  DECL_VISIBILITY (decl) = vis;
>    DECL_VISIBILITY_SPECIFIED (decl) = 1;
>    gcc_assert (DECL_ASSEMBLER_NAME (decl));
>  
> @@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
>    return decl;
>  }
>  
> +/* Build a decl for a libfunc named NAME.  */
> +
> +tree
> +build_libfunc_function (const char *name)
> +{
> +  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Return a libfunc for NAME, creating one if we don't already have one.
> -   The returned rtx is a SYMBOL_REF.  */
> +   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
>  
>  rtx
> -init_one_libfunc (const char *name)
> +init_one_libfunc_visibility (const char *name, symbol_visibility vis)
>  {
>    tree id, decl;
>    hashval_t hash;
> @@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
>      {
>        /* Create a new decl, so that it can be passed to
>  	 targetm.encode_section_info.  */
> -      decl = build_libfunc_function (name);
> +      decl = build_libfunc_function_visibility (name, vis);
>        *slot = decl;
>      }
>    return XEXP (DECL_RTL (decl), 0);
>  }
>  
> +rtx
> +init_one_libfunc (const char *name)
> +{
> +  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
>  
>  rtx
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers
  2018-10-02 16:19 ` [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers Richard Henderson
@ 2018-10-30 21:47   ` James Greenhalgh
  0 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2018-10-30 21:47 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On Tue, Oct 02, 2018 at 11:19:13AM -0500, Richard Henderson wrote:
> The LSE CASP instruction requires values to be placed in even
> register pairs.  A solution involving two additional register
> classes was rejected in favor of the much simpler solution of
> simply requiring all TImode values to be aligned.

OK.

Thanks,
James

> 
> 	* config/aarch64/aarch64.c (aarch64_hard_regno_mode_ok): Force
> 	16-byte modes held in GP registers to use an even regno.
> ---
>  gcc/config/aarch64/aarch64.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 49b47382b5d..ce4d7e51d00 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1451,10 +1451,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
>    if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
>      return mode == Pmode;
>  
> -  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
> -    return true;
> -
> -  if (FP_REGNUM_P (regno))
> +  if (GP_REGNUM_P (regno))
> +    {
> +      if (known_le (GET_MODE_SIZE (mode), 8))
> +	return true;
> +      else if (known_le (GET_MODE_SIZE (mode), 16))
> +	return (regno & 1) == 0;
> +    }
> +  else if (FP_REGNUM_P (regno))
>      {
>        if (vec_flags & VEC_STRUCT)
>  	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
> -- 
> 2.17.1
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation
  2018-10-30 20:32   ` James Greenhalgh
@ 2018-10-31 10:35     ` Richard Henderson
  0 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 10:35 UTC (permalink / raw)
  To: James Greenhalgh, Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd

On 10/30/18 7:48 PM, James Greenhalgh wrote:
> On Tue, Oct 02, 2018 at 11:19:05AM -0500, Richard Henderson wrote:
>> The cas insn is a single insn, and if expanded properly need not
>> be split after reload.  Use the proper inputs for the insn.
> 
> OK.

Thanks.  Committed 1-4 & 9.


r~

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-30 21:42   ` James Greenhalgh
@ 2018-10-31 11:13     ` Richard Henderson
  2018-10-31 11:25     ` Richard Henderson
  1 sibling, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 11:13 UTC (permalink / raw)
  To: James Greenhalgh, Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd, will.deacon

On 10/30/18 8:32 PM, James Greenhalgh wrote:
> On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
>> When the result of an operation is not used, we can ignore the
>> result by storing to XZR.  For two of the memory models, using
>> XZR with LD<op> has a preferred assembler alias, ST<op>.
> 
> ST<op> has different semantics to LD<op>, in particular, ST<op> is not
> ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.
> 
> The relevant Arm Arm text is:
> 
>   If the destination register is not one of WZR or XZR, LDADDA and
>   LDADDAL load from memory with acquire semantics

You're quite right.  I must have glossed over that clause when looking at this
before.  I'll make sure there's a temp register to clobber for v2.


r~

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-30 21:42   ` James Greenhalgh
  2018-10-31 11:13     ` Richard Henderson
@ 2018-10-31 11:25     ` Richard Henderson
  2018-10-31 15:49       ` Will Deacon
  1 sibling, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 11:25 UTC (permalink / raw)
  To: James Greenhalgh
  Cc: gcc-patches, Ramana Radhakrishnan, agraf, Marcus Shawcroft,
	Richard Earnshaw, nd, will.deacon

[-- Attachment #1: Type: text/plain, Size: 1910 bytes --]

On 10/30/18 8:32 PM, James Greenhalgh wrote:
> On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
>> When the result of an operation is not used, we can ignore the
>> result by storing to XZR.  For two of the memory models, using
>> XZR with LD<op> has a preferred assembler alias, ST<op>.
> 
> ST<op> has different semantics to LD<op>, in particular, ST<op> is not
> ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.
> 
> The relevant Arm Arm text is:
> 
>   If the destination register is not one of WZR or XZR, LDADDA and
>   LDADDAL load from memory with acquire semantics
> 
>   LDADDL and LDADDAL store to memory with release semantics.
> 
>   LDADD has no memory ordering requirements.
> 
> I'm taking this to mean that even if the result is unused, using XZR is not
> a valid transformation; it weakens the expected acquire semantics to
> unordered.
> 
> The example I have from Will Deacon on an internal bug database is:
> 
>   P0 (atomic_int* y,atomic_int* x) {
>     atomic_store_explicit(x,1,memory_order_relaxed);
>     atomic_thread_fence(memory_order_release);
>     atomic_store_explicit(y,1,memory_order_relaxed);
>   }
> 
>   P1 (atomic_int* y,atomic_int* x) {
>     int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed);
>     atomic_thread_fence(memory_order_acquire);
>     int r1 = atomic_load_explicit(x,memory_order_relaxed);
>   }
> 
>   The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal.
> 
> This example comes from a while back in my memory; so copying Will for
> any more detailed questions.
> 
> My impression is that this transformation is not safe, and so the patch is
> not OK.

Here's a revised patch.

Use ST<op> for relaxed and release orderings, retain the (non-xzr) scratch
register for other orderings.  But the scratch need not be early-clobber, since
there's no mid-point of half-consumed inputs.


r~


[-- Attachment #2: z --]
[-- Type: text/plain, Size: 8039 bytes --]

From 31a9f2fa6b73ecbca39c8d0b1a09f269a9d67f8d Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Wed, 19 Sep 2018 22:18:09 +0000
Subject: aarch64: Emit LSE st<op> instructions

	* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
	Use ST<op> for relaxed and release models; the scratch register need
	not be early-clobber.

	* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
	* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 2198649b1be..00536d34dfd 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -270,14 +270,14 @@
 	   (match_operand:ALLI 1 "register_operand" "r")
 	   (match_operand:SI 2 "const_int_operand")]
       ATOMIC_LDOP))
-   (clobber (match_scratch:ALLI 3 "=&r"))]
+   (clobber (match_scratch:ALLI 3 "=r"))]
   "TARGET_LSE"
   {
    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
    if (is_mm_relaxed (model))
-     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0";
    else if (is_mm_release (model))
-     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0";
    else if (is_mm_acquire (model) || is_mm_consume (model))
      return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
    else
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
index 4b2282c6861..db2206186b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
@@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
 TEST (sub_load, SUB_LOAD)
 TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
 
-/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
+/* { dg-final { scan-assembler-times "staddb\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
+/* { dg-final { scan-assembler-times "staddh\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
+/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
 /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
-/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
+/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
 /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
+/* { dg-final { scan-assembler-times "stadd\t" 16} } */
+/* { dg-final { scan-assembler-times "staddl\t" 16} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
index 4879d52b9b4..b8a53e0a676 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
@@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
 
 /* Load-OR.  */
 
-/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
+/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
+/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
+/* { dg-final { scan-assembler-times "stseth\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldset\t" 16} } */
+/* { dg-final { scan-assembler-times "ldset\t" 8} } */
 /* { dg-final { scan-assembler-times "ldseta\t" 32} } */
-/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
+/* { dg-final { scan-assembler-times "stset\t" 8} } */
+/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
 
 /* Load-AND.  */
 
-/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrh\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclr\t" 16} */
+/* { dg-final { scan-assembler-times "ldclr\t" 8} */
 /* { dg-final { scan-assembler-times "ldclra\t" 32} } */
-/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldclral\t" 32} } */
+/* { dg-final { scan-assembler-times "stclr\t" 8} */
+/* { dg-final { scan-assembler-times "stclrl\t" 8} } */
 
 /* Load-XOR.  */
 
-/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */
+/* { dg-final { scan-assembler-times "steorb\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */
+/* { dg-final { scan-assembler-times "steorh\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeor\t" 16} */
+/* { dg-final { scan-assembler-times "ldeor\t" 8} */
 /* { dg-final { scan-assembler-times "ldeora\t" 32} } */
-/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */
+/* { dg-final { scan-assembler-times "steor\t" 8} */
+/* { dg-final { scan-assembler-times "steorl\t" 8} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
  2018-10-30 21:46   ` James Greenhalgh
@ 2018-10-31 11:29   ` Richard Henderson
  2018-10-31 17:46   ` Eric Botcazou
  2 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 11:29 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jeff Law, Eric Botcazou

Ping for rtl, middle-end, or global reviewers.


r~

On 10/2/18 5:19 PM, Richard Henderson wrote:
> 	* optabs-libfuncs.c (build_libfunc_function_visibility):
> 	New, split out from...
> 	(build_libfunc_function): ... here.
> 	(init_one_libfunc_visibility): New, split out from ...
> 	(init_one_libfunc): ... here.
> ---
>  gcc/optabs-libfuncs.h |  2 ++
>  gcc/optabs-libfuncs.c | 26 ++++++++++++++++++++------
>  2 files changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
> index 0669ea1fdd7..cf39da36887 100644
> --- a/gcc/optabs-libfuncs.h
> +++ b/gcc/optabs-libfuncs.h
> @@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
>  void gen_satfractuns_conv_libfunc (convert_optab, const char *,
>  				   machine_mode, machine_mode);
>  
> +tree build_libfunc_function_visibility (const char *, symbol_visibility);
>  tree build_libfunc_function (const char *);
> +rtx init_one_libfunc_visibility (const char *, symbol_visibility);
>  rtx init_one_libfunc (const char *);
>  rtx set_user_assembler_libfunc (const char *, const char *);
>  
> diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
> index bd0df8baa37..73a28e9ca7a 100644
> --- a/gcc/optabs-libfuncs.c
> +++ b/gcc/optabs-libfuncs.c
> @@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash<tree_node>
>  /* A table of previously-created libfuncs, hashed by name.  */
>  static GTY (()) hash_table<libfunc_decl_hasher> *libfunc_decls;
>  
> -/* Build a decl for a libfunc named NAME.  */
> +/* Build a decl for a libfunc named NAME with visibility VIS.  */
>  
>  tree
> -build_libfunc_function (const char *name)
> +build_libfunc_function_visibility (const char *name, symbol_visibility vis)
>  {
>    /* ??? We don't have any type information; pretend this is "int foo ()".  */
>    tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
> @@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
>    DECL_EXTERNAL (decl) = 1;
>    TREE_PUBLIC (decl) = 1;
>    DECL_ARTIFICIAL (decl) = 1;
> -  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
> +  DECL_VISIBILITY (decl) = vis;
>    DECL_VISIBILITY_SPECIFIED (decl) = 1;
>    gcc_assert (DECL_ASSEMBLER_NAME (decl));
>  
> @@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
>    return decl;
>  }
>  
> +/* Build a decl for a libfunc named NAME.  */
> +
> +tree
> +build_libfunc_function (const char *name)
> +{
> +  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Return a libfunc for NAME, creating one if we don't already have one.
> -   The returned rtx is a SYMBOL_REF.  */
> +   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
>  
>  rtx
> -init_one_libfunc (const char *name)
> +init_one_libfunc_visibility (const char *name, symbol_visibility vis)
>  {
>    tree id, decl;
>    hashval_t hash;
> @@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
>      {
>        /* Create a new decl, so that it can be passed to
>  	 targetm.encode_section_info.  */
> -      decl = build_libfunc_function (name);
> +      decl = build_libfunc_function_visibility (name, vis);
>        *slot = decl;
>      }
>    return XEXP (DECL_RTL (decl), 0);
>  }
>  
> +rtx
> +init_one_libfunc (const char *name)
> +{
> +  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
> +}
> +
>  /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
>  
>  rtx
> 

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-31 11:25     ` Richard Henderson
@ 2018-10-31 15:49       ` Will Deacon
  2018-10-31 17:35         ` Richard Henderson
  0 siblings, 1 reply; 32+ messages in thread
From: Will Deacon @ 2018-10-31 15:49 UTC (permalink / raw)
  To: Richard Henderson
  Cc: James Greenhalgh, gcc-patches, Ramana Radhakrishnan, agraf,
	Marcus Shawcroft, Richard Earnshaw, nd

Hi Richard,

On Wed, Oct 31, 2018 at 10:27:59AM +0000, Richard Henderson wrote:
> On 10/30/18 8:32 PM, James Greenhalgh wrote:
> > On Tue, Oct 02, 2018 at 11:19:09AM -0500, Richard Henderson wrote:
> >> When the result of an operation is not used, we can ignore the
> >> result by storing to XZR.  For two of the memory models, using
> >> XZR with LD<op> has a preferred assembler alias, ST<op>.
> > 
> > ST<op> has different semantics to LD<op>, in particular, ST<op> is not
> > ordered by a DMB LD; so this could weaken the LDADD and break C11 semantics.
> > 
> > The relevant Arm Arm text is:
> > 
> >   If the destination register is not one of WZR or XZR, LDADDA and
> >   LDADDAL load from memory with acquire semantics
> > 
> >   LDADDL and LDADDAL store to memory with release semantics.
> > 
> >   LDADD has no memory ordering requirements.
> > 
> > I'm taking this to mean that even if the result is unused, using XZR is not
> > a valid transformation; it weakens the expected acquire semantics to
> > unordered.
> > 
> > The example I have from Will Deacon on an internal bug database is:
> > 
> >   P0 (atomic_int* y,atomic_int* x) {
> >     atomic_store_explicit(x,1,memory_order_relaxed);
> >     atomic_thread_fence(memory_order_release);
> >     atomic_store_explicit(y,1,memory_order_relaxed);
> >   }
> > 
> >   P1 (atomic_int* y,atomic_int* x) {
> >     int r0 = atomic_fetch_add_explicit(y,1,memory_order_relaxed);
> >     atomic_thread_fence(memory_order_acquire);
> >     int r1 = atomic_load_explicit(x,memory_order_relaxed);
> >   }
> > 
> >   The outcome where y == 2 and P1 has r0 = 1 and r1 = 0 is illegal.
> > 
> > This example comes from a while back in my memory; so copying Will for
> > any more detailed questions.
> > 
> > My impression is that this transformation is not safe, and so the patch is
> > not OK.
> 
> Here's a revised patch.
> 
> Use ST<op> for relaxed and release orderings, retain the (non-xzr) scratch
> register for other orderings.  But the scratch need not be early-clobber, since
> there's no mid-point of half-consumed inputs.

The example test above uses relaxed atomics in conjunction with an acquire
fence, so I don't think we can actually use ST<op> at all without a change
to the language specification. I previouslyyallocated P0861 for this purpose
but never got a chance to write it up...

Perhaps the issue is a bit clearer with an additional thread (not often I
say that!):


P0 (atomic_int* y,atomic_int* x) {
  atomic_store_explicit(x,1,memory_order_relaxed);
  atomic_thread_fence(memory_order_release);
  atomic_store_explicit(y,1,memory_order_relaxed);
}

P1 (atomic_int* y,atomic_int* x) {
  atomic_fetch_add_explicit(y,1,memory_order_relaxed);	// STADD
  atomic_thread_fence(memory_order_acquire);
  int r0 = atomic_load_explicit(x,memory_order_relaxed);
}

P2 (atomic_int* y) {
  int r1 = atomic_load_explicit(y,memory_order_relaxed);
}


My understanding is that it is forbidden for r0 == 0 and r1 == 2 after
this test has executed. However, if the relaxed add in P1 compiles to
STADD and the subsequent acquire fence is compiled as DMB LD, then we
don't have any ordering guarantees in P1 and the forbidden result could
be observed.

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-31 15:49       ` Will Deacon
@ 2018-10-31 17:35         ` Richard Henderson
  2018-10-31 19:42           ` Will Deacon
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 17:35 UTC (permalink / raw)
  To: Will Deacon
  Cc: James Greenhalgh, gcc-patches, Ramana Radhakrishnan, agraf,
	Marcus Shawcroft, Richard Earnshaw, nd

On 10/31/18 3:04 PM, Will Deacon wrote:
> The example test above uses relaxed atomics in conjunction with an acquire
> fence, so I don't think we can actually use ST<op> at all without a change
> to the language specification. I previouslyyallocated P0861 for this purpose
> but never got a chance to write it up...
> 
> Perhaps the issue is a bit clearer with an additional thread (not often I
> say that!):
> 
> 
> P0 (atomic_int* y,atomic_int* x) {
>   atomic_store_explicit(x,1,memory_order_relaxed);
>   atomic_thread_fence(memory_order_release);
>   atomic_store_explicit(y,1,memory_order_relaxed);
> }
> 
> P1 (atomic_int* y,atomic_int* x) {
>   atomic_fetch_add_explicit(y,1,memory_order_relaxed);	// STADD
>   atomic_thread_fence(memory_order_acquire);
>   int r0 = atomic_load_explicit(x,memory_order_relaxed);
> }
> 
> P2 (atomic_int* y) {
>   int r1 = atomic_load_explicit(y,memory_order_relaxed);
> }
> 
> 
> My understanding is that it is forbidden for r0 == 0 and r1 == 2 after
> this test has executed. However, if the relaxed add in P1 compiles to
> STADD and the subsequent acquire fence is compiled as DMB LD, then we
> don't have any ordering guarantees in P1 and the forbidden result could
> be observed.

I suppose I don't understand exactly what you're saying.

I can see that, yes, if you split the fetch-add from the acquire in P1 you get
the incorrect results you describe.  But isn't that a bug in the test itself?
Why would not the only correct version have

P1 (atomic_int* y, atomic_int* x) {
  atomic_fetch_add_explicit(y, 1, memory_order_acquire);
  int r0 = atomic_load_explicit(x, memory_order_relaxed);
}

at which point we won't use STADD for the fetch-add, but LDADDA.

If the problem is more fundamental than this, would you have another go at
explaining?  In particular, I don't see the difference between

	ldadd	val, scratch, [base]
  vs
	stadd	val, [base]

and

	ldaddl	val, scratch, [base]
  vs
	staddl	val, [base]

where both pairs of instructions have the same memory ordering semantics.
Currently we are always producing the ld version of each pair.


r~

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
  2018-10-30 21:46   ` James Greenhalgh
  2018-10-31 11:29   ` Richard Henderson
@ 2018-10-31 17:46   ` Eric Botcazou
  2018-10-31 19:04     ` Richard Henderson
  2 siblings, 1 reply; 32+ messages in thread
From: Eric Botcazou @ 2018-10-31 17:46 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, ramana.radhakrishnan, agraf, marcus.shawcroft,
	james.greenhalgh, richard.earnshaw

> 	* optabs-libfuncs.c (build_libfunc_function_visibility):
> 	New, split out from...
> 	(build_libfunc_function): ... here.
> 	(init_one_libfunc_visibility): New, split out from ...
> 	(init_one_libfunc): ... here.

Either that or add the parameter with a VISIBILITY_DEFAULT default argument.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-31 17:46   ` Eric Botcazou
@ 2018-10-31 19:04     ` Richard Henderson
  2018-10-31 19:43       ` Eric Botcazou
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 19:04 UTC (permalink / raw)
  To: Eric Botcazou
  Cc: gcc-patches, ramana.radhakrishnan, agraf, marcus.shawcroft,
	james.greenhalgh, richard.earnshaw

On 10/31/18 5:32 PM, Eric Botcazou wrote:
>> 	* optabs-libfuncs.c (build_libfunc_function_visibility):
>> 	New, split out from...
>> 	(build_libfunc_function): ... here.
>> 	(init_one_libfunc_visibility): New, split out from ...
>> 	(init_one_libfunc): ... here.
> 
> Either that or add the parameter with a VISIBILITY_DEFAULT default argument.

I thought of that, but thought this was slightly clearer from usage.  I'm open
the default option if you prefer.


r~

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-31 17:35         ` Richard Henderson
@ 2018-10-31 19:42           ` Will Deacon
  2018-10-31 22:39             ` Richard Henderson
  0 siblings, 1 reply; 32+ messages in thread
From: Will Deacon @ 2018-10-31 19:42 UTC (permalink / raw)
  To: Richard Henderson
  Cc: James Greenhalgh, gcc-patches, Ramana Radhakrishnan, agraf,
	Marcus Shawcroft, Richard Earnshaw, nd

On Wed, Oct 31, 2018 at 04:38:53PM +0000, Richard Henderson wrote:
> On 10/31/18 3:04 PM, Will Deacon wrote:
> > The example test above uses relaxed atomics in conjunction with an acquire
> > fence, so I don't think we can actually use ST<op> at all without a change
> > to the language specification. I previouslyyallocated P0861 for this purpose
> > but never got a chance to write it up...
> > 
> > Perhaps the issue is a bit clearer with an additional thread (not often I
> > say that!):
> > 
> > 
> > P0 (atomic_int* y,atomic_int* x) {
> >   atomic_store_explicit(x,1,memory_order_relaxed);
> >   atomic_thread_fence(memory_order_release);
> >   atomic_store_explicit(y,1,memory_order_relaxed);
> > }
> > 
> > P1 (atomic_int* y,atomic_int* x) {
> >   atomic_fetch_add_explicit(y,1,memory_order_relaxed);	// STADD
> >   atomic_thread_fence(memory_order_acquire);
> >   int r0 = atomic_load_explicit(x,memory_order_relaxed);
> > }
> > 
> > P2 (atomic_int* y) {
> >   int r1 = atomic_load_explicit(y,memory_order_relaxed);
> > }
> > 
> > 
> > My understanding is that it is forbidden for r0 == 0 and r1 == 2 after
> > this test has executed. However, if the relaxed add in P1 compiles to
> > STADD and the subsequent acquire fence is compiled as DMB LD, then we
> > don't have any ordering guarantees in P1 and the forbidden result could
> > be observed.
> 
> I suppose I don't understand exactly what you're saying.

Apologies, I'm probably not explaining things very well. I'm trying to
avoid getting into the C11 memory model relations if I can help it, hence
the example.

> I can see that, yes, if you split the fetch-add from the acquire in P1 you get
> the incorrect results you describe.  But isn't that a bug in the test itself?

Per the C11 memory model, the test above is well-defined and if r1 == 2
then it is required that r0 == 1. With your proposal, this is not guaranteed
for AArch64, and it would be possible to end up with r1 == 2 and r0 == 0.

> Why would not the only correct version have
> 
> P1 (atomic_int* y, atomic_int* x) {
>   atomic_fetch_add_explicit(y, 1, memory_order_acquire);
>   int r0 = atomic_load_explicit(x, memory_order_relaxed);
> }
> 
> at which point we won't use STADD for the fetch-add, but LDADDA.

That would indeed work correctly, but the problem is that the C11 memory
model doesn't rule out the previous test as something which isn't portable.

> If the problem is more fundamental than this, would you have another go at
> explaining?  In particular, I don't see the difference between
> 
> 	ldadd	val, scratch, [base]
>   vs
> 	stadd	val, [base]
> 
> and
> 
> 	ldaddl	val, scratch, [base]
>   vs
> 	staddl	val, [base]
> 
> where both pairs of instructions have the same memory ordering semantics.
> Currently we are always producing the ld version of each pair.

Aha, maybe this is the problem. An acquire fence on AArch64 is implemented
using a DMB LD instruction, which orders prior reads against subsequent
reads and writes. However, the architecture says:

  | The ST<OP> instructions, and LD<OP> instructions where the destination
  | register is WZR or XZR, are not regarded as doing a read for the purpose
  | of a DMB LD barrier.

and so therefore an ST atomic is not affected by a subsequent acquire fence,
whereas an LD atomic is.

Does that help at all?

Will

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors
  2018-10-31 19:04     ` Richard Henderson
@ 2018-10-31 19:43       ` Eric Botcazou
  0 siblings, 0 replies; 32+ messages in thread
From: Eric Botcazou @ 2018-10-31 19:43 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, ramana.radhakrishnan, agraf, marcus.shawcroft,
	james.greenhalgh, richard.earnshaw

> I thought of that, but thought this was slightly clearer from usage.  I'm
> open the default option if you prefer.

No, either is fine as far as I'm concerned.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions
  2018-10-31 19:42           ` Will Deacon
@ 2018-10-31 22:39             ` Richard Henderson
  0 siblings, 0 replies; 32+ messages in thread
From: Richard Henderson @ 2018-10-31 22:39 UTC (permalink / raw)
  To: Will Deacon
  Cc: James Greenhalgh, gcc-patches, Ramana Radhakrishnan, agraf,
	Marcus Shawcroft, Richard Earnshaw, nd

[-- Attachment #1: Type: text/plain, Size: 640 bytes --]

On 10/31/18 5:51 PM, Will Deacon wrote:
> Aha, maybe this is the problem. An acquire fence on AArch64 is implemented
> using a DMB LD instruction, which orders prior reads against subsequent
> reads and writes. However, the architecture says:
> 
>   | The ST<OP> instructions, and LD<OP> instructions where the destination
>   | register is WZR or XZR, are not regarded as doing a read for the purpose
>   | of a DMB LD barrier.
> 
> and so therefore an ST atomic is not affected by a subsequent acquire fence,
> whereas an LD atomic is.
> 
> Does that help at all?

Yes it does, thanks.  Lest this come up again, let's document this.


r~

[-- Attachment #2: 0001-aarch64-Remove-early-clobber-from-ATOMIC_LDOP-scratc.patch --]
[-- Type: text/x-patch, Size: 1751 bytes --]

From 804f690fc8ebaa436b97ea4c9fef830f9cd2b873 Mon Sep 17 00:00:00 2001
From: Richard Henderson <richard.henderson@linaro.org>
Date: Wed, 19 Sep 2018 22:18:09 +0000
Subject: [PATCH] aarch64: Remove early clobber from ATOMIC_LDOP scratch

	* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
	The scratch register need not be early-clobber.  Document the reason
	why we cannot use ST<OP>.
---
 gcc/config/aarch64/atomics.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 2198649b1be..8944b5690b5 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -263,6 +263,18 @@
   }
 )
 
+;; It is tempting to want to use ST<OP> here for relaxed and release
+;; memory models here.  However, that is incompatible with the C++
+;; memory model for the following case:
+;;
+;;	atomic_fetch_add(ptr, 1, memory_order_relaxed);
+;;	atomic_thread_fence(memory_order_acquire);
+;;
+;; The problem is that the architecture says that ST<OP> (and LD<OP>
+;; insns where the destination is XZR) are not regarded as a read.
+;; However we also implement the acquire memory barrier with DMB LD,
+;; and so the ST<OP> is not blocked by the barrier.
+
 (define_insn "aarch64_atomic_<atomic_ldoptab><mode>_lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
 	(unspec_volatile:ALLI
@@ -270,7 +282,7 @@
 	   (match_operand:ALLI 1 "register_operand" "r")
 	   (match_operand:SI 2 "const_int_operand")]
       ATOMIC_LDOP))
-   (clobber (match_scratch:ALLI 3 "=&r"))]
+   (clobber (match_scratch:ALLI 3 "=r"))]
   "TARGET_LSE"
   {
    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
-- 
2.17.2


^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2018-10-31 21:55 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-02 16:19 [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
2018-10-02 16:19 ` [PATCH, AArch64 v2 11/11] Enable -matomic-ool by default Richard Henderson
2018-10-02 16:19 ` [PATCH, AArch64 v2 02/11] aarch64: Improve cas generation Richard Henderson
2018-10-30 20:37   ` James Greenhalgh
2018-10-02 16:19 ` [PATCH, AArch64 v2 07/11] aarch64: Add out-of-line functions for LSE atomics Richard Henderson
2018-10-02 16:19 ` [PATCH, AArch64 v2 01/11] aarch64: Simplify LSE cas generation Richard Henderson
2018-10-30 20:32   ` James Greenhalgh
2018-10-31 10:35     ` Richard Henderson
2018-10-02 16:19 ` [PATCH, AArch64 v2 04/11] aarch64: Improve atomic-op lse generation Richard Henderson
2018-10-30 21:40   ` James Greenhalgh
2018-10-02 16:19 ` [PATCH, AArch64 v2 09/11] aarch64: Force TImode values into even registers Richard Henderson
2018-10-30 21:47   ` James Greenhalgh
2018-10-02 16:19 ` [PATCH, AArch64 v2 03/11] aarch64: Improve swp generation Richard Henderson
2018-10-30 20:50   ` James Greenhalgh
2018-10-02 16:19 ` [PATCH, AArch64 v2 08/11] aarch64: Implement -matomic-ool Richard Henderson
2018-10-02 16:19 ` [PATCH, AArch64 v2 06/11] Add visibility to libfunc constructors Richard Henderson
2018-10-30 21:46   ` James Greenhalgh
2018-10-31 11:29   ` Richard Henderson
2018-10-31 17:46   ` Eric Botcazou
2018-10-31 19:04     ` Richard Henderson
2018-10-31 19:43       ` Eric Botcazou
2018-10-02 16:36 ` [PATCH, AArch64 v2 10/11] aarch64: Implement TImode compare-and-swap Richard Henderson
2018-10-02 16:37 ` [PATCH, AArch64 v2 05/11] aarch64: Emit LSE st<op> instructions Richard Henderson
2018-10-30 21:42   ` James Greenhalgh
2018-10-31 11:13     ` Richard Henderson
2018-10-31 11:25     ` Richard Henderson
2018-10-31 15:49       ` Will Deacon
2018-10-31 17:35         ` Richard Henderson
2018-10-31 19:42           ` Will Deacon
2018-10-31 22:39             ` Richard Henderson
2018-10-11 17:44 ` [PATCH, AArch64 v2 00/11] LSE atomics out-of-line Richard Henderson
2018-10-22 14:01   ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).