[PATCH, AArch64 09/11] aarch64: Implement -matomic-ool

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH, AArch64 09/11] aarch64: Implement -matomic-ool
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 11/11] Enable -matomic-ool by default rth7680
                   ` (11 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

	* config/aarch64/aarch64.opt (-matomic-ool): New.
	* config/aarch64/aarch64.c (aarch64_atomic_ool_func): New.
	(aarch64_ool_cas_names, aarch64_ool_swp_names): New.
	(aarch64_ool_ldadd_names, aarch64_ool_ldset_names): New.
	(aarch64_ool_ldclr_names, aarch64_ool_ldeor_names): New.
	(aarch64_ool_stadd_names, aarch64_ool_stset_names): New.
	(aarch64_ool_stclr_names, aarch64_ool_steor_names): New.
	(aarch64_expand_compare_and_swap): Honor TARGET_ATOMIC_OOL.
	* config/aarch64/atomics.md (atomic_exchange<ALLI>): Likewise.
	(atomic_<atomic_op><ALLI>): Likewise.
	(atomic_fetch_<atomic_op><ALLI>): Likewise.
	(atomic_<atomic_op>_fetch<ALLI>): Likewise.
---
 gcc/config/aarch64/aarch64-protos.h           | 17 ++++
 gcc/config/aarch64/aarch64.c                  | 95 +++++++++++++++++++
 .../atomic-comp-swap-release-acquire.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-char.c       |  2 +-
 .../gcc.target/aarch64/atomic-op-consume.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-imm.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-int.c        |  2 +-
 .../gcc.target/aarch64/atomic-op-long.c       |  2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-release.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c    |  2 +-
 .../gcc.target/aarch64/atomic-op-short.c      |  2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |  2 +-
 .../atomic_cmp_exchange_zero_strong_1.c       |  2 +-
 .../gcc.target/aarch64/sync-comp-swap.c       |  2 +-
 .../gcc.target/aarch64/sync-op-acquire.c      |  2 +-
 .../gcc.target/aarch64/sync-op-full.c         |  2 +-
 gcc/config/aarch64/aarch64.opt                |  4 +
 gcc/config/aarch64/atomics.md                 | 94 ++++++++++++++++--
 gcc/doc/invoke.texi                           | 14 ++-
 22 files changed, 232 insertions(+), 26 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 1d2f8487d1a..c7b96b12bbe 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -624,4 +624,21 @@ poly_uint64 aarch64_regmode_natural_size (machine_mode);
 
 bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
+struct atomic_ool_names
+{
+    const char *str[4][4];
+};
+
+rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			    const atomic_ool_names *names);
+extern const atomic_ool_names aarch64_ool_swp_names;
+extern const atomic_ool_names aarch64_ool_stadd_names;
+extern const atomic_ool_names aarch64_ool_stset_names;
+extern const atomic_ool_names aarch64_ool_stclr_names;
+extern const atomic_ool_names aarch64_ool_steor_names;
+extern const atomic_ool_names aarch64_ool_ldadd_names;
+extern const atomic_ool_names aarch64_ool_ldset_names;
+extern const atomic_ool_names aarch64_ool_ldclr_names;
+extern const atomic_ool_names aarch64_ool_ldeor_names;
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1e00fdc801c..78b30d68884 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14160,6 +14160,90 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
+/* We store the names of the various atomic helpers in a 4x4 array.
+   Return the libcall function given MODE, MODEL and NAMES.  */
+
+rtx
+aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
+			const atomic_ool_names *names)
+{
+  memmodel model = memmodel_base (INTVAL (model_rtx));
+  int mode_idx, model_idx;
+
+  switch (mode)
+    {
+    case E_QImode:
+      mode_idx = 0;
+      break;
+    case E_HImode:
+      mode_idx = 1;
+      break;
+    case E_SImode:
+      mode_idx = 2;
+      break;
+    case E_DImode:
+      mode_idx = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+      model_idx = 0;
+      break;
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_ACQUIRE:
+      model_idx = 1;
+      break;
+    case MEMMODEL_RELEASE:
+      model_idx = 2;
+      break;
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      model_idx = 3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
+				      VISIBILITY_HIDDEN);
+}
+
+#define DEF0(B, N) \
+  { "__aa64_" #B #N "_relax", \
+    "__aa64_" #B #N "_acq", \
+    "__aa64_" #B #N "_rel", \
+    "__aa64_" #B #N "_acq_rel" }
+
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8)
+
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF4(cas) } };
+const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
+const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
+const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
+const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
+const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
+
+/* Note that the store forms are only available for relax and release
+   memory models.  For the other models, re-use the load forms.  */
+#undef DEF0
+#define DEF0(B, N) \
+  { "__aa64_st" #B #N "_relax", \
+    "__aa64_ld" #B #N "_acq", \
+    "__aa64_st" #B #N "_rel", \
+    "__aa64_ld" #B #N "_acq_rel" }
+
+const atomic_ool_names aarch64_ool_stadd_names = { { DEF4(add) } };
+const atomic_ool_names aarch64_ool_stset_names = { { DEF4(set) } };
+const atomic_ool_names aarch64_ool_stclr_names = { { DEF4(clr) } };
+const atomic_ool_names aarch64_ool_steor_names = { { DEF4(eor) } };
+
+#undef DEF0
+#undef DEF4
+
 /* Expand a compare and swap pattern.  */
 
 void
@@ -14204,6 +14288,17 @@ aarch64_expand_compare_and_swap (rtx operands[])
 						   newval, mod_s));
       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
     }
+  else if (TARGET_ATOMIC_OOL)
+    {
+      /* Oldval must satisfy compare afterward.  */
+      if (!aarch64_plus_operand (oldval, mode))
+	oldval = force_reg (mode, oldval);
+      rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
+      rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
+				      oldval, mode, newval, mode,
+				      XEXP (mem, 0), ptr_mode);
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
+    }
   else
     {
       /* The oldval predicate varies by mode.  Test it and force to reg.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
index 49ca5d0d09c..e92f205c3a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-comp-swap-release-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-atomic-ool" } */
 
 #include "atomic-comp-swap-release-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
index 74f26348e42..6965431f7d9 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acq_rel.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-acq_rel.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
index 66c1b1efe20..07dbca49d56 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
index c09d0434ecf..73bfbb7afc9 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-char.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-char.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
index 5783ab84f5c..c7945b3a22d 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-consume.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-consume.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
index 18b8f0b04e9..e46bb3de7c1 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-imm.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 int v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
index 8520f0839ba..9b55deb5225 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-int.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-int.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
index d011f8c5ce2..2622f75331f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-long.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 long v = 0;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
index ed96bfdb978..f118a37a352 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-relaxed.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-relaxed.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
index fc4be17de89..579634b08e8 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-release.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-release.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
index 613000fe490..016b0d6619f 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-seq_cst.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-seq_cst.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
index e82c8118ece..978bd1d8377 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-op-short.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "atomic-op-short.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
index f2a21ddf2e1..77430ecdbce 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_reg_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-atomic-ool" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
index 8d2ae67dfbe..7d58b2f6bd0 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic_cmp_exchange_zero_strong_1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=armv8-a+nolse" } */
+/* { dg-options "-O2 -march=armv8-a+nolse -mno-atomic-ool" } */
 /* { dg-skip-if "" { *-*-* } { "-mcpu=*" } { "" } } */
 
 int
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
index e571b2f13b3..7fc5885d0fd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-comp-swap.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -fno-ipa-icf -mno-atomic-ool" } */
 
 #include "sync-comp-swap.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
index 357bf1be3b2..6ad0daa8998 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-acquire.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "sync-op-acquire.x"
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
index c6ba1629965..9a7afeb70d3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
+++ b/gcc/testsuite/gcc.target/aarch64/sync-op-full.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8-a+nolse -O2" } */
+/* { dg-options "-march=armv8-a+nolse -O2 -mno-atomic-ool" } */
 
 #include "sync-op-full.x"
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index b2e80cbf6f1..83166834165 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -218,3 +218,7 @@ Enables verbose cost model dumping in the debug dump files.
 mtrack-speculation
 Target Var(aarch64_track_speculation)
 Generate code to track when the CPU might be speculating incorrectly.
+
+matomic-ool
+Target Report Mask(ATOMIC_OOL) Save
+Generate local calls to out-of-line atomic operations.
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index f74521f885d..470c85d6ac3 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -140,16 +140,27 @@
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
-    rtx (*gen) (rtx, rtx, rtx, rtx);
-
     /* Use an atomic SWP when available.  */
     if (TARGET_LSE)
-      gen = gen_aarch64_atomic_exchange<mode>_lse;
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>_lse
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
+    else if (TARGET_ATOMIC_OOL)
+      {
+	machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[3],
+					    &aarch64_ool_swp_names);
+	rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL,
+					    mode, operands[2], mode,
+					    XEXP (operands[1], 0), ptr_mode);
+        emit_move_insn (operands[0], rval);
+      }
     else
-      gen = gen_aarch64_atomic_exchange<mode>;
-
-    emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
-
+      {
+	emit_insn (gen_aarch64_atomic_exchange<mode>
+		   (operands[0], operands[1], operands[2], operands[3]));
+      }
     DONE;
   }
 )
@@ -234,6 +245,39 @@
 	  }
 	operands[1] = force_reg (<MODE>mode, operands[1]);
       }
+    else if (TARGET_ATOMIC_OOL)
+      {
+        const atomic_ool_names *names;
+	switch (<CODE>)
+	  {
+	  case MINUS:
+	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
+					      NULL, 1);
+	    /* fallthru */
+	  case PLUS:
+	    names = &aarch64_ool_stadd_names;
+	    break;
+	  case IOR:
+	    names = &aarch64_ool_stset_names;
+	    break;
+	  case XOR:
+	    names = &aarch64_ool_steor_names;
+	    break;
+	  case AND:
+	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
+					      NULL, 1);
+	    names = &aarch64_ool_stclr_names;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+        machine_mode mode = <MODE>mode;
+	rtx func = aarch64_atomic_ool_func (mode, operands[2], names);
+	emit_library_call_value (func, NULL_RTX, LCT_NORMAL, mode,
+				 operands[1], mode,
+				 XEXP (operands[0], 0), ptr_mode);
+        DONE;
+      }
     else
       gen = gen_aarch64_atomic_<atomic_optab><mode>;
 
@@ -350,6 +394,40 @@
 	}
       operands[2] = force_reg (<MODE>mode, operands[2]);
     }
+  else if (TARGET_ATOMIC_OOL)
+    {
+      const atomic_ool_names *names;
+      switch (<CODE>)
+	{
+	case MINUS:
+	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
+					    NULL, 1);
+	  /* fallthru */
+	case PLUS:
+	  names = &aarch64_ool_ldadd_names;
+	  break;
+	case IOR:
+	  names = &aarch64_ool_ldset_names;
+	  break;
+	case XOR:
+	  names = &aarch64_ool_ldeor_names;
+	  break;
+	case AND:
+	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
+					    NULL, 1);
+	  names = &aarch64_ool_ldclr_names;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      machine_mode mode = <MODE>mode;
+      rtx func = aarch64_atomic_ool_func (mode, operands[3], names);
+      rtx rval = emit_library_call_value (func, operands[0], LCT_NORMAL, mode,
+					  operands[2], mode,
+					  XEXP (operands[1], 0), ptr_mode);
+      emit_move_insn (operands[0], rval);
+      DONE;
+    }
   else
     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
 
@@ -439,7 +517,7 @@
 {
   /* Use an atomic load-operate instruction when possible.  In this case
      we will re-compute the result from the original mem value. */
-  if (TARGET_LSE)
+  if (TARGET_LSE || TARGET_ATOMIC_OOL)
     {
       rtx tmp = gen_reg_rtx (<MODE>mode);
       operands[2] = force_reg (<MODE>mode, operands[2]);
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 7ef4e7a449b..229a6f06aae 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -621,7 +621,7 @@ Objective-C and Objective-C++ Dialects}.
 -mpc-relative-literal-loads @gol
 -msign-return-address=@var{scope} @gol
 -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}  @gol
--moverride=@var{string}  -mverbose-cost-dump -mtrack-speculation} 
+-moverride=@var{string}  -mverbose-cost-dump -mtrack-speculation -matomic-ool} 
 
 @emph{Adapteva Epiphany Options}
 @gccoptlist{-mhalf-reg-file  -mprefer-short-insn-regs @gol
@@ -15039,6 +15039,18 @@ be used by the compiler when expanding calls to
 @code{__builtin_speculation_safe_copy} to permit a more efficient code
 sequence to be generated.
 
+@item -matomic-ool
+@itemx -mno-atomic-ool
+Enable or disable calls to out-of-line helpers to implement atomic operations.
+These helpers will, at runtime, determine if ARMv8.1-Atomics instructions
+should be used; if not, they will use the load/store-exclusive instructions
+that are present in the base ARMv8.0 ISA.
+
+This option is only applicable when compiling for the base ARMv8.0
+instruction set.  If using a later revision, e.g. @option{-march=armv8.1-a}
+or @option{-march=armv8-a+lse}, the ARMv8.1-Atomics instructions will be
+used directly.
+
 @item -march=@var{name}
 @opindex march
 Specify the name of the target architecture and, optionally, one or
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (6 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-27 13:08   ` Matthew Malcomson
                     ` (2 more replies)
  2018-09-26  5:04 ` [PATCH, AArch64 01/11] aarch64: Simplify LSE cas generation rth7680
                   ` (5 subsequent siblings)
  13 siblings, 3 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

This pattern will only be used with the __sync functions, because
we do not yet have a bare TImode atomic load.

	* config/aarch64/aarch64.c (aarch64_gen_compare_reg): Add support		for NE comparison of TImode values.
	(aarch64_print_operand): Extend %R to handle general registers.
	(aarch64_emit_load_exclusive): Add support for TImode.
	(aarch64_emit_store_exclusive): Likewise.
	(aarch64_atomic_ool_func): Likewise.
	(aarch64_ool_cas_names): Likewise.
	* config/aarch64/atomics.md (@atomic_compare_and_swap<ALLI_TI>):
	Change iterator from ALLI to ALLI_TI.
	(@atomic_compare_and_swap<JUST_TI>): New.
	(@atomic_compare_and_swap<JUST_TI>_lse): New.
	(aarch64_load_exclusive_pair): New.
	(aarch64_store_exclusive_pair): New.
	* config/aarch64/iterators.md (JUST_TI): New.

	* config/aarch64/lse.c (cas): Add support for SIZE == 16.
	* config/aarch64/t-lse (S0, O0): Split out cas.
	(LSE_OBJS): Include $(O0).
---
 gcc/config/aarch64/aarch64-protos.h |  2 +-
 gcc/config/aarch64/aarch64.c        | 72 ++++++++++++++++++-----
 libgcc/config/aarch64/lse.c         | 48 ++++++++++-----
 gcc/config/aarch64/atomics.md       | 91 +++++++++++++++++++++++++++--
 gcc/config/aarch64/iterators.md     |  3 +
 libgcc/config/aarch64/t-lse         | 10 +++-
 6 files changed, 189 insertions(+), 37 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index c7b96b12bbe..f735c4e5ad8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -626,7 +626,7 @@ bool aarch64_high_bits_all_ones_p (HOST_WIDE_INT);
 
 struct atomic_ool_names
 {
-    const char *str[4][4];
+    const char *str[5][4];
 };
 
 rtx aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 78b30d68884..eca47784730 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1606,10 +1606,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  if (cmp_mode == E_TImode)
+    {
+      gcc_assert (code == NE);
+
+      cc_mode = E_CCmode;
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+
+      rtx x_lo = operand_subword (x, 0, 0, TImode);
+      rtx y_lo = operand_subword (y, 0, 0, TImode);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+      rtx x_hi = operand_subword (x, 1, 0, TImode);
+      rtx y_hi = operand_subword (y, 1, 0, TImode);
+      emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+			     gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+			     GEN_INT (AARCH64_EQ)));
+    }
+  else
+    {
+      cc_mode = SELECT_CC_MODE (code, x, y);
+      cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+      emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+    }
   return cc_reg;
 }
 
@@ -6689,7 +6712,7 @@ sizetochar (int size)
      'S/T/U/V':		Print a FP/SIMD register name for a register list.
 			The register printed is the FP/SIMD register name
 			of X + 0/1/2/3 for S/T/U/V.
-     'R':		Print a scalar FP/SIMD register name + 1.
+     'R':		Print a scalar Integer/FP/SIMD register name + 1.
      'X':		Print bottom 16 bits of integer constant in hex.
      'w/x':		Print a general register name or the zero register
 			(32-bit or 64-bit).
@@ -6881,12 +6904,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
       break;
 
     case 'R':
-      if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-	{
-	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
-	  return;
-	}
-      asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+      else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+      else
+	output_operand_lossage ("incompatible register operand for '%%%c'",
+				code);
       break;
 
     case 'X':
@@ -14139,16 +14163,26 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 			     rtx mem, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
+  if (mode == E_TImode)
+    emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+						gen_highpart (DImode, rval),
+						mem, model_rtx));
+  else
+    emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
 }
 
 /* Emit store exclusive.  */
 
 static void
 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
-			      rtx rval, rtx mem, rtx model_rtx)
+			      rtx mem, rtx val, rtx model_rtx)
 {
-  emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
+  if (mode == E_TImode)
+    emit_insn (gen_aarch64_store_exclusive_pair
+	       (bval, mem, operand_subword (val, 0, 0, TImode),
+		operand_subword (val, 1, 0, TImode), model_rtx));
+  else
+    emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, val, model_rtx));
 }
 
 /* Mark the previous jump instruction as unlikely.  */
@@ -14160,7 +14194,7 @@ aarch64_emit_unlikely_jump (rtx insn)
   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
 }
 
-/* We store the names of the various atomic helpers in a 4x4 array.
+/* We store the names of the various atomic helpers in a 5x4 array.
    Return the libcall function given MODE, MODEL and NAMES.  */
 
 rtx
@@ -14184,6 +14218,9 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     case E_DImode:
       mode_idx = 3;
       break;
+    case E_TImode:
+      mode_idx = 4;
+      break;
     default:
       gcc_unreachable ();
     }
@@ -14218,9 +14255,11 @@ aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
     "__aa64_" #B #N "_rel", \
     "__aa64_" #B #N "_acq_rel" }
 
-#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8)
+#define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
+		 { NULL, NULL, NULL, NULL }
+#define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
 
-static const atomic_ool_names aarch64_ool_cas_names = { { DEF4(cas) } };
+static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
@@ -14243,6 +14282,7 @@ const atomic_ool_names aarch64_ool_steor_names = { { DEF4(eor) } };
 
 #undef DEF0
 #undef DEF4
+#undef DEF5
 
 /* Expand a compare and swap pattern.  */
 
diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
index 20f4bde741f..8f84820fa78 100644
--- a/libgcc/config/aarch64/lse.c
+++ b/libgcc/config/aarch64/lse.c
@@ -89,6 +89,7 @@ asm(".arch armv8-a+lse");
 #elif SIZE == 4 || SIZE == 8
 # define S     ""
 # define MASK  ""
+#elif SIZE == 16
 #else
 # error
 #endif
@@ -96,9 +97,11 @@ asm(".arch armv8-a+lse");
 #if SIZE < 8
 # define T  unsigned int
 # define W  "w"
-#else
+#elif SIZE == 8
 # define T  unsigned long long
 # define W  ""
+#else
+# define T  unsigned __int128
 #endif
 
 #if MODEL == 1
@@ -136,19 +139,38 @@ T NAME(cas)(T cmp, T new, T *ptr)
   unsigned tmp;
 
   if (have_atomics)
-    __asm__("cas" A L S " %"W"0, %"W"2, %1"
-            : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+    {
+#if SIZE == 16
+      __asm__("casp" A L " %0, %R0, %2, %R2, %1"
+              : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+#else
+      __asm__("cas" A L S " %"W"0, %"W"2, %1"
+              : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+#endif
+    }
   else
-    __asm__(
-	"0: "
-	"ld" A "xr"S" %"W"0, %1\n\t"
-	"cmp %"W"0, %"W"4" MASK "\n\t"
-	"bne 1f\n\t"
-	"st" L "xr"S" %w2, %"W"3, %1\n\t"
-	"cbnz %w2, 0b\n"
-	"1:"
-	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
-
+    {
+#if SIZE == 16
+      __asm__("0: "
+	      "ld" A "xp %0, %R0, %1\n\t"
+	      "cmp %0, %4\n\t"
+	      "ccmp %R0, %R4, #0, eq\n\t"
+	      "bne 1f\n\t"
+	      "st" L "xp %w2, %3, %R3, %1\n\t"
+	      "cbnz %w2, 0b\n"
+	      "1:"
+	      : "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+#else
+      __asm__("0: "
+	      "ld" A "xr"S" %"W"0, %1\n\t"
+	      "cmp %"W"0, %"W"4" MASK "\n\t"
+	      "bne 1f\n\t"
+	      "st" L "xr"S" %w2, %"W"3, %1\n\t"
+	      "cbnz %w2, 0b\n"
+	      "1:"
+	      : "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+#endif
+    }
   return old;
 }
 #endif
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 470c85d6ac3..568e4c831d1 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -22,10 +22,10 @@
 
 (define_expand "@atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand" "")			;; bool out
-   (match_operand:ALLI 1 "register_operand" "")			;; val out
-   (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
-   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
+   (match_operand:ALLI_TI 1 "register_operand" "")		;; val out
+   (match_operand:ALLI_TI 2 "aarch64_sync_memory_operand" "")	;; memory
+   (match_operand:ALLI_TI 3 "nonmemory_operand" "")		;; expected
+   (match_operand:ALLI_TI 4 "aarch64_reg_or_zero" "")		;; desired
    (match_operand:SI 5 "const_int_operand")			;; is_weak
    (match_operand:SI 6 "const_int_operand")			;; mod_s
    (match_operand:SI 7 "const_int_operand")]			;; mod_f
@@ -88,6 +88,30 @@
   }
 )
 
+(define_insn_and_split "@aarch64_compare_and_swap<mode>"
+  [(set (reg:CC CC_REGNUM)					;; bool out
+    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
+   (set (match_operand:JUST_TI 0 "register_operand" "=&r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_operand:JUST_TI 2 "register_operand" "r")		;; expect
+       (match_operand:JUST_TI 3 "aarch64_reg_or_zero" "rZ")	;; desired
+       (match_operand:SI 4 "const_int_operand")			;; is_weak
+       (match_operand:SI 5 "const_int_operand")			;; mod_s
+       (match_operand:SI 6 "const_int_operand")]		;; mod_f
+      UNSPECV_ATOMIC_CMPSW))
+   (clobber (match_scratch:SI 7 "=&r"))]
+  ""
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_compare_and_swap (operands);
+    DONE;
+  }
+)
+
 (define_insn "@aarch64_compare_and_swap<mode>_lse"
   [(set (match_operand:SI 0 "register_operand" "=r")		;; val out
     (zero_extend:SI
@@ -133,6 +157,28 @@
     return "casal<atomic_sfx>\t%<w>0, %<w>3, %1";
 })
 
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:JUST_TI 0 "register_operand" "=r")	;; val out
+    (match_operand:JUST_TI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+   (set (match_dup 1)
+    (unspec_volatile:JUST_TI
+      [(match_operand:JUST_TI 2 "register_operand" "0")		;; expect
+       (match_operand:JUST_TI 3 "register_operand" "r")		;; desired
+       (match_operand:SI 4 "const_int_operand")]		;; mod_s
+      UNSPECV_ATOMIC_CMPSW))]
+  "TARGET_LSE"
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
+  if (is_mm_relaxed (model))
+    return "casp\t%0, %R0, %3, %R3, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "caspa\t%0, %R0, %3, %R3, %1";
+  else if (is_mm_release (model))
+    return "caspl\t%0, %R0, %3, %R3, %1";
+  else
+    return "caspal\t%0, %R0, %3, %R3, %1";
+})
+
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
@@ -650,6 +696,24 @@
   }
 )
 
+(define_insn "aarch64_load_exclusive_pair"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(unspec_volatile:DI
+	  [(match_operand:TI 2 "aarch64_sync_memory_operand" "Q")
+	   (match_operand:SI 3 "const_int_operand")]
+	  UNSPECV_LX))
+   (set (match_operand:DI 1 "register_operand" "=r")
+	(unspec_volatile:DI [(match_dup 2) (match_dup 3)] UNSPECV_LX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_release (model))
+      return "ldxp\t%0, %1, %2";
+    else
+      return "ldaxp\t%0, %1, %2";
+  }
+)
+
 (define_insn "@aarch64_store_exclusive<mode>"
   [(set (match_operand:SI 0 "register_operand" "=&r")
     (unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
@@ -668,6 +732,25 @@
   }
 )
 
+(define_insn "aarch64_store_exclusive_pair"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+	(unspec_volatile:SI [(const_int 0)] UNSPECV_SX))
+   (set (match_operand:TI 1 "aarch64_sync_memory_operand" "=Q")
+	(unspec_volatile:TI
+	  [(match_operand:DI 2 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:DI 3 "aarch64_reg_or_zero" "rZ")
+	   (match_operand:SI 4 "const_int_operand")]
+	  UNSPECV_SX))]
+  ""
+  {
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire (model))
+      return "stxp\t%w0, %x2, %x3, %1";
+    else
+      return "stlxp\t%w0, %x2, %x3, %1";
+  }
+)
+
 (define_expand "mem_thread_fence"
   [(match_operand:SI 0 "const_int_operand" "")]
   ""
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 524e4e6929b..dd26bdbbc6b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -29,6 +29,9 @@
 ;; Iterator for HI, SI, DI, some instructions can only work on these modes.
 (define_mode_iterator GPI_I16 [(HI "AARCH64_ISA_F16") SI DI])
 
+;; "Iterator" for just TI -- features like @pattern only work with iterators.
+(define_mode_iterator JUST_TI [TI])
+
 ;; Iterator for QI and HI modes
 (define_mode_iterator SHORT [QI HI])
 
diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
index e862b0c2448..534ff6efea8 100644
--- a/libgcc/config/aarch64/t-lse
+++ b/libgcc/config/aarch64/t-lse
@@ -18,15 +18,19 @@
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.
 
-# CAS, Swap, Load-and-operate have 4 sizes and 4 memory models
-S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), cas swp ldadd ldclr ldeor ldset))
+# Compare-and-swap has 5 sizes and 4 memory models.
+S0 := $(foreach s, 1 2 4 8 16, $(addsuffix _$(s), cas))
+O0 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S0)))
+
+# Swap, Load-and-operate have 4 sizes and 4 memory models
+S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), swp ldadd ldclr ldeor ldset))
 O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
 
 # Store-and-operate has 4 sizes but only 2 memory models (relaxed, release).
 S2 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), stadd stclr steor stset))
 O2 := $(foreach m, 1 3, $(addsuffix _$(m)$(objext), $(S2)))
 
-LSE_OBJS := $(O1) $(O2)
+LSE_OBJS := $(O0) $(O1) $(O2)
 
 libgcc-objects += $(LSE_OBJS) have_atomic$(objext)
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 00/11] LSE atomics out-of-line
@ 2018-09-26  5:04 rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation rth7680
                   ` (13 more replies)
  0 siblings, 14 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

ARMv8.1 adds an (mandatory) Atomics extension, also known as the
Large System Extension.  Deploying this extension at the OS level
has proved challenging.

The following is the result of a conversation between myself,
Alex Graf of SuSE, and Ramana Radhakrishnan of ARM, at last week's
Linaro Connect in Vancouver.

The current state of the world is that one could distribute two
different copies of a given shared library and place the LSE-enabled
version in /lib64/atomics/ and it will be selected over the /lib64/
version by ld.so when HWCAP_ATOMICS is present.

Alex's main concern with this is that (1) he doesn't want to
distribute two copies of every library, or determine what a
resonable subset would be and (2) this solution does not work
for executables, e.g. mysql.

Ramana's main concern was to avoid the overhead of an indirect jump,
especially in how that would affect the (non-)branch-prediction of
the smallest implementations.

Therefore, I've created small out-of-line helpers that are directly
linked into every library or executable that requires them.  There
will be two direct branches, both of which will be well-predicted.

In the process, I discovered a number of places within the code
where the existing implementation could be improved.  In particular:

 - the LSE patterns didn't use predicates or constraints that
   match the actual instructions, requiring unnecessary splitting.

 - the non-LSE compare-and-swap can use an extending compare to
   avoid requiring the input to have been previously extended.

 - TImode compare-and-swap was missing entirely.  This brings
   aarch64 to parity with x86_64 wrt __sync_val_compare_and_swap.

There is a final patch that enables the new option by default.
I am not necessarily expecting this to be merged upstream, but
for the operating system to decide what the default should be.
It might be that this should be a configure option, so as to
make that OS choice easier, but I've just now thought of that.  ;-)

I'm going to have to rely on Alex and/or Ramana to perform
testing on a system that supports LSE.

r~

Richard Henderson (11):
  aarch64: Simplify LSE cas generation
  aarch64: Improve cas generation
  aarch64: Improve swp generation
  aarch64: Improve atomic-op lse generation
  aarch64: Emit LSE st<op> instructions
  Add visibility to libfunc constructors
  Link static libgcc after shared libgcc for -shared-libgcc
  aarch64: Add out-of-line functions for LSE atomics
  aarch64: Implement -matomic-ool
  aarch64: Implement TImode compare-and-swap
  Enable -matomic-ool by default

 gcc/config/aarch64/aarch64-protos.h           |  20 +-
 gcc/optabs-libfuncs.h                         |   2 +
 gcc/common/config/aarch64/aarch64-common.c    |   6 +-
 gcc/config/aarch64/aarch64.c                  | 480 ++++++--------
 gcc/gcc.c                                     |   9 +-
 gcc/optabs-libfuncs.c                         |  26 +-
 .../atomic-comp-swap-release-acquire.c        |   2 +-
 .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
 .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
 .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
 .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
 .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
 .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
 .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
 .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
 .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
 .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
 .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
 .../gcc.target/aarch64/sync-op-full.c         |   2 +-
 libgcc/config/aarch64/lse.c                   | 280 ++++++++
 gcc/config/aarch64/aarch64.opt                |   4 +
 gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
 gcc/config/aarch64/iterators.md               |   8 +-
 gcc/config/aarch64/predicates.md              |  12 +
 gcc/doc/invoke.texi                           |  14 +-
 libgcc/config.host                            |   4 +
 libgcc/config/aarch64/t-lse                   |  48 ++
 33 files changed, 1050 insertions(+), 577 deletions(-)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (8 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 01/11] aarch64: Simplify LSE cas generation rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  9:01   ` Florian Weimer
  2018-09-28 16:29   ` Ramana Radhakrishnan
  2018-09-26  7:40 ` [PATCH, AArch64 02/11] aarch64: Improve cas generation rth7680
                   ` (3 subsequent siblings)
  13 siblings, 2 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

This is the libgcc part of the interface -- providing the functions.
Rationale is provided at the top of libgcc/config/aarch64/lse.c.

	* config/aarch64/lse.c: New file.
	* config/aarch64/t-lse: New file.
	* config.host: Add t-lse to all aarch64 tuples.
---
 libgcc/config/aarch64/lse.c | 258 ++++++++++++++++++++++++++++++++++++
 libgcc/config.host          |   4 +
 libgcc/config/aarch64/t-lse |  44 ++++++
 3 files changed, 306 insertions(+)
 create mode 100644 libgcc/config/aarch64/lse.c
 create mode 100644 libgcc/config/aarch64/t-lse

diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
new file mode 100644
index 00000000000..20f4bde741f
--- /dev/null
+++ b/libgcc/config/aarch64/lse.c
@@ -0,0 +1,258 @@
+/* Out-of-line LSE atomics for AArch64 architecture.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/*
+ * The problem that we are trying to solve is operating system deployment
+ * of ARMv8.1-Atomics, also known as Large System Exensions (LSE).
+ *
+ * There are a number of potential solutions for this problem which have
+ * been proposed and rejected for various reasons.  To recap:
+ *
+ * (1) Multiple builds.  The dynamic linker will examine /lib64/atomics/
+ * if HWCAP_ATOMICS is set, allowing entire libraries to be overwritten.
+ * However, not all Linux distributions are happy with multiple builds,
+ * and anyway it has no effect on main applications.
+ *
+ * (2) IFUNC.  We could put these functions into libgcc_s.so, and have
+ * a single copy of each function for all DSOs.  However, ARM is concerned
+ * that the branch-to-indirect-branch that is implied by using a PLT,
+ * as required by IFUNC, is too much overhead for smaller cpus.
+ *
+ * (3) Statically predicted direct branches.  This is the approach that
+ * is taken here.  These functions are linked into every DSO that uses them.
+ * All of the symbols are hidden, so that the functions are called via a
+ * direct branch.  The choice of LSE vs non-LSE is done via one byte load
+ * followed by a well-predicted direct branch.  The functions are compiled
+ * separately to minimize code size.
+ */
+
+/* Define or declare the symbol gating the LSE implementations.  */
+#ifndef L_have_atomics
+extern
+#endif
+_Bool __aa64_have_atomics __attribute__((visibility("hidden"), nocommon));
+
+/* The branch controlled by this test should be easily predicted, in that
+   it will, after constructors, always branch the same way.  The expectation
+   is that systems that implement ARMv8.1-Atomics are "beefier" than those
+   that omit the extension.  By arranging for the fall-through path to use
+   load-store-exclusive insns, we aid the branch predictor of the
+   smallest cpus.  */
+#define have_atomics  __builtin_expect(__aa64_have_atomics, 0)
+
+#ifdef L_have_atomics
+/* Disable initialization of __aa64_have_atomics during bootstrap.  */
+# ifndef inhibit_libc
+#  include <sys/auxv.h>
+
+static void __attribute__((constructor))
+init_have_atomics(void)
+{
+  unsigned long hwcap = getauxval(AT_HWCAP);
+  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
+# endif /* inhibit_libc */
+#else
+
+/* Tell the assembler to accept LSE instructions.  */
+asm(".arch armv8-a+lse");
+
+/* Turn size and memory model defines into mnemonic fragments.  */
+#if SIZE == 1
+# define S     "b"
+# define MASK  ", uxtb"
+#elif SIZE == 2
+# define S     "h"
+# define MASK  ", uxth"
+#elif SIZE == 4 || SIZE == 8
+# define S     ""
+# define MASK  ""
+#else
+# error
+#endif
+
+#if SIZE < 8
+# define T  unsigned int
+# define W  "w"
+#else
+# define T  unsigned long long
+# define W  ""
+#endif
+
+#if MODEL == 1
+# define SUFF  _relax
+# define A     ""
+# define L     ""
+#elif MODEL == 2
+# define SUFF  _acq
+# define A     "a"
+# define L     ""
+#elif MODEL == 3
+# define SUFF  _rel
+# define A     ""
+# define L     "l"
+#elif MODEL == 4
+# define SUFF  _acq_rel
+# define A     "a"
+# define L     "l"
+#else
+# error
+#endif
+
+#define NAME2(B, S, X)  __aa64_ ## B ## S ## X
+#define NAME1(B, S, X)  NAME2(B, S, X)
+#define NAME(BASE)	NAME1(BASE, SIZE, SUFF)
+
+#define str1(S)  #S
+#define str(S)   str1(S)
+
+#ifdef L_cas
+T NAME(cas)(T cmp, T new, T *ptr) __attribute__((visibility("hidden")));
+T NAME(cas)(T cmp, T new, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__("cas" A L S " %"W"0, %"W"2, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	"cmp %"W"0, %"W"4" MASK "\n\t"
+	"bne 1f\n\t"
+	"st" L "xr"S" %w2, %"W"3, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
+
+  return old;
+}
+#endif
+
+#ifdef L_swp
+T NAME(swp)(T new, T *ptr) __attribute__((visibility("hidden")));
+T NAME(swp)(T new, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__("swp" A L S " %"W"2, %"W"0, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(new));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	"st" L "xr"S" %w2, %"W"3, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new));
+
+  return old;
+}
+#endif
+
+#if defined(L_ldadd) || defined(L_ldclr) \
+    || defined(L_ldeor) || defined(L_ldset)
+
+#ifdef L_ldadd
+#define LDOP  ldadd
+#define OP    add
+#elif defined(L_ldclr)
+#define LDOP  ldclr
+#define OP    bic
+#elif defined(L_ldeor)
+#define LDOP  ldeor
+#define OP    eor
+#elif defined(L_ldset)
+#define LDOP  ldset
+#define OP    orr
+#else
+#error
+#endif
+
+T NAME(LDOP)(T val, T *ptr) __attribute__((visibility("hidden")));
+T NAME(LDOP)(T val, T *ptr)
+{
+  T old;
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__(str(LDOP) A L S " %"W"2, %"W"0, %1"
+            : "=r"(old), "+m"(*ptr) : "r"(val));
+  else
+    __asm__(
+	"0: "
+	"ld" A "xr"S" %"W"0, %1\n\t"
+	str(OP) " %"W"2, %"W"0, %"W"3\n\t"
+	"st" L "xr"S" %w2, %"W"2, %1\n\t"
+	"cbnz %w2, 0b\n"
+	"1:"
+	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(val));
+
+  return old;
+}
+#endif
+
+#if defined(L_stadd) || defined(L_stclr) \
+    || defined(L_steor) || defined(L_stset)
+
+#ifdef L_stadd
+#define STOP  stadd
+#define OP    add
+#elif defined(L_stclr)
+#define STOP  stclr
+#define OP    bic
+#elif defined(L_steor)
+#define STOP  steor
+#define OP    eor
+#elif defined(L_stset)
+#define STOP  stset
+#define OP    orr
+#else
+#error
+#endif
+
+void NAME(STOP)(T val, T *ptr) __attribute__((visibility("hidden")));
+void NAME(STOP)(T val, T *ptr)
+{
+  unsigned tmp;
+
+  if (have_atomics)
+    __asm__(str(STOP) L S " %"W"1, %0" : "+m"(*ptr) : "r"(val));
+  else
+    __asm__(
+	"0: "
+	"ldxr"S" %"W"1, %0\n\t"
+	str(OP) " %"W"1, %"W"1, %"W"2\n\t"
+	"st" L "xr"S" %w1, %"W"1, %0\n\t"
+	"cbnz %w1, 0b\n"
+	"1:"
+	: "+m"(*ptr), "=&r"(tmp) : "r"(val));
+}
+#endif
+#endif /* L_have_atomics */
diff --git a/libgcc/config.host b/libgcc/config.host
index 029f6569caf..2c4a05d69c5 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -340,23 +340,27 @@ aarch64*-*-elf | aarch64*-*-rtems*)
 	extra_parts="$extra_parts crtbegin.o crtend.o crti.o crtn.o"
 	extra_parts="$extra_parts crtfastmath.o"
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	md_unwind_header=aarch64/aarch64-unwind.h
 	;;
 aarch64*-*-freebsd*)
 	extra_parts="$extra_parts crtfastmath.o"
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	md_unwind_header=aarch64/freebsd-unwind.h
 	;;
 aarch64*-*-fuchsia*)
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
 	;;
 aarch64*-*-linux*)
 	extra_parts="$extra_parts crtfastmath.o"
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+	tmake_file="${tmake_file} ${cpu_type}/t-lse"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
 	;;
 alpha*-*-linux*)
diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
new file mode 100644
index 00000000000..e862b0c2448
--- /dev/null
+++ b/libgcc/config/aarch64/t-lse
@@ -0,0 +1,44 @@
+# Out-of-line LSE atomics for AArch64 architecture.
+# Copyright (C) 2018 Free Software Foundation, Inc.
+# Contributed by Linaro Ltd.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# <http://www.gnu.org/licenses/>.
+
+# CAS, Swap, Load-and-operate have 4 sizes and 4 memory models
+S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), cas swp ldadd ldclr ldeor ldset))
+O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
+
+# Store-and-operate has 4 sizes but only 2 memory models (relaxed, release).
+S2 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), stadd stclr steor stset))
+O2 := $(foreach m, 1 3, $(addsuffix _$(m)$(objext), $(S2)))
+
+LSE_OBJS := $(O1) $(O2)
+
+libgcc-objects += $(LSE_OBJS) have_atomic$(objext)
+
+empty      =
+space      = $(empty) $(empty)
+PAT_SPLIT  = $(subst _,$(space),$(*F))
+PAT_BASE   = $(word 1,$(PAT_SPLIT))
+PAT_N      = $(word 2,$(PAT_SPLIT))
+PAT_M      = $(word 3,$(PAT_SPLIT))
+
+have_atomic$(objext): $(srcdir)/config/aarch64/lse.c
+	$(gcc_compile) -DL_have_atomics -c $<
+
+$(LSE_OBJS): $(srcdir)/config/aarch64/lse.c
+	$(gcc_compile) -DL_$(PAT_BASE) -DSIZE=$(PAT_N) -DMODEL=$(PAT_M) -c $<
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 06/11] Add visibility to libfunc constructors
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (2 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 11/11] Enable -matomic-ool by default rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 03/11] aarch64: Improve swp generation rth7680
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

	* optabs-libfuncs.c (build_libfunc_function_visibility):
	New, split out from...
	(build_libfunc_function): ... here.
	(init_one_libfunc_visibility): New, split out from ...
	(init_one_libfunc): ... here.
---
 gcc/optabs-libfuncs.h |  2 ++
 gcc/optabs-libfuncs.c | 26 ++++++++++++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/gcc/optabs-libfuncs.h b/gcc/optabs-libfuncs.h
index 0669ea1fdd7..cf39da36887 100644
--- a/gcc/optabs-libfuncs.h
+++ b/gcc/optabs-libfuncs.h
@@ -63,7 +63,9 @@ void gen_satfract_conv_libfunc (convert_optab, const char *,
 void gen_satfractuns_conv_libfunc (convert_optab, const char *,
 				   machine_mode, machine_mode);
 
+tree build_libfunc_function_visibility (const char *, symbol_visibility);
 tree build_libfunc_function (const char *);
+rtx init_one_libfunc_visibility (const char *, symbol_visibility);
 rtx init_one_libfunc (const char *);
 rtx set_user_assembler_libfunc (const char *, const char *);
 
diff --git a/gcc/optabs-libfuncs.c b/gcc/optabs-libfuncs.c
index bd0df8baa37..73a28e9ca7a 100644
--- a/gcc/optabs-libfuncs.c
+++ b/gcc/optabs-libfuncs.c
@@ -719,10 +719,10 @@ struct libfunc_decl_hasher : ggc_ptr_hash<tree_node>
 /* A table of previously-created libfuncs, hashed by name.  */
 static GTY (()) hash_table<libfunc_decl_hasher> *libfunc_decls;
 
-/* Build a decl for a libfunc named NAME.  */
+/* Build a decl for a libfunc named NAME with visibility VIS.  */
 
 tree
-build_libfunc_function (const char *name)
+build_libfunc_function_visibility (const char *name, symbol_visibility vis)
 {
   /* ??? We don't have any type information; pretend this is "int foo ()".  */
   tree decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
@@ -731,7 +731,7 @@ build_libfunc_function (const char *name)
   DECL_EXTERNAL (decl) = 1;
   TREE_PUBLIC (decl) = 1;
   DECL_ARTIFICIAL (decl) = 1;
-  DECL_VISIBILITY (decl) = VISIBILITY_DEFAULT;
+  DECL_VISIBILITY (decl) = vis;
   DECL_VISIBILITY_SPECIFIED (decl) = 1;
   gcc_assert (DECL_ASSEMBLER_NAME (decl));
 
@@ -742,11 +742,19 @@ build_libfunc_function (const char *name)
   return decl;
 }
 
+/* Build a decl for a libfunc named NAME.  */
+
+tree
+build_libfunc_function (const char *name)
+{
+  return build_libfunc_function_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Return a libfunc for NAME, creating one if we don't already have one.
-   The returned rtx is a SYMBOL_REF.  */
+   The decl is given visibility VIS.  The returned rtx is a SYMBOL_REF.  */
 
 rtx
-init_one_libfunc (const char *name)
+init_one_libfunc_visibility (const char *name, symbol_visibility vis)
 {
   tree id, decl;
   hashval_t hash;
@@ -763,12 +771,18 @@ init_one_libfunc (const char *name)
     {
       /* Create a new decl, so that it can be passed to
 	 targetm.encode_section_info.  */
-      decl = build_libfunc_function (name);
+      decl = build_libfunc_function_visibility (name, vis);
       *slot = decl;
     }
   return XEXP (DECL_RTL (decl), 0);
 }
 
+rtx
+init_one_libfunc (const char *name)
+{
+  return init_one_libfunc_visibility (name, VISIBILITY_DEFAULT);
+}
+
 /* Adjust the assembler name of libfunc NAME to ASMSPEC.  */
 
 rtx
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 09/11] aarch64: Implement -matomic-ool rth7680
                   ` (12 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

Fix constraints; avoid unnecessary split.  Drop the use of the atomic_op
iterator in favor of the ATOMIC_LDOP iterator; this is simplier and more
logical for ldclr aka bic.

	* config/aarch64/aarch64.c (aarch64_emit_bic): Remove.
	(aarch64_atomic_ldop_supported_p): Remove.
	(aarch64_gen_atomic_ldop): Remove.
	* config/aarch64/atomic.md (atomic_<atomic_optab><ALLI>):
	Fully expand LSE operations here.
	(atomic_fetch_<atomic_optab><ALLI>): Likewise.
	(atomic_<atomic_optab>_fetch<ALLI>): Likewise.
	(aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse): Drop atomic_op iterator
	and use ATOMIC_LDOP instead; use register_operand for the input;
	drop the split and emit insns directly.
	(aarch64_atomic_fetch_<ATOMIC_LDOP><ALLI>_lse): Likewise.
	(aarch64_atomic_<atomic_op>_fetch<ALLI>_lse): Remove.
	(@aarch64_atomic_load<ATOMIC_LDOP><ALLI>): Remove.
---
 gcc/config/aarch64/aarch64-protos.h |   2 -
 gcc/config/aarch64/aarch64.c        | 176 -------------------------
 gcc/config/aarch64/atomics.md       | 197 +++++++++++++++-------------
 gcc/config/aarch64/iterators.md     |   5 +-
 4 files changed, 108 insertions(+), 272 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3d045cf43be..1d2f8487d1a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -563,8 +563,6 @@ rtx aarch64_load_tp (rtx);
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
 
-bool aarch64_atomic_ldop_supported_p (enum rtx_code);
-void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_gen_adjusted_ldpstp (rtx *, bool, scalar_mode, RTX_CODE);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5e9a85be44c..1e00fdc801c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14224,32 +14224,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
   emit_insn (gen_rtx_SET (bval, x));
 }
 
-/* Test whether the target supports using a atomic load-operate instruction.
-   CODE is the operation and AFTER is TRUE if the data in memory after the
-   operation should be returned and FALSE if the data before the operation
-   should be returned.  Returns FALSE if the operation isn't supported by the
-   architecture.  */
-
-bool
-aarch64_atomic_ldop_supported_p (enum rtx_code code)
-{
-  if (!TARGET_LSE)
-    return false;
-
-  switch (code)
-    {
-    case SET:
-    case AND:
-    case IOR:
-    case XOR:
-    case MINUS:
-    case PLUS:
-      return true;
-    default:
-      return false;
-    }
-}
-
 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
    sequence implementing an atomic operation.  */
 
@@ -14382,156 +14356,6 @@ aarch64_split_compare_and_swap (rtx operands[])
     aarch64_emit_post_barrier (model);
 }
 
-/* Emit a BIC instruction.  */
-
-static void
-aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
-{
-  rtx shift_rtx = GEN_INT (shift);
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
-    {
-    case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
-    case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen (dst, s2, shift_rtx, s1));
-}
-
-/* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
-   location to store the data read from memory.  OUT_RESULT is the location to
-   store the result of the operation.  MEM is the memory location to read and
-   modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
-   operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
-   be NULL.  */
-
-void
-aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
-			 rtx mem, rtx value, rtx model_rtx)
-{
-  machine_mode mode = GET_MODE (mem);
-  machine_mode wmode = (mode == DImode ? DImode : SImode);
-  const bool short_mode = (mode < SImode);
-  int ldop_code;
-  rtx src;
-  rtx x;
-
-  if (out_data)
-    out_data = gen_lowpart (mode, out_data);
-
-  if (out_result)
-    out_result = gen_lowpart (mode, out_result);
-
-  /* Make sure the value is in a register, putting it into a destination
-     register if it needs to be manipulated.  */
-  if (!register_operand (value, mode)
-      || code == AND || code == MINUS)
-    {
-      src = out_result ? out_result : out_data;
-      emit_move_insn (src, gen_lowpart (mode, value));
-    }
-  else
-    src = value;
-  gcc_assert (register_operand (src, mode));
-
-  /* Preprocess the data for the operation as necessary.  If the operation is
-     a SET then emit a swap instruction and finish.  */
-  switch (code)
-    {
-    case MINUS:
-      /* Negate the value and treat it as a PLUS.  */
-      {
-	rtx neg_src;
-
-	/* Resize the value if necessary.  */
-	if (short_mode)
-	  src = gen_lowpart (wmode, src);
-
-	neg_src = gen_rtx_NEG (wmode, src);
-	emit_insn (gen_rtx_SET (src, neg_src));
-
-	if (short_mode)
-	  src = gen_lowpart (mode, src);
-      }
-      /* Fall-through.  */
-    case PLUS:
-      ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
-      break;
-
-    case IOR:
-      ldop_code = UNSPECV_ATOMIC_LDOP_OR;
-      break;
-
-    case XOR:
-      ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
-      break;
-
-    case AND:
-      {
-	rtx not_src;
-
-	/* Resize the value if necessary.  */
-	if (short_mode)
-	  src = gen_lowpart (wmode, src);
-
-	not_src = gen_rtx_NOT (wmode, src);
-	emit_insn (gen_rtx_SET (src, not_src));
-
-	if (short_mode)
-	  src = gen_lowpart (mode, src);
-      }
-      ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
-      break;
-
-    default:
-      /* The operation can't be done with atomic instructions.  */
-      gcc_unreachable ();
-    }
-
-  emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
-				      out_data, mem, src, model_rtx));
-
-  /* If necessary, calculate the data in memory after the update by redoing the
-     operation from values in registers.  */
-  if (!out_result)
-    return;
-
-  if (short_mode)
-    {
-      src = gen_lowpart (wmode, src);
-      out_data = gen_lowpart (wmode, out_data);
-      out_result = gen_lowpart (wmode, out_result);
-    }
-
-  x = NULL_RTX;
-
-  switch (code)
-    {
-    case MINUS:
-    case PLUS:
-      x = gen_rtx_PLUS (wmode, out_data, src);
-      break;
-    case IOR:
-      x = gen_rtx_IOR (wmode, out_data, src);
-      break;
-    case XOR:
-      x = gen_rtx_XOR (wmode, out_data, src);
-      break;
-    case AND:
-      aarch64_emit_bic (wmode, out_result, out_data, src, 0);
-      return;
-    default:
-      gcc_unreachable ();
-    }
-
-  emit_set_insn (out_result, x);
-
-  return;
-}
-
 /* Split an atomic operation.  */
 
 void
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 63384f9f99c..05ac6232fe5 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -207,13 +207,37 @@
     rtx (*gen) (rtx, rtx, rtx);
 
     /* Use an atomic load-operate instruction when possible.  */
-    if (aarch64_atomic_ldop_supported_p (<CODE>))
-      gen = gen_aarch64_atomic_<atomic_optab><mode>_lse;
+    if (TARGET_LSE)
+      {
+	switch (<CODE>)
+	  {
+	  case MINUS:
+	    operands[1] = expand_simple_unop (<MODE>mode, NEG, operands[1],
+					      NULL, 1);
+	    /* fallthru */
+	  case PLUS:
+	    gen = gen_aarch64_atomic_add<mode>_lse;
+	    break;
+	  case IOR:
+	    gen = gen_aarch64_atomic_ior<mode>_lse;
+	    break;
+	  case XOR:
+	    gen = gen_aarch64_atomic_xor<mode>_lse;
+	    break;
+	  case AND:
+	    operands[1] = expand_simple_unop (<MODE>mode, NOT, operands[1],
+					      NULL, 1);
+	    gen = gen_aarch64_atomic_bic<mode>_lse;
+	    break;
+	  default:
+	    gcc_unreachable ();
+	  }
+	operands[1] = force_reg (<MODE>mode, operands[1]);
+      }
     else
       gen = gen_aarch64_atomic_<atomic_optab><mode>;
 
     emit_insn (gen (operands[0], operands[1], operands[2]));
-
     DONE;
   }
 )
@@ -239,22 +263,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_<atomic_optab><mode>_lse"
+(define_insn "aarch64_atomic_<atomic_ldoptab><mode>_lse"
   [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
-    (unspec_volatile:ALLI
-      [(atomic_op:ALLI (match_dup 0)
-	(match_operand:ALLI 1 "<atomic_op_operand>" "r<const_atomic>"))
-       (match_operand:SI 2 "const_int_operand")]
-      UNSPECV_ATOMIC_OP))
+	(unspec_volatile:ALLI
+	  [(match_dup 0)
+	   (match_operand:ALLI 1 "register_operand" "r")
+	   (match_operand:SI 2 "const_int_operand")]
+      ATOMIC_LDOP))
    (clobber (match_scratch:ALLI 3 "=&r"))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (<CODE>, operands[3], NULL, operands[0],
-			     operands[1], operands[2]);
-    DONE;
+   enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+   if (is_mm_relaxed (model))
+     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+   else if (is_mm_release (model))
+     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+   else if (is_mm_acquire (model) || is_mm_consume (model))
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
+   else
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0";
   }
 )
 
@@ -280,7 +307,7 @@
   }
 )
 
-;; Load-operate-store, returning the updated memory data.
+;; Load-operate-store, returning the original memory data.
 
 (define_expand "atomic_fetch_<atomic_optab><mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -293,13 +320,37 @@
   rtx (*gen) (rtx, rtx, rtx, rtx);
 
   /* Use an atomic load-operate instruction when possible.  */
-  if (aarch64_atomic_ldop_supported_p (<CODE>))
-    gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>_lse;
+  if (TARGET_LSE)
+    {
+      switch (<CODE>)
+        {
+	case MINUS:
+	  operands[2] = expand_simple_unop (<MODE>mode, NEG, operands[2],
+					    NULL, 1);
+	  /* fallthru */
+	case PLUS:
+	  gen = gen_aarch64_atomic_fetch_add<mode>_lse;
+	  break;
+	case IOR:
+	  gen = gen_aarch64_atomic_fetch_ior<mode>_lse;
+	  break;
+	case XOR:
+	  gen = gen_aarch64_atomic_fetch_xor<mode>_lse;
+	  break;
+	case AND:
+	  operands[2] = expand_simple_unop (<MODE>mode, NOT, operands[2],
+					    NULL, 1);
+	  gen = gen_aarch64_atomic_fetch_bic<mode>_lse;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+    }
   else
     gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
 
   emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
-
   DONE;
 })
 
@@ -326,23 +377,26 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_fetch_<atomic_optab><mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
-    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
+(define_insn "aarch64_atomic_fetch_<atomic_ldoptab><mode>_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
+	(match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
    (set (match_dup 1)
-    (unspec_volatile:ALLI
-      [(atomic_op:ALLI (match_dup 1)
-	(match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>"))
-       (match_operand:SI 3 "const_int_operand")]
-      UNSPECV_ATOMIC_LDOP))]
+	(unspec_volatile:ALLI
+	  [(match_dup 1)
+	   (match_operand:ALLI 2 "register_operand" "r")
+	   (match_operand:SI 3 "const_int_operand")]
+	  ATOMIC_LDOP))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (<CODE>, operands[0], NULL, operands[1],
-			     operands[2], operands[3]);
-    DONE;
+   enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+   if (is_mm_relaxed (model))
+     return "ld<atomic_ldop><atomic_sfx>\t%<w>2, %<w>0, %1";
+   else if (is_mm_acquire (model) || is_mm_consume (model))
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>2, %<w>0, %1";
+   else if (is_mm_release (model))
+     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>2, %<w>0, %1";
+   else
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>2, %<w>0, %1";
   }
 )
 
@@ -370,7 +424,7 @@
   }
 )
 
-;; Load-operate-store, returning the original memory data.
+;; Load-operate-store, returning the updated memory data.
 
 (define_expand "atomic_<atomic_optab>_fetch<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -380,17 +434,23 @@
   (match_operand:SI 3 "const_int_operand")]
  ""
 {
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-  rtx value = operands[2];
-
-  /* Use an atomic load-operate instruction when possible.  */
-  if (aarch64_atomic_ldop_supported_p (<CODE>))
-    gen = gen_aarch64_atomic_<atomic_optab>_fetch<mode>_lse;
+  /* Use an atomic load-operate instruction when possible.  In this case
+     we will re-compute the result from the original mem value. */
+  if (TARGET_LSE)
+    {
+      rtx tmp = gen_reg_rtx (<MODE>mode);
+      operands[2] = force_reg (<MODE>mode, operands[2]);
+      emit_insn (gen_atomic_fetch_<atomic_optab><mode>
+                 (tmp, operands[1], operands[2], operands[3]));
+      tmp = expand_simple_binop (<MODE>mode, <CODE>, tmp, operands[2],
+				 operands[0], 1, OPTAB_WIDEN);
+      emit_move_insn (operands[0], tmp);
+    }
   else
-    gen = gen_aarch64_atomic_<atomic_optab>_fetch<mode>;
-
-  emit_insn (gen (operands[0], operands[1], value, operands[3]));
-
+    {
+      emit_insn (gen_aarch64_atomic_<atomic_optab>_fetch<mode>
+                 (operands[0], operands[1], operands[2], operands[3]));
+    }
   DONE;
 })
 
@@ -417,29 +477,6 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_<atomic_optab>_fetch<mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
-    (atomic_op:ALLI
-     (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")
-     (match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>")))
-   (set (match_dup 1)
-    (unspec_volatile:ALLI
-      [(match_dup 1)
-       (match_dup 2)
-       (match_operand:SI 3 "const_int_operand")]
-      UNSPECV_ATOMIC_LDOP))
-     (clobber (match_scratch:ALLI 4 "=&r"))]
-  "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_ldop (<CODE>, operands[4], operands[0], operands[1],
-			     operands[2], operands[3]);
-    DONE;
-  }
-)
-
 (define_insn_and_split "atomic_nand_fetch<mode>"
   [(set (match_operand:ALLI 0 "register_operand" "=&r")
     (not:ALLI
@@ -585,29 +622,3 @@
       return "dmb\\tish";
   }
 )
-
-;; ARMv8.1-A LSE instructions.
-
-;; Atomic load-op: Load data, operate, store result, keep data.
-
-(define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
- [(set (match_operand:ALLI 0 "register_operand" "=r")
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
-  (set (match_dup 1)
-   (unspec_volatile:ALLI
-    [(match_dup 1)
-     (match_operand:ALLI 2 "register_operand")
-     (match_operand:SI 3 "const_int_operand")]
-    ATOMIC_LDOP))]
- "TARGET_LSE && reload_completed"
- {
-   enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-   if (is_mm_relaxed (model))
-     return "ld<atomic_ldop><atomic_sfx>\t%<w>2, %<w>0, %1";
-   else if (is_mm_acquire (model) || is_mm_consume (model))
-     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>2, %<w>0, %1";
-   else if (is_mm_release (model))
-     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>2, %<w>0, %1";
-   else
-     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>2, %<w>0, %1";
- })
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index a43956054e8..524e4e6929b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -503,7 +503,6 @@
     UNSPECV_ATOMIC_CAS		; Represent an atomic CAS.
     UNSPECV_ATOMIC_SWP		; Represent an atomic SWP.
     UNSPECV_ATOMIC_OP		; Represent an atomic operation.
-    UNSPECV_ATOMIC_LDOP		; Represent an atomic load-operation
     UNSPECV_ATOMIC_LDOP_OR	; Represent an atomic load-or
     UNSPECV_ATOMIC_LDOP_BIC	; Represent an atomic load-bic
     UNSPECV_ATOMIC_LDOP_XOR	; Represent an atomic load-xor
@@ -1591,6 +1590,10 @@
  [(UNSPECV_ATOMIC_LDOP_OR "set") (UNSPECV_ATOMIC_LDOP_BIC "clr")
   (UNSPECV_ATOMIC_LDOP_XOR "eor") (UNSPECV_ATOMIC_LDOP_PLUS "add")])
 
+(define_int_attr atomic_ldoptab
+ [(UNSPECV_ATOMIC_LDOP_OR "ior") (UNSPECV_ATOMIC_LDOP_BIC "bic")
+  (UNSPECV_ATOMIC_LDOP_XOR "xor") (UNSPECV_ATOMIC_LDOP_PLUS "add")])
+
 ;; -------------------------------------------------------------------
 ;; Int Iterators Attributes.
 ;; -------------------------------------------------------------------
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 03/11] aarch64: Improve swp generation
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (3 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 06/11] Add visibility to libfunc constructors rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 05/11] aarch64: Emit LSE st<op> instructions rth7680
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

Allow zero as an input; fix constraints; avoid unnecessary split.

	* config/aarch64/aarch64.c (aarch64_emit_atomic_swap): Remove.
	(aarch64_gen_atomic_ldop): Don't call it.
	* config/aarch64/atomics.md (atomic_exchange<ALLI>):
	Use aarch64_reg_or_zero.
	(aarch64_atomic_exchange<ALLI>): Likewise.
	(aarch64_atomic_exchange<ALLI>_lse): Remove split; remove & from
	operand 0; use aarch64_reg_or_zero for input; merge ...
	(@aarch64_atomic_swp<ALLI>): ... this and remove.
---
 gcc/config/aarch64/aarch64.c  | 13 ----------
 gcc/config/aarch64/atomics.md | 49 +++++++++++------------------------
 2 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index c0f2d296342..5e9a85be44c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14401,15 +14401,6 @@ aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
   emit_insn (gen (dst, s2, shift_rtx, s1));
 }
 
-/* Emit an atomic swap.  */
-
-static void
-aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
-			  rtx mem, rtx model)
-{
-  emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
-}
-
 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
    location to store the data read from memory.  OUT_RESULT is the location to
    store the result of the operation.  MEM is the memory location to read and
@@ -14450,10 +14441,6 @@ aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
      a SET then emit a swap instruction and finish.  */
   switch (code)
     {
-    case SET:
-      aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
-      return;
-
     case MINUS:
       /* Negate the value and treat it as a PLUS.  */
       {
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index c00a18675b4..63384f9f99c 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -136,7 +136,7 @@
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "")
-  (match_operand:ALLI 2 "register_operand" "")
+  (match_operand:ALLI 2 "aarch64_reg_or_zero" "")
   (match_operand:SI 3 "const_int_operand" "")]
   ""
   {
@@ -156,10 +156,10 @@
 
 (define_insn_and_split "aarch64_atomic_exchange<mode>"
   [(set (match_operand:ALLI 0 "register_operand" "=&r")		;; output
-    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q")) ;; memory
+    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))	;; memory
    (set (match_dup 1)
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "register_operand" "r")	;; input
+      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")	;; input
        (match_operand:SI 3 "const_int_operand" "")]		;; model
       UNSPECV_ATOMIC_EXCHG))
    (clobber (reg:CC CC_REGNUM))
@@ -175,22 +175,25 @@
   }
 )
 
-(define_insn_and_split "aarch64_atomic_exchange<mode>_lse"
-  [(set (match_operand:ALLI 0 "register_operand" "=&r")
+(define_insn "aarch64_atomic_exchange<mode>_lse"
+  [(set (match_operand:ALLI 0 "register_operand" "=r")
     (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
    (set (match_dup 1)
     (unspec_volatile:ALLI
-      [(match_operand:ALLI 2 "register_operand" "r")
+      [(match_operand:ALLI 2 "aarch64_reg_or_zero" "rZ")
        (match_operand:SI 3 "const_int_operand" "")]
       UNSPECV_ATOMIC_EXCHG))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
   {
-    aarch64_gen_atomic_ldop (SET, operands[0], NULL, operands[1],
-			     operands[2], operands[3]);
-    DONE;
+    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+    if (is_mm_relaxed (model))
+      return "swp<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else if (is_mm_acquire (model) || is_mm_consume (model))
+      return "swpa<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else if (is_mm_release (model))
+      return "swpl<atomic_sfx>\t%<w>2, %<w>0, %1";
+    else
+      return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
   }
 )
 
@@ -585,28 +588,6 @@
 
 ;; ARMv8.1-A LSE instructions.
 
-;; Atomic swap with memory.
-(define_insn "@aarch64_atomic_swp<mode>"
- [(set (match_operand:ALLI 0 "register_operand" "+&r")
-   (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
-  (set (match_dup 1)
-   (unspec_volatile:ALLI
-    [(match_operand:ALLI 2 "register_operand" "r")
-     (match_operand:SI 3 "const_int_operand" "")]
-    UNSPECV_ATOMIC_SWP))]
-  "TARGET_LSE && reload_completed"
-  {
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model))
-      return "swp<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else if (is_mm_acquire (model) || is_mm_consume (model))
-      return "swpa<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else if (is_mm_release (model))
-      return "swpl<atomic_sfx>\t%<w>2, %<w>0, %1";
-    else
-      return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
-  })
-
 ;; Atomic load-op: Load data, operate, store result, keep data.
 
 (define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (5 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 05/11] aarch64: Emit LSE st<op> instructions rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26 16:55   ` Joseph Myers
  2018-09-26  5:04 ` [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap rth7680
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

We are about to introduce symbols to libgcc.a that will
not be present in libgcc_s.so.  Most symbols will be
resolved from the shared library first, and only the new
symbols will be pulled from the static library.

	* gcc.c (init_gcc_specs): Include static_name after shared_name.
---
 gcc/gcc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/gcc.c b/gcc/gcc.c
index 264204d7b37..4a7ca691122 100644
--- a/gcc/gcc.c
+++ b/gcc/gcc.c
@@ -1676,9 +1676,8 @@ init_gcc_specs (struct obstack *obstack, const char *shared_name,
 		static_name, " " LD_AS_NEEDED_OPTION " ",
 		shared_name, " " LD_NO_AS_NEEDED_OPTION
 		"}"
-		"%{shared-libgcc:",
-		shared_name, "%{!shared: ", static_name, "}"
-		"}}"
+		"%{shared-libgcc:", shared_name, " ", static_name, "}"
+		"}"
 #else
   buf = concat ("%{static|static-libgcc:", static_name, " ", eh_name, "}"
 		"%{!static:%{!static-libgcc:"
@@ -1688,11 +1687,11 @@ init_gcc_specs (struct obstack *obstack, const char *shared_name,
 		"}"
 #ifdef LINK_EH_SPEC
 		"%{shared:"
-		"%{shared-libgcc:", shared_name, "}"
+		"%{shared-libgcc:", shared_name, " ", static_name, "}"
 		"%{!shared-libgcc:", static_name, "}"
 		"}"
 #else
-		"%{shared:", shared_name, "}"
+		"%{shared:", shared_name, " ", static_name, "}"
 #endif
 #endif
 		"}}", NULL);
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 11/11] Enable -matomic-ool by default
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 09/11] aarch64: Implement -matomic-ool rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 06/11] Add visibility to libfunc constructors rth7680
                   ` (10 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

Do Not Merge Upstream.
This is for agraf and his testing within SLES.
---
 gcc/common/config/aarch64/aarch64-common.c | 6 ++++--
 gcc/config/aarch64/aarch64.c               | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index 292fb818705..3bd1312a3f8 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -31,9 +31,11 @@
 #include "flags.h"
 #include "diagnostic.h"
 
-#ifdef  TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef  TARGET_HANDLE_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index eca47784730..d2aa9bad5a4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -17611,9 +17611,11 @@ aarch64_run_selftests (void)
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
 
-#ifdef TARGET_BIG_ENDIAN_DEFAULT
 #undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
+#ifdef  TARGET_BIG_ENDIAN_DEFAULT
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END | MASK_ATOMIC_OOL)
+#else
+#define TARGET_DEFAULT_TARGET_FLAGS (MASK_ATOMIC_OOL)
 #endif
 
 #undef TARGET_CLASS_MAX_NREGS
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 01/11] aarch64: Simplify LSE cas generation
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (7 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics rth7680
                   ` (4 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

The cas insn is a single insn, and if expanded properly need not
be split after reload.  Use the proper inputs for the insn.

	* config/aarch64/aarch64.c (aarch64_expand_compare_and_swap):
	Force value into a register for TARGET_LSE; avoid register
	overlap between rval and oldval, in case the compare is needed;
	emit the compare during initial expansion so that it may be
	deleted if unused.
	(aarch64_gen_atomic_cas): Remove.
	* config/aarch64/atomics.md (@aarch64_compare_and_swap<SHORT>_lse):
	Remove & from operand 0; use matching constraint for operand 2;
	remove is_weak and mod_f operands as unused.  Drop the split
	and merge with...
	(@aarch64_atomic_cas<SHORT>): ... this pattern's output; remove.
	(@aarch64_compare_and_swap<GPI>_lse): Similarly.
	(@aarch64_atomic_cas<GPI>): Similarly.
---
 gcc/config/aarch64/aarch64-protos.h |   1 -
 gcc/config/aarch64/aarch64.c        |  41 +++-------
 gcc/config/aarch64/atomics.md       | 117 ++++++++--------------------
 3 files changed, 41 insertions(+), 118 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index caf1d2041f0..3d045cf43be 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -562,7 +562,6 @@ rtx aarch64_load_tp (rtx);
 
 void aarch64_expand_compare_and_swap (rtx op[]);
 void aarch64_split_compare_and_swap (rtx op[]);
-void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
 
 bool aarch64_atomic_ldop_supported_p (enum rtx_code);
 void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 12f7dfe9a75..a0ba358c2f1 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14174,7 +14174,7 @@ aarch64_expand_compare_and_swap (rtx operands[])
     case E_SImode:
     case E_DImode:
       /* Force the value into a register if needed.  */
-      if (!aarch64_plus_operand (oldval, mode))
+      if (TARGET_LSE || !aarch64_plus_operand (oldval, mode))
 	oldval = force_reg (cmp_mode, oldval);
       break;
 
@@ -14183,16 +14183,20 @@ aarch64_expand_compare_and_swap (rtx operands[])
     }
 
   if (TARGET_LSE)
-    emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
-						 newval, is_weak, mod_s,
-						 mod_f));
+    {
+      if (reg_overlap_mentioned_p (rval, oldval))
+        rval = gen_reg_rtx (cmp_mode);
+      emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
+						   newval, mod_s));
+      aarch64_gen_compare_reg (EQ, rval, oldval);
+    }
   else
     emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
 					     is_weak, mod_s, mod_f));
 
-
   if (mode == QImode || mode == HImode)
-    emit_move_insn (operands[1], gen_lowpart (mode, rval));
+    rval = gen_lowpart (mode, rval);
+  emit_move_insn (operands[1], rval);
 
   x = gen_rtx_REG (CCmode, CC_REGNUM);
   x = gen_rtx_EQ (SImode, x, const0_rtx);
@@ -14242,31 +14246,6 @@ aarch64_emit_post_barrier (enum memmodel model)
     }
 }
 
-/* Emit an atomic compare-and-swap operation.  RVAL is the destination register
-   for the data in memory.  EXPECTED is the value expected to be in memory.
-   DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
-   is the memory ordering to use.  */
-
-void
-aarch64_gen_atomic_cas (rtx rval, rtx mem,
-			rtx expected, rtx desired,
-			rtx model)
-{
-  machine_mode mode;
-
-  mode = GET_MODE (mem);
-
-  /* Move the expected value into the CAS destination register.  */
-  emit_insn (gen_rtx_SET (rval, expected));
-
-  /* Emit the CAS.  */
-  emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
-
-  /* Compare the expected value with the value loaded by the CAS, to establish
-     whether the swap was made.  */
-  aarch64_gen_compare_reg (EQ, rval, expected);
-}
-
 /* Split a compare and swap pattern.  */
 
 void
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index bba8e9e9c8e..9f00dd3c68e 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -85,56 +85,50 @@
   }
 )
 
-(define_insn_and_split "@aarch64_compare_and_swap<mode>_lse"
-  [(set (reg:CC CC_REGNUM)					;; bool out
-    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:SI 0 "register_operand" "=&r")		;; val out
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:SI 0 "register_operand" "=r")		;; val out
     (zero_extend:SI
-      (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
+     (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
    (set (match_dup 1)
     (unspec_volatile:SHORT
-      [(match_operand:SI 2 "aarch64_plus_operand" "rI")	;; expected
+      [(match_operand:SI 2 "register_operand" "0")		;; expected
        (match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")	;; desired
-       (match_operand:SI 4 "const_int_operand")		;; is_weak
-       (match_operand:SI 5 "const_int_operand")		;; mod_s
-       (match_operand:SI 6 "const_int_operand")]	;; mod_f
+       (match_operand:SI 4 "const_int_operand")]		;; mod_s
       UNSPECV_ATOMIC_CMPSW))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_cas (operands[0], operands[1],
-			    operands[2], operands[3],
-			    operands[5]);
-    DONE;
-  }
-)
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
+  if (is_mm_relaxed (model))
+    return "cas<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "casa<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else if (is_mm_release (model))
+    return "casl<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else
+    return "casal<atomic_sfx>\t%<w>0, %<w>3, %1";
+})
 
-(define_insn_and_split "@aarch64_compare_and_swap<mode>_lse"
-  [(set (reg:CC CC_REGNUM)					;; bool out
-    (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:GPI 0 "register_operand" "=&r")		;; val out
+(define_insn "@aarch64_compare_and_swap<mode>_lse"
+  [(set (match_operand:GPI 0 "register_operand" "=r")		;; val out
     (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))   ;; memory
    (set (match_dup 1)
     (unspec_volatile:GPI
-      [(match_operand:GPI 2 "aarch64_plus_operand" "rI")	;; expect
+      [(match_operand:GPI 2 "register_operand" "0")		;; expect
        (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")		;; desired
-       (match_operand:SI 4 "const_int_operand")			;; is_weak
-       (match_operand:SI 5 "const_int_operand")			;; mod_s
-       (match_operand:SI 6 "const_int_operand")]		;; mod_f
+       (match_operand:SI 4 "const_int_operand")]		;; mod_s
       UNSPECV_ATOMIC_CMPSW))]
   "TARGET_LSE"
-  "#"
-  "&& reload_completed"
-  [(const_int 0)]
-  {
-    aarch64_gen_atomic_cas (operands[0], operands[1],
-			    operands[2], operands[3],
-			    operands[5]);
-    DONE;
-  }
-)
+{
+  enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
+  if (is_mm_relaxed (model))
+    return "cas<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else if (is_mm_acquire (model) || is_mm_consume (model))
+    return "casa<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else if (is_mm_release (model))
+    return "casl<atomic_sfx>\t%<w>0, %<w>3, %1";
+  else
+    return "casal<atomic_sfx>\t%<w>0, %<w>3, %1";
+})
 
 (define_expand "atomic_exchange<mode>"
  [(match_operand:ALLI 0 "register_operand" "")
@@ -610,55 +604,6 @@
       return "swpal<atomic_sfx>\t%<w>2, %<w>0, %1";
   })
 
-;; Atomic compare-and-swap: HI and smaller modes.
-
-(define_insn "@aarch64_atomic_cas<mode>"
- [(set (match_operand:SI 0 "register_operand" "+&r")		  ;; out
-   (zero_extend:SI
-    (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q")))  ;; memory.
-  (set (match_dup 1)
-   (unspec_volatile:SHORT
-    [(match_dup 0)
-     (match_operand:SHORT 2 "aarch64_reg_or_zero" "rZ")	;; value.
-     (match_operand:SI 3 "const_int_operand" "")]	;; model.
-    UNSPECV_ATOMIC_CAS))]
- "TARGET_LSE && reload_completed"
-{
-  enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-  if (is_mm_relaxed (model))
-    return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else if (is_mm_acquire (model) || is_mm_consume (model))
-    return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else if (is_mm_release (model))
-    return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
-  else
-    return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
-})
-
-;; Atomic compare-and-swap: SI and larger modes.
-
-(define_insn "@aarch64_atomic_cas<mode>"
- [(set (match_operand:GPI 0 "register_operand" "+&r")	      ;; out
-   (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))  ;; memory.
-  (set (match_dup 1)
-   (unspec_volatile:GPI
-    [(match_dup 0)
-     (match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")	;; value.
-     (match_operand:SI 3 "const_int_operand" "")]	;; model.
-    UNSPECV_ATOMIC_CAS))]
-  "TARGET_LSE && reload_completed"
-{
-    enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
-    if (is_mm_relaxed (model))
-      return "cas<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else if (is_mm_acquire (model) || is_mm_consume (model))
-      return "casa<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else if (is_mm_release (model))
-      return "casl<atomic_sfx>\t%<w>0, %<w>2, %1";
-    else
-      return "casal<atomic_sfx>\t%<w>0, %<w>2, %1";
-})
-
 ;; Atomic load-op: Load data, operate, store result, keep data.
 
 (define_insn "@aarch64_atomic_load<atomic_ldop><mode>"
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 05/11] aarch64: Emit LSE st<op> instructions
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (4 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 03/11] aarch64: Improve swp generation rth7680
@ 2018-09-26  5:04 ` rth7680
  2018-09-26  5:04 ` [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc rth7680
                   ` (7 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  5:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

When the result of an operation is not used, we can ignore the
result by storing to XZR.  For two of the memory models, using
XZR with LD<op> has a preferred assembler alias, ST<op>.

	* config/aarch64/atomics.md (aarch64_atomic_<ATOMIC_LDOP><ALLI>_lse):
	Use ST<op> for relaxed and release models; load to XZR otherwise;
	remove the now unnecessary scratch register.

	* gcc.target/aarch64/atomic-inst-ldadd.c: Expect stadd{,l}.
	* gcc.target/aarch64/atomic-inst-ldlogic.c: Similarly.
---
 .../gcc.target/aarch64/atomic-inst-ldadd.c    | 18 ++++---
 .../gcc.target/aarch64/atomic-inst-ldlogic.c  | 54 ++++++++++++-------
 gcc/config/aarch64/atomics.md                 | 15 +++---
 3 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
index 4b2282c6861..db2206186b4 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
@@ -67,20 +67,26 @@ TEST (add_load_notreturn, ADD_LOAD_NORETURN)
 TEST (sub_load, SUB_LOAD)
 TEST (sub_load_notreturn, SUB_LOAD_NORETURN)
 
-/* { dg-final { scan-assembler-times "ldaddb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddab\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlb\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlb\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalb\t" 32} } */
+/* { dg-final { scan-assembler-times "staddb\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlb\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldaddh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddah\t" 32} } */
-/* { dg-final { scan-assembler-times "ldaddlh\t" 16} } */
+/* { dg-final { scan-assembler-times "ldaddlh\t" 8} } */
 /* { dg-final { scan-assembler-times "ldaddalh\t" 32} } */
+/* { dg-final { scan-assembler-times "staddh\t" 8} } */
+/* { dg-final { scan-assembler-times "staddlh\t" 8} } */
 
-/* { dg-final { scan-assembler-times "ldadd\t" 32} } */
+/* { dg-final { scan-assembler-times "ldadd\t" 16} } */
 /* { dg-final { scan-assembler-times "ldadda\t" 64} } */
-/* { dg-final { scan-assembler-times "ldaddl\t" 32} } */
+/* { dg-final { scan-assembler-times "ldaddl\t" 16} } */
 /* { dg-final { scan-assembler-times "ldaddal\t" 64} } */
+/* { dg-final { scan-assembler-times "stadd\t" 16} } */
+/* { dg-final { scan-assembler-times "staddl\t" 16} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
index 4879d52b9b4..b8a53e0a676 100644
--- a/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
+++ b/gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
@@ -101,54 +101,72 @@ TEST (xor_load_notreturn, XOR_LOAD_NORETURN)
 
 /* Load-OR.  */
 
-/* { dg-final { scan-assembler-times "ldsetb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalb\t" 16} } */
+/* { dg-final { scan-assembler-times "stsetb\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldseth\t" 8} } */
+/* { dg-final { scan-assembler-times "ldseth\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldsetlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldsetlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldsetalh\t" 16} } */
+/* { dg-final { scan-assembler-times "stseth\t" 4} } */
+/* { dg-final { scan-assembler-times "stsetlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldset\t" 16} } */
+/* { dg-final { scan-assembler-times "ldset\t" 8} } */
 /* { dg-final { scan-assembler-times "ldseta\t" 32} } */
-/* { dg-final { scan-assembler-times "ldsetl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldsetl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldsetal\t" 32} } */
+/* { dg-final { scan-assembler-times "stset\t" 8} } */
+/* { dg-final { scan-assembler-times "stsetl\t" 8} } */
 
 /* Load-AND.  */
 
-/* { dg-final { scan-assembler-times "ldclrb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralb\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrb\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclrh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclrah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldclrlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldclrlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldclralh\t" 16} } */
+/* { dg-final { scan-assembler-times "stclrh\t" 4} } */
+/* { dg-final { scan-assembler-times "stclrlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldclr\t" 16} */
+/* { dg-final { scan-assembler-times "ldclr\t" 8} */
 /* { dg-final { scan-assembler-times "ldclra\t" 32} } */
-/* { dg-final { scan-assembler-times "ldclrl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldclrl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldclral\t" 32} } */
+/* { dg-final { scan-assembler-times "stclr\t" 8} */
+/* { dg-final { scan-assembler-times "stclrl\t" 8} } */
 
 /* Load-XOR.  */
 
-/* { dg-final { scan-assembler-times "ldeorb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorab\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlb\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlb\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralb\t" 16} } */
+/* { dg-final { scan-assembler-times "steorb\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlb\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeorh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeorah\t" 16} } */
-/* { dg-final { scan-assembler-times "ldeorlh\t" 8} } */
+/* { dg-final { scan-assembler-times "ldeorlh\t" 4} } */
 /* { dg-final { scan-assembler-times "ldeoralh\t" 16} } */
+/* { dg-final { scan-assembler-times "steorh\t" 4} } */
+/* { dg-final { scan-assembler-times "steorlh\t" 4} } */
 
-/* { dg-final { scan-assembler-times "ldeor\t" 16} */
+/* { dg-final { scan-assembler-times "ldeor\t" 8} */
 /* { dg-final { scan-assembler-times "ldeora\t" 32} } */
-/* { dg-final { scan-assembler-times "ldeorl\t" 16} } */
+/* { dg-final { scan-assembler-times "ldeorl\t" 8} } */
 /* { dg-final { scan-assembler-times "ldeoral\t" 32} } */
+/* { dg-final { scan-assembler-times "steor\t" 8} */
+/* { dg-final { scan-assembler-times "steorl\t" 8} } */
 
 /* { dg-final { scan-assembler-not "ldaxr\t" } } */
 /* { dg-final { scan-assembler-not "stlxr\t" } } */
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 05ac6232fe5..f74521f885d 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -269,19 +269,22 @@
 	  [(match_dup 0)
 	   (match_operand:ALLI 1 "register_operand" "r")
 	   (match_operand:SI 2 "const_int_operand")]
-      ATOMIC_LDOP))
-   (clobber (match_scratch:ALLI 3 "=&r"))]
+      ATOMIC_LDOP))]
   "TARGET_LSE"
   {
    enum memmodel model = memmodel_from_int (INTVAL (operands[2]));
+   /* Note that while ST<op> is an alias for LD<op> with the second
+      operand as XZR, the assember only defines them for the RELAXED
+      and REL models.  But there's nothing that prevents us from explicitly
+      using XZR with LD<op> for the ACQ and ACQ_REL models.  */
    if (is_mm_relaxed (model))
-     return "ld<atomic_ldop><atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop><atomic_sfx>\t%<w>1, %0";
    else if (is_mm_release (model))
-     return "ld<atomic_ldop>l<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "st<atomic_ldop>l<atomic_sfx>\t%<w>1, %0";
    else if (is_mm_acquire (model) || is_mm_consume (model))
-     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "ld<atomic_ldop>a<atomic_sfx>\t%<w>1, <w>zr, %0";
    else
-     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, %<w>3, %0";
+     return "ld<atomic_ldop>al<atomic_sfx>\t%<w>1, <w>zr, %0";
   }
 )
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH, AArch64 02/11] aarch64: Improve cas generation
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (9 preceding siblings ...)
  2018-09-26  5:04 ` [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics rth7680
@ 2018-09-26  7:40 ` rth7680
  2018-09-26  9:22 ` [PATCH, AArch64 00/11] LSE atomics out-of-line Florian Weimer
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 31+ messages in thread
From: rth7680 @ 2018-09-26  7:40 UTC (permalink / raw)
  To: gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

From: Richard Henderson <richard.henderson@linaro.org>

Do not zero-extend the input to the cas for subword operations;
instead, use the appropriate zero-extending compare insns.
Correct the predicates and constraints for immediate expected operand.

	* config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): New.
	(aarch64_split_compare_and_swap): Use it.
	(aarch64_expand_compare_and_swap): Likewise.  Remove convert_modes;
	test oldval against the proper predicate.
	* config/aarch64/atomics.md (@atomic_compare_and_swap<ALLI>):
	Use nonmemory_operand for expected.
	(cas_short_expected_pred): New.
	(@aarch64_compare_and_swap<SHORT>): Use it; use "rn" not "rI" to match.
	(@aarch64_compare_and_swap<GPI>): Use "rn" not "rI" for expected.
	* config/aarch64/predicates.md (aarch64_plushi_immediate): New.
	(aarch64_plushi_operand): New.
---
 gcc/config/aarch64/aarch64.c     | 85 ++++++++++++++++++++------------
 gcc/config/aarch64/atomics.md    | 21 ++++----
 gcc/config/aarch64/predicates.md | 12 +++++
 3 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a0ba358c2f1..c0f2d296342 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1613,6 +1613,33 @@ aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
   return cc_reg;
 }
 
+/* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
+
+static rtx
+aarch64_gen_compare_reg_maybe_ze(RTX_CODE code, rtx x, rtx y,
+                                 machine_mode y_mode)
+{
+  if (y_mode == E_QImode || y_mode == E_HImode)
+    {
+      if (CONST_INT_P (y))
+	y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+      else
+	{
+	  rtx t, cc_reg;
+	  machine_mode cc_mode;
+
+	  t = gen_rtx_ZERO_EXTEND (SImode, y);
+	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
+	  cc_mode = CC_SWPmode;
+	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+	  emit_set_insn (cc_reg, t);
+	  return cc_reg;
+	}
+    }
+
+  return aarch64_gen_compare_reg (code, x, y);
+}
+
 /* Build the SYMBOL_REF for __tls_get_addr.  */
 
 static GTY(()) rtx tls_get_addr_libfunc;
@@ -14138,8 +14165,8 @@ aarch64_emit_unlikely_jump (rtx insn)
 void
 aarch64_expand_compare_and_swap (rtx operands[])
 {
-  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
-  machine_mode mode, cmp_mode;
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
+  machine_mode mode, r_mode;
 
   bval = operands[0];
   rval = operands[1];
@@ -14150,56 +14177,50 @@ aarch64_expand_compare_and_swap (rtx operands[])
   mod_s = operands[6];
   mod_f = operands[7];
   mode = GET_MODE (mem);
-  cmp_mode = mode;
 
   /* Normally the succ memory model must be stronger than fail, but in the
      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
-
   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
 
-  switch (mode)
+  r_mode = mode;
+  if (mode == QImode || mode == HImode)
     {
-    case E_QImode:
-    case E_HImode:
-      /* For short modes, we're going to perform the comparison in SImode,
-	 so do the zero-extension now.  */
-      cmp_mode = SImode;
-      rval = gen_reg_rtx (SImode);
-      oldval = convert_modes (SImode, mode, oldval, true);
-      /* Fall through.  */
-
-    case E_SImode:
-    case E_DImode:
-      /* Force the value into a register if needed.  */
-      if (TARGET_LSE || !aarch64_plus_operand (oldval, mode))
-	oldval = force_reg (cmp_mode, oldval);
-      break;
-
-    default:
-      gcc_unreachable ();
+      r_mode = SImode;
+      rval = gen_reg_rtx (r_mode);
     }
 
   if (TARGET_LSE)
     {
+      /* Oldval always requires a register.  We also must not clobber
+         oldval when writing to rval, so that we can compare afterward.  */
+      oldval = force_reg (mode, oldval);
       if (reg_overlap_mentioned_p (rval, oldval))
-        rval = gen_reg_rtx (cmp_mode);
+        rval = gen_reg_rtx (r_mode);
+
       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
 						   newval, mod_s));
-      aarch64_gen_compare_reg (EQ, rval, oldval);
+      cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
     }
   else
-    emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
-					     is_weak, mod_s, mod_f));
+    {
+      /* The oldval predicate varies by mode.  Test it and force to reg.  */
+      insn_code code = code_for_aarch64_compare_and_swap (mode);
+      if (!insn_data[code].operand[2].predicate (oldval, mode))
+	oldval = force_reg (mode, oldval);
 
-  if (mode == QImode || mode == HImode)
+      emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
+				 is_weak, mod_s, mod_f));
+      cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+    }
+
+  if (r_mode != mode)
     rval = gen_lowpart (mode, rval);
   emit_move_insn (operands[1], rval);
 
-  x = gen_rtx_REG (CCmode, CC_REGNUM);
-  x = gen_rtx_EQ (SImode, x, const0_rtx);
+  x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
   emit_insn (gen_rtx_SET (bval, x));
 }
 
@@ -14314,10 +14335,10 @@ aarch64_split_compare_and_swap (rtx operands[])
     }
   else
     {
-      cond = aarch64_gen_compare_reg (NE, rval, oldval);
+      cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
-				 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
     }
 
diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 9f00dd3c68e..c00a18675b4 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -24,8 +24,8 @@
   [(match_operand:SI 0 "register_operand" "")			;; bool out
    (match_operand:ALLI 1 "register_operand" "")			;; val out
    (match_operand:ALLI 2 "aarch64_sync_memory_operand" "")	;; memory
-   (match_operand:ALLI 3 "general_operand" "")			;; expected
-   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")			;; desired
+   (match_operand:ALLI 3 "nonmemory_operand" "")		;; expected
+   (match_operand:ALLI 4 "aarch64_reg_or_zero" "")		;; desired
    (match_operand:SI 5 "const_int_operand")			;; is_weak
    (match_operand:SI 6 "const_int_operand")			;; mod_s
    (match_operand:SI 7 "const_int_operand")]			;; mod_f
@@ -36,19 +36,22 @@
   }
 )
 
+(define_mode_attr cas_short_expected_pred
+  [(QI "aarch64_reg_or_imm") (HI "aarch64_plushi_operand")])
+
 (define_insn_and_split "@aarch64_compare_and_swap<mode>"
   [(set (reg:CC CC_REGNUM)					;; bool out
     (unspec_volatile:CC [(const_int 0)] UNSPECV_ATOMIC_CMPSW))
-   (set (match_operand:SI 0 "register_operand" "=&r")	   ;; val out
+   (set (match_operand:SI 0 "register_operand" "=&r")		;; val out
     (zero_extend:SI
       (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
    (set (match_dup 1)
     (unspec_volatile:SHORT
-      [(match_operand:SI 2 "aarch64_plus_operand" "rI")	;; expected
+      [(match_operand:SHORT 2 "<cas_short_expected_pred>" "rn")	;; expected
        (match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")	;; desired
-       (match_operand:SI 4 "const_int_operand")		;; is_weak
-       (match_operand:SI 5 "const_int_operand")		;; mod_s
-       (match_operand:SI 6 "const_int_operand")]	;; mod_f
+       (match_operand:SI 4 "const_int_operand")			;; is_weak
+       (match_operand:SI 5 "const_int_operand")			;; mod_s
+       (match_operand:SI 6 "const_int_operand")]		;; mod_f
       UNSPECV_ATOMIC_CMPSW))
    (clobber (match_scratch:SI 7 "=&r"))]
   ""
@@ -68,7 +71,7 @@
     (match_operand:GPI 1 "aarch64_sync_memory_operand" "+Q"))   ;; memory
    (set (match_dup 1)
     (unspec_volatile:GPI
-      [(match_operand:GPI 2 "aarch64_plus_operand" "rI")	;; expect
+      [(match_operand:GPI 2 "aarch64_plus_operand" "rn")	;; expect
        (match_operand:GPI 3 "aarch64_reg_or_zero" "rZ")		;; desired
        (match_operand:SI 4 "const_int_operand")			;; is_weak
        (match_operand:SI 5 "const_int_operand")			;; mod_s
@@ -91,7 +94,7 @@
      (match_operand:SHORT 1 "aarch64_sync_memory_operand" "+Q"))) ;; memory
    (set (match_dup 1)
     (unspec_volatile:SHORT
-      [(match_operand:SI 2 "register_operand" "0")		;; expected
+      [(match_operand:SHORT 2 "register_operand" "0")		;; expected
        (match_operand:SHORT 3 "aarch64_reg_or_zero" "rZ")	;; desired
        (match_operand:SI 4 "const_int_operand")]		;; mod_s
       UNSPECV_ATOMIC_CMPSW))]
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 5b08b03c586..4c75eff3e5a 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -114,6 +114,18 @@
   (ior (match_operand 0 "register_operand")
        (match_operand 0 "aarch64_plus_immediate")))
 
+(define_predicate "aarch64_plushi_immediate"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT val = INTVAL (op);
+  /* The HImode value must be zero-extendable to an SImode plus_operand.  */
+  return ((val & 0xfff) == val || sext_hwi (val & 0xf000, 16) == val);
+})
+
+(define_predicate "aarch64_plushi_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_plushi_immediate")))
+
 (define_predicate "aarch64_pluslong_immediate"
   (and (match_code "const_int")
        (match_test "(INTVAL (op) < 0xffffff && INTVAL (op) > -0xffffff)")))
-- 
2.17.1

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26  5:04 ` [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics rth7680
@ 2018-09-26  9:01   ` Florian Weimer
  2018-09-26 14:33     ` Richard Henderson
  2018-09-28 16:29   ` Ramana Radhakrishnan
  1 sibling, 1 reply; 31+ messages in thread
From: Florian Weimer @ 2018-09-26  9:01 UTC (permalink / raw)
  To: rth7680; +Cc: gcc-patches, ramana.radhakrishnan, agraf, matz, Richard Henderson

* rth:

> diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
> new file mode 100644
> index 00000000000..20f4bde741f
> --- /dev/null
> +++ b/libgcc/config/aarch64/lse.c

> +static void __attribute__((constructor))
> +init_have_atomics(void)
> +{
> +  unsigned long hwcap = getauxval(AT_HWCAP);
> +  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
> +}

Is there an expectation that it is possible to use the atomics in IFUNC
resolvers?  Then this needs an explanation why it is safe to run with
the other kind of atomics until the initialization of
__aa64_have_atomics has happened.

(GNU style requires a space before a parenthesis, at least in a function
call or function declarator.)

Thanks,
Florian

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (10 preceding siblings ...)
  2018-09-26  7:40 ` [PATCH, AArch64 02/11] aarch64: Improve cas generation rth7680
@ 2018-09-26  9:22 ` Florian Weimer
  2018-09-26 13:05   ` Michael Matz
  2018-09-27 13:08 ` Ramana Radhakrishnan
  2019-02-04 11:14 ` __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line) Florian Weimer
  13 siblings, 1 reply; 31+ messages in thread
From: Florian Weimer @ 2018-09-26  9:22 UTC (permalink / raw)
  To: rth7680; +Cc: gcc-patches, ramana.radhakrishnan, agraf, matz, Richard Henderson

* rth:

> Therefore, I've created small out-of-line helpers that are directly
> linked into every library or executable that requires them.  There
> will be two direct branches, both of which will be well-predicted.

This seems reasonable to me, considering the trade-offs.

If the indirect function call overhead is deemed too large, the only
other feasible option I see from a distribution point of view is to drop
support for the previous architecture version without LSE.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-26  9:22 ` [PATCH, AArch64 00/11] LSE atomics out-of-line Florian Weimer
@ 2018-09-26 13:05   ` Michael Matz
  0 siblings, 0 replies; 31+ messages in thread
From: Michael Matz @ 2018-09-26 13:05 UTC (permalink / raw)
  To: Florian Weimer
  Cc: rth7680, gcc-patches, ramana.radhakrishnan, agraf, Richard Henderson

Hi,

On Wed, 26 Sep 2018, Florian Weimer wrote:

> > Therefore, I've created small out-of-line helpers that are directly
> > linked into every library or executable that requires them.  There
> > will be two direct branches, both of which will be well-predicted.
> 
> This seems reasonable to me, considering the trade-offs.
> 
> If the indirect function call overhead is deemed too large,

With IFUNCs there's the concern that it's not really a feasible 
solution for all cases: you'd have to specialize each and every function 
containing atomic accesses.  That's difficult to do by hand and 
potentially explodes size when done automatically.

> the only other feasible option I see from a distribution point of view 
> is to drop support for the previous architecture version without LSE.

Agreed.  So thanks rth for that :)


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26  9:01   ` Florian Weimer
@ 2018-09-26 14:33     ` Richard Henderson
  2018-09-26 14:36       ` Florian Weimer
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Henderson @ 2018-09-26 14:33 UTC (permalink / raw)
  To: Florian Weimer, rth7680
  Cc: gcc-patches, ramana.radhakrishnan, agraf, matz, Richard Henderson

On 9/26/18 1:59 AM, Florian Weimer wrote:
> * rth:
> 
>> diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
>> new file mode 100644
>> index 00000000000..20f4bde741f
>> --- /dev/null
>> +++ b/libgcc/config/aarch64/lse.c
> 
>> +static void __attribute__((constructor))
>> +init_have_atomics(void)
>> +{
>> +  unsigned long hwcap = getauxval(AT_HWCAP);
>> +  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
>> +}
> 
> Is there an expectation that it is possible to use the atomics in IFUNC
> resolvers?  Then this needs an explanation why it is safe to run with
> the other kind of atomics until the initialization of
> __aa64_have_atomics has happened.

Yes.  The explanation is simple, in that the !have_atomics path is also atomic.
 It will simply use the slower load/store-exclusive path.

Perhaps, despite the official ARMv8.1-Atomics name, LSE was in fact a better
choice for a name after all, as its lack does not imply a lack of atomicity.
And a comment, to be sure.

> (GNU style requires a space before a parenthesis, at least in a function
> call or function declarator.)

Yes, of course.  It's no longer automatic for my fingers and eyes.


r~

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26 14:33     ` Richard Henderson
@ 2018-09-26 14:36       ` Florian Weimer
  2018-09-26 14:37         ` Richard Henderson
  0 siblings, 1 reply; 31+ messages in thread
From: Florian Weimer @ 2018-09-26 14:36 UTC (permalink / raw)
  To: Richard Henderson
  Cc: rth7680, gcc-patches, ramana.radhakrishnan, agraf, matz,
	Richard Henderson

* Richard Henderson:

> On 9/26/18 1:59 AM, Florian Weimer wrote:
>> * rth:
>> 
>>> diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
>>> new file mode 100644
>>> index 00000000000..20f4bde741f
>>> --- /dev/null
>>> +++ b/libgcc/config/aarch64/lse.c
>> 
>>> +static void __attribute__((constructor))
>>> +init_have_atomics(void)
>>> +{
>>> +  unsigned long hwcap = getauxval(AT_HWCAP);
>>> +  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
>>> +}
>> 
>> Is there an expectation that it is possible to use the atomics in IFUNC
>> resolvers?  Then this needs an explanation why it is safe to run with
>> the other kind of atomics until the initialization of
>> __aa64_have_atomics has happened.
>
> Yes.  The explanation is simple, in that the !have_atomics path is
> also atomic.  It will simply use the slower load/store-exclusive path.
>
> Perhaps, despite the official ARMv8.1-Atomics name, LSE was in fact a
> better choice for a name after all, as its lack does not imply a lack
> of atomicity.  And a comment, to be sure.

That's not what I meant.  I'm curious if LSE and non-LSE atomics on the
same location will still result in the expected memory ordering.  If
they don't, then this requires *some* explanation why this is okay.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26 14:36       ` Florian Weimer
@ 2018-09-26 14:37         ` Richard Henderson
  0 siblings, 0 replies; 31+ messages in thread
From: Richard Henderson @ 2018-09-26 14:37 UTC (permalink / raw)
  To: Florian Weimer, Richard Henderson
  Cc: gcc-patches, ramana.radhakrishnan, agraf, matz, Richard Henderson

On 9/26/18 7:33 AM, Florian Weimer wrote:
>>> *That's not what I meant.  I'm curious if LSE and non-LSE atomics on the
>>> same location will still result in the expected memory ordering.  If
>>> they don't, then this requires *some* explanation why this is okay.
>>>
>>> Thanks,
>>> Florian

Yes, they interoperate just fine.


r~

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc
  2018-09-26  5:04 ` [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc rth7680
@ 2018-09-26 16:55   ` Joseph Myers
  2018-09-26 16:57     ` Richard Henderson
  0 siblings, 1 reply; 31+ messages in thread
From: Joseph Myers @ 2018-09-26 16:55 UTC (permalink / raw)
  To: rth7680; +Cc: gcc-patches, ramana.radhakrishnan, agraf, matz, Richard Henderson

On Tue, 25 Sep 2018, rth7680@gmail.com wrote:

> From: Richard Henderson <richard.henderson@linaro.org>
> 
> We are about to introduce symbols to libgcc.a that will
> not be present in libgcc_s.so.  Most symbols will be
> resolved from the shared library first, and only the new
> symbols will be pulled from the static library.

Configurations with this property should be using libgcc's 
config/t-slibgcc-libgcc to make libgcc_s.so into a linker script.  Unless 
you have a configuration needing this but not supporting linker scripts, I 
wouldn't expect driver changes to be needed.

The linker script looks like:

/* GNU ld script
   Use the shared library, but some functions are only in
   the static library.  */
GROUP ( libgcc_s.so.1 -lgcc )

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc
  2018-09-26 16:55   ` Joseph Myers
@ 2018-09-26 16:57     ` Richard Henderson
  0 siblings, 0 replies; 31+ messages in thread
From: Richard Henderson @ 2018-09-26 16:57 UTC (permalink / raw)
  To: Joseph Myers, rth7680; +Cc: gcc-patches, ramana.radhakrishnan, agraf, matz

On 9/26/18 9:49 AM, Joseph Myers wrote:
> On Tue, 25 Sep 2018, rth7680@gmail.com wrote:
> 
>> From: Richard Henderson <richard.henderson@linaro.org>
>>
>> We are about to introduce symbols to libgcc.a that will
>> not be present in libgcc_s.so.  Most symbols will be
>> resolved from the shared library first, and only the new
>> symbols will be pulled from the static library.
> 
> Configurations with this property should be using libgcc's 
> config/t-slibgcc-libgcc to make libgcc_s.so into a linker script.  Unless 
> you have a configuration needing this but not supporting linker scripts, I 
> wouldn't expect driver changes to be needed.
> 
> The linker script looks like:
> 
> /* GNU ld script
>    Use the shared library, but some functions are only in
>    the static library.  */
> GROUP ( libgcc_s.so.1 -lgcc )

Ah.  I had been surprised that we didn't already support this
form of linking, and missed this config fragment.  Will fix.


r~

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (11 preceding siblings ...)
  2018-09-26  9:22 ` [PATCH, AArch64 00/11] LSE atomics out-of-line Florian Weimer
@ 2018-09-27 13:08 ` Ramana Radhakrishnan
  2018-09-27 15:19   ` Alexander Graf
  2018-09-27 16:51   ` Richard Henderson
  2019-02-04 11:14 ` __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line) Florian Weimer
  13 siblings, 2 replies; 31+ messages in thread
From: Ramana Radhakrishnan @ 2018-09-27 13:08 UTC (permalink / raw)
  To: rth7680, gcc-patches; +Cc: agraf, matz, Richard Henderson, nd

On 26/09/2018 06:03, rth7680@gmail.com wrote:
> From: Richard Henderson <richard.henderson@linaro.org>
> 
> ARMv8.1 adds an (mandatory) Atomics extension, also known as the
> Large System Extension.  Deploying this extension at the OS level
> has proved challenging.
> 
> The following is the result of a conversation between myself,
> Alex Graf of SuSE, and Ramana Radhakrishnan of ARM, at last week's
> Linaro Connect in Vancouver.
> 
> The current state of the world is that one could distribute two
> different copies of a given shared library and place the LSE-enabled
> version in /lib64/atomics/ and it will be selected over the /lib64/
> version by ld.so when HWCAP_ATOMICS is present.
> 
> Alex's main concern with this is that (1) he doesn't want to
> distribute two copies of every library, or determine what a
> resonable subset would be and (2) this solution does not work
> for executables, e.g. mysql.
> 
> Ramana's main concern was to avoid the overhead of an indirect jump,
> especially in how that would affect the (non-)branch-prediction of
> the smallest implementations.
> 
> Therefore, I've created small out-of-line helpers that are directly
> linked into every library or executable that requires them.  There
> will be two direct branches, both of which will be well-predicted.
> 
> In the process, I discovered a number of places within the code
> where the existing implementation could be improved.  In particular:
> 
>   - the LSE patterns didn't use predicates or constraints that
>     match the actual instructions, requiring unnecessary splitting.
> 
>   - the non-LSE compare-and-swap can use an extending compare to
>     avoid requiring the input to have been previously extended.
> 
>   - TImode compare-and-swap was missing entirely.  This brings
>     aarch64 to parity with x86_64 wrt __sync_val_compare_and_swap.
> 
> There is a final patch that enables the new option by default.
> I am not necessarily expecting this to be merged upstream, but
> for the operating system to decide what the default should be.
> It might be that this should be a configure option, so as to
> make that OS choice easier, but I've just now thought of that.  ;-)
> 
> I'm going to have to rely on Alex and/or Ramana to perform
> testing on a system that supports LSE.
> 

Thanks for this patchset -

I'll give this a whirl in the next couple of days but don't expect 
results until Monday or so.

I do have an additional concern that I forgot to mention in Vancouver -

Thanks Wilco for reminding me that this now replaces a bunch of inline 
instructions with effectively a library call therefore clobbering a 
whole bunch of caller saved registers.

In which case I see 2 options.

-  maybe we should consider a private interface and restrict the 
registers that these files are compiled with to minimise the number of 
caller saved registers we trash.

- Alternatively we should consider an option to inline these at O2 or O3 
as we may just be trading the performance improvements we get with using 
the lse atomics for additional stacking and unstacking of caller saved 
registers in the main functions...

But anyway while we discuss that we'll have a look at testing and 
benchmarking this.


regards
Ramana

> 
> r~
> 
> 
> Richard Henderson (11):
>    aarch64: Simplify LSE cas generation
>    aarch64: Improve cas generation
>    aarch64: Improve swp generation
>    aarch64: Improve atomic-op lse generation
>    aarch64: Emit LSE st<op> instructions
>    Add visibility to libfunc constructors
>    Link static libgcc after shared libgcc for -shared-libgcc
>    aarch64: Add out-of-line functions for LSE atomics
>    aarch64: Implement -matomic-ool
>    aarch64: Implement TImode compare-and-swap
>    Enable -matomic-ool by default
> 
>   gcc/config/aarch64/aarch64-protos.h           |  20 +-
>   gcc/optabs-libfuncs.h                         |   2 +
>   gcc/common/config/aarch64/aarch64-common.c    |   6 +-
>   gcc/config/aarch64/aarch64.c                  | 480 ++++++--------
>   gcc/gcc.c                                     |   9 +-
>   gcc/optabs-libfuncs.c                         |  26 +-
>   .../atomic-comp-swap-release-acquire.c        |   2 +-
>   .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
>   .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
>   .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
>   .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
>   .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
>   .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
>   .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
>   .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
>   .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
>   .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
>   .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
>   .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
>   .../gcc.target/aarch64/sync-op-full.c         |   2 +-
>   libgcc/config/aarch64/lse.c                   | 280 ++++++++
>   gcc/config/aarch64/aarch64.opt                |   4 +
>   gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
>   gcc/config/aarch64/iterators.md               |   8 +-
>   gcc/config/aarch64/predicates.md              |  12 +
>   gcc/doc/invoke.texi                           |  14 +-
>   libgcc/config.host                            |   4 +
>   libgcc/config/aarch64/t-lse                   |  48 ++
>   33 files changed, 1050 insertions(+), 577 deletions(-)
>   create mode 100644 libgcc/config/aarch64/lse.c
>   create mode 100644 libgcc/config/aarch64/t-lse
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap
  2018-09-26  5:04 ` [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap rth7680
@ 2018-09-27 13:08   ` Matthew Malcomson
       [not found]   ` <3460dd10-4d9a-1def-3f9b-5f7a1afe5906@arm.com>
  2018-10-01 13:51   ` Matthew Malcomson
  2 siblings, 0 replies; 31+ messages in thread
From: Matthew Malcomson @ 2018-09-27 13:08 UTC (permalink / raw)
  To: rth7680, gcc-patches
  Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson, nd

Hi Richard,

I don't have any comment on the overall aim of the patch series, but in 
this particular
patch implementing casp it looks like you doesn't ensure the register 
pairs for casp are
even-odd.

This is the restriction in the Arm Arm decode for casp variants as
 Â if Rs<0> == '1' then UnallocatedEncoding();
 Â if Rt<0> == '1' then UnallocatedEncoding();

Example testcase where this trips up:


#include <stdatomic.h>
#include <stdbool.h>

bool
foo (__int128 *ptr)
{
 Â  register __int128 a __asm ("x3") = 0;
 Â  __int128 b = 0;
 Â  // This statement to ensure "a" is indeed put into x3,x4
 Â  asm ("" : "=r" (a) : "0" (a));
 Â  return __sync_bool_compare_and_swap (ptr, b, a);
}



$ build-aarch64-none-elf/install/bin/aarch64-none-elf-gcc \
$Â Â  -matomic-ool \
$Â Â  -march=armv8.4-a \
$Â Â  -O3 \
$Â Â  -o test \
$Â Â  -c ~/test.c
/tmp/cc0dw2At.s: Assembler messages:
/tmp/cc0dw2At.s:15: Error: reg pair must start from even reg at operand 
1 -- `caspal x1,x2,x3,x4,[x0]'
gnu-work [14:01:14] $

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-27 13:08 ` Ramana Radhakrishnan
@ 2018-09-27 15:19   ` Alexander Graf
  2018-09-27 16:51   ` Richard Henderson
  1 sibling, 0 replies; 31+ messages in thread
From: Alexander Graf @ 2018-09-27 15:19 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: rth7680, gcc-patches, matz, Richard Henderson, nd, will.deacon



> Am 27.09.2018 um 15:07 schrieb Ramana Radhakrishnan <ramana.radhakrishnan@arm.com>:
> 
>> On 26/09/2018 06:03, rth7680@gmail.com wrote:
>> From: Richard Henderson <richard.henderson@linaro.org>
>> ARMv8.1 adds an (mandatory) Atomics extension, also known as the
>> Large System Extension.  Deploying this extension at the OS level
>> has proved challenging.
>> The following is the result of a conversation between myself,
>> Alex Graf of SuSE, and Ramana Radhakrishnan of ARM, at last week's
>> Linaro Connect in Vancouver.
>> The current state of the world is that one could distribute two
>> different copies of a given shared library and place the LSE-enabled
>> version in /lib64/atomics/ and it will be selected over the /lib64/
>> version by ld.so when HWCAP_ATOMICS is present.
>> Alex's main concern with this is that (1) he doesn't want to
>> distribute two copies of every library, or determine what a
>> resonable subset would be and (2) this solution does not work
>> for executables, e.g. mysql.
>> Ramana's main concern was to avoid the overhead of an indirect jump,
>> especially in how that would affect the (non-)branch-prediction of
>> the smallest implementations.
>> Therefore, I've created small out-of-line helpers that are directly
>> linked into every library or executable that requires them.  There
>> will be two direct branches, both of which will be well-predicted.
>> In the process, I discovered a number of places within the code
>> where the existing implementation could be improved.  In particular:
>>  - the LSE patterns didn't use predicates or constraints that
>>    match the actual instructions, requiring unnecessary splitting.
>>  - the non-LSE compare-and-swap can use an extending compare to
>>    avoid requiring the input to have been previously extended.
>>  - TImode compare-and-swap was missing entirely.  This brings
>>    aarch64 to parity with x86_64 wrt __sync_val_compare_and_swap.
>> There is a final patch that enables the new option by default.
>> I am not necessarily expecting this to be merged upstream, but
>> for the operating system to decide what the default should be.
>> It might be that this should be a configure option, so as to
>> make that OS choice easier, but I've just now thought of that.  ;-)
>> I'm going to have to rely on Alex and/or Ramana to perform
>> testing on a system that supports LSE.
> 
> Thanks for this patchset -
> 
> I'll give this a whirl in the next couple of days but don't expect results until Monday or so.
> 
> I do have an additional concern that I forgot to mention in Vancouver -
> 
> Thanks Wilco for reminding me that this now replaces a bunch of inline instructions with effectively a library call therefore clobbering a whole bunch of caller saved registers.
> 
> In which case I see 2 options.
> 
> -  maybe we should consider a private interface and restrict the registers that these files are compiled with to minimise the number of caller saved registers we trash.
> 
> - Alternatively we should consider an option to inline these at O2 or O3 as we may just be trading the performance improvements we get with using the lse atomics

I talked to Will Deacon about lse atomics today a bit. Apparently, a key benefit that you get from using them is guaranteed forward progress when compared to an exclusives loop.

So IMHO even a tiny slowdown might be better than not progressing.

Another concern he did bring up was that due to the additional conditional code a cmpxchg loop may become bigger, so converges slower/never than a native implementation. I assume we can identify those cases later and solve them with ifuncs in the target code though.


Alex

> for additional stacking and unstacking of caller saved registers in the main functions...
> 
> But anyway while we discuss that we'll have a look at testing and benchmarking this.
> 
> 
> regards
> Ramana
> 
>> r~
>> Richard Henderson (11):
>>   aarch64: Simplify LSE cas generation
>>   aarch64: Improve cas generation
>>   aarch64: Improve swp generation
>>   aarch64: Improve atomic-op lse generation
>>   aarch64: Emit LSE st<op> instructions
>>   Add visibility to libfunc constructors
>>   Link static libgcc after shared libgcc for -shared-libgcc
>>   aarch64: Add out-of-line functions for LSE atomics
>>   aarch64: Implement -matomic-ool
>>   aarch64: Implement TImode compare-and-swap
>>   Enable -matomic-ool by default
>>  gcc/config/aarch64/aarch64-protos.h           |  20 +-
>>  gcc/optabs-libfuncs.h                         |   2 +
>>  gcc/common/config/aarch64/aarch64-common.c    |   6 +-
>>  gcc/config/aarch64/aarch64.c                  | 480 ++++++--------
>>  gcc/gcc.c                                     |   9 +-
>>  gcc/optabs-libfuncs.c                         |  26 +-
>>  .../atomic-comp-swap-release-acquire.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-inst-ldadd.c    |  18 +-
>>  .../gcc.target/aarch64/atomic-inst-ldlogic.c  |  54 +-
>>  .../gcc.target/aarch64/atomic-op-acq_rel.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-acquire.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-char.c       |   2 +-
>>  .../gcc.target/aarch64/atomic-op-consume.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-imm.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-op-int.c        |   2 +-
>>  .../gcc.target/aarch64/atomic-op-long.c       |   2 +-
>>  .../gcc.target/aarch64/atomic-op-relaxed.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-release.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-seq_cst.c    |   2 +-
>>  .../gcc.target/aarch64/atomic-op-short.c      |   2 +-
>>  .../aarch64/atomic_cmp_exchange_zero_reg_1.c  |   2 +-
>>  .../atomic_cmp_exchange_zero_strong_1.c       |   2 +-
>>  .../gcc.target/aarch64/sync-comp-swap.c       |   2 +-
>>  .../gcc.target/aarch64/sync-op-acquire.c      |   2 +-
>>  .../gcc.target/aarch64/sync-op-full.c         |   2 +-
>>  libgcc/config/aarch64/lse.c                   | 280 ++++++++
>>  gcc/config/aarch64/aarch64.opt                |   4 +
>>  gcc/config/aarch64/atomics.md                 | 608 ++++++++++--------
>>  gcc/config/aarch64/iterators.md               |   8 +-
>>  gcc/config/aarch64/predicates.md              |  12 +
>>  gcc/doc/invoke.texi                           |  14 +-
>>  libgcc/config.host                            |   4 +
>>  libgcc/config/aarch64/t-lse                   |  48 ++
>>  33 files changed, 1050 insertions(+), 577 deletions(-)
>>  create mode 100644 libgcc/config/aarch64/lse.c
>>  create mode 100644 libgcc/config/aarch64/t-lse
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap
       [not found]   ` <3460dd10-4d9a-1def-3f9b-5f7a1afe5906@arm.com>
@ 2018-09-27 16:39     ` Richard Henderson
  2018-09-27 17:07       ` Matthew Malcomson
  0 siblings, 1 reply; 31+ messages in thread
From: Richard Henderson @ 2018-09-27 16:39 UTC (permalink / raw)
  To: Matthew Malcomson, rth7680, gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz

On 9/27/18 6:04 AM, Matthew Malcomson wrote:
> Hi Richard,
> 
> 
> On 26/09/18 06:03, rth7680@gmail.com wrote:
>> From: Richard Henderson <richard.henderson@linaro.org>
>>
>> This pattern will only be used with the __sync functions, because
>> we do not yet have a bare TImode atomic load.
>>
>>
> I don't have any comment on the overall aim of the patch series, but in
> this particular
> patch it looks like you doesn't ensure the register pairs for casp are
> even-odd.
> 
> This is the restriction in the Arm Arm decode for casp variants as
> Â if Rs<0> == '1' then UnallocatedEncoding();
> Â if Rt<0> == '1' then UnallocatedEncoding();

Oops.  I missed this bit when reading the docs.  Thanks.
I'll incorporate your even register class patch into the next round.


r~

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-27 13:08 ` Ramana Radhakrishnan
  2018-09-27 15:19   ` Alexander Graf
@ 2018-09-27 16:51   ` Richard Henderson
  2018-09-28  8:48     ` Ramana Radhakrishnan
  1 sibling, 1 reply; 31+ messages in thread
From: Richard Henderson @ 2018-09-27 16:51 UTC (permalink / raw)
  To: Ramana Radhakrishnan, rth7680, gcc-patches; +Cc: agraf, matz, nd

On 9/27/18 6:07 AM, Ramana Radhakrishnan wrote:
> I do have an additional concern that I forgot to mention in Vancouver -
> 
> Thanks Wilco for reminding me that this now replaces a bunch of inline
> instructions with effectively a library call therefore clobbering a whole bunch
> of caller saved registers.

We did talk about this in Vancouver, including perhaps providing a private
interface.  At the time you brushed it off and asked why I couldn't just write
the helpers in C.

I guess we can talk about a private interface after we see what the total
overhead is with it as it is.


r~

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap
  2018-09-27 16:39     ` Richard Henderson
@ 2018-09-27 17:07       ` Matthew Malcomson
  0 siblings, 0 replies; 31+ messages in thread
From: Matthew Malcomson @ 2018-09-27 17:07 UTC (permalink / raw)
  To: Richard Henderson, rth7680, gcc-patches
  Cc: ramana.radhakrishnan, agraf, matz, nd

On 27/09/18 17:32, Richard Henderson wrote:
> On 9/27/18 6:04 AM, Matthew Malcomson wrote:
>> Hi Richard,
>>
>>
>> On 26/09/18 06:03, rth7680@gmail.com wrote:
>>> From: Richard Henderson <richard.henderson@linaro.org>
>>>
>>> This pattern will only be used with the __sync functions, because
>>> we do not yet have a bare TImode atomic load.
>>>
>>>
>> I don't have any comment on the overall aim of the patch series, but in
>> this particular
>> patch it looks like you doesn't ensure the register pairs for casp are
>> even-odd.
>>
>> This is the restriction in the Arm Arm decode for casp variants as
>>  Â if Rs<0> == '1' then UnallocatedEncoding();
>>  Â if Rt<0> == '1' then UnallocatedEncoding();
> Oops.  I missed this bit when reading the docs.  Thanks.
> I'll incorporate your even register class patch into the next round.
>
>
> r~

Just a heads up on that register class patch (because it's use is not 
very intuitive):

To allow any valid register pair combination between operands you need 
to have all combinations of the two constraints in your patterns 
alternatives.

So e.g. the "out" operand might have constraints like "Uep,Uex,Uep,Uex" 
while the "desired" operand would have "Uep,Uep,Uex,Uex".

[It's ugly, but the best of the options I found ].

M

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 00/11] LSE atomics out-of-line
  2018-09-27 16:51   ` Richard Henderson
@ 2018-09-28  8:48     ` Ramana Radhakrishnan
  0 siblings, 0 replies; 31+ messages in thread
From: Ramana Radhakrishnan @ 2018-09-28  8:48 UTC (permalink / raw)
  To: Richard Henderson, Ramana Radhakrishnan, rth7680, gcc-patches
  Cc: agraf, matz, nd

On 27/09/2018 17:40, Richard Henderson wrote:
> On 9/27/18 6:07 AM, Ramana Radhakrishnan wrote:
>> I do have an additional concern that I forgot to mention in Vancouver -
>>
>> Thanks Wilco for reminding me that this now replaces a bunch of inline
>> instructions with effectively a library call therefore clobbering a whole bunch
>> of caller saved registers.
> 
> We did talk about this in Vancouver, including perhaps providing a private
> interface.  At the time you brushed it off and asked why I couldn't just write
> the helpers in C.
> 

My apologies, yes you are right, we did talk about it.

> I guess we can talk about a private interface after we see what the total
> overhead is with it as it is.

Indeed.

Ramana

> 
> 
> r~
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics
  2018-09-26  5:04 ` [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics rth7680
  2018-09-26  9:01   ` Florian Weimer
@ 2018-09-28 16:29   ` Ramana Radhakrishnan
  1 sibling, 0 replies; 31+ messages in thread
From: Ramana Radhakrishnan @ 2018-09-28 16:29 UTC (permalink / raw)
  To: rth7680, gcc-patches; +Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson

On 26/09/2018 06:03, rth7680@gmail.com wrote:
> From: Richard Henderson <richard.henderson@linaro.org>
> 
> This is the libgcc part of the interface -- providing the functions.
> Rationale is provided at the top of libgcc/config/aarch64/lse.c.
> 
> 	* config/aarch64/lse.c: New file.
> 	* config/aarch64/t-lse: New file.
> 	* config.host: Add t-lse to all aarch64 tuples.
> ---
>   libgcc/config/aarch64/lse.c | 258 ++++++++++++++++++++++++++++++++++++
>   libgcc/config.host          |   4 +
>   libgcc/config/aarch64/t-lse |  44 ++++++
>   3 files changed, 306 insertions(+)
>   create mode 100644 libgcc/config/aarch64/lse.c
>   create mode 100644 libgcc/config/aarch64/t-lse
> 
> diff --git a/libgcc/config/aarch64/lse.c b/libgcc/config/aarch64/lse.c
> new file mode 100644
> index 00000000000..20f4bde741f
> --- /dev/null
> +++ b/libgcc/config/aarch64/lse.c
> @@ -0,0 +1,258 @@
> +/* Out-of-line LSE atomics for AArch64 architecture.
> +   Copyright (C) 2018 Free Software Foundation, Inc.
> +   Contributed by Linaro Ltd.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and
> +a copy of the GCC Runtime Library Exception along with this program;
> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +/*
> + * The problem that we are trying to solve is operating system deployment
> + * of ARMv8.1-Atomics, also known as Large System Exensions (LSE).
> + *
> + * There are a number of potential solutions for this problem which have
> + * been proposed and rejected for various reasons.  To recap:
> + *
> + * (1) Multiple builds.  The dynamic linker will examine /lib64/atomics/
> + * if HWCAP_ATOMICS is set, allowing entire libraries to be overwritten.
> + * However, not all Linux distributions are happy with multiple builds,
> + * and anyway it has no effect on main applications.
> + *
> + * (2) IFUNC.  We could put these functions into libgcc_s.so, and have
> + * a single copy of each function for all DSOs.  However, ARM is concerned
> + * that the branch-to-indirect-branch that is implied by using a PLT,
> + * as required by IFUNC, is too much overhead for smaller cpus.
> + *
> + * (3) Statically predicted direct branches.  This is the approach that
> + * is taken here.  These functions are linked into every DSO that uses them.
> + * All of the symbols are hidden, so that the functions are called via a
> + * direct branch.  The choice of LSE vs non-LSE is done via one byte load
> + * followed by a well-predicted direct branch.  The functions are compiled
> + * separately to minimize code size.
> + */
> +
> +/* Define or declare the symbol gating the LSE implementations.  */
> +#ifndef L_have_atomics
> +extern
> +#endif
> +_Bool __aa64_have_atomics __attribute__((visibility("hidden"), nocommon));


This needs to be able to build against glibc versions that do not have 
HWCAP_ATOMICS available in the headers.

Thus initialize to 0 ?

> +
> +/* The branch controlled by this test should be easily predicted, in that
> +   it will, after constructors, always branch the same way.  The expectation
> +   is that systems that implement ARMv8.1-Atomics are "beefier" than those
> +   that omit the extension.  By arranging for the fall-through path to use
> +   load-store-exclusive insns, we aid the branch predictor of the
> +   smallest cpus.  */
> +#define have_atomics  __builtin_expect(__aa64_have_atomics, 0)
> +
> +#ifdef L_have_atomics
> +/* Disable initialization of __aa64_have_atomics during bootstrap.  */
> +# ifndef inhibit_libc
> +#  include <sys/auxv.h>
> +


> +static void __attribute__((constructor))
> +init_have_atomics(void)
> +{
> +  unsigned long hwcap = getauxval(AT_HWCAP);
> +  __aa64_have_atomics = (hwcap & HWCAP_ATOMICS) != 0;
> +}

And then have the constructor run only when HWCAP_ATOMICS is defined ?.

i.e.

#ifdef HWCAP_ATOMICS

//constructor

#endif



> +# endif /* inhibit_libc */
> +#else
> +
> +/* Tell the assembler to accept LSE instructions.  */
> +asm(".arch armv8-a+lse");

Thankfully I think this is now well supported in most places that we 
don't need probe tests for this .. :)

So the tests that I'm running now will be good enough for a armv8-a run. 
I need to track down a machine internally with the right sort of headers 
for a v8.1-A run, that's not going to happen till next week.

regards
Ramana

> +
> +/* Turn size and memory model defines into mnemonic fragments.  */
> +#if SIZE == 1
> +# define S     "b"
> +# define MASK  ", uxtb"
> +#elif SIZE == 2
> +# define S     "h"
> +# define MASK  ", uxth"
> +#elif SIZE == 4 || SIZE == 8
> +# define S     ""
> +# define MASK  ""
> +#else
> +# error
> +#endif
> +
> +#if SIZE < 8
> +# define T  unsigned int
> +# define W  "w"
> +#else
> +# define T  unsigned long long
> +# define W  ""
> +#endif
> +
> +#if MODEL == 1
> +# define SUFF  _relax
> +# define A     ""
> +# define L     ""
> +#elif MODEL == 2
> +# define SUFF  _acq
> +# define A     "a"
> +# define L     ""
> +#elif MODEL == 3
> +# define SUFF  _rel
> +# define A     ""
> +# define L     "l"
> +#elif MODEL == 4
> +# define SUFF  _acq_rel
> +# define A     "a"
> +# define L     "l"
> +#else
> +# error
> +#endif
> +
> +#define NAME2(B, S, X)  __aa64_ ## B ## S ## X
> +#define NAME1(B, S, X)  NAME2(B, S, X)
> +#define NAME(BASE)	NAME1(BASE, SIZE, SUFF)
> +
> +#define str1(S)  #S
> +#define str(S)   str1(S)
> +
> +#ifdef L_cas
> +T NAME(cas)(T cmp, T new, T *ptr) __attribute__((visibility("hidden")));
> +T NAME(cas)(T cmp, T new, T *ptr)
> +{
> +  T old;
> +  unsigned tmp;
> +
> +  if (have_atomics)
> +    __asm__("cas" A L S " %"W"0, %"W"2, %1"
> +            : "=r"(old), "+m"(*ptr) : "r"(new), "0"(cmp));
> +  else
> +    __asm__(
> +	"0: "
> +	"ld" A "xr"S" %"W"0, %1\n\t"
> +	"cmp %"W"0, %"W"4" MASK "\n\t"
> +	"bne 1f\n\t"
> +	"st" L "xr"S" %w2, %"W"3, %1\n\t"
> +	"cbnz %w2, 0b\n"
> +	"1:"
> +	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new), "r"(cmp));
> +
> +  return old;
> +}
> +#endif
> +
> +#ifdef L_swp
> +T NAME(swp)(T new, T *ptr) __attribute__((visibility("hidden")));
> +T NAME(swp)(T new, T *ptr)
> +{
> +  T old;
> +  unsigned tmp;
> +
> +  if (have_atomics)
> +    __asm__("swp" A L S " %"W"2, %"W"0, %1"
> +            : "=r"(old), "+m"(*ptr) : "r"(new));
> +  else
> +    __asm__(
> +	"0: "
> +	"ld" A "xr"S" %"W"0, %1\n\t"
> +	"st" L "xr"S" %w2, %"W"3, %1\n\t"
> +	"cbnz %w2, 0b\n"
> +	"1:"
> +	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(new));
> +
> +  return old;
> +}
> +#endif
> +
> +#if defined(L_ldadd) || defined(L_ldclr) \
> +    || defined(L_ldeor) || defined(L_ldset)
> +
> +#ifdef L_ldadd
> +#define LDOP  ldadd
> +#define OP    add
> +#elif defined(L_ldclr)
> +#define LDOP  ldclr
> +#define OP    bic
> +#elif defined(L_ldeor)
> +#define LDOP  ldeor
> +#define OP    eor
> +#elif defined(L_ldset)
> +#define LDOP  ldset
> +#define OP    orr
> +#else
> +#error
> +#endif
> +
> +T NAME(LDOP)(T val, T *ptr) __attribute__((visibility("hidden")));
> +T NAME(LDOP)(T val, T *ptr)
> +{
> +  T old;
> +  unsigned tmp;
> +
> +  if (have_atomics)
> +    __asm__(str(LDOP) A L S " %"W"2, %"W"0, %1"
> +            : "=r"(old), "+m"(*ptr) : "r"(val));
> +  else
> +    __asm__(
> +	"0: "
> +	"ld" A "xr"S" %"W"0, %1\n\t"
> +	str(OP) " %"W"2, %"W"0, %"W"3\n\t"
> +	"st" L "xr"S" %w2, %"W"2, %1\n\t"
> +	"cbnz %w2, 0b\n"
> +	"1:"
> +	: "=&r"(old), "+m"(*ptr), "=&r"(tmp) : "r"(val));
> +
> +  return old;
> +}
> +#endif
> +
> +#if defined(L_stadd) || defined(L_stclr) \
> +    || defined(L_steor) || defined(L_stset)
> +
> +#ifdef L_stadd
> +#define STOP  stadd
> +#define OP    add
> +#elif defined(L_stclr)
> +#define STOP  stclr
> +#define OP    bic
> +#elif defined(L_steor)
> +#define STOP  steor
> +#define OP    eor
> +#elif defined(L_stset)
> +#define STOP  stset
> +#define OP    orr
> +#else
> +#error
> +#endif
> +
> +void NAME(STOP)(T val, T *ptr) __attribute__((visibility("hidden")));
> +void NAME(STOP)(T val, T *ptr)
> +{
> +  unsigned tmp;
> +
> +  if (have_atomics)
> +    __asm__(str(STOP) L S " %"W"1, %0" : "+m"(*ptr) : "r"(val));
> +  else
> +    __asm__(
> +	"0: "
> +	"ldxr"S" %"W"1, %0\n\t"
> +	str(OP) " %"W"1, %"W"1, %"W"2\n\t"
> +	"st" L "xr"S" %w1, %"W"1, %0\n\t"
> +	"cbnz %w1, 0b\n"
> +	"1:"
> +	: "+m"(*ptr), "=&r"(tmp) : "r"(val));
> +}
> +#endif
> +#endif /* L_have_atomics */
> diff --git a/libgcc/config.host b/libgcc/config.host
> index 029f6569caf..2c4a05d69c5 100644
> --- a/libgcc/config.host
> +++ b/libgcc/config.host
> @@ -340,23 +340,27 @@ aarch64*-*-elf | aarch64*-*-rtems*)
>   	extra_parts="$extra_parts crtbegin.o crtend.o crti.o crtn.o"
>   	extra_parts="$extra_parts crtfastmath.o"
>   	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
> +	tmake_file="${tmake_file} ${cpu_type}/t-lse"
>   	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
>   	md_unwind_header=aarch64/aarch64-unwind.h
>   	;;
>   aarch64*-*-freebsd*)
>   	extra_parts="$extra_parts crtfastmath.o"
>   	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
> +	tmake_file="${tmake_file} ${cpu_type}/t-lse"
>   	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
>   	md_unwind_header=aarch64/freebsd-unwind.h
>   	;;
>   aarch64*-*-fuchsia*)
>   	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
> +	tmake_file="${tmake_file} ${cpu_type}/t-lse"
>   	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
>   	;;
>   aarch64*-*-linux*)
>   	extra_parts="$extra_parts crtfastmath.o"
>   	md_unwind_header=aarch64/linux-unwind.h
>   	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
> +	tmake_file="${tmake_file} ${cpu_type}/t-lse"
>   	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
>   	;;
>   alpha*-*-linux*)
> diff --git a/libgcc/config/aarch64/t-lse b/libgcc/config/aarch64/t-lse
> new file mode 100644
> index 00000000000..e862b0c2448
> --- /dev/null
> +++ b/libgcc/config/aarch64/t-lse
> @@ -0,0 +1,44 @@
> +# Out-of-line LSE atomics for AArch64 architecture.
> +# Copyright (C) 2018 Free Software Foundation, Inc.
> +# Contributed by Linaro Ltd.
> +#
> +# This file is part of GCC.
> +#
> +# GCC is free software; you can redistribute it and/or modify it
> +# under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 3, or (at your option)
> +# any later version.
> +#
> +# GCC is distributed in the hope that it will be useful, but
> +# WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with GCC; see the file COPYING3.  If not see
> +# <http://www.gnu.org/licenses/>.
> +
> +# CAS, Swap, Load-and-operate have 4 sizes and 4 memory models
> +S1 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), cas swp ldadd ldclr ldeor ldset))
> +O1 := $(foreach m, 1 2 3 4, $(addsuffix _$(m)$(objext), $(S1)))
> +
> +# Store-and-operate has 4 sizes but only 2 memory models (relaxed, release).
> +S2 := $(foreach s, 1 2 4 8, $(addsuffix _$(s), stadd stclr steor stset))
> +O2 := $(foreach m, 1 3, $(addsuffix _$(m)$(objext), $(S2)))
> +
> +LSE_OBJS := $(O1) $(O2)
> +
> +libgcc-objects += $(LSE_OBJS) have_atomic$(objext)
> +
> +empty      =
> +space      = $(empty) $(empty)
> +PAT_SPLIT  = $(subst _,$(space),$(*F))
> +PAT_BASE   = $(word 1,$(PAT_SPLIT))
> +PAT_N      = $(word 2,$(PAT_SPLIT))
> +PAT_M      = $(word 3,$(PAT_SPLIT))
> +
> +have_atomic$(objext): $(srcdir)/config/aarch64/lse.c
> +	$(gcc_compile) -DL_have_atomics -c $<
> +
> +$(LSE_OBJS): $(srcdir)/config/aarch64/lse.c
> +	$(gcc_compile) -DL_$(PAT_BASE) -DSIZE=$(PAT_N) -DMODEL=$(PAT_M) -c $<
> 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap
  2018-09-26  5:04 ` [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap rth7680
  2018-09-27 13:08   ` Matthew Malcomson
       [not found]   ` <3460dd10-4d9a-1def-3f9b-5f7a1afe5906@arm.com>
@ 2018-10-01 13:51   ` Matthew Malcomson
  2 siblings, 0 replies; 31+ messages in thread
From: Matthew Malcomson @ 2018-10-01 13:51 UTC (permalink / raw)
  To: rth7680, gcc-patches
  Cc: ramana.radhakrishnan, agraf, matz, Richard Henderson, nd

Hi Richard,

On 26/09/18 06:03, rth7680@gmail.com wrote:
> From: Richard Henderson <richard.henderson@linaro.org>
>
> This pattern will only be used with the __sync functions, because
> we do not yet have a bare TImode atomic load.

Does this mean that the libatomic `defined(atomic_compare_exchange_n)` 
checks would return false for 16 bytes sizes?
(the acinclude.m4 file checks for __atomic_compare_exchange_n)

You would know better than I, but if that's the case it seems that the 
atomic_{load,store}_16 implementations in libatomic would still use the 
locking ABI, and e.g. atomic_load_16 could be interrupted by using the 
CASP instruction to produce an incorrect value.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line)
  2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
                   ` (12 preceding siblings ...)
  2018-09-27 13:08 ` Ramana Radhakrishnan
@ 2019-02-04 11:14 ` Florian Weimer
  2019-02-04 12:15   ` Jonathan Wakely
  13 siblings, 1 reply; 31+ messages in thread
From: Florian Weimer @ 2019-02-04 11:14 UTC (permalink / raw)
  To: rth7680
  Cc: gcc-patches, ramana.radhakrishnan, agraf, matz,
	Richard Henderson, libstdc++

* Richard Henderson:

> Therefore, I've created small out-of-line helpers that are directly
> linked into every library or executable that requires them.  There
> will be two direct branches, both of which will be well-predicted.

This work inspired me to put together something that provides a similar
hidden variable, comparable to __aa64_have_atomics, to libc_nonshared.a
in glibc:

  <https://sourceware.org/ml/libc-alpha/2019-02/msg00073.html>

I hope it can be eventually be used to dynamically optimize the use of
atomics in the std::shared_ptr implementation in libstdc++.

For a generic optimization of all atomics, this is not suitable because
even a single-threaded process can have MAP_SHARED mappings and will
have to use atomics there.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line)
  2019-02-04 11:14 ` __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line) Florian Weimer
@ 2019-02-04 12:15   ` Jonathan Wakely
  0 siblings, 0 replies; 31+ messages in thread
From: Jonathan Wakely @ 2019-02-04 12:15 UTC (permalink / raw)
  To: Florian Weimer
  Cc: rth7680, gcc-patches, ramana.radhakrishnan, agraf, matz,
	Richard Henderson, libstdc++

On 04/02/19 12:13 +0100, Florian Weimer wrote:
>* Richard Henderson:
>
>> Therefore, I've created small out-of-line helpers that are directly
>> linked into every library or executable that requires them.  There
>> will be two direct branches, both of which will be well-predicted.
>
>This work inspired me to put together something that provides a similar
>hidden variable, comparable to __aa64_have_atomics, to libc_nonshared.a
>in glibc:
>
>  <https://sourceware.org/ml/libc-alpha/2019-02/msg00073.html>
>
>I hope it can be eventually be used to dynamically optimize the use of
>atomics in the std::shared_ptr implementation in libstdc++.

This makes me very happy. Thanks, Florian!

>For a generic optimization of all atomics, this is not suitable because
>even a single-threaded process can have MAP_SHARED mappings and will
>have to use atomics there.
>
>Thanks,
>Florian

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2019-02-04 12:15 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-26  5:04 [PATCH, AArch64 00/11] LSE atomics out-of-line rth7680
2018-09-26  5:04 ` [PATCH, AArch64 04/11] aarch64: Improve atomic-op lse generation rth7680
2018-09-26  5:04 ` [PATCH, AArch64 09/11] aarch64: Implement -matomic-ool rth7680
2018-09-26  5:04 ` [PATCH, AArch64 11/11] Enable -matomic-ool by default rth7680
2018-09-26  5:04 ` [PATCH, AArch64 06/11] Add visibility to libfunc constructors rth7680
2018-09-26  5:04 ` [PATCH, AArch64 03/11] aarch64: Improve swp generation rth7680
2018-09-26  5:04 ` [PATCH, AArch64 05/11] aarch64: Emit LSE st<op> instructions rth7680
2018-09-26  5:04 ` [PATCH, AArch64 07/11] Link static libgcc after shared libgcc for -shared-libgcc rth7680
2018-09-26 16:55   ` Joseph Myers
2018-09-26 16:57     ` Richard Henderson
2018-09-26  5:04 ` [PATCH, AArch64 10/11] aarch64: Implement TImode compare-and-swap rth7680
2018-09-27 13:08   ` Matthew Malcomson
     [not found]   ` <3460dd10-4d9a-1def-3f9b-5f7a1afe5906@arm.com>
2018-09-27 16:39     ` Richard Henderson
2018-09-27 17:07       ` Matthew Malcomson
2018-10-01 13:51   ` Matthew Malcomson
2018-09-26  5:04 ` [PATCH, AArch64 01/11] aarch64: Simplify LSE cas generation rth7680
2018-09-26  5:04 ` [PATCH, AArch64 08/11] aarch64: Add out-of-line functions for LSE atomics rth7680
2018-09-26  9:01   ` Florian Weimer
2018-09-26 14:33     ` Richard Henderson
2018-09-26 14:36       ` Florian Weimer
2018-09-26 14:37         ` Richard Henderson
2018-09-28 16:29   ` Ramana Radhakrishnan
2018-09-26  7:40 ` [PATCH, AArch64 02/11] aarch64: Improve cas generation rth7680
2018-09-26  9:22 ` [PATCH, AArch64 00/11] LSE atomics out-of-line Florian Weimer
2018-09-26 13:05   ` Michael Matz
2018-09-27 13:08 ` Ramana Radhakrishnan
2018-09-27 15:19   ` Alexander Graf
2018-09-27 16:51   ` Richard Henderson
2018-09-28  8:48     ` Ramana Radhakrishnan
2019-02-04 11:14 ` __libc_single_threaded variable for optimizing std::shared_ptr (was: [PATCH, AArch64 00/11] LSE atomics out-of-line) Florian Weimer
2019-02-04 12:15   ` Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).