public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [CFT] s390: Convert from sync to atomic optabs
@ 2012-07-29 21:32 Richard Henderson
  2012-07-30 14:19 ` Ulrich Weigand
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-07-29 21:32 UTC (permalink / raw)
  To: uweigand, Andreas.Krebbel; +Cc: gcc-patches

Tested only as far as cross-compile.  I had a browse through
objdump of libatomic for a brief sanity check.

Can you please test on real hw and report back?


r~

---
 gcc/config/s390/s390-protos.h |    3 +-
 gcc/config/s390/s390.c        |   90 +++++-----
 gcc/config/s390/s390.md       |  373 ++++++++++++++++++++++++++++-------------
 3 files changed, 308 insertions(+), 158 deletions(-)

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 4f1eb42..79673d6 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -85,7 +85,8 @@ extern void s390_expand_setmem (rtx, rtx, rtx);
 extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
 extern bool s390_expand_addcc (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 extern bool s390_expand_insv (rtx, rtx, rtx, rtx);
-extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx, rtx);
+extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx,
+				rtx, rtx, bool);
 extern void s390_expand_atomic (enum machine_mode, enum rtx_code,
 				rtx, rtx, rtx, bool);
 extern rtx s390_return_addr_rtx (int, rtx);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index f72f49f..0685c6c 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -896,10 +896,12 @@ s390_emit_compare (enum rtx_code code, rtx op0, rtx op1)
    conditional branch testing the result.  */
 
 static rtx
-s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem, rtx cmp, rtx new_rtx)
+s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem,
+			    rtx cmp, rtx new_rtx)
 {
-  emit_insn (gen_sync_compare_and_swapsi (old, mem, cmp, new_rtx));
-  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM), const0_rtx);
+  emit_insn (gen_atomic_compare_and_swapsi_internal (old, mem, cmp, new_rtx));
+  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM),
+			    const0_rtx);
 }
 
 /* Emit a jump instruction to TARGET.  If COND is NULL_RTX, emit an
@@ -4720,79 +4722,79 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
 }
 
 /* Expand an atomic compare and swap operation for HImode and QImode.  MEM is
-   the memory location, CMP the old value to compare MEM with and NEW_RTX the value
-   to set if CMP == MEM.
-   CMP is never in memory for compare_and_swap_cc because
-   expand_bool_compare_and_swap puts it into a register for later compare.  */
+   the memory location, CMP the old value to compare MEM with and NEW_RTX the
+   value to set if CMP == MEM.  */
 
 void
-s390_expand_cs_hqi (enum machine_mode mode, rtx target, rtx mem, rtx cmp, rtx new_rtx)
+s390_expand_cs_hqi (enum machine_mode mode, rtx btarget, rtx vtarget, rtx mem,
+		    rtx cmp, rtx new_rtx, bool is_weak)
 {
   struct alignment_context ac;
   rtx cmpv, newv, val, resv, cc;
   rtx res = gen_reg_rtx (SImode);
-  rtx csloop = gen_label_rtx ();
-  rtx csend = gen_label_rtx ();
+  rtx csloop = NULL, csend = NULL;
 
-  gcc_assert (register_operand (target, VOIDmode));
+  gcc_assert (register_operand (vtarget, VOIDmode));
   gcc_assert (MEM_P (mem));
 
   init_alignment_context (&ac, mem, mode);
 
   /* Shift the values to the correct bit positions.  */
-  if (!(ac.aligned && MEM_P (cmp)))
-    cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
-  if (!(ac.aligned && MEM_P (new_rtx)))
-    new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
+  cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
+  new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
 
   /* Load full word.  Subsequent loads are performed by CS.  */
   val = expand_simple_binop (SImode, AND, ac.memsi, ac.modemaski,
 			     NULL_RTX, 1, OPTAB_DIRECT);
 
   /* Start CS loop.  */
-  emit_label (csloop);
+  if (!is_weak)
+    {
+      /* Begin assuming success.  */
+      emit_move_insn (btarget, const1_rtx);
+
+      csloop = gen_label_rtx ();
+      csend = gen_label_rtx ();
+      emit_label (csloop);
+    }
+
   /* val = "<mem>00..0<mem>"
    * cmp = "00..0<cmp>00..0"
    * new = "00..0<new>00..0"
    */
 
   /* Patch cmp and new with val at correct position.  */
-  if (ac.aligned && MEM_P (cmp))
+  cmpv = force_reg (SImode, expand_simple_binop (SImode, IOR, cmp, val,
+						 NULL_RTX, 1, OPTAB_DIRECT));
+  newv = force_reg (SImode, expand_simple_binop (SImode, IOR, new_rtx, val,
+						 NULL_RTX, 1, OPTAB_DIRECT));
+
+  if (is_weak)
     {
-      cmpv = force_reg (SImode, val);
-      store_bit_field (cmpv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, cmp);
+      cc = s390_emit_compare_and_swap (NE, res, ac.memsi, cmpv, newv);
+      emit_insn (gen_cstorecc4 (btarget, cc, XEXP (cc, 0), XEXP (cc, 1)));
     }
   else
-    cmpv = force_reg (SImode, expand_simple_binop (SImode, IOR, cmp, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
-  if (ac.aligned && MEM_P (new_rtx))
     {
-      newv = force_reg (SImode, val);
-      store_bit_field (newv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, new_rtx);
-    }
-  else
-    newv = force_reg (SImode, expand_simple_binop (SImode, IOR, new_rtx, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
+      /* Jump to end if we're done (likely?).  */
+      cc = s390_emit_compare_and_swap (EQ, res, ac.memsi, cmpv, newv);
+      s390_emit_jump (csend, cc);
 
-  /* Jump to end if we're done (likely?).  */
-  s390_emit_jump (csend, s390_emit_compare_and_swap (EQ, res, ac.memsi,
-						     cmpv, newv));
+      /* Check for changes outside mode, and loop internal if so.  */
+      resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
+			          NULL_RTX, 1, OPTAB_DIRECT);
+      cc = s390_emit_compare (NE, resv, val);
+      emit_move_insn (val, resv);
+      s390_emit_jump (csloop, cc);
 
-  /* Check for changes outside mode.  */
-  resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
-			      NULL_RTX, 1, OPTAB_DIRECT);
-  cc = s390_emit_compare (NE, resv, val);
-  emit_move_insn (val, resv);
-  /* Loop internal if so.  */
-  s390_emit_jump (csloop, cc);
-
-  emit_label (csend);
+      /* Failed.  */
+      emit_move_insn (btarget, const0_rtx);
+      emit_label (csend);
+    }
 
   /* Return the correct part of the bitfield.  */
-  convert_move (target, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
-					     NULL_RTX, 1, OPTAB_DIRECT), 1);
+  convert_move (vtarget, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
+					      NULL_RTX, 1, OPTAB_DIRECT), 1);
 }
 
 /* Expand an atomic operation CODE of mode MODE.  MEM is the memory location
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 096f266..f74d0b2 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -84,6 +84,7 @@
 
    ; Atomic Support
    UNSPEC_MB
+   UNSPEC_MOVA
 
    ; TLS relocation specifiers
    UNSPEC_TLSGD
@@ -349,21 +350,19 @@
 (define_mode_iterator DD_DF [DF DD])
 (define_mode_iterator TD_TF [TF TD])
 
-;; This mode iterator allows 31-bit and 64-bit TDSI patterns to be generated
-;; from the same template.
-(define_mode_iterator TDSI [(TI "TARGET_64BIT") DI SI])
-
 ;; These mode iterators allow 31-bit and 64-bit GPR patterns to be generated
 ;; from the same template.
 (define_mode_iterator GPR [(DI "TARGET_ZARCH") SI])
+(define_mode_iterator DGPR [(TI "TARGET_ZARCH") DI SI])
 (define_mode_iterator DSI [DI SI])
+(define_mode_iterator TDI [TI DI])
 
 ;; These mode iterators allow :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(DI "TARGET_64BIT") (SI "!TARGET_64BIT")])
 
-;; These macros refer to the actual word_mode of the configuration. This is equal
-;; to Pmode except on 31-bit machines in zarch mode.
+;; These macros refer to the actual word_mode of the configuration.
+;; This is equal to Pmode except on 31-bit machines in zarch mode.
 (define_mode_iterator DW [(TI "TARGET_ZARCH") (DI "!TARGET_ZARCH")])
 (define_mode_iterator W  [(DI "TARGET_ZARCH") (SI "!TARGET_ZARCH")])
 
@@ -379,6 +378,7 @@
 ;; same template.
 (define_mode_iterator INT [(DI "TARGET_ZARCH") SI HI QI])
 (define_mode_iterator INTALL [TI DI SI HI QI])
+(define_mode_iterator DINT [(TI "TARGET_ZARCH") DI SI HI QI])
 
 ;; This iterator allows some 'ashift' and 'lshiftrt' pattern to be defined from
 ;; the same template.
@@ -487,6 +487,9 @@
 ;; and "cds" in DImode.
 (define_mode_attr tg [(TI "g") (DI "")])
 
+;; In TDI templates, a string like "c<d>sg".
+(define_mode_attr td [(TI "d") (DI "")])
+
 ;; In GPR templates, a string like "c<gf>dbr" will expand to "cgdbr" in DImode
 ;; and "cfdbr" in SImode.
 (define_mode_attr gf [(DI "g") (SI "f")])
@@ -8739,164 +8742,308 @@
 ;;
 
 ;
-; memory barrier pattern.
+; memory barrier patterns.
 ;
 
-(define_expand "memory_barrier"
-  [(set (match_dup 0)
-	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
+(define_expand "mem_signal_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
   ""
 {
-  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
-  MEM_VOLATILE_P (operands[0]) = 1;
+  /* The s390 memory model is strong enough not to require any
+     barrier in order to synchronize a thread with itself.  */
+  DONE;
+})
+
+(define_expand "mem_thread_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
+  ""
+{
+  /* Unless this is a SEQ_CST fence, the s390 memory model is strong
+     enough not to require barriers of any kind.  */
+  if (INTVAL (operands[0]) == MEMMODEL_SEQ_CST)
+    {
+      rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+      MEM_VOLATILE_P (mem) = 1;
+      emit_insn (gen_mem_thread_fence_1 (mem));
+    }
+  DONE;
 })
 
-(define_insn "*memory_barrier"
+; Although bcr is superscalar on Z10, this variant will never
+; become part of an execution group.
+(define_insn "mem_thread_fence_1"
   [(set (match_operand:BLK 0 "" "")
 	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
   ""
   "bcr\t15,0"
   [(set_attr "op_type" "RR")])
 
-; Although bcr is superscalar on Z10, this variant will never become part of
-; an execution group.
+;
+; atomic load/store operations
+;
+
+; Atomic loads need not examine the memory model at all.
+(define_expand "atomic_load<mode>"
+  [(match_operand:DINT 0 "register_operand")	;; output
+   (match_operand:DINT 1 "memory_operand")	;; memory
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_loadti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn "atomic_loaddi_1"
+  [(set (match_operand:DI 0 "register_operand" "=f,f")
+	(unspec:DI [(match_operand:DI 1 "memory_operand" "R,m")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   ld %0,%1
+   ldy %0,%1"
+  [(set_attr "op_type" "RX,RXY")
+   (set_attr "type" "floaddf")])
+
+(define_insn "atomic_loadti_1"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+	(unspec:TI [(match_operand:TI 1 "memory_operand" "m")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "lpq %0,%1"
+  [(set_attr "op_type" "RXY")])
+
+; Atomic stores must(?) enforce sequential consistency.
+(define_expand "atomic_store<mode>"
+  [(match_operand:DINT 0 "memory_operand")	;; memory
+   (match_operand:DINT 1 "register_operand")	;; input
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_storeti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    emit_insn (gen_atomic_storedi_1 (operands[0], operands[1]));
+  else
+    emit_move_insn (operands[0], operands[1]);
+  if (model == MEMMODEL_SEQ_CST)
+    emit_insn (gen_mem_thread_fence (operands[2]));
+  DONE;
+})
+
+(define_insn "atomic_storedi_1"
+  [(set (match_operand:DI 0 "memory_operand" "=R,m")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "f,f")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   std %1,%0
+   stdy %1,%0"
+  [(set_attr "op_type" "RX,RXY")
+   (set_attr "type" "fstoredf")])
+
+(define_insn "atomic_storeti_1"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "r")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "stpq %1,%0"
+  [(set_attr "op_type" "RXY")])
 
 ;
 ; compare and swap patterns.
 ;
 
-(define_expand "sync_compare_and_swap<mode>"
-  [(parallel
-    [(set (match_operand:TDSI 0 "register_operand" "")
-	  (match_operand:TDSI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:TDSI
-	    [(match_dup 1)
-	     (match_operand:TDSI 2 "register_operand" "")
-	     (match_operand:TDSI 3 "register_operand" "")]
-	    UNSPECV_CAS))
-     (set (reg:CCZ1 CC_REGNUM)
-	  (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
-  "")
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand")	;; bool success output
+   (match_operand:DINT 1 "register_operand")	;; oldval output
+   (match_operand:DINT 2 "s_operand")		;; memory
+   (match_operand:DINT 3 "register_operand")	;; expected intput
+   (match_operand:DINT 4 "register_operand")	;; newval intput
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
+  ""
+{
+  if (<MODE>mode == QImode || <MODE>mode == HImode)
+    s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1], operands[2],
+			operands[3], operands[4], INTVAL (operands[5]));
+  else
+    {
+      rtx cc, cmp;
 
-(define_expand "sync_compare_and_swap<mode>"
+      emit_insn (gen_atomic_compare_and_swap<mode>_internal
+	         (operands[1], operands[2], operands[3], operands[4]));
+
+      cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
+      cmp = gen_rtx_NE (SImode, cc, const0_rtx);
+      emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));
+    }
+  DONE;
+})
+
+(define_expand "atomic_compare_and_swap<mode>_internal"
   [(parallel
-    [(set (match_operand:HQI 0 "register_operand" "")
-	  (match_operand:HQI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:HQI
-	    [(match_dup 1)
-	     (match_operand:HQI 2 "general_operand" "")
-	     (match_operand:HQI 3 "general_operand" "")]
-	    UNSPECV_CAS))
-     (clobber (reg:CC CC_REGNUM))])]
-  ""
-  "s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1],
-		       operands[2], operands[3]); DONE;")
+     [(set (match_operand:DGPR 0 "register_operand")
+	   (match_operand:DGPR 1 "s_operand"))
+      (set (match_dup 1)
+	   (unspec_volatile:DGPR
+	     [(match_dup 1)
+	      (match_operand:DGPR 2 "register_operand")
+	      (match_operand:DGPR 3 "register_operand")]
+	     UNSPECV_CAS))
+      (set (reg:CCZ1 CC_REGNUM)
+	   (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
+  "")
 
-; cds, cdsg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:DW 0 "register_operand" "=r")
-	(match_operand:DW 1 "memory_operand" "+Q"))
+; cdsg, csg
+(define_insn "*atomic_compare_and_swap<mode>_1"
+  [(set (match_operand:TDI 0 "register_operand" "=r")
+	(match_operand:TDI 1 "s_operand" "+QS"))
    (set (match_dup 1)
-	(unspec_volatile:DW
+	(unspec_volatile:TDI
 	  [(match_dup 1)
-	   (match_operand:DW 2 "register_operand" "0")
-	   (match_operand:DW 3 "register_operand" "r")]
+	   (match_operand:TDI 2 "register_operand" "0")
+	   (match_operand:TDI 3 "register_operand" "r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
-  ""
-  "cds<tg>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<TE>")
+  "TARGET_ZARCH"
+  "c<td>sg\t%0,%3,%S1"
+  [(set_attr "op_type" "RXY")
    (set_attr "type"   "sem")])
 
-; cs, csg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(match_operand:GPR 1 "memory_operand" "+Q"))
+; cds, cdsy
+(define_insn "*atomic_compare_and_swapdi_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(match_operand:DI 1 "s_operand" "+Q,S"))
    (set (match_dup 1)
-	(unspec_volatile:GPR
+	(unspec_volatile:DI
 	  [(match_dup 1)
-	   (match_operand:GPR 2 "register_operand" "0")
-	   (match_operand:GPR 3 "register_operand" "r")]
+	   (match_operand:DI 2 "register_operand" "0,0")
+	   (match_operand:DI 3 "register_operand" "r,r")]
+	  UNSPECV_CAS))
+   (set (reg:CCZ1 CC_REGNUM)
+	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
+  "!TARGET_ZARCH"
+  "@
+   cds\t%0,%3,%S1
+   cdsy\t%0,%3,%S1"
+  [(set_attr "op_type" "RX,RXY")
+   (set_attr "type" "sem")])
+
+; cs, csy
+(define_insn "*atomic_compare_and_swapsi_3"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(match_operand:SI 1 "s_operand" "+Q,S"))
+   (set (match_dup 1)
+	(unspec_volatile:SI
+	  [(match_dup 1)
+	   (match_operand:SI 2 "register_operand" "0,0")
+	   (match_operand:SI 3 "register_operand" "r,r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
   ""
-  "cs<g>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<E>")
+  "@
+   cs\t%0,%3,%S1
+   csy\t%0,%3,%S1"
+  [(set_attr "op_type" "RX,RXY")
    (set_attr "type"   "sem")])
 
-
 ;
 ; Other atomic instruction patterns.
 ;
 
-(define_expand "sync_lock_test_and_set<mode>"
-  [(match_operand:HQI 0 "register_operand")
-   (match_operand:HQI 1 "memory_operand")
-   (match_operand:HQI 2 "general_operand")]
-  ""
-  "s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
 ; z196 load and add, xor, or and and instructions
 
-; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "memory_operand" "+QS")
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 0)
-			     (match_operand:GPR 1 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (match_scratch:GPR 2 "=d"))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:GPR 0 "register_operand")		;; val out
+   (ATOMIC_Z196:GPR
+     (match_operand:GPR 1 "s_operand")			;; memory
+     (match_operand:GPR 2 "register_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   "TARGET_Z196"
-  "la<noxa><g>\t%2,%1,%0")
+{
+  emit_insn (gen_atomic_fetch_<atomic><mode>_iaf
+	     (operands[0], operands[1], operands[2]));
+  DONE;
+})
 
 ; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_old_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "register_operand" "=d")
-	  (match_operand:GPR 1 "memory_operand"   "+QS"))
-     (set (match_dup 1)
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 1)
-			     (match_operand:GPR 2 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_insn "atomic_fetch_<atomic><mode>_iaf"
+  [(set (match_operand:GPR 0 "register_operand" "=d")
+	(match_operand:GPR 1 "s_operand" "+S"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	 [(ATOMIC_Z196:GPR (match_dup 1)
+			   (match_operand:GPR 2 "general_operand" "d"))]
+	 UNSPECV_ATOMIC_OP))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z196"
-  "la<noxa><g>\t%0,%2,%1")
+  "la<noxa><g>\t%0,%2,%1"
+  [(set_attr "op_type" "RXY")
+   (set_attr "type" "sem")])
 
+;; For SImode and larger, the optabs.c code will do just fine in
+;; expanding a compare-and-swap loop.  For QI/HImode, we can do
+;; better by expanding our own loop.
 
-(define_expand "sync_<atomic><mode>"
-  [(set (match_operand:HQI 0 "memory_operand")
-	(ATOMIC:HQI (match_dup 0)
-		    (match_operand:HQI 1 "general_operand")))]
+(define_expand "atomic_<atomic><mode>"
+  [(ATOMIC:HQI
+     (match_operand:HQI 0 "s_operand")			;; memory
+     (match_operand:HQI 1 "general_operand"))		;; val in
+   (match_operand:SI 2 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
-		       operands[1], false); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
+		       operands[1], false);
+  DONE;
+})
 
-(define_expand "sync_old_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(match_operand:HQI 1 "memory_operand"))
-   (set (match_dup 1)
-	(ATOMIC:HQI (match_dup 1)
-		    (match_operand:HQI 2 "general_operand")))]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "s_operand")			;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
-(define_expand "sync_new_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(ATOMIC:HQI (match_operand:HQI 1 "memory_operand")
-		    (match_operand:HQI 2 "general_operand")))
-   (set (match_dup 1) (ATOMIC:HQI (match_dup 1) (match_dup 2)))]
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
+
+(define_expand "atomic_<atomic>_fetch<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "s_operand")			;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
+  ""
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], true);
+  DONE;
+})
+
+(define_expand "atomic_exchange<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (match_operand:HQI 1 "s_operand")			;; memory
+   (match_operand:HQI 2 "general_operand")		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], true); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
 
 ;;
 ;;- Miscellaneous instructions.
-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [CFT] s390: Convert from sync to atomic optabs
  2012-07-29 21:32 [CFT] s390: Convert from sync to atomic optabs Richard Henderson
@ 2012-07-30 14:19 ` Ulrich Weigand
  2012-07-30 15:12   ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: Ulrich Weigand @ 2012-07-30 14:19 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Andreas.Krebbel, gcc-patches

Richard Henderson wrote:

> Tested only as far as cross-compile.  I had a browse through
> objdump of libatomic for a brief sanity check.
> 
> Can you please test on real hw and report back?

I'll run a test, but a couple of things I noticed:


>    /* Shift the values to the correct bit positions.  */
> -  if (!(ac.aligned && MEM_P (cmp)))
> -    cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
> -  if (!(ac.aligned && MEM_P (new_rtx)))
> -    new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
> +  cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
> +  new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);

This seems to disable use of ICM / STCM to perform byte or
aligned halfword access.  Why is this necessary?  Those operations
are supposed to provide the required operand consistency ...

> +(define_insn "atomic_loaddi_1"
> +  [(set (match_operand:DI 0 "register_operand" "=f,f")
> +	(unspec:DI [(match_operand:DI 1 "memory_operand" "R,m")]
> +		   UNSPEC_MOVA))]
> +  "!TARGET_ZARCH"
> +  "@
> +   ld %0,%1
> +   ldy %0,%1"
> +  [(set_attr "op_type" "RX,RXY")
> +   (set_attr "type" "floaddf")])

This seems to force DImode accesses through floating-point
registers, which is quite inefficient.  Why not allow LM/STM?
Those are supposed to provide doubleword consistency if the
operand is sufficiently aligned ...


[ From the Principles of Operations, section Block-Concurrent
  References:

  The instructions LOAD MULTIPLE (LM), LOAD MULTIPLE
  DISJOINT, LOAD MULTIPLE HIGH, STORE
  MULTIPLE (STM), and STORE MULTIPLE HIGH,
  when the operand or operands start on a word
  boundary; the instructions LOAD MULTIPLE (LMG)
  and STORE MULTIPLE (STMG), when the operand
  starts on a doubleword boundary; and the instructions
  COMPARE LOGICAL (CLC), COMPARE LOGICAL
  CHARACTERS UNDER MASK, INSERT
  CHARACTERS UNDER MASK, LOAD CONTROL
  (LCTLG), STORE CHARACTERS UNDER MASK,
  and STORE CONTROL (STCTG) access their storage
  operands in a left-to-right direction, and all bytes
  accessed within each doubleword appear to be
  accessed concurrently as observed by other CPUs.  ]


Otherwise the patch looks good to me.

Thanks,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [CFT] s390: Convert from sync to atomic optabs
  2012-07-30 14:19 ` Ulrich Weigand
@ 2012-07-30 15:12   ` Richard Henderson
  2012-07-30 15:51     ` Ulrich Weigand
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-07-30 15:12 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: Andreas.Krebbel, gcc-patches

On 2012-07-30 07:09, Ulrich Weigand wrote:
> Richard Henderson wrote:
> 
>> Tested only as far as cross-compile.  I had a browse through
>> objdump of libatomic for a brief sanity check.
>>
>> Can you please test on real hw and report back?
> 
> I'll run a test, but a couple of things I noticed:
> 
> 
>>    /* Shift the values to the correct bit positions.  */
>> -  if (!(ac.aligned && MEM_P (cmp)))
>> -    cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
>> -  if (!(ac.aligned && MEM_P (new_rtx)))
>> -    new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
>> +  cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
>> +  new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
> 
> This seems to disable use of ICM / STCM to perform byte or
> aligned halfword access.  Why is this necessary?  Those operations
> are supposed to provide the required operand consistency ...

Because MEM_P for cmp and new_rtx are always false.  The expander
always requests register_operand for those.  I suppose I could back
out merging those cases into the macro.

I presume a good test case to examine for ICM is with such an operand
coming from a global.  What about STCM?  I don't see the output from
sync_compare_and_swap ever being allowed in memory...

> This seems to force DImode accesses through floating-point
> registers, which is quite inefficient.  Why not allow LM/STM?
> Those are supposed to provide doubleword consistency if the
> operand is sufficiently aligned ...

... because I only looked at the definition of LM which itself
doesn't mention consistency, and the definition of LPQ which talks
about LM not being suitable for quadword consistency, and came to
the wrong conclusion.

So now, looking at movdi_31, I see two problems that prevent just
using a "normal" move for the atomic_load/store_di: the o/d and d/b
alternatives which are split.  Is there some specific goodness that
those alternatives provide that is not had by reloading into the
Q/S memory patterns?


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [CFT] s390: Convert from sync to atomic optabs
  2012-07-30 15:12   ` Richard Henderson
@ 2012-07-30 15:51     ` Ulrich Weigand
  2012-07-30 18:53       ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: Ulrich Weigand @ 2012-07-30 15:51 UTC (permalink / raw)
  To: Richard Henderson; +Cc: Andreas.Krebbel, gcc-patches

Richard Henderson wrote:
> On 2012-07-30 07:09, Ulrich Weigand wrote:
> > This seems to disable use of ICM / STCM to perform byte or
> > aligned halfword access.  Why is this necessary?  Those operations
> > are supposed to provide the required operand consistency ...
> 
> Because MEM_P for cmp and new_rtx are always false.  The expander
> always requests register_operand for those.  I suppose I could back
> out merging those cases into the macro.

Right, that's one of the reasons why we had two separate macros
for sync_compare_and_swap ...

> I presume a good test case to examine for ICM is with such an operand
> coming from a global.  What about STCM?  I don't see the output from
> sync_compare_and_swap ever being allowed in memory...

Actually, it's only ICM that is of interest here; it should get used when
either the comparison value or the "new" value come from a memory location,
e.g. a global.  Sorry, I was confused about STCM ...

> > This seems to force DImode accesses through floating-point
> > registers, which is quite inefficient.  Why not allow LM/STM?
> > Those are supposed to provide doubleword consistency if the
> > operand is sufficiently aligned ...
> 
> ... because I only looked at the definition of LM which itself
> doesn't mention consistency, and the definition of LPQ which talks
> about LM not being suitable for quadword consistency, and came to
> the wrong conclusion.
> 
> So now, looking at movdi_31, I see two problems that prevent just
> using a "normal" move for the atomic_load/store_di: the o/d and d/b
> alternatives which are split.  Is there some specific goodness that
> those alternatives provide that is not had by reloading into the
> Q/S memory patterns?

Well, they are there as splitters because reload assumes all moves
are handled somewhere, either by the mov pattern or else via a
secondary reload.  I've implemented all moves that *can* be
implemented without an extra register via splitters on the
mov pattern, and only those that absolute require the extra
register via secondary reload ...

Given that, it's probably best to use a separate instruction for
the DImode atomic moves after all, but allow GPRs using LM/STM.
(Only for Q/S constraint type addresses.  For those instructions,
we have to reload the address instead of performing two moves.)

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [CFT] s390: Convert from sync to atomic optabs
  2012-07-30 15:51     ` Ulrich Weigand
@ 2012-07-30 18:53       ` Richard Henderson
  2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-07-30 18:53 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: Andreas.Krebbel, gcc-patches

On 07/30/2012 08:40 AM, Ulrich Weigand wrote:
>> > I presume a good test case to examine for ICM is with such an operand
>> > coming from a global.  What about STCM?  I don't see the output from
>> > sync_compare_and_swap ever being allowed in memory...
> Actually, it's only ICM that is of interest here; it should get used when
> either the comparison value or the "new" value come from a memory location,
> e.g. a global.  Sorry, I was confused about STCM ...
> 

Well... it turns out to be just about impossible to get this to trigger.

With optimization on, the middle-end decides to promote the parameters
to registers immediately, which means that we never see a MEM in the
expander.  With optimization off, we don't propagate enough alignment
info so we never see ac.aligned = true.

It does look like we could relax the MEM_P requirement for Z10, so that
we use the register-based insv (RISBG).  I'll give that a go...


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-30 18:53       ` Richard Henderson
@ 2012-07-30 22:33         ` Richard Henderson
  2012-07-30 22:33           ` [PATCH 1/2] s390: Reorg s390_expand_insv Richard Henderson
                             ` (3 more replies)
  0 siblings, 4 replies; 29+ messages in thread
From: Richard Henderson @ 2012-07-30 22:33 UTC (permalink / raw)
  To: uweigand; +Cc: gcc-patches, rguenther

The atomic_load/storedi_1 patterns are fixed to use LM, STM.

I've had a go at generating better code in the HQImode CAS
loop for aligned memory, but I don't know that I'd call it
the most efficient thing ever.  Some of this is due to 
deficiencies in other parts of the compiler (including the
s390 backend):

  (1) MEM_ALIGN can't pass down full align+ofs data that we had
      during cfgexpand.  This means the opportunities for using
      the "aligned" path are less than they ought to be.

  (2) In get_pointer_alignment (used by get_builtin_sync_mem),
      we don't consider an ADDR_EXPR to return the full alignment
      that the type is due.  I'm sure this is to work around some
      other sort of usage via the <string.h> builtins, but it's
      less-than-handy in this case.

      I wonder if in get_builtin_sync_mem we ought to be using
      get_object_alignment (build_fold_indirect_ref (addr)) instead?

      Consider

	struct S { int x; unsigned short y; } g_s;
	unsigned short o, n;
	void good() {
	  __builtin_compare_exchange (&g_s.y, &o, n, 0, 0, 0);
	}
	void bad(S *p_s) {
	  __builtin_compare_exchange (&p_s->y, &o, n, 0, 0, 0);
	}

      where GOOD produces the aligned MEM that we need, and BAD doesn't.

  (3) Support for IC, and ICM via the insv pattern is lacking.
      I've added a tiny bit of support here, in the form of using
      the existing strict_low_part patterns, but most definitely we
      could do better.

  (4) The *sethighpartsi and *sethighpartdi_64 patterns ought to be
      more different.  As is, we can't insert into bits 48-56 of a
      DImode quantity, because we don't generate ICM for DImode,
      only ICMH.

  (5) Missing support for RISBGZ in the form of an extv/z expander.
      The existing *extv/z splitters probably ought to be conditionalized
      on !Z10.

  (6) The strict_low_part patterns should allow registers for at
      least Z10.  The SImode strict_low_part can use LR everywhere.

  (7) RISBGZ could be used for a 3-address constant lshrsi3 before
      srlk is available.

For the GOOD function above, and this patch set, for -O3 -march=z10:

        larl    %r3,s+4
        lhrl    %r0,o
        lhi     %r2,1
        l       %r1,0(%r3)
        nilh    %r1,0
.L2:
        lr      %r5,%r1
        larl    %r12,n
        lr      %r4,%r1
        risbg   %r4,%r0,32,47,16
        icm     %r5,3,0(%r12)
        cs      %r4,%r5,0(%r3)
        je      .L3
        lr      %r5,%r4
        nilh    %r5,0
        cr      %r5,%r1
        lr      %r1,%r5
        jne     .L2
        lhi     %r2,0
.L3:
        srl     %r4,16
        sthrl   %r4,o

Odd things:

   * O is forced into a register before reaching the expander, so we
     get the RISBG for that.  N is left in a memory and so we commit
     to using ICM for that.  Further, because of how strict_low_part
     is implemented we're committed to leaving that in memory.

   * We don't optimize the loop and hoist the LARL of N outside the loop.

   * Given that we're having to zap the mask in %r1 for the second
     compare anyway, I wonder if RISBG is really beneficial over OR.
     Is RISBG (or ICM for that matter) any faster (or even smaller)?


r~


Richard Henderson (2):
  s390: Reorg s390_expand_insv
  s390: Convert from sync to atomic optabs

 gcc/config/s390/s390-protos.h |    3 +-
 gcc/config/s390/s390.c        |  270 ++++++++++++++++++----------
 gcc/config/s390/s390.md       |  401 +++++++++++++++++++++++++++++------------
 3 files changed, 465 insertions(+), 209 deletions(-)

-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 1/2] s390: Reorg s390_expand_insv
  2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
@ 2012-07-30 22:33           ` Richard Henderson
  2012-07-30 22:36           ` [PATCH 2/2] s390: Convert from sync to atomic optabs Richard Henderson
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 29+ messages in thread
From: Richard Henderson @ 2012-07-30 22:33 UTC (permalink / raw)
  To: uweigand; +Cc: gcc-patches

Try RISBG last, after other mechanisms have failed; don't require
operands in registers for it but force them there instead.  Try a
limited form of ICM.
---
 gcc/config/s390/s390.c |  129 ++++++++++++++++++++++++++++++-----------------
 1 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index f72f49f..240fb7e 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -4538,48 +4538,70 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 {
   int bitsize = INTVAL (op1);
   int bitpos = INTVAL (op2);
+  enum machine_mode mode = GET_MODE (dest);
+  enum machine_mode smode = smallest_mode_for_size (bitsize, MODE_INT);
+  rtx op, clobber;
 
-  /* On z10 we can use the risbg instruction to implement insv.  */
-  if (TARGET_Z10
-      && ((GET_MODE (dest) == DImode && GET_MODE (src) == DImode)
-	  || (GET_MODE (dest) == SImode && GET_MODE (src) == SImode)))
+  /* Generate INSERT IMMEDIATE (IILL et al).  */
+  /* (set (ze (reg)) (const_int)).  */
+  if (TARGET_ZARCH
+      && register_operand (dest, word_mode)
+      && (bitpos % 16) == 0
+      && (bitsize % 16) == 0
+      && const_int_operand (src, VOIDmode))
     {
-      rtx op;
-      rtx clobber;
+      HOST_WIDE_INT val = INTVAL (src);
+      int regpos = bitpos + bitsize;
 
-      op = gen_rtx_SET (GET_MODE(src),
-			gen_rtx_ZERO_EXTRACT (GET_MODE (dest), dest, op1, op2),
-			src);
-      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
-      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
+      while (regpos > bitpos)
+	{
+	  enum machine_mode putmode;
+	  int putsize;
+
+	  if (TARGET_EXTIMM && (regpos % 32 == 0) && (regpos >= bitpos + 32))
+	    putmode = SImode;
+	  else
+	    putmode = HImode;
 
+	  putsize = GET_MODE_BITSIZE (putmode);
+	  regpos -= putsize;
+	  emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
+						GEN_INT (putsize),
+						GEN_INT (regpos)),
+			  gen_int_mode (val, putmode));
+	  val >>= putsize;
+	}
+      gcc_assert (regpos == bitpos);
       return true;
     }
 
-  /* We need byte alignment.  */
-  if (bitsize % BITS_PER_UNIT)
-    return false;
-
+  /* Generate STORE CHARACTERS UNDER MASK (STCM et al).  */
   if (bitpos == 0
-      && memory_operand (dest, VOIDmode)
+      && (bitsize % BITS_PER_UNIT) == 0
+      && MEM_P (dest)
       && (register_operand (src, word_mode)
 	  || const_int_operand (src, VOIDmode)))
     {
       /* Emit standard pattern if possible.  */
-      enum machine_mode mode = smallest_mode_for_size (bitsize, MODE_INT);
-      if (GET_MODE_BITSIZE (mode) == bitsize)
-	emit_move_insn (adjust_address (dest, mode, 0), gen_lowpart (mode, src));
+      if (GET_MODE_BITSIZE (smode) == bitsize)
+	{
+	  emit_move_insn (adjust_address (dest, smode, 0),
+			  gen_lowpart (smode, src));
+	  return true;
+	}
 
       /* (set (ze (mem)) (const_int)).  */
       else if (const_int_operand (src, VOIDmode))
 	{
 	  int size = bitsize / BITS_PER_UNIT;
-	  rtx src_mem = adjust_address (force_const_mem (word_mode, src), BLKmode,
+	  rtx src_mem = adjust_address (force_const_mem (word_mode, src),
+					BLKmode,
 					GET_MODE_SIZE (word_mode) - size);
 
 	  dest = adjust_address (dest, BLKmode, 0);
 	  set_mem_size (dest, size);
 	  s390_expand_movmem (dest, src_mem, GEN_INT (size));
+	  return true;
 	}
 
       /* (set (ze (mem)) (reg)).  */
@@ -4602,42 +4624,55 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 			      gen_rtx_LSHIFTRT (word_mode, src, GEN_INT
 						(GET_MODE_BITSIZE (SImode))));
 	    }
+	  return true;
 	}
-      else
-	return false;
+    }
 
-      return true;
+  /* Generate INSERT CHARACTERS UNDER MASK (IC, ICM et al).  */
+  if ((bitpos % BITS_PER_UNIT) == 0
+      && (bitsize % BITS_PER_UNIT) == 0
+      && (bitpos & 32) == ((bitpos + bitsize - 1) & 32)
+      && MEM_P (src)
+      && (mode == DImode || mode == SImode)
+      && register_operand (dest, mode))
+    {
+      /* Emit a strict_low_part pattern if possible.  */
+      if (bitpos == 0 && GET_MODE_BITSIZE (smode) == bitsize)
+	{
+	  op = gen_rtx_STRICT_LOW_PART (VOIDmode, gen_lowpart (smode, dest));
+	  op = gen_rtx_SET (VOIDmode, op, gen_lowpart (smode, src));
+	  clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
+	  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
+	  return true;
+	}
+
+      /* ??? There are more powerful versions of ICM that are not
+	 completely represented in the md file.  */
     }
 
-  /* (set (ze (reg)) (const_int)).  */
-  if (TARGET_ZARCH
-      && register_operand (dest, word_mode)
-      && (bitpos % 16) == 0
-      && (bitsize % 16) == 0
-      && const_int_operand (src, VOIDmode))
+  /* For z10, generate ROTATE THEN INSERT SELECTED BITS (RISBG et al).  */
+  if (TARGET_Z10 && (mode == DImode || mode == SImode))
     {
-      HOST_WIDE_INT val = INTVAL (src);
-      int regpos = bitpos + bitsize;
+      enum machine_mode mode_s = GET_MODE (src);
 
-      while (regpos > bitpos)
+      if (mode_s == VOIDmode)
 	{
-	  enum machine_mode putmode;
-	  int putsize;
+	  /* Assume const_int etc already in the proper mode.  */
+	  src = force_reg (mode, src);
+	}
+      else if (mode_s != mode)
+	{
+	  gcc_assert (GET_MODE_BITSIZE (mode_s) >= bitsize);
+	  src = force_reg (mode_s, src);
+	  src = gen_lowpart (mode, src);
+	}
 
-	  if (TARGET_EXTIMM && (regpos % 32 == 0) && (regpos >= bitpos + 32))
-	    putmode = SImode;
-	  else
-	    putmode = HImode;
+      op = gen_rtx_SET (mode,
+			gen_rtx_ZERO_EXTRACT (mode, dest, op1, op2),
+			src);
+      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clobber)));
 
-	  putsize = GET_MODE_BITSIZE (putmode);
-	  regpos -= putsize;
-	  emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
-						GEN_INT (putsize),
-						GEN_INT (regpos)),
-			  gen_int_mode (val, putmode));
-	  val >>= putsize;
-	}
-      gcc_assert (regpos == bitpos);
       return true;
     }
 
-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH 2/2] s390: Convert from sync to atomic optabs
  2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
  2012-07-30 22:33           ` [PATCH 1/2] s390: Reorg s390_expand_insv Richard Henderson
@ 2012-07-30 22:36           ` Richard Henderson
  2012-08-06 18:34             ` Ulrich Weigand
  2012-07-31  9:11           ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Guenther
  2012-07-31 18:36           ` Ulrich Weigand
  3 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-07-30 22:36 UTC (permalink / raw)
  To: uweigand; +Cc: gcc-patches

Split out s390_two_part_insv from s390_expand_cs_hqi to try
harder to use bit insertion instructions in the CAS loop.
---
 gcc/config/s390/s390-protos.h |    3 +-
 gcc/config/s390/s390.c        |  141 ++++++++++-----
 gcc/config/s390/s390.md       |  401 +++++++++++++++++++++++++++++------------
 3 files changed, 383 insertions(+), 162 deletions(-)

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 4f1eb42..79673d6 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -85,7 +85,8 @@ extern void s390_expand_setmem (rtx, rtx, rtx);
 extern bool s390_expand_cmpmem (rtx, rtx, rtx, rtx);
 extern bool s390_expand_addcc (enum rtx_code, rtx, rtx, rtx, rtx, rtx);
 extern bool s390_expand_insv (rtx, rtx, rtx, rtx);
-extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx, rtx);
+extern void s390_expand_cs_hqi (enum machine_mode, rtx, rtx, rtx,
+				rtx, rtx, bool);
 extern void s390_expand_atomic (enum machine_mode, enum rtx_code,
 				rtx, rtx, rtx, bool);
 extern rtx s390_return_addr_rtx (int, rtx);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 240fb7e..1006281 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -896,10 +896,12 @@ s390_emit_compare (enum rtx_code code, rtx op0, rtx op1)
    conditional branch testing the result.  */
 
 static rtx
-s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem, rtx cmp, rtx new_rtx)
+s390_emit_compare_and_swap (enum rtx_code code, rtx old, rtx mem,
+			    rtx cmp, rtx new_rtx)
 {
-  emit_insn (gen_sync_compare_and_swapsi (old, mem, cmp, new_rtx));
-  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM), const0_rtx);
+  emit_insn (gen_atomic_compare_and_swapsi_internal (old, mem, cmp, new_rtx));
+  return s390_emit_compare (code, gen_rtx_REG (CCZ1mode, CC_REGNUM),
+			    const0_rtx);
 }
 
 /* Emit a jump instruction to TARGET.  If COND is NULL_RTX, emit an
@@ -4754,80 +4756,123 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
   ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask, NULL_RTX, 1);
 }
 
+/* A subroutine of s390_expand_cs_hqi.  Insert INS into VAL.  If possible,
+   use a single insv insn into SEQ2.  Otherwise, put prep insns in SEQ1 and
+   perform the merge in SEQ2.  */
+
+static rtx
+s390_two_part_insv (struct alignment_context *ac, rtx *seq1, rtx *seq2,
+		    enum machine_mode mode, rtx val, rtx ins)
+{
+  rtx tmp;
+
+  if (ac->aligned)
+    {
+      start_sequence ();
+      tmp = copy_to_mode_reg (SImode, val);
+      if (s390_expand_insv (tmp, GEN_INT (GET_MODE_BITSIZE (mode)),
+			    const0_rtx, ins))
+	{
+	  *seq1 = NULL;
+	  *seq2 = get_insns ();
+	  end_sequence ();
+	  return tmp;
+	}
+      end_sequence ();
+    }
+
+  /* Failed to use insv.  Generate a two part shift and mask.  */
+  start_sequence ();
+  tmp = s390_expand_mask_and_shift (ins, mode, ac->shift);
+  *seq1 = get_insns ();
+  end_sequence ();
+
+  start_sequence ();
+  tmp = expand_simple_binop (SImode, IOR, tmp, val, NULL_RTX, 1, OPTAB_DIRECT);
+  *seq2 = get_insns ();
+  end_sequence ();
+
+  return tmp;
+}
+
 /* Expand an atomic compare and swap operation for HImode and QImode.  MEM is
-   the memory location, CMP the old value to compare MEM with and NEW_RTX the value
-   to set if CMP == MEM.
-   CMP is never in memory for compare_and_swap_cc because
-   expand_bool_compare_and_swap puts it into a register for later compare.  */
+   the memory location, CMP the old value to compare MEM with and NEW_RTX the
+   value to set if CMP == MEM.  */
 
 void
-s390_expand_cs_hqi (enum machine_mode mode, rtx target, rtx mem, rtx cmp, rtx new_rtx)
+s390_expand_cs_hqi (enum machine_mode mode, rtx btarget, rtx vtarget, rtx mem,
+		    rtx cmp, rtx new_rtx, bool is_weak)
 {
   struct alignment_context ac;
-  rtx cmpv, newv, val, resv, cc;
+  rtx cmpv, newv, val, resv, cc, seq0, seq1, seq2, seq3;
   rtx res = gen_reg_rtx (SImode);
-  rtx csloop = gen_label_rtx ();
-  rtx csend = gen_label_rtx ();
+  rtx csloop = NULL, csend = NULL;
 
-  gcc_assert (register_operand (target, VOIDmode));
+  gcc_assert (register_operand (vtarget, VOIDmode));
   gcc_assert (MEM_P (mem));
 
   init_alignment_context (&ac, mem, mode);
 
-  /* Shift the values to the correct bit positions.  */
-  if (!(ac.aligned && MEM_P (cmp)))
-    cmp = s390_expand_mask_and_shift (cmp, mode, ac.shift);
-  if (!(ac.aligned && MEM_P (new_rtx)))
-    new_rtx = s390_expand_mask_and_shift (new_rtx, mode, ac.shift);
-
   /* Load full word.  Subsequent loads are performed by CS.  */
   val = expand_simple_binop (SImode, AND, ac.memsi, ac.modemaski,
 			     NULL_RTX, 1, OPTAB_DIRECT);
 
+  /* Prepare insertions of cmp and new_rtx into the loaded value.  When
+     possible, we try to use insv to make this happen efficiently.  If
+     that fails we'll generate code both inside and outside the loop.  */
+  cmpv = s390_two_part_insv (&ac, &seq0, &seq2, mode, val, cmp);
+  newv = s390_two_part_insv (&ac, &seq1, &seq3, mode, val, new_rtx);
+
+  if (seq0)
+    emit_insn (seq0);
+  if (seq1)
+    emit_insn (seq1);
+
   /* Start CS loop.  */
-  emit_label (csloop);
+  if (!is_weak)
+    {
+      /* Begin assuming success.  */
+      emit_move_insn (btarget, const1_rtx);
+
+      csloop = gen_label_rtx ();
+      csend = gen_label_rtx ();
+      emit_label (csloop);
+    }
+
   /* val = "<mem>00..0<mem>"
    * cmp = "00..0<cmp>00..0"
    * new = "00..0<new>00..0"
    */
 
-  /* Patch cmp and new with val at correct position.  */
-  if (ac.aligned && MEM_P (cmp))
+  emit_insn (seq2);
+  emit_insn (seq3);
+
+  if (is_weak)
     {
-      cmpv = force_reg (SImode, val);
-      store_bit_field (cmpv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, cmp);
+      cc = s390_emit_compare_and_swap (NE, res, ac.memsi, cmpv, newv);
+      emit_insn (gen_cstorecc4 (btarget, cc, XEXP (cc, 0), XEXP (cc, 1)));
     }
   else
-    cmpv = force_reg (SImode, expand_simple_binop (SImode, IOR, cmp, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
-  if (ac.aligned && MEM_P (new_rtx))
     {
-      newv = force_reg (SImode, val);
-      store_bit_field (newv, GET_MODE_BITSIZE (mode), 0,
-		       0, 0, SImode, new_rtx);
-    }
-  else
-    newv = force_reg (SImode, expand_simple_binop (SImode, IOR, new_rtx, val,
-						   NULL_RTX, 1, OPTAB_DIRECT));
-
-  /* Jump to end if we're done (likely?).  */
-  s390_emit_jump (csend, s390_emit_compare_and_swap (EQ, res, ac.memsi,
-						     cmpv, newv));
+      /* Jump to end if we're done (likely?).  */
+      cc = s390_emit_compare_and_swap (EQ, res, ac.memsi, cmpv, newv);
+      s390_emit_jump (csend, cc);
 
-  /* Check for changes outside mode.  */
-  resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
-			      NULL_RTX, 1, OPTAB_DIRECT);
-  cc = s390_emit_compare (NE, resv, val);
-  emit_move_insn (val, resv);
-  /* Loop internal if so.  */
-  s390_emit_jump (csloop, cc);
+      /* Check for changes outside mode, and loop internal if so.  */
+      resv = expand_simple_binop (SImode, AND, res, ac.modemaski,
+			          NULL_RTX, 1, OPTAB_DIRECT);
+      cc = s390_emit_compare (NE, resv, val);
+      emit_move_insn (val, resv);
+      s390_emit_jump (csloop, cc);
 
-  emit_label (csend);
+      /* Failed.  */
+      emit_move_insn (btarget, const0_rtx);
+      emit_label (csend);
+    }
 
   /* Return the correct part of the bitfield.  */
-  convert_move (target, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
-					     NULL_RTX, 1, OPTAB_DIRECT), 1);
+  convert_move (vtarget, expand_simple_binop (SImode, LSHIFTRT, res, ac.shift,
+					      NULL_RTX, 1, OPTAB_DIRECT), 1);
 }
 
 /* Expand an atomic operation CODE of mode MODE.  MEM is the memory location
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 096f266..3314006 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -84,6 +84,7 @@
 
    ; Atomic Support
    UNSPEC_MB
+   UNSPEC_MOVA
 
    ; TLS relocation specifiers
    UNSPEC_TLSGD
@@ -349,21 +350,19 @@
 (define_mode_iterator DD_DF [DF DD])
 (define_mode_iterator TD_TF [TF TD])
 
-;; This mode iterator allows 31-bit and 64-bit TDSI patterns to be generated
-;; from the same template.
-(define_mode_iterator TDSI [(TI "TARGET_64BIT") DI SI])
-
 ;; These mode iterators allow 31-bit and 64-bit GPR patterns to be generated
 ;; from the same template.
 (define_mode_iterator GPR [(DI "TARGET_ZARCH") SI])
+(define_mode_iterator DGPR [(TI "TARGET_ZARCH") DI SI])
 (define_mode_iterator DSI [DI SI])
+(define_mode_iterator TDI [TI DI])
 
 ;; These mode iterators allow :P to be used for patterns that operate on
 ;; pointer-sized quantities.  Exactly one of the two alternatives will match.
 (define_mode_iterator P [(DI "TARGET_64BIT") (SI "!TARGET_64BIT")])
 
-;; These macros refer to the actual word_mode of the configuration. This is equal
-;; to Pmode except on 31-bit machines in zarch mode.
+;; These macros refer to the actual word_mode of the configuration.
+;; This is equal to Pmode except on 31-bit machines in zarch mode.
 (define_mode_iterator DW [(TI "TARGET_ZARCH") (DI "!TARGET_ZARCH")])
 (define_mode_iterator W  [(DI "TARGET_ZARCH") (SI "!TARGET_ZARCH")])
 
@@ -379,6 +378,7 @@
 ;; same template.
 (define_mode_iterator INT [(DI "TARGET_ZARCH") SI HI QI])
 (define_mode_iterator INTALL [TI DI SI HI QI])
+(define_mode_iterator DINT [(TI "TARGET_ZARCH") DI SI HI QI])
 
 ;; This iterator allows some 'ashift' and 'lshiftrt' pattern to be defined from
 ;; the same template.
@@ -487,6 +487,9 @@
 ;; and "cds" in DImode.
 (define_mode_attr tg [(TI "g") (DI "")])
 
+;; In TDI templates, a string like "c<d>sg".
+(define_mode_attr td [(TI "d") (DI "")])
+
 ;; In GPR templates, a string like "c<gf>dbr" will expand to "cgdbr" in DImode
 ;; and "cfdbr" in SImode.
 (define_mode_attr gf [(DI "g") (SI "f")])
@@ -8739,164 +8742,336 @@
 ;;
 
 ;
-; memory barrier pattern.
+; memory barrier patterns.
 ;
 
-(define_expand "memory_barrier"
-  [(set (match_dup 0)
-	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
+(define_expand "mem_signal_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
+  ""
+{
+  /* The s390 memory model is strong enough not to require any
+     barrier in order to synchronize a thread with itself.  */
+  DONE;
+})
+
+(define_expand "mem_thread_fence"
+  [(match_operand:SI 0 "const_int_operand")]		;; model
   ""
 {
-  operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
-  MEM_VOLATILE_P (operands[0]) = 1;
+  /* Unless this is a SEQ_CST fence, the s390 memory model is strong
+     enough not to require barriers of any kind.  */
+  if (INTVAL (operands[0]) == MEMMODEL_SEQ_CST)
+    {
+      rtx mem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+      MEM_VOLATILE_P (mem) = 1;
+      emit_insn (gen_mem_thread_fence_1 (mem));
+    }
+  DONE;
 })
 
-(define_insn "*memory_barrier"
+; Although bcr is superscalar on Z10, this variant will never
+; become part of an execution group.
+(define_insn "mem_thread_fence_1"
   [(set (match_operand:BLK 0 "" "")
 	(unspec:BLK [(match_dup 0)] UNSPEC_MB))]
   ""
   "bcr\t15,0"
   [(set_attr "op_type" "RR")])
 
-; Although bcr is superscalar on Z10, this variant will never become part of
-; an execution group.
+;
+; atomic load/store operations
+;
+
+; Atomic loads need not examine the memory model at all.
+(define_expand "atomic_load<mode>"
+  [(match_operand:DINT 0 "register_operand")	;; output
+   (match_operand:DINT 1 "memory_operand")	;; memory
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_loadti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    {
+      if (!s_operand (operands[1], VOIDmode))
+	{
+	  rtx a = copy_to_reg (XEXP (operands[1], 0));
+	  operands[1] = replace_equiv_address (operands[1], a);
+        }
+      emit_insn (gen_atomic_loaddi_1 (operands[0], operands[1]));
+    }
+  else
+    emit_move_insn (operands[0], operands[1]);
+  DONE;
+})
+
+; Different from movdi_31 in that we have no splitters.
+(define_insn "atomic_loaddi_1"
+  [(set (match_operand:DI 0 "register_operand" "=d,d,!*f,!*f")
+	(unspec:DI [(match_operand:DI 1 "s_operand" "Q,S,Q,m")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   lm\t%0,%M0,%S1
+   lmy\t%0,%M0,%S1
+   ld\t%0,%1
+   ldy\t%0,%1"
+  [(set_attr "op_type" "RS,RSY,RS,RSY")
+   (set_attr "type" "lm,lm,floaddf,floaddf")])
+
+(define_insn "atomic_loadti_1"
+  [(set (match_operand:TI 0 "register_operand" "=r")
+	(unspec:TI [(match_operand:TI 1 "memory_operand" "m")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "lpq\t%0,%1"
+  [(set_attr "op_type" "RXY")])
+
+; Atomic stores must(?) enforce sequential consistency.
+(define_expand "atomic_store<mode>"
+  [(match_operand:DINT 0 "memory_operand")	;; memory
+   (match_operand:DINT 1 "register_operand")	;; input
+   (match_operand:SI 2 "const_int_operand")]	;; model
+  ""
+{
+  enum memmodel model = (enum memmodel) INTVAL (operands[2]);
+
+  if (<MODE>mode == TImode)
+    emit_insn (gen_atomic_storeti_1 (operands[0], operands[1]));
+  else if (<MODE>mode == DImode && !TARGET_ZARCH)
+    {
+      if (!s_operand (operands[0], VOIDmode))
+	{
+	  rtx a = copy_to_reg (XEXP (operands[0], 0));
+	  operands[0] = replace_equiv_address (operands[0], a);
+        }
+      emit_insn (gen_atomic_storedi_1 (operands[0], operands[1]));
+    }
+  else
+    emit_move_insn (operands[0], operands[1]);
+  if (model == MEMMODEL_SEQ_CST)
+    emit_insn (gen_mem_thread_fence (operands[2]));
+  DONE;
+})
+
+; Different from movdi_31 in that we have no splitters.
+(define_insn "atomic_storedi_1"
+  [(set (match_operand:DI 0 "s_operand" "=Q,S,Q,m")
+	(unspec:DI [(match_operand:DI 1 "register_operand" "d,d,!*f,!*f")]
+		   UNSPEC_MOVA))]
+  "!TARGET_ZARCH"
+  "@
+   stm\t%1,%N1,%S0
+   stmy\t%1,%N1,%S0
+   std %1,%0
+   stdy %1,%0"
+  [(set_attr "op_type" "RS,RSY,RS,RSY")
+   (set_attr "type" "stm,stm,fstoredf,fstoredf")])
+
+(define_insn "atomic_storeti_1"
+  [(set (match_operand:TI 0 "memory_operand" "=m")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "r")]
+		   UNSPEC_MOVA))]
+  "TARGET_ZARCH"
+  "stpq\t%1,%0"
+  [(set_attr "op_type" "RXY")])
 
 ;
 ; compare and swap patterns.
 ;
 
-(define_expand "sync_compare_and_swap<mode>"
-  [(parallel
-    [(set (match_operand:TDSI 0 "register_operand" "")
-	  (match_operand:TDSI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:TDSI
-	    [(match_dup 1)
-	     (match_operand:TDSI 2 "register_operand" "")
-	     (match_operand:TDSI 3 "register_operand" "")]
-	    UNSPECV_CAS))
-     (set (reg:CCZ1 CC_REGNUM)
-	  (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
-  "")
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand")	;; bool success output
+   (match_operand:DGPR 1 "register_operand")	;; oldval output
+   (match_operand:DGPR 2 "s_operand")		;; memory
+   (match_operand:DGPR 3 "register_operand")	;; expected intput
+   (match_operand:DGPR 4 "register_operand")	;; newval intput
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
+  ""
+{
+  rtx cc, cmp;
+  emit_insn (gen_atomic_compare_and_swap<mode>_internal
+	     (operands[1], operands[2], operands[3], operands[4]));
+  cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
+  cmp = gen_rtx_NE (SImode, cc, const0_rtx);
+  emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));
+  DONE;
+})
 
-(define_expand "sync_compare_and_swap<mode>"
-  [(parallel
-    [(set (match_operand:HQI 0 "register_operand" "")
-	  (match_operand:HQI 1 "memory_operand" ""))
-     (set (match_dup 1)
-	  (unspec_volatile:HQI
-	    [(match_dup 1)
-	     (match_operand:HQI 2 "general_operand" "")
-	     (match_operand:HQI 3 "general_operand" "")]
-	    UNSPECV_CAS))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_expand "atomic_compare_and_swap<mode>"
+  [(match_operand:SI 0 "register_operand")	;; bool success output
+   (match_operand:HQI 1 "register_operand")	;; oldval output
+   (match_operand:HQI 2 "s_operand")		;; memory
+   (match_operand:HQI 3 "general_operand")	;; expected intput
+   (match_operand:HQI 4 "general_operand")	;; newval intput
+   (match_operand:SI 5 "const_int_operand")	;; is_weak
+   (match_operand:SI 6 "const_int_operand")	;; success model
+   (match_operand:SI 7 "const_int_operand")]	;; failure model
   ""
-  "s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1],
-		       operands[2], operands[3]); DONE;")
+{
+  s390_expand_cs_hqi (<MODE>mode, operands[0], operands[1], operands[2],
+		      operands[3], operands[4], INTVAL (operands[5]));
+  DONE;
+})
 
-; cds, cdsg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:DW 0 "register_operand" "=r")
-	(match_operand:DW 1 "memory_operand" "+Q"))
+(define_expand "atomic_compare_and_swap<mode>_internal"
+  [(parallel
+     [(set (match_operand:DGPR 0 "register_operand")
+	   (match_operand:DGPR 1 "s_operand"))
+      (set (match_dup 1)
+	   (unspec_volatile:DGPR
+	     [(match_dup 1)
+	      (match_operand:DGPR 2 "register_operand")
+	      (match_operand:DGPR 3 "register_operand")]
+	     UNSPECV_CAS))
+      (set (reg:CCZ1 CC_REGNUM)
+	   (compare:CCZ1 (match_dup 1) (match_dup 2)))])]
+  "")
+
+; cdsg, csg
+(define_insn "*atomic_compare_and_swap<mode>_1"
+  [(set (match_operand:TDI 0 "register_operand" "=r")
+	(match_operand:TDI 1 "s_operand" "+QS"))
    (set (match_dup 1)
-	(unspec_volatile:DW
+	(unspec_volatile:TDI
 	  [(match_dup 1)
-	   (match_operand:DW 2 "register_operand" "0")
-	   (match_operand:DW 3 "register_operand" "r")]
+	   (match_operand:TDI 2 "register_operand" "0")
+	   (match_operand:TDI 3 "register_operand" "r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
-  ""
-  "cds<tg>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<TE>")
+  "TARGET_ZARCH"
+  "c<td>sg\t%0,%3,%S1"
+  [(set_attr "op_type" "RXY")
    (set_attr "type"   "sem")])
 
-; cs, csg
-(define_insn "*sync_compare_and_swap<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(match_operand:GPR 1 "memory_operand" "+Q"))
+; cds, cdsy
+(define_insn "*atomic_compare_and_swapdi_2"
+  [(set (match_operand:DI 0 "register_operand" "=r,r")
+	(match_operand:DI 1 "s_operand" "+Q,S"))
    (set (match_dup 1)
-	(unspec_volatile:GPR
+	(unspec_volatile:DI
 	  [(match_dup 1)
-	   (match_operand:GPR 2 "register_operand" "0")
-	   (match_operand:GPR 3 "register_operand" "r")]
+	   (match_operand:DI 2 "register_operand" "0,0")
+	   (match_operand:DI 3 "register_operand" "r,r")]
+	  UNSPECV_CAS))
+   (set (reg:CCZ1 CC_REGNUM)
+	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
+  "!TARGET_ZARCH"
+  "@
+   cds\t%0,%3,%S1
+   cdsy\t%0,%3,%S1"
+  [(set_attr "op_type" "RX,RXY")
+   (set_attr "type" "sem")])
+
+; cs, csy
+(define_insn "*atomic_compare_and_swapsi_3"
+  [(set (match_operand:SI 0 "register_operand" "=r,r")
+	(match_operand:SI 1 "s_operand" "+Q,S"))
+   (set (match_dup 1)
+	(unspec_volatile:SI
+	  [(match_dup 1)
+	   (match_operand:SI 2 "register_operand" "0,0")
+	   (match_operand:SI 3 "register_operand" "r,r")]
 	  UNSPECV_CAS))
    (set (reg:CCZ1 CC_REGNUM)
 	(compare:CCZ1 (match_dup 1) (match_dup 2)))]
   ""
-  "cs<g>\t%0,%3,%S1"
-  [(set_attr "op_type" "RS<E>")
+  "@
+   cs\t%0,%3,%S1
+   csy\t%0,%3,%S1"
+  [(set_attr "op_type" "RX,RXY")
    (set_attr "type"   "sem")])
 
-
 ;
 ; Other atomic instruction patterns.
 ;
 
-(define_expand "sync_lock_test_and_set<mode>"
-  [(match_operand:HQI 0 "register_operand")
-   (match_operand:HQI 1 "memory_operand")
-   (match_operand:HQI 2 "general_operand")]
-  ""
-  "s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
 ; z196 load and add, xor, or and and instructions
 
-; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "memory_operand" "+QS")
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 0)
-			     (match_operand:GPR 1 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (match_scratch:GPR 2 "=d"))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:GPR 0 "register_operand")		;; val out
+   (ATOMIC_Z196:GPR
+     (match_operand:GPR 1 "s_operand")			;; memory
+     (match_operand:GPR 2 "register_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   "TARGET_Z196"
-  "la<noxa><g>\t%2,%1,%0")
+{
+  emit_insn (gen_atomic_fetch_<atomic><mode>_iaf
+	     (operands[0], operands[1], operands[2]));
+  DONE;
+})
 
 ; lan, lang, lao, laog, lax, laxg, laa, laag
-(define_insn "sync_old_<atomic><mode>"
-  [(parallel
-    [(set (match_operand:GPR 0 "register_operand" "=d")
-	  (match_operand:GPR 1 "memory_operand"   "+QS"))
-     (set (match_dup 1)
-	  (unspec_volatile:GPR
-	   [(ATOMIC_Z196:GPR (match_dup 1)
-			     (match_operand:GPR 2 "general_operand" "d"))]
-	   UNSPECV_ATOMIC_OP))
-     (clobber (reg:CC CC_REGNUM))])]
+(define_insn "atomic_fetch_<atomic><mode>_iaf"
+  [(set (match_operand:GPR 0 "register_operand" "=d")
+	(match_operand:GPR 1 "s_operand" "+S"))
+   (set (match_dup 1)
+	(unspec_volatile:GPR
+	 [(ATOMIC_Z196:GPR (match_dup 1)
+			   (match_operand:GPR 2 "general_operand" "d"))]
+	 UNSPECV_ATOMIC_OP))
+   (clobber (reg:CC CC_REGNUM))]
   "TARGET_Z196"
-  "la<noxa><g>\t%0,%2,%1")
+  "la<noxa><g>\t%0,%2,%1"
+  [(set_attr "op_type" "RXY")
+   (set_attr "type" "sem")])
 
+;; For SImode and larger, the optabs.c code will do just fine in
+;; expanding a compare-and-swap loop.  For QI/HImode, we can do
+;; better by expanding our own loop.
 
-(define_expand "sync_<atomic><mode>"
-  [(set (match_operand:HQI 0 "memory_operand")
-	(ATOMIC:HQI (match_dup 0)
-		    (match_operand:HQI 1 "general_operand")))]
+(define_expand "atomic_<atomic><mode>"
+  [(ATOMIC:HQI
+     (match_operand:HQI 0 "s_operand")			;; memory
+     (match_operand:HQI 1 "general_operand"))		;; val in
+   (match_operand:SI 2 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
-		       operands[1], false); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, NULL_RTX, operands[0],
+		       operands[1], false);
+  DONE;
+})
 
-(define_expand "sync_old_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(match_operand:HQI 1 "memory_operand"))
-   (set (match_dup 1)
-	(ATOMIC:HQI (match_dup 1)
-		    (match_operand:HQI 2 "general_operand")))]
+(define_expand "atomic_fetch_<atomic><mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "s_operand")			;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], false); DONE;")
-
-(define_expand "sync_new_<atomic><mode>"
-  [(set (match_operand:HQI 0 "register_operand")
-	(ATOMIC:HQI (match_operand:HQI 1 "memory_operand")
-		    (match_operand:HQI 2 "general_operand")))
-   (set (match_dup 1) (ATOMIC:HQI (match_dup 1) (match_dup 2)))]
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
+
+(define_expand "atomic_<atomic>_fetch<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (ATOMIC:HQI
+     (match_operand:HQI 1 "s_operand")			;; memory
+     (match_operand:HQI 2 "general_operand"))		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
   ""
-  "s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
-		       operands[2], true); DONE;")
+{
+  s390_expand_atomic (<MODE>mode, <CODE>, operands[0], operands[1],
+		      operands[2], true);
+  DONE;
+})
+
+(define_expand "atomic_exchange<mode>"
+  [(match_operand:HQI 0 "register_operand")		;; val out
+   (match_operand:HQI 1 "s_operand")			;; memory
+   (match_operand:HQI 2 "general_operand")		;; val in
+   (match_operand:SI 3 "const_int_operand")]		;; model
+  ""
+{
+  s390_expand_atomic (<MODE>mode, SET, operands[0], operands[1],
+		      operands[2], false);
+  DONE;
+})
 
 ;;
 ;;- Miscellaneous instructions.
-- 
1.7.7.6

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
  2012-07-30 22:33           ` [PATCH 1/2] s390: Reorg s390_expand_insv Richard Henderson
  2012-07-30 22:36           ` [PATCH 2/2] s390: Convert from sync to atomic optabs Richard Henderson
@ 2012-07-31  9:11           ` Richard Guenther
  2012-07-31 15:27             ` Andrew MacLeod
  2012-07-31 16:07             ` Richard Henderson
  2012-07-31 18:36           ` Ulrich Weigand
  3 siblings, 2 replies; 29+ messages in thread
From: Richard Guenther @ 2012-07-31  9:11 UTC (permalink / raw)
  To: Richard Henderson; +Cc: uweigand, gcc-patches

On Mon, 30 Jul 2012, Richard Henderson wrote:

> The atomic_load/storedi_1 patterns are fixed to use LM, STM.
> 
> I've had a go at generating better code in the HQImode CAS
> loop for aligned memory, but I don't know that I'd call it
> the most efficient thing ever.  Some of this is due to 
> deficiencies in other parts of the compiler (including the
> s390 backend):
> 
>   (1) MEM_ALIGN can't pass down full align+ofs data that we had
>       during cfgexpand.  This means the opportunities for using
>       the "aligned" path are less than they ought to be.
> 
>   (2) In get_pointer_alignment (used by get_builtin_sync_mem),
>       we don't consider an ADDR_EXPR to return the full alignment
>       that the type is due.  I'm sure this is to work around some
>       other sort of usage via the <string.h> builtins, but it's
>       less-than-handy in this case.
> 
>       I wonder if in get_builtin_sync_mem we ought to be using
>       get_object_alignment (build_fold_indirect_ref (addr)) instead?
> 
>       Consider
> 
> 	struct S { int x; unsigned short y; } g_s;
> 	unsigned short o, n;
> 	void good() {
> 	  __builtin_compare_exchange (&g_s.y, &o, n, 0, 0, 0);
> 	}
> 	void bad(S *p_s) {
> 	  __builtin_compare_exchange (&p_s->y, &o, n, 0, 0, 0);
> 	}
> 
>       where GOOD produces the aligned MEM that we need, and BAD doesn't.

You cannot generally use get_object_alignment here.  Once we have
an address in the middle-end we treat it as generic pointer, happily
not caring about the actual pointer types.  But it seems we do

  /* The alignment needs to be at least according to that of the mode.  */
  set_mem_align (mem, MAX (GET_MODE_ALIGNMENT (mode),
                           get_pointer_alignment (loc)));

anyway?  What do we expect __builtin_compare_exchange to do for
unaligned inputs?  Like

typedef int uint __attribute__((aligned((8))));
unsigned short o, n;
void very_bad (uint *p) {
  __builtin_compare_exchange (p, &o, n, 0, 0, 0);
}

?  Doesn't the above set a wrong alignment?

Back to using get_object_alignment - you cannot blindly use
build_fold_indirect_ref at least, you could use get_object_alignment
on the operand of an ADDR_EXPR address but I am sure we can construct
a testcase where that would give a wrong answer, too (maybe not
easily without violating the C standards rule that pointers have
to be aligned according to their type ... but nobody in practice
follows this and the middle-end does not require this either).

Thus, the bad news is that it's hard for the middle-end to
recover alignment of a memory access that is represented as
a builtin function call that takes addresses as parameters
(which also makes them address-taken and thus possibly aliased).
Didn't Andrew have some patches to introduce a GIMPLE_ATOMIC
eventually side-stepping this issue (maybe that used addresses, too)?

Richard.

>   (3) Support for IC, and ICM via the insv pattern is lacking.
>       I've added a tiny bit of support here, in the form of using
>       the existing strict_low_part patterns, but most definitely we
>       could do better.
> 
>   (4) The *sethighpartsi and *sethighpartdi_64 patterns ought to be
>       more different.  As is, we can't insert into bits 48-56 of a
>       DImode quantity, because we don't generate ICM for DImode,
>       only ICMH.
> 
>   (5) Missing support for RISBGZ in the form of an extv/z expander.
>       The existing *extv/z splitters probably ought to be conditionalized
>       on !Z10.
> 
>   (6) The strict_low_part patterns should allow registers for at
>       least Z10.  The SImode strict_low_part can use LR everywhere.
> 
>   (7) RISBGZ could be used for a 3-address constant lshrsi3 before
>       srlk is available.
> 
> For the GOOD function above, and this patch set, for -O3 -march=z10:
> 
>         larl    %r3,s+4
>         lhrl    %r0,o
>         lhi     %r2,1
>         l       %r1,0(%r3)
>         nilh    %r1,0
> .L2:
>         lr      %r5,%r1
>         larl    %r12,n
>         lr      %r4,%r1
>         risbg   %r4,%r0,32,47,16
>         icm     %r5,3,0(%r12)
>         cs      %r4,%r5,0(%r3)
>         je      .L3
>         lr      %r5,%r4
>         nilh    %r5,0
>         cr      %r5,%r1
>         lr      %r1,%r5
>         jne     .L2
>         lhi     %r2,0
> .L3:
>         srl     %r4,16
>         sthrl   %r4,o
> 
> Odd things:
> 
>    * O is forced into a register before reaching the expander, so we
>      get the RISBG for that.  N is left in a memory and so we commit
>      to using ICM for that.  Further, because of how strict_low_part
>      is implemented we're committed to leaving that in memory.
> 
>    * We don't optimize the loop and hoist the LARL of N outside the loop.
> 
>    * Given that we're having to zap the mask in %r1 for the second
>      compare anyway, I wonder if RISBG is really beneficial over OR.
>      Is RISBG (or ICM for that matter) any faster (or even smaller)?
> 
> 
> r~
> 
> 
> Richard Henderson (2):
>   s390: Reorg s390_expand_insv
>   s390: Convert from sync to atomic optabs
> 
>  gcc/config/s390/s390-protos.h |    3 +-
>  gcc/config/s390/s390.c        |  270 ++++++++++++++++++----------
>  gcc/config/s390/s390.md       |  401 +++++++++++++++++++++++++++++------------
>  3 files changed, 465 insertions(+), 209 deletions(-)
> 
> 

-- 
Richard Guenther <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-31  9:11           ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Guenther
@ 2012-07-31 15:27             ` Andrew MacLeod
  2012-07-31 16:07             ` Richard Henderson
  1 sibling, 0 replies; 29+ messages in thread
From: Andrew MacLeod @ 2012-07-31 15:27 UTC (permalink / raw)
  To: Richard Guenther; +Cc: Richard Henderson, uweigand, gcc-patches

On 07/31/2012 05:09 AM, Richard Guenther wrote:
> On
> Thus, the bad news is that it's hard for the middle-end to
> recover alignment of a memory access that is represented as
> a builtin function call that takes addresses as parameters
> (which also makes them address-taken and thus possibly aliased).
> Didn't Andrew have some patches to introduce a GIMPLE_ATOMIC
> eventually side-stepping this issue (maybe that used addresses, too)?
>
yes, but Im not sure I'm going to be able to gimple atomic in for 
4.8...  Im trying to make sure the C11 stuff gets in then its back to 
gimple atomic...  you never know tho...  I'm just concerned about 
introducing it that late in the cycle...  I'll reconsider the plan in 
the next week or two.. maybe I can get gimple atomic in first then the 
C11 stuff... put on a mega-push in august...


Andrew

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-31  9:11           ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Guenther
  2012-07-31 15:27             ` Andrew MacLeod
@ 2012-07-31 16:07             ` Richard Henderson
  2012-08-01  8:41               ` Richard Guenther
  1 sibling, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-07-31 16:07 UTC (permalink / raw)
  To: Richard Guenther; +Cc: uweigand, gcc-patches

On 2012-07-31 02:09, Richard Guenther wrote:
> What do we expect __builtin_compare_exchange to do for
> unaligned inputs?

At the moment we expect it to SIGBUS, as a rule.

We'd *like* to defer to the library routine for unaligned,
but we don't do that yet.

Too bad about not being able to query addresses/ssa_names
for their alignment; I thought we could do that already.


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
                             ` (2 preceding siblings ...)
  2012-07-31  9:11           ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Guenther
@ 2012-07-31 18:36           ` Ulrich Weigand
  2012-07-31 19:54             ` Richard Henderson
  2012-08-01 23:23             ` Richard Henderson
  3 siblings, 2 replies; 29+ messages in thread
From: Ulrich Weigand @ 2012-07-31 18:36 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches, rguenther

Richard Henderson wrote:

> I've had a go at generating better code in the HQImode CAS
> loop for aligned memory, but I don't know that I'd call it
> the most efficient thing ever.

Thanks for having a look at this!

>   (3) Support for IC, and ICM via the insv pattern is lacking.
>       I've added a tiny bit of support here, in the form of using
>       the existing strict_low_part patterns, but most definitely we
>       could do better.

This doesn't look correct:
+      /* Emit a strict_low_part pattern if possible.  */
+      if (bitpos == 0 && GET_MODE_BITSIZE (smode) == bitsize)

With bitpos == 0 we need to insert into the *high* part, not
the low part on a big-endian platform.  This probably causes
this incorrect code below:
         icm     %r5,3,0(%r12)
We'd need icm mask 12, not 3, to load into the two upper bytes.

[ This is also probably causing the testing failures I'm seeing
with the patch as-is.  I haven't looked into them in detail yet.  ]

>   (4) The *sethighpartsi and *sethighpartdi_64 patterns ought to be
>       more different.  As is, we can't insert into bits 48-56 of a
>       DImode quantity, because we don't generate ICM for DImode,
>       only ICMH.
> 
>   (5) Missing support for RISBGZ in the form of an extv/z expander.
>       The existing *extv/z splitters probably ought to be conditionalized
>       on !Z10.
> 
>   (6) The strict_low_part patterns should allow registers for at
>       least Z10.  The SImode strict_low_part can use LR everywhere.
> 
>   (7) RISBGZ could be used for a 3-address constant lshrsi3 before
>       srlk is available.

Good points, agreed with all of that.  None of that ought to be
a prerequisite for the atomic patch, of course ...

>    * Given that we're having to zap the mask in %r1 for the second
>      compare anyway, I wonder if RISBG is really beneficial over OR.
>      Is RISBG (or ICM for that matter) any faster (or even smaller)?

Just a plain OR is preferable to a RISBG.  I guess the point of the
RISBG is that you can avoid the extra shift ...  Now, if that shift
can be moved ahead of the loop, that may not be all that big of a
win.  On the other hand, these loops hopefully don't loop very often
if we don't have a lot of contention ...

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-31 18:36           ` Ulrich Weigand
@ 2012-07-31 19:54             ` Richard Henderson
  2012-08-01 23:23             ` Richard Henderson
  1 sibling, 0 replies; 29+ messages in thread
From: Richard Henderson @ 2012-07-31 19:54 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches, rguenther

On 2012-07-31 11:17, Ulrich Weigand wrote:
> This doesn't look correct:
> +      /* Emit a strict_low_part pattern if possible.  */
> +      if (bitpos == 0 && GET_MODE_BITSIZE (smode) == bitsize)
> 
> With bitpos == 0 we need to insert into the *high* part, not
> the low part on a big-endian platform.  This probably causes
> this incorrect code below:
>          icm     %r5,3,0(%r12)

Oops, yes.  I forget about bits-big-endian.

> Just a plain OR is preferable to a RISBG.  I guess the point of the
> RISBG is that you can avoid the extra shift ...  Now, if that shift
> can be moved ahead of the loop, that may not be all that big of a
> win.  On the other hand, these loops hopefully don't loop very often
> if we don't have a lot of contention ...

Indeed.  So it's mostly about minimizing size.

I guess with RISBG available we can always implement with an input in memory with two insns -- full addressing mode on a load plus the risbg to shift and insert.

If we use ICM, we might get away with 1 insn, but might need a second to reload the address into an s_operand.


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-31 16:07             ` Richard Henderson
@ 2012-08-01  8:41               ` Richard Guenther
  2012-08-01 15:59                 ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Guenther @ 2012-08-01  8:41 UTC (permalink / raw)
  To: Richard Henderson; +Cc: uweigand, gcc-patches

On Tue, 31 Jul 2012, Richard Henderson wrote:

> On 2012-07-31 02:09, Richard Guenther wrote:
> > What do we expect __builtin_compare_exchange to do for
> > unaligned inputs?
> 
> At the moment we expect it to SIGBUS, as a rule.
> 
> We'd *like* to defer to the library routine for unaligned,
> but we don't do that yet.

I see.  So your issue is that you don't get the knowledge
that the address is even more aligned than required by the
builtin.

> Too bad about not being able to query addresses/ssa_names
> for their alignment; I thought we could do that already.

We can - just we cannot rely on type information for addresses,
mainly because people write non-conforming C code all the time
(and thus we settled on their side for middle-end semantics).
On x86_64 it's common to do

int foo (int *p)
{
   if ((uintptr_t)p & 3)
     return 0;
   return *p;
}

and if we'd use type information for 'p' then we'd optimize away
the alignment test ...

So we only use type information when seeing an actual memory
reference where we make sure to keep alignment info correct
(which we don't bother to do for addresses).

Richard.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-01  8:41               ` Richard Guenther
@ 2012-08-01 15:59                 ` Richard Henderson
  2012-08-01 17:14                   ` Richard Guenther
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-08-01 15:59 UTC (permalink / raw)
  To: Richard Guenther; +Cc: uweigand, gcc-patches

On 08/01/2012 01:40 AM, Richard Guenther wrote:
> I see.  So your issue is that you don't get the knowledge
> that the address is even more aligned than required by the
> builtin.

Yes.  Very helpful for quite a few targets that only have word-sized atomic operations, and we emulate char/short via bit-fiddling.  That's where MEM_ALIGN as an align+ofs pair would come in doubly helpful...

> So we only use type information when seeing an actual memory
> reference where we make sure to keep alignment info correct
> (which we don't bother to do for addresses).

How hard would it be to include (some) builtins in "actual memory reference"?  Since it seems likely at this point that gimple_atomic will make it in for 4.8?


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-01 15:59                 ` Richard Henderson
@ 2012-08-01 17:14                   ` Richard Guenther
  2012-08-01 19:42                     ` Richard Henderson
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Guenther @ 2012-08-01 17:14 UTC (permalink / raw)
  To: Richard Henderson; +Cc: uweigand, gcc-patches

On Wed, 1 Aug 2012, Richard Henderson wrote:

> On 08/01/2012 01:40 AM, Richard Guenther wrote:
> > I see.  So your issue is that you don't get the knowledge
> > that the address is even more aligned than required by the
> > builtin.
> 
> Yes.  Very helpful for quite a few targets that only have word-sized atomic operations, and we emulate char/short via bit-fiddling.  That's where MEM_ALIGN as an align+ofs pair would come in doubly helpful...
> 
> > So we only use type information when seeing an actual memory
> > reference where we make sure to keep alignment info correct
> > (which we don't bother to do for addresses).
> 
> How hard would it be to include (some) builtins in "actual memory reference"?  Since it seems likely at this point that gimple_atomic will make it in for 4.8?

Actually it would not help you at all.  As far as I understand
the testcase is equivalent from an alignment perspective to

 struct S { int x; unsigned short y; } g_s;
 void bad (S *p_s)
 {
   short *p = (short *)&p_s->y;
   *(short *)p = 0;
 }

so the builtin is a memory access to a short.  We cannot derive
any alignment for p_s from this alone unless we change the way
the middle-end constrains pointer type usage (which in turn
means that pointer conversions cannot be dropped on the floor
like we do now).

If you said

  p_s->y = 0;

then we can exploit the fact that you dereference p_s and derive
bigger alignment.  But I don't see how we can massage the
builtin to preserve such form.  Well, put in a memory reference
in the argument, __builtin_compare_exchange (p_s->y, ...), but
that fails foul of GIMPLE requirements to use a temporary for
register type function arguments, which we may be able to
overcome with some special flags.

Richard.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-01 17:14                   ` Richard Guenther
@ 2012-08-01 19:42                     ` Richard Henderson
  0 siblings, 0 replies; 29+ messages in thread
From: Richard Henderson @ 2012-08-01 19:42 UTC (permalink / raw)
  To: Richard Guenther; +Cc: uweigand, gcc-patches

On 2012-08-01 10:14, Richard Guenther wrote:
> If you said
> 
>   p_s->y = 0;
> 
> then we can exploit the fact that you dereference p_s and derive
> bigger alignment.  But I don't see how we can massage the
> builtin to preserve such form.  Well, put in a memory reference
> in the argument, __builtin_compare_exchange (p_s->y, ...), but
> that fails foul of GIMPLE requirements to use a temporary for
> register type function arguments, which we may be able to
> overcome with some special flags.

Ah, right.  Well, we'll just have to leave it for gimple_atomic then.


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-07-31 18:36           ` Ulrich Weigand
  2012-07-31 19:54             ` Richard Henderson
@ 2012-08-01 23:23             ` Richard Henderson
  2012-08-03 12:20               ` Ulrich Weigand
  2012-08-06 16:44               ` Ulrich Weigand
  1 sibling, 2 replies; 29+ messages in thread
From: Richard Henderson @ 2012-08-01 23:23 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 119 bytes --]

Please try this as a followup to the previous two patches.
That should clean up the mistake with the insv change.


r~

[-- Attachment #2: commit-6b07a31 --]
[-- Type: text/plain, Size: 5377 bytes --]

commit 6b07a31943bcbca2a4f6fae707cf3d7ae283d4dc
Author: Richard Henderson <rth@redhat.com>
Date:   Wed Aug 1 16:10:37 2012 -0700

    fixup insv

diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 8259e2b..35c7fb5 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -4551,7 +4551,8 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
   int bitsize = INTVAL (op1);
   int bitpos = INTVAL (op2);
   enum machine_mode mode = GET_MODE (dest);
-  enum machine_mode smode = smallest_mode_for_size (bitsize, MODE_INT);
+  enum machine_mode smode;
+  int smode_bsize, mode_bsize;
   rtx op, clobber;
 
   /* Generate INSERT IMMEDIATE (IILL et al).  */
@@ -4587,6 +4588,10 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
       return true;
     }
 
+  smode = smallest_mode_for_size (bitsize, MODE_INT);
+  smode_bsize = GET_MODE_BITSIZE (smode);
+  mode_bsize = GET_MODE_BITSIZE (mode);
+
   /* Generate STORE CHARACTERS UNDER MASK (STCM et al).  */
   if (bitpos == 0
       && (bitsize % BITS_PER_UNIT) == 0
@@ -4595,7 +4600,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 	  || const_int_operand (src, VOIDmode)))
     {
       /* Emit standard pattern if possible.  */
-      if (GET_MODE_BITSIZE (smode) == bitsize)
+      if (smode_bsize == bitsize)
 	{
 	  emit_move_insn (adjust_address (dest, smode, 0),
 			  gen_lowpart (smode, src));
@@ -4608,7 +4613,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
 	  int size = bitsize / BITS_PER_UNIT;
 	  rtx src_mem = adjust_address (force_const_mem (word_mode, src),
 					BLKmode,
-					GET_MODE_SIZE (word_mode) - size);
+					UNITS_PER_WORD - size);
 
 	  dest = adjust_address (dest, BLKmode, 0);
 	  set_mem_size (dest, size);
@@ -4619,22 +4624,22 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
       /* (set (ze (mem)) (reg)).  */
       else if (register_operand (src, word_mode))
 	{
-	  if (bitsize <= GET_MODE_BITSIZE (SImode))
+	  if (bitsize <= 32)
 	    emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest, op1,
 						  const0_rtx), src);
 	  else
 	    {
 	      /* Emit st,stcmh sequence.  */
-	      int stcmh_width = bitsize - GET_MODE_BITSIZE (SImode);
+	      int stcmh_width = bitsize - 32;
 	      int size = stcmh_width / BITS_PER_UNIT;
 
 	      emit_move_insn (adjust_address (dest, SImode, size),
 			      gen_lowpart (SImode, src));
 	      set_mem_size (dest, size);
-	      emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest, GEN_INT
-						    (stcmh_width), const0_rtx),
-			      gen_rtx_LSHIFTRT (word_mode, src, GEN_INT
-						(GET_MODE_BITSIZE (SImode))));
+	      emit_move_insn (gen_rtx_ZERO_EXTRACT (word_mode, dest,
+						    GEN_INT (stcmh_width),
+						    const0_rtx),
+			      gen_rtx_LSHIFTRT (word_mode, src, GEN_INT (32)));
 	    }
 	  return true;
 	}
@@ -4649,7 +4654,7 @@ s390_expand_insv (rtx dest, rtx op1, rtx op2, rtx src)
       && register_operand (dest, mode))
     {
       /* Emit a strict_low_part pattern if possible.  */
-      if (bitpos == 0 && GET_MODE_BITSIZE (smode) == bitsize)
+      if (smode_bsize == bitsize && bitpos == mode_bsize - smode_bsize)
 	{
 	  op = gen_rtx_STRICT_LOW_PART (VOIDmode, gen_lowpart (smode, dest));
 	  op = gen_rtx_SET (VOIDmode, op, gen_lowpart (smode, src));
@@ -4728,7 +4733,12 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
   ac->aligned = (MEM_ALIGN (mem) >= GET_MODE_BITSIZE (SImode));
 
   if (ac->aligned)
-    ac->memsi = adjust_address (mem, SImode, 0); /* Memory is aligned.  */
+    {
+      ac->memsi = adjust_address (mem, SImode, 0); /* Memory is aligned.  */
+      ac->shift = const0_rtx;
+      ac->modemask = GEN_INT (GET_MODE_MASK (mode));
+      ac->modemaski = GEN_INT (~GET_MODE_MASK (mode));
+    }
   else
     {
       /* Alignment is unknown.  */
@@ -4755,15 +4765,17 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
       ac->shift = expand_simple_binop (SImode, MINUS, ac->shift, byteoffset,
 				      NULL_RTX, 1, OPTAB_DIRECT);
 
+      /* Shift is the byte count, but we need the bitcount.  */
+      ac->shift = expand_simple_binop (SImode, ASHIFT, ac->shift, GEN_INT (3),
+				       NULL_RTX, 1, OPTAB_DIRECT);
+
+      /* Calculate masks.  */
+      ac->modemask = expand_simple_binop (SImode, ASHIFT,
+					  GEN_INT (GET_MODE_MASK (mode)),
+					  ac->shift, NULL_RTX, 1, OPTAB_DIRECT);
+      ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask,
+					  NULL_RTX, 1);
     }
-  /* Shift is the byte count, but we need the bitcount.  */
-  ac->shift = expand_simple_binop (SImode, MULT, ac->shift, GEN_INT (BITS_PER_UNIT),
-				  NULL_RTX, 1, OPTAB_DIRECT);
-  /* Calculate masks.  */
-  ac->modemask = expand_simple_binop (SImode, ASHIFT,
-				     GEN_INT (GET_MODE_MASK (mode)), ac->shift,
-				     NULL_RTX, 1, OPTAB_DIRECT);
-  ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask, NULL_RTX, 1);
 }
 
 /* A subroutine of s390_expand_cs_hqi.  Insert INS into VAL.  If possible,
@@ -4781,7 +4793,7 @@ s390_two_part_insv (struct alignment_context *ac, rtx *seq1, rtx *seq2,
       start_sequence ();
       tmp = copy_to_mode_reg (SImode, val);
       if (s390_expand_insv (tmp, GEN_INT (GET_MODE_BITSIZE (mode)),
-			    const0_rtx, ins))
+			    GEN_INT (32 - GET_MODE_BITSIZE (mode)), ins))
 	{
 	  *seq1 = NULL;
 	  *seq2 = get_insns ();

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-01 23:23             ` Richard Henderson
@ 2012-08-03 12:20               ` Ulrich Weigand
  2012-08-03 14:21                 ` Ulrich Weigand
  2012-08-06 16:44               ` Ulrich Weigand
  1 sibling, 1 reply; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-03 12:20 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches

Richard Henderson wrote:

> Please try this as a followup to the previous two patches.
> That should clean up the mistake with the insv change.

Just a quick heads-up that something still must be broken;
I get extra test suite failures:

FAIL: gcc.dg/atomic-compare-exchange-1.c execution test
FAIL: gcc.dg/atomic-compare-exchange-2.c execution test
FAIL: gcc.dg/atomic-compare-exchange-3.c execution test
WARNING: program timed out.
FAIL: gcc.dg/atomic-op-3.c execution test
FAIL: gcc.dg/ia64-sync-2.c execution test
FAIL: gcc.dg/ia64-sync-3.c execution test
FAIL: gcc.dg/sync-3.c execution test
FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O0 -g  thread simulation test
FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O2 -g  thread simulation test
FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O3 -g  thread simulation test
FAIL: gcc.dg/simulate-thread/atomic-other-short.c  -O3 -g  thread simulation test
FAIL: libatomic.c/atomic-compare-exchange-3.c execution test
WARNING: program timed out.
FAIL: libatomic.c/atomic-op-3.c execution test
FAIL: libatomic.c/generic-2.c execution test

and just about all libgomp tests and many libjava tests seem
to hang and time out ...

I'll have a look what's going on here.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-03 12:20               ` Ulrich Weigand
@ 2012-08-03 14:21                 ` Ulrich Weigand
  0 siblings, 0 replies; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-03 14:21 UTC (permalink / raw)
  To: rth; +Cc: gcc-patches

I wrote:
> Just a quick heads-up that something still must be broken;
> I get extra test suite failures:
> 
> FAIL: gcc.dg/atomic-compare-exchange-1.c execution test
> FAIL: gcc.dg/atomic-compare-exchange-2.c execution test
> FAIL: gcc.dg/atomic-compare-exchange-3.c execution test
> WARNING: program timed out.
> FAIL: gcc.dg/atomic-op-3.c execution test
> FAIL: gcc.dg/ia64-sync-2.c execution test
> FAIL: gcc.dg/ia64-sync-3.c execution test
> FAIL: gcc.dg/sync-3.c execution test
> FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O0 -g  thread simulation test
> FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O2 -g  thread simulation test
> FAIL: gcc.dg/simulate-thread/atomic-other-int.c  -O3 -g  thread simulation test
> FAIL: gcc.dg/simulate-thread/atomic-other-short.c  -O3 -g  thread simulation test
> FAIL: libatomic.c/atomic-compare-exchange-3.c execution test
> WARNING: program timed out.
> FAIL: libatomic.c/atomic-op-3.c execution test
> FAIL: libatomic.c/generic-2.c execution test
> 
> and just about all libgomp tests and many libjava tests seem
> to hang and time out ...
> 
> I'll have a look what's going on here.

Richard Henderson wrote:
> +(define_expand "atomic_compare_and_swap<mode>"
> +  [(match_operand:SI 0 "register_operand")	;; bool success output
> +   (match_operand:DGPR 1 "register_operand")	;; oldval output
> +   (match_operand:DGPR 2 "s_operand")		;; memory
> +   (match_operand:DGPR 3 "register_operand")	;; expected intput
> +   (match_operand:DGPR 4 "register_operand")	;; newval intput
> +   (match_operand:SI 5 "const_int_operand")	;; is_weak
> +   (match_operand:SI 6 "const_int_operand")	;; success model
> +   (match_operand:SI 7 "const_int_operand")]	;; failure model
> +  ""
> +{
> +  rtx cc, cmp;
> +  emit_insn (gen_atomic_compare_and_swap<mode>_internal
> +	     (operands[1], operands[2], operands[3], operands[4]));
> +  cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
> +  cmp = gen_rtx_NE (SImode, cc, const0_rtx);
> +  emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));

This needs to be an EQ instead of NE comparison here ...

> +  if (is_weak)
>      {
> -      cmpv = force_reg (SImode, val);
> -      store_bit_field (cmpv, GET_MODE_BITSIZE (mode), 0,
> -		       0, 0, SImode, cmp);
> +      cc = s390_emit_compare_and_swap (NE, res, ac.memsi, cmpv, newv);
> +      emit_insn (gen_cstorecc4 (btarget, cc, XEXP (cc, 0), XEXP (cc, 1)));
>      }

... and here.

This fixes the main atomic test failures I was seeing.  I've restarted
the full bootstrap / regression test now ...

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 0/2] Convert s390 to atomic optabs, v2
  2012-08-01 23:23             ` Richard Henderson
  2012-08-03 12:20               ` Ulrich Weigand
@ 2012-08-06 16:44               ` Ulrich Weigand
  1 sibling, 0 replies; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-06 16:44 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches

Richard Henderson wrote:


While the first set of changes looks good to me, I don't understand those:

> @@ -4728,7 +4733,12 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
>    ac->aligned = (MEM_ALIGN (mem) >= GET_MODE_BITSIZE (SImode));
>  
>    if (ac->aligned)
> -    ac->memsi = adjust_address (mem, SImode, 0); /* Memory is aligned.  */
> +    {
> +      ac->memsi = adjust_address (mem, SImode, 0); /* Memory is aligned.  */
> +      ac->shift = const0_rtx;
> +      ac->modemask = GEN_INT (GET_MODE_MASK (mode));
> +      ac->modemaski = GEN_INT (~GET_MODE_MASK (mode));
> +    }
>    else
>      {
>        /* Alignment is unknown.  */
> @@ -4755,15 +4765,17 @@ init_alignment_context (struct alignment_context *ac, rtx mem,
>        ac->shift = expand_simple_binop (SImode, MINUS, ac->shift, byteoffset,
>  				      NULL_RTX, 1, OPTAB_DIRECT);
>  
> +      /* Shift is the byte count, but we need the bitcount.  */
> +      ac->shift = expand_simple_binop (SImode, ASHIFT, ac->shift, GEN_INT (3),
> +				       NULL_RTX, 1, OPTAB_DIRECT);
> +
> +      /* Calculate masks.  */
> +      ac->modemask = expand_simple_binop (SImode, ASHIFT,
> +					  GEN_INT (GET_MODE_MASK (mode)),
> +					  ac->shift, NULL_RTX, 1, OPTAB_DIRECT);
> +      ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask,
> +					  NULL_RTX, 1);
>      }
> -  /* Shift is the byte count, but we need the bitcount.  */
> -  ac->shift = expand_simple_binop (SImode, MULT, ac->shift, GEN_INT (BITS_PER_UNIT),
> -				  NULL_RTX, 1, OPTAB_DIRECT);
> -  /* Calculate masks.  */
> -  ac->modemask = expand_simple_binop (SImode, ASHIFT,
> -				     GEN_INT (GET_MODE_MASK (mode)), ac->shift,
> -				     NULL_RTX, 1, OPTAB_DIRECT);
> -  ac->modemaski = expand_simple_unop (SImode, NOT, ac->modemask, NULL_RTX, 1);
>  }
>  
>  /* A subroutine of s390_expand_cs_hqi.  Insert INS into VAL.  If possible,
> @@ -4781,7 +4793,7 @@ s390_two_part_insv (struct alignment_context *ac, rtx *seq1, rtx *seq2,
>        start_sequence ();
>        tmp = copy_to_mode_reg (SImode, val);
>        if (s390_expand_insv (tmp, GEN_INT (GET_MODE_BITSIZE (mode)),
> -			    const0_rtx, ins))
> +			    GEN_INT (32 - GET_MODE_BITSIZE (mode)), ins))
>  	{
>  	  *seq1 = NULL;
>  	  *seq2 = get_insns ();

"aligned" accesses do involve the *most significant* part of the word
(on a big-endian machine), which means ac->shift has to be set to
modesize (outer) - modesize (inner), and expand_insv needs to be
called with bitpos 0 (due to bits-big-endian).

When reverting this part of your patch (and together with the EQ/NE fix
pointed out here: http://gcc.gnu.org/ml/gcc-patches/2012-08/msg00170.html),
I can complete a bootstrap/testing cycle without regressions.

(There's still code being generated that looks a bit inefficient, but that's
a different story.)

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] s390: Convert from sync to atomic optabs
  2012-07-30 22:36           ` [PATCH 2/2] s390: Convert from sync to atomic optabs Richard Henderson
@ 2012-08-06 18:34             ` Ulrich Weigand
  2012-08-06 18:51               ` Richard Henderson
  2012-08-06 22:40               ` s390: Avoid CAS boolean output inefficiency Richard Henderson
  0 siblings, 2 replies; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-06 18:34 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches

Richard Henderson wrote:


Some more comments on this patch.

> +; Different from movdi_31 in that we have no splitters.
> +(define_insn "atomic_loaddi_1"
> +  [(set (match_operand:DI 0 "register_operand" "=d,d,!*f,!*f")
> +	(unspec:DI [(match_operand:DI 1 "s_operand" "Q,S,Q,m")]

The constraint line doesn't look right.  ld(y) accept base+index,
so they should get R and T constraints, respectively.

In fact, I wouldn't use "s_operand" as a predicate, since this
excludes valid addresses for those cases, and since in general
I've found it to be preferable to have reload fix up addresses.
So I'd just use "memory_operand" here, and actually in all the
other patterns as well ...  (This also simplifes the expander.)

> +		   UNSPEC_MOVA))]
> +  "!TARGET_ZARCH"
> +  "@
> +   lm\t%0,%M0,%S1
> +   lmy\t%0,%M0,%S1
> +   ld\t%0,%1
> +   ldy\t%0,%1"
> +  [(set_attr "op_type" "RS,RSY,RS,RSY")
> +   (set_attr "type" "lm,lm,floaddf,floaddf")])
> +
> +(define_insn "atomic_loadti_1"
> +  [(set (match_operand:TI 0 "register_operand" "=r")
> +	(unspec:TI [(match_operand:TI 1 "memory_operand" "m")]

"m" is wrong here (and basically everywhere), since it includes
SYMBOL_REFs for lrl etc. on z10.  For lpq we need "RT".

> +  "lpq\t%0,%1"
> +  [(set_attr "op_type" "RXY")])

LPQ and STPQ probably should get a "type" of "other".  This may not
model the pipeline exactly, but it's better than just assuming
"integer" by default.  These instructions are special (and slow).

> +(define_expand "atomic_compare_and_swap<mode>"
> +  [(match_operand:SI 0 "register_operand")	;; bool success output
> +   (match_operand:DGPR 1 "register_operand")	;; oldval output
> +   (match_operand:DGPR 2 "s_operand")		;; memory

memory_operand probably better again.

> +   (match_operand:DGPR 3 "register_operand")	;; expected intput
> +   (match_operand:DGPR 4 "register_operand")	;; newval intput
> +   (match_operand:SI 5 "const_int_operand")	;; is_weak
> +   (match_operand:SI 6 "const_int_operand")	;; success model
> +   (match_operand:SI 7 "const_int_operand")]	;; failure model
> +  ""
> +{
> +  rtx cc, cmp;
> +  emit_insn (gen_atomic_compare_and_swap<mode>_internal
> +	     (operands[1], operands[2], operands[3], operands[4]));
> +  cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
> +  cmp = gen_rtx_NE (SImode, cc, const0_rtx);

As already pointed out, this needs to be EQ.

> -  ""
> -  "cds<tg>\t%0,%3,%S1"
> -  [(set_attr "op_type" "RS<TE>")
> +  "TARGET_ZARCH"
> +  "c<td>sg\t%0,%3,%S1"
> +  [(set_attr "op_type" "RXY")

Should be "RSY"

> +  "@
> +   cds\t%0,%3,%S1
> +   cdsy\t%0,%3,%S1"
> +  [(set_attr "op_type" "RX,RXY")

Should be "RS, RSY"

> +  "@
> +   cs\t%0,%3,%S1
> +   csy\t%0,%3,%S1"
> +  [(set_attr "op_type" "RX,RXY")

Likewise.

> +(define_insn "atomic_fetch_<atomic><mode>_iaf"
> +  [(set (match_operand:GPR 0 "register_operand" "=d")
> +	(match_operand:GPR 1 "s_operand" "+S"))
> +   (set (match_dup 1)
> +	(unspec_volatile:GPR
> +	 [(ATOMIC_Z196:GPR (match_dup 1)
> +			   (match_operand:GPR 2 "general_operand" "d"))]
> +	 UNSPECV_ATOMIC_OP))
> +   (clobber (reg:CC CC_REGNUM))]
>    "TARGET_Z196"
> -  "la<noxa><g>\t%0,%2,%1")
> +  "la<noxa><g>\t%0,%2,%1"
> +  [(set_attr "op_type" "RXY")

Again RSY.



There is one particular inefficiency I have noticed.  This code:

  if (!__atomic_compare_exchange_n (&v, &expected, max, 0 , 0, 0))
    abort ();

from atomic-compare-exchange-3.c gets compiled into:

        l       %r3,0(%r2)
        larl    %r1,v
        cs      %r3,%r4,0(%r1)
        ipm     %r1
        sra     %r1,28
        st      %r3,0(%r2)
        ltr     %r1,%r1
        jne     .L3

which is extremely inefficient; it converts the condition code into
an integer using the slow ipm, sra sequence, just so that it can
convert the integer back into a condition code via ltr and branch
on it ...

Now, with the old code this sequence used to be optimized away by
combine and we'd just have the cs followed by a branch.  This is
not done any more because we have the store to "expected" in between.

This is because of this code in combine:

      /* Make sure that the value that is to be substituted for the register
         does not use any registers whose values alter in between.  However,
         If the insns are adjacent, a use can't cross a set even though we
         think it might (this can happen for a sequence of insns each setting
         the same destination; last_set of that register might point to
         a NOTE).  If INSN has a REG_EQUIV note, the register is always
         equivalent to the memory so the substitution is valid even if there
         are intervening stores.  Also, don't move a volatile asm or
         UNSPEC_VOLATILE across any other insns.  */
      || (! all_adjacent
          && (((!MEM_P (src)
                || ! find_reg_note (insn, REG_EQUIV, src))
               && use_crosses_set_p (src, DF_INSN_LUID (insn)))
              || (GET_CODE (src) == ASM_OPERANDS && MEM_VOLATILE_P (src))
              || GET_CODE (src) == UNSPEC_VOLATILE))

Note that we have exactly the case mentioned, where a series of instructions
to be combined all set the same (CC) register.  If they're all adjacent,
this still gets optimized away -- but they no longer are due to the store.

Is there a way of structuring the atomic optabs/expander so that the store
gets done either before or after all the CC operations?


Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] s390: Convert from sync to atomic optabs
  2012-08-06 18:34             ` Ulrich Weigand
@ 2012-08-06 18:51               ` Richard Henderson
  2012-08-06 19:45                 ` Richard Henderson
  2012-08-06 22:40               ` s390: Avoid CAS boolean output inefficiency Richard Henderson
  1 sibling, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-08-06 18:51 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches

On 08/06/2012 11:34 AM, Ulrich Weigand wrote:
> Richard Henderson wrote:
> 
> 
> Some more comments on this patch.
> 
>> +; Different from movdi_31 in that we have no splitters.
>> +(define_insn "atomic_loaddi_1"
>> +  [(set (match_operand:DI 0 "register_operand" "=d,d,!*f,!*f")
>> +	(unspec:DI [(match_operand:DI 1 "s_operand" "Q,S,Q,m")]
> 
> The constraint line doesn't look right.  ld(y) accept base+index,
> so they should get R and T constraints, respectively.

Yes, but since I'd limited to s_operand, R seemed weird.

> In fact, I wouldn't use "s_operand" as a predicate, since this
> excludes valid addresses for those cases, and since in general
> I've found it to be preferable to have reload fix up addresses.
> So I'd just use "memory_operand" here, and actually in all the
> other patterns as well ...  (This also simplifes the expander.)

In the first patch I did, I had memory_operand and QS, but that
ran into reload failures.  I assumed I'd just made a mistake.

I'll see if I can replicate this for your debugging enjoyment...


> "m" is wrong here (and basically everywhere), since it includes
> SYMBOL_REFs for lrl etc. on z10.  For lpq we need "RT".

Ah right.

> LPQ and STPQ probably should get a "type" of "other".  This may not
> model the pipeline exactly, but it's better than just assuming
> "integer" by default.  These instructions are special (and slow).

Noted.

> There is one particular inefficiency I have noticed.  This code:
> 
>   if (!__atomic_compare_exchange_n (&v, &expected, max, 0 , 0, 0))
>     abort ();
> 
> from atomic-compare-exchange-3.c gets compiled into:
> 
>         l       %r3,0(%r2)
>         larl    %r1,v
>         cs      %r3,%r4,0(%r1)
>         ipm     %r1
>         sra     %r1,28
>         st      %r3,0(%r2)
>         ltr     %r1,%r1
>         jne     .L3
> 
> which is extremely inefficient; it converts the condition code into
> an integer using the slow ipm, sra sequence, just so that it can
> convert the integer back into a condition code via ltr and branch
> on it ...
...
> Is there a way of structuring the atomic optabs/expander so that the store
> gets done either before or after all the CC operations?

No.  But I'm a bit confused since a store doesn't affect the flags,
so I'm not sure why the EQ can't be combined with the branch.  I'll
give that another look.

All that said, one of the things that MacLeod's gimple_atomic will
achieve is allowing two register outputs, which (for the common case)
will eliminate the store entirely.


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH 2/2] s390: Convert from sync to atomic optabs
  2012-08-06 18:51               ` Richard Henderson
@ 2012-08-06 19:45                 ` Richard Henderson
  0 siblings, 0 replies; 29+ messages in thread
From: Richard Henderson @ 2012-08-06 19:45 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches

On 08/06/2012 11:50 AM, Richard Henderson wrote:
> In the first patch I did, I had memory_operand and QS, but that
> ran into reload failures.  I assumed I'd just made a mistake.
> 
> I'll see if I can replicate this for your debugging enjoyment...

I think I had written =S instead of =QS, which of course fails for old 31-bit.
I can't reproduce the problem now...


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* s390: Avoid CAS boolean output inefficiency
  2012-08-06 18:34             ` Ulrich Weigand
  2012-08-06 18:51               ` Richard Henderson
@ 2012-08-06 22:40               ` Richard Henderson
  2012-08-07 17:02                 ` Ulrich Weigand
  2012-08-09 16:55                 ` Eric Botcazou
  1 sibling, 2 replies; 29+ messages in thread
From: Richard Henderson @ 2012-08-06 22:40 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2618 bytes --]

On 08/06/2012 11:34 AM, Ulrich Weigand wrote:
> There is one particular inefficiency I have noticed.  This code:
> 
>   if (!__atomic_compare_exchange_n (&v, &expected, max, 0 , 0, 0))
>     abort ();
> 
> from atomic-compare-exchange-3.c gets compiled into:
> 
>         l       %r3,0(%r2)
>         larl    %r1,v
>         cs      %r3,%r4,0(%r1)
>         ipm     %r1
>         sra     %r1,28
>         st      %r3,0(%r2)
>         ltr     %r1,%r1
>         jne     .L3
> 
> which is extremely inefficient; it converts the condition code into
> an integer using the slow ipm, sra sequence, just so that it can
> convert the integer back into a condition code via ltr and branch
> on it ...

This was caused (or perhaps abetted by) the representation of EQ
as NE ^ 1.  With the subsequent truncation and zero-extend, I
think combine reached its insn limit of 3 before seeing everything
it needed to see.

I'm able to fix this problem by representing EQ as EQ before reload.
For extimm targets this results in identical code; for older targets
it requires avoidance of the constant pool, i.e. LHI+XR instead of X.

        l       %r2,0(%r3)
        larl    %r1,v
        cs      %r2,%r5,0(%r1)
        st      %r2,0(%r3)
        jne     .L3

That fixed, we see the second CAS in that file:

        .loc 1 27 0
        cs      %r2,%r2,0(%r1)
        ipm     %r5
        sll     %r5,28
        lhi     %r0,1
        xr      %r5,%r0
        st      %r2,0(%r3)
        ltr     %r5,%r5
        je      .L20

This happens because CSE notices the cbranch vs 0, and sets r116
to zero along the path to

     32   if (!__atomic_compare_exchange_n (&v, &expected, 0, STRONG,
                                            __ATOMIC_RELEASE, __ATOMIC_ACQUIRE))

at which point CSE decides that it would be cheaper to "re-use"
the zero already in r116 instead of load another constant 0 here.
After that, combine is ham-strung because r116 is not dead.

I'm not quite sure the best way to fix this, since rtx_costs already
has all constants cost 0.  CSE ought not believe that r116 is better
than a plain constant.  CSE also shouldn't be extending the life of
pseudos this way.

A short-term possibility is to have the CAS insns accept general_operand,
so that the 0 gets merged.  With reload inheritance and post-reload cse,
that might produce code that is "good enough".  Certainly it's effective
for the atomic-compare-exchange-3.c testcase.  I'm less than happy with
that, since the non-optimization of CAS depends on following code that
is totally unrelated.

This patch ought to be independent of any other patch so far.


r~

[-- Attachment #2: z --]
[-- Type: text/plain, Size: 2305 bytes --]

diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 0e43e51..bed6b79 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -5325,12 +5325,15 @@
 	    (match_operand 3 "const0_operand")]))
      (clobber (reg:CC CC_REGNUM))])]
   ""
-  "emit_insn (gen_sne (operands[0], operands[2]));
-   if (GET_CODE (operands[1]) == EQ)
-     emit_insn (gen_xorsi3 (operands[0], operands[0], const1_rtx));
-   DONE;")
+{
+  if (!TARGET_EXTIMM && GET_CODE (operands[1]) == EQ)
+    {
+      emit_insn (gen_seq_neimm (operands[0], operands[2]));
+      DONE;
+    }
+})
 
-(define_insn_and_split "sne"
+(define_insn_and_split "*sne"
   [(set (match_operand:SI 0 "register_operand" "=d")
 	(ne:SI (match_operand:CCZ1 1 "register_operand" "0")
 	       (const_int 0)))
@@ -5342,6 +5345,48 @@
     [(set (match_dup 0) (ashiftrt:SI (match_dup 0) (const_int 28)))
      (clobber (reg:CC CC_REGNUM))])])
 
+(define_insn_and_split "*seq"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(eq:SI (match_operand:CCZ1 1 "register_operand" "0")
+	       (const_int 0)))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_EXTIMM"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  emit_insn (gen_lshrsi3 (op0, op0, GEN_INT (28)));
+  emit_insn (gen_xorsi3 (op0, op0, const1_rtx));
+  DONE;
+})
+
+;; ??? Ideally we'd USE a const1_rtx, properly reloaded, but that makes
+;; things more difficult for combine (which can only insert clobbers).
+;; But perhaps it would be better still to have simply used a branch around
+;; constant load instead of beginning with the IPM?
+;;
+;; What about LOCR for Z196?  That's a more general question about cstore
+;; being decomposed into movcc...
+
+(define_insn_and_split "seq_neimm"
+  [(set (match_operand:SI 0 "register_operand" "=d")
+	(eq:SI (match_operand:CCZ1 1 "register_operand" "0")
+	       (const_int 0)))
+   (clobber (match_scratch:SI 2 "=&d"))
+   (clobber (reg:CC CC_REGNUM))]
+  "!TARGET_EXTIMM"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op2 = operands[2];
+  emit_insn (gen_ashlsi3 (op0, op0, GEN_INT (28)));
+  emit_move_insn (op2, const1_rtx);
+  emit_insn (gen_xorsi3 (op0, op0, op2));
+  DONE;
+})
 
 ;;
 ;; - Conditional move instructions (introduced with z196)

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: s390: Avoid CAS boolean output inefficiency
  2012-08-06 22:40               ` s390: Avoid CAS boolean output inefficiency Richard Henderson
@ 2012-08-07 17:02                 ` Ulrich Weigand
  2012-08-07 22:13                   ` Richard Henderson
  2012-08-09 16:55                 ` Eric Botcazou
  1 sibling, 1 reply; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-07 17:02 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches

Richard Henderson wrote:
> On 08/06/2012 11:34 AM, Ulrich Weigand wrote:
> > There is one particular inefficiency I have noticed.  This code:
> > 
> >   if (!__atomic_compare_exchange_n (&v, &expected, max, 0 , 0, 0))
> >     abort ();
> > 
> > from atomic-compare-exchange-3.c gets compiled into:
> > 
> >         l       %r3,0(%r2)
> >         larl    %r1,v
> >         cs      %r3,%r4,0(%r1)
> >         ipm     %r1
> >         sra     %r1,28
> >         st      %r3,0(%r2)
> >         ltr     %r1,%r1
> >         jne     .L3
> > 
> > which is extremely inefficient; it converts the condition code into
> > an integer using the slow ipm, sra sequence, just so that it can
> > convert the integer back into a condition code via ltr and branch
> > on it ...
> 
> This was caused (or perhaps abetted by) the representation of EQ
> as NE ^ 1.  With the subsequent truncation and zero-extend, I
> think combine reached its insn limit of 3 before seeing everything
> it needed to see.

Yes, combine isn't able to handle everything in one go, but it finds
an intermediate insn.  With the old __sync_compare_and_swap, it is
then able to optimize everything away in a second step; the only
reason this doesn't work any more is the intervening store.

The following patch changes the builtin expander to pass a MEM oldval
as-is to the back-end expander, so that the back-end can move the
store to before the CC operation.  With that patch I'm also seeing
all the IPMs disappear.

[ Well, all except for this one:

> This happens because CSE notices the cbranch vs 0, and sets r116
> to zero along the path to
> 
>      32   if (!__atomic_compare_exchange_n (&v, &expected, 0, STRONG,
>                                             __ATOMIC_RELEASE, __ATOMIC_ACQUIRE))
> 
> at which point CSE decides that it would be cheaper to "re-use"
> the zero already in r116 instead of load another constant 0 here.
> After that, combine is ham-strung because r116 is not dead.

As you point out, this does indeed fix this problem as well:

> A short-term possibility is to have the CAS insns accept general_operand,
> so that the 0 gets merged.  
]


What do you think about this solution?  It has the advantage that
we still get the same xor code if we actually do need the ipm ...


Bye,
Ulrich


diff -ur gcc/builtins.c gcc.new/builtins.c
--- gcc/builtins.c	2012-08-07 16:04:45.054348099 +0200
+++ gcc.new/builtins.c	2012-08-07 15:44:01.304349225 +0200
@@ -5376,6 +5376,7 @@
 
   expect = expand_normal (CALL_EXPR_ARG (exp, 1));
   expect = convert_memory_address (Pmode, expect);
+  expect = gen_rtx_MEM (mode, expect);
   desired = expand_expr_force_mode (CALL_EXPR_ARG (exp, 2), mode);
 
   weak = CALL_EXPR_ARG (exp, 3);
@@ -5383,14 +5384,15 @@
   if (host_integerp (weak, 0) && tree_low_cst (weak, 0) != 0)
     is_weak = true;
 
-  oldval = copy_to_reg (gen_rtx_MEM (mode, expect));
-
+  oldval = expect;
   if (!expand_atomic_compare_and_swap ((target == const0_rtx ? NULL : &target),
 				       &oldval, mem, oldval, desired,
 				       is_weak, success, failure))
     return NULL_RTX;
 
-  emit_move_insn (gen_rtx_MEM (mode, expect), oldval);
+  if (oldval != expect)
+    emit_move_insn (expect, oldval);
+
   return target;
 }
 
diff -ur gcc/config/s390/s390.md gcc.new/config/s390/s390.md
--- gcc/config/s390/s390.md	2012-08-07 16:04:54.204348621 +0200
+++ gcc.new/config/s390/s390.md	2012-08-07 16:00:21.934348628 +0200
@@ -8870,7 +8870,7 @@
 
 (define_expand "atomic_compare_and_swap<mode>"
   [(match_operand:SI 0 "register_operand")	;; bool success output
-   (match_operand:DGPR 1 "register_operand")	;; oldval output
+   (match_operand:DGPR 1 "nonimmediate_operand");; oldval output
    (match_operand:DGPR 2 "memory_operand")	;; memory
    (match_operand:DGPR 3 "register_operand")	;; expected intput
    (match_operand:DGPR 4 "register_operand")	;; newval intput
@@ -8879,9 +8879,17 @@
    (match_operand:SI 7 "const_int_operand")]	;; failure model
   ""
 {
-  rtx cc, cmp;
+  rtx cc, cmp, output = operands[1];
+
+  if (!register_operand (output, <MODE>mode))
+    output = gen_reg_rtx (<MODE>mode);
+
   emit_insn (gen_atomic_compare_and_swap<mode>_internal
-	     (operands[1], operands[2], operands[3], operands[4]));
+	     (output, operands[2], operands[3], operands[4]));
+
+  if (output != operands[1])
+    emit_move_insn (operands[1], output);
+
   cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
   cmp = gen_rtx_EQ (SImode, cc, const0_rtx);
   emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));


-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: s390: Avoid CAS boolean output inefficiency
  2012-08-07 17:02                 ` Ulrich Weigand
@ 2012-08-07 22:13                   ` Richard Henderson
  2012-08-08 18:05                     ` Ulrich Weigand
  0 siblings, 1 reply; 29+ messages in thread
From: Richard Henderson @ 2012-08-07 22:13 UTC (permalink / raw)
  To: Ulrich Weigand; +Cc: gcc-patches

On 08/07/2012 10:02 AM, Ulrich Weigand wrote:
> The following patch changes the builtin expander to pass a MEM oldval
> as-is to the back-end expander, so that the back-end can move the
> store to before the CC operation.  With that patch I'm also seeing
> all the IPMs disappear.
...
> What do you think about this solution?  It has the advantage that
> we still get the same xor code if we actually do need the ipm ...

I'm ok with that patch.


r~

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: s390: Avoid CAS boolean output inefficiency
  2012-08-07 22:13                   ` Richard Henderson
@ 2012-08-08 18:05                     ` Ulrich Weigand
  0 siblings, 0 replies; 29+ messages in thread
From: Ulrich Weigand @ 2012-08-08 18:05 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches

Richard Henderson wrote:
> On 08/07/2012 10:02 AM, Ulrich Weigand wrote:
> > The following patch changes the builtin expander to pass a MEM oldval
> > as-is to the back-end expander, so that the back-end can move the
> > store to before the CC operation.  With that patch I'm also seeing
> > all the IPMs disappear.
> ...
> > What do you think about this solution?  It has the advantage that
> > we still get the same xor code if we actually do need the ipm ...
> 
> I'm ok with that patch.

Thanks!  I've checked in the following version.
Tested on s390x-ibm-linux with no regressions.

Bye,
Ulrich

ChangeLog:

	* builtins.c (expand_builtin_atomic_compare_exchange): Pass old
	value operand as MEM to expand_atomic_compare_and_swap.

	* config/s390/s390.md ("atomic_compare_and_swap<mode>"): Accept
	nonimmediate_operand for old value; generate load and store if
	needed.
	* config/s390/s390.c (s390_expand_cs_hqi): Accept any operand
	as vtarget.

Index: gcc/builtins.c
===================================================================
*** gcc/builtins.c	(revision 190226)
--- gcc/builtins.c	(working copy)
*************** expand_builtin_atomic_compare_exchange (
*** 5376,5381 ****
--- 5376,5382 ----
  
    expect = expand_normal (CALL_EXPR_ARG (exp, 1));
    expect = convert_memory_address (Pmode, expect);
+   expect = gen_rtx_MEM (mode, expect);
    desired = expand_expr_force_mode (CALL_EXPR_ARG (exp, 2), mode);
  
    weak = CALL_EXPR_ARG (exp, 3);
*************** expand_builtin_atomic_compare_exchange (
*** 5383,5396 ****
    if (host_integerp (weak, 0) && tree_low_cst (weak, 0) != 0)
      is_weak = true;
  
!   oldval = copy_to_reg (gen_rtx_MEM (mode, expect));
! 
    if (!expand_atomic_compare_and_swap ((target == const0_rtx ? NULL : &target),
  				       &oldval, mem, oldval, desired,
  				       is_weak, success, failure))
      return NULL_RTX;
  
!   emit_move_insn (gen_rtx_MEM (mode, expect), oldval);
    return target;
  }
  
--- 5384,5398 ----
    if (host_integerp (weak, 0) && tree_low_cst (weak, 0) != 0)
      is_weak = true;
  
!   oldval = expect;
    if (!expand_atomic_compare_and_swap ((target == const0_rtx ? NULL : &target),
  				       &oldval, mem, oldval, desired,
  				       is_weak, success, failure))
      return NULL_RTX;
  
!   if (oldval != expect)
!     emit_move_insn (expect, oldval);
! 
    return target;
  }
  
Index: gcc/config/s390/s390.c
===================================================================
*** gcc/config/s390/s390.c	(revision 190226)
--- gcc/config/s390/s390.c	(working copy)
*************** s390_expand_cs_hqi (enum machine_mode mo
*** 4825,4831 ****
    rtx res = gen_reg_rtx (SImode);
    rtx csloop = NULL, csend = NULL;
  
-   gcc_assert (register_operand (vtarget, VOIDmode));
    gcc_assert (MEM_P (mem));
  
    init_alignment_context (&ac, mem, mode);
--- 4825,4830 ----
Index: gcc/config/s390/s390.md
===================================================================
*** gcc/config/s390/s390.md	(revision 190226)
--- gcc/config/s390/s390.md	(working copy)
***************
*** 8870,8876 ****
  
  (define_expand "atomic_compare_and_swap<mode>"
    [(match_operand:SI 0 "register_operand")	;; bool success output
!    (match_operand:DGPR 1 "register_operand")	;; oldval output
     (match_operand:DGPR 2 "memory_operand")	;; memory
     (match_operand:DGPR 3 "register_operand")	;; expected intput
     (match_operand:DGPR 4 "register_operand")	;; newval intput
--- 8870,8876 ----
  
  (define_expand "atomic_compare_and_swap<mode>"
    [(match_operand:SI 0 "register_operand")	;; bool success output
!    (match_operand:DGPR 1 "nonimmediate_operand");; oldval output
     (match_operand:DGPR 2 "memory_operand")	;; memory
     (match_operand:DGPR 3 "register_operand")	;; expected intput
     (match_operand:DGPR 4 "register_operand")	;; newval intput
***************
*** 8879,8887 ****
     (match_operand:SI 7 "const_int_operand")]	;; failure model
    ""
  {
!   rtx cc, cmp;
    emit_insn (gen_atomic_compare_and_swap<mode>_internal
! 	     (operands[1], operands[2], operands[3], operands[4]));
    cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
    cmp = gen_rtx_EQ (SImode, cc, const0_rtx);
    emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));
--- 8879,8900 ----
     (match_operand:SI 7 "const_int_operand")]	;; failure model
    ""
  {
!   rtx cc, cmp, output = operands[1];
! 
!   if (!register_operand (output, <MODE>mode))
!     output = gen_reg_rtx (<MODE>mode);
! 
    emit_insn (gen_atomic_compare_and_swap<mode>_internal
! 	     (output, operands[2], operands[3], operands[4]));
! 
!   /* We deliberately accept non-register operands in the predicate
!      to ensure the write back to the output operand happens *before*
!      the store-flags code below.  This makes it easier for combine
!      to merge the store-flags code with a potential test-and-branch
!      pattern following (immediately!) afterwards.  */
!   if (output != operands[1])
!     emit_move_insn (operands[1], output);
! 
    cc = gen_rtx_REG (CCZ1mode, CC_REGNUM);
    cmp = gen_rtx_EQ (SImode, cc, const0_rtx);
    emit_insn (gen_cstorecc4 (operands[0], cmp, cc, const0_rtx));
***************
*** 8890,8896 ****
  
  (define_expand "atomic_compare_and_swap<mode>"
    [(match_operand:SI 0 "register_operand")	;; bool success output
!    (match_operand:HQI 1 "register_operand")	;; oldval output
     (match_operand:HQI 2 "memory_operand")	;; memory
     (match_operand:HQI 3 "general_operand")	;; expected intput
     (match_operand:HQI 4 "general_operand")	;; newval intput
--- 8903,8909 ----
  
  (define_expand "atomic_compare_and_swap<mode>"
    [(match_operand:SI 0 "register_operand")	;; bool success output
!    (match_operand:HQI 1 "nonimmediate_operand")	;; oldval output
     (match_operand:HQI 2 "memory_operand")	;; memory
     (match_operand:HQI 3 "general_operand")	;; expected intput
     (match_operand:HQI 4 "general_operand")	;; newval intput

-- 
  Dr. Ulrich Weigand
  GNU Toolchain for Linux on System z and Cell BE
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: s390: Avoid CAS boolean output inefficiency
  2012-08-06 22:40               ` s390: Avoid CAS boolean output inefficiency Richard Henderson
  2012-08-07 17:02                 ` Ulrich Weigand
@ 2012-08-09 16:55                 ` Eric Botcazou
  1 sibling, 0 replies; 29+ messages in thread
From: Eric Botcazou @ 2012-08-09 16:55 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches, Ulrich Weigand

> This was caused (or perhaps abetted by) the representation of EQ
> as NE ^ 1.  With the subsequent truncation and zero-extend, I
> think combine reached its insn limit of 3 before seeing everything
> it needed to see.

This can be 4 now, if you tweak the initial heuristic.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2012-08-09 16:55 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-29 21:32 [CFT] s390: Convert from sync to atomic optabs Richard Henderson
2012-07-30 14:19 ` Ulrich Weigand
2012-07-30 15:12   ` Richard Henderson
2012-07-30 15:51     ` Ulrich Weigand
2012-07-30 18:53       ` Richard Henderson
2012-07-30 22:33         ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Henderson
2012-07-30 22:33           ` [PATCH 1/2] s390: Reorg s390_expand_insv Richard Henderson
2012-07-30 22:36           ` [PATCH 2/2] s390: Convert from sync to atomic optabs Richard Henderson
2012-08-06 18:34             ` Ulrich Weigand
2012-08-06 18:51               ` Richard Henderson
2012-08-06 19:45                 ` Richard Henderson
2012-08-06 22:40               ` s390: Avoid CAS boolean output inefficiency Richard Henderson
2012-08-07 17:02                 ` Ulrich Weigand
2012-08-07 22:13                   ` Richard Henderson
2012-08-08 18:05                     ` Ulrich Weigand
2012-08-09 16:55                 ` Eric Botcazou
2012-07-31  9:11           ` [PATCH 0/2] Convert s390 to atomic optabs, v2 Richard Guenther
2012-07-31 15:27             ` Andrew MacLeod
2012-07-31 16:07             ` Richard Henderson
2012-08-01  8:41               ` Richard Guenther
2012-08-01 15:59                 ` Richard Henderson
2012-08-01 17:14                   ` Richard Guenther
2012-08-01 19:42                     ` Richard Henderson
2012-07-31 18:36           ` Ulrich Weigand
2012-07-31 19:54             ` Richard Henderson
2012-08-01 23:23             ` Richard Henderson
2012-08-03 12:20               ` Ulrich Weigand
2012-08-03 14:21                 ` Ulrich Weigand
2012-08-06 16:44               ` Ulrich Weigand

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).