[PATCH][Aarch64] Add support for overflow add and sub operations

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][Aarch64] Add support for overflow add and sub operations
@ 2017-05-19  6:27 Michael Collison
  2017-05-19 11:00 ` Christophe Lyon
  0 siblings, 1 reply; 8+ messages in thread
From: Michael Collison @ 2017-05-19  6:27 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 3144 bytes --]

Hi,

This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:

Sure for a simple test case such as:

int
f (int x, int y, int *ovf)
{
  int res;
  *ovf = __builtin_sadd_overflow (x, y, &res);
  return res;
}

Current trunk at -O2 generates

f:
	mov	w3, w0
	mov	w4, 0
	add	w0, w0, w1
	tbnz	w1, #31, .L4
	cmp	w0, w3
	blt	.L3
.L2:
	str	w4, [x2]
	ret
	.p2align 3
.L4:
	cmp	w0, w3
	ble	.L2
.L3:
	mov	w4, 1
	b	.L2


With the patch this now generates:

f:
	adds	w0, w0, w1
	cset	w1, vs
	str	w1, [x2]
	ret


Original patch from Richard Henderson:

https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html


Okay for trunk?

2017-05-17  Michael Collison  <michael.collison@arm.com>
	    Richard Henderson <rth@redhat.com>

	* config/aarch64/aarch64-modes.def (CC_V): New.
	* config/aarch64/aarch64-protos.h
	(aarch64_add_128bit_scratch_regs): Declare
	(aarch64_add_128bit_scratch_regs): Declare.
	(aarch64_expand_subvti): Declare.
	(aarch64_gen_unlikely_cbranch): Declare
	* config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
	for signed overflow using CC_Vmode.
	(aarch64_get_condition_code_1): Handle CC_Vmode.
	(aarch64_gen_unlikely_cbranch): New function.
	(aarch64_add_128bit_scratch_regs): New function.
	(aarch64_subv_128bit_scratch_regs): New function.
	(aarch64_expand_subvti): New function.
	* config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
	(addti3): Create simpler code if low part is already known to be 0.
	(addvti4, uaddvti4): New.
	(*add<GPI>3_compareC_cconly_imm): New.
	(*add<GPI>3_compareC_cconly): New.
	(*add<GPI>3_compareC_imm): New.
	(*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
	handle constants within this pattern.
	(*add<GPI>3_compareV_cconly_imm): New.
	(*add<GPI>3_compareV_cconly): New.
	(*add<GPI>3_compareV_imm): New.
	(add<GPI>3_compareV): New.
	(add<GPI>3_carryinC, add<GPI>3_carryinV): New.
	(*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
	(*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
	(subv<GPI>4, usubv<GPI>4): New.
	(subti): Handle op1 zero.
	(subvti4, usub4ti4): New.
	(*sub<GPI>3_compare1_imm): New.
	(sub<GPI>3_carryinCV): New.
	(*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
	(*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
	* testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usubll.c: New testcase.

[-- Attachment #2: PR6308.patch --]
[-- Type: application/octet-stream, Size: 29469 bytes --]

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 45f7a44..244e490 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
+CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
 
 /* Half-precision floating point for __fp16.  */
 FLOAT_MODE (HF, 2, 0);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f55d4ba..f38b2b8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 bool aarch64_return_address_signing_enabled (void);
 void aarch64_save_restore_target_globals (tree);
+void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				      rtx *low_in1, rtx *low_in2,
+				      rtx *high_dest, rtx *high_in1,
+				      rtx *high_in2);
+void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				       rtx *low_in1, rtx *low_in2,
+				       rtx *high_dest, rtx *high_in1,
+				       rtx *high_in2);
+void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+			    rtx low_in2, rtx high_dest, rtx high_in1,
+			    rtx high_in2);
+
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
@@ -412,6 +424,8 @@ bool aarch64_float_const_representable_p (rtx);
 
 #if defined (RTX_CODE)
 
+void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
+				   rtx label_ref);
 bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, bool);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f343d92..71a651c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && GET_CODE (y) == ZERO_EXTEND)
     return CC_Cmode;
 
+  /* A test for signed overflow.  */
+  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+      && code == NE
+      && GET_CODE (x) == PLUS
+      && GET_CODE (y) == SIGN_EXTEND)
+    return CC_Vmode;
+
   /* For everything else, return CCmode.  */
   return CCmode;
 }
@@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
 	}
       break;
 
+    case CC_Vmode:
+      switch (comp_code)
+	{
+	case NE: return AARCH64_VS;
+	case EQ: return AARCH64_VC;
+	default: return -1;
+	}
+      break;
+
     default:
       return -1;
     }
@@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
   return true;
 }
 
+/* Generate RTL for a conditional branch with rtx comparison CODE in
+   mode CC_MODE.  The destination of the unlikely conditional branch
+   is LABEL_REF.  */
+
+void
+aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
+			      rtx label_ref)
+{
+  rtx x;
+  x = gen_rtx_fmt_ee (code, VOIDmode,
+		      gen_rtx_REG (cc_mode, CC_REGNUM),
+		      const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
+			    pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
+void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				      rtx *low_in1, rtx *low_in2,
+				      rtx *high_dest, rtx *high_in1,
+				      rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = gen_lowpart (DImode, op1);
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = gen_highpart (DImode, op1);
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+}
+
+void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				       rtx *low_in1, rtx *low_in2,
+				       rtx *high_dest, rtx *high_in1,
+				       rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+
+}
+
+void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+			    rtx low_in2, rtx high_dest, rtx high_in1,
+			    rtx high_in2)
+{
+  if (low_in2 == const0_rtx)
+    {
+      low_dest = low_in1;
+      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
+				      force_reg (DImode, high_in2)));
+    }
+  else
+    {
+      if (CONST_INT_P (low_in2))
+	{
+	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
+	  high_in2 = force_reg (DImode, high_in2);
+	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
+	}
+      else
+	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
+      emit_insn (gen_subdi3_carryinCV (high_dest,
+				       force_reg (DImode, high_in1),
+				       high_in2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
+  emit_move_insn (gen_highpart (DImode, op0), high_dest);
+
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a693a3b..3976ecb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1711,25 +1711,123 @@
   }
 )
 
+(define_expand "addv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+
+  DONE;
+})
+
+
 (define_expand "addti3"
   [(set (match_operand:TI 0 "register_operand" "")
 	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "register_operand" "")))]
+		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0,l1,l2,h0,h1,h2;
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      if (!aarch64_pluslong_operand (h2, DImode))
+	h2 = force_reg (DImode, h2);
+      emit_insn (gen_adddi3 (h0, h1, h2));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
   DONE;
 })
 
+(define_expand "addvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+(define_expand "uaddvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+  DONE;
+ })
+
 (define_insn "add<mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -1828,10 +1926,70 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+;; Note that since we're sign-extending, match the immediate in GPI
+;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
+(define_insn "*add<mode>3_compareV_cconly_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
+	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "@
+  cmn\\t%<w>0, %<w>1
+  cmp\\t%<w>0, #%n1"
+  [(set_attr "type" "alus_imm")]
+)
+
+(define_insn "*add<mode>3_compareV_cconly"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "cmn\\t%<w>0, %<w>1"
+  [(set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*add<mode>3_compareV_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r,r"))
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI>
+	    (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+   ""
+   "@
+   adds\\t%<w>0, %<w>1, %<w>2
+   subs\\t%<w>0, %<w>1, #%n2"
+  [(set_attr "type" "alus_imm,alus_imm")]
+)
+
+(define_insn "add<mode>3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+  ""
+  "adds\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "alus_sreg")]
+)
+
 (define_insn "*adds_shift_imm_<mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
-	 (plus:GPI (ASHIFT:GPI 
+	 (plus:GPI (ASHIFT:GPI
 		    (match_operand:GPI 1 "register_operand" "r")
 		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		   (match_operand:GPI 3 "register_operand" "r"))
@@ -2187,6 +2345,138 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "add<mode>3_carryinC"
+  [(parallel
+     [(set (match_dup 3)
+	   (ne:CC_C
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 4)
+		 (zero_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (zero_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (zero_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 5) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 5) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
+  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinC_zero"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinC"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_expand "add<mode>3_carryinV"
+  [(parallel
+     [(set (reg:CC_V CC_REGNUM)
+	   (ne:CC_V
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 3)
+		 (sign_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (sign_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 4) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 4) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
+  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinV_zero"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*add_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (and:GPI
@@ -2283,22 +2573,86 @@
    (set_attr "simd" "*,yes")]
 )
 
+(define_expand "subv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "usubv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
+
+  DONE;
+})
+
 (define_expand "subti3"
   [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "register_operand" "")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
 		  (match_operand:TI 2 "register_operand" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = gen_lowpart (DImode, operands[2]);
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = gen_highpart (DImode, operands[2]);
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+  emit_insn (gen_subdi3_carryin (h0, h1, h2));
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+  DONE;
+})
+
+(define_expand "subvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+(define_expand "usubvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
   DONE;
 })
 
@@ -2327,6 +2681,22 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*sub<mode>3_compare1_imm"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
+	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
+  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
+  "@
+  subs\\t%<w>0, %<w>1, %<w>2
+  adds\\t%<w>0, %<w>1, %<w>3"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "sub<mode>3_compare1"
   [(set (reg:CC CC_REGNUM)
 	(compare:CC
@@ -2554,6 +2924,85 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "sub<mode>3_carryinCV"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC
+	     (sign_extend:<DWI>
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
+	     (plus:<DWI>
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r"))
+	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+      (set (match_operand:GPI 0 "register_operand" "=r")
+	   (minus:GPI
+	     (minus:GPI (match_dup 1) (match_dup 2))
+	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+   ""
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (neg:GPI (match_dup 1))
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, %<w>1"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (minus:GPI (match_dup 1) (match_dup 2))
+	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*sub_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
new file mode 100644
index 0000000..6d84bb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_add (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "addcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
new file mode 100644
index 0000000..9768a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_add (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_saddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
new file mode 100644
index 0000000..126a526
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_add (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_saddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
new file mode 100644
index 0000000..c1261e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_sub (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
new file mode 100644
index 0000000..1040464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_sub (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_ssubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
new file mode 100644
index 0000000..a03df88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_sub (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_ssubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
new file mode 100644
index 0000000..8c7c998
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_add (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "addcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
new file mode 100644
index 0000000..e325591
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_add (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_uaddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
new file mode 100644
index 0000000..5f42886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_uaddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
new file mode 100644
index 0000000..a84f4a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_sub (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
new file mode 100644
index 0000000..ed033da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_sub (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_usubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
new file mode 100644
index 0000000..a742f0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_usubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
-- 
1.9.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-05-19  6:27 [PATCH][Aarch64] Add support for overflow add and sub operations Michael Collison
@ 2017-05-19 11:00 ` Christophe Lyon
  2017-05-19 21:42   ` Michael Collison
  0 siblings, 1 reply; 8+ messages in thread
From: Christophe Lyon @ 2017-05-19 11:00 UTC (permalink / raw)
  To: Michael Collison; +Cc: gcc-patches, nd

Hi Michael,


On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
> Hi,
>
> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>
> Sure for a simple test case such as:
>
> int
> f (int x, int y, int *ovf)
> {
>   int res;
>   *ovf = __builtin_sadd_overflow (x, y, &res);
>   return res;
> }
>
> Current trunk at -O2 generates
>
> f:
>         mov     w3, w0
>         mov     w4, 0
>         add     w0, w0, w1
>         tbnz    w1, #31, .L4
>         cmp     w0, w3
>         blt     .L3
> .L2:
>         str     w4, [x2]
>         ret
>         .p2align 3
> .L4:
>         cmp     w0, w3
>         ble     .L2
> .L3:
>         mov     w4, 1
>         b       .L2
>
>
> With the patch this now generates:
>
> f:
>         adds    w0, w0, w1
>         cset    w1, vs
>         str     w1, [x2]
>         ret
>
>
> Original patch from Richard Henderson:
>
> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>
>
> Okay for trunk?
>
> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>             Richard Henderson <rth@redhat.com>
>
>         * config/aarch64/aarch64-modes.def (CC_V): New.
>         * config/aarch64/aarch64-protos.h
>         (aarch64_add_128bit_scratch_regs): Declare
>         (aarch64_add_128bit_scratch_regs): Declare.
>         (aarch64_expand_subvti): Declare.
>         (aarch64_gen_unlikely_cbranch): Declare
>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>         for signed overflow using CC_Vmode.
>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>         (aarch64_gen_unlikely_cbranch): New function.
>         (aarch64_add_128bit_scratch_regs): New function.
>         (aarch64_subv_128bit_scratch_regs): New function.
>         (aarch64_expand_subvti): New function.
>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>         (addti3): Create simpler code if low part is already known to be 0.
>         (addvti4, uaddvti4): New.
>         (*add<GPI>3_compareC_cconly_imm): New.
>         (*add<GPI>3_compareC_cconly): New.
>         (*add<GPI>3_compareC_imm): New.
>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>         handle constants within this pattern.
>         (*add<GPI>3_compareV_cconly_imm): New.
>         (*add<GPI>3_compareV_cconly): New.
>         (*add<GPI>3_compareV_imm): New.
>         (add<GPI>3_compareV): New.
>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>         (subv<GPI>4, usubv<GPI>4): New.
>         (subti): Handle op1 zero.
>         (subvti4, usub4ti4): New.
>         (*sub<GPI>3_compare1_imm): New.
>         (sub<GPI>3_carryinCV): New.
>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.

I've tried your patch, and 2 of the new tests FAIL:
    gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
    gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs

Am I missing something?

Thanks,

Christophe

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-05-19 11:00 ` Christophe Lyon
@ 2017-05-19 21:42   ` Michael Collison
  2017-07-05  9:38     ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 8+ messages in thread
From: Michael Collison @ 2017-05-19 21:42 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches, nd

[-- Attachment #1: Type: text/plain, Size: 4671 bytes --]

Christophe,

I had a type in the two test cases: "addcs" should have been "adcs". I caught this previously but submitted the previous patch incorrectly. Updated patch attached.

Okay for trunk?

-----Original Message-----
From: Christophe Lyon [mailto:christophe.lyon@linaro.org] 
Sent: Friday, May 19, 2017 3:59 AM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub operations

Hi Michael,


On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
> Hi,
>
> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>
> Sure for a simple test case such as:
>
> int
> f (int x, int y, int *ovf)
> {
>   int res;
>   *ovf = __builtin_sadd_overflow (x, y, &res);
>   return res;
> }
>
> Current trunk at -O2 generates
>
> f:
>         mov     w3, w0
>         mov     w4, 0
>         add     w0, w0, w1
>         tbnz    w1, #31, .L4
>         cmp     w0, w3
>         blt     .L3
> .L2:
>         str     w4, [x2]
>         ret
>         .p2align 3
> .L4:
>         cmp     w0, w3
>         ble     .L2
> .L3:
>         mov     w4, 1
>         b       .L2
>
>
> With the patch this now generates:
>
> f:
>         adds    w0, w0, w1
>         cset    w1, vs
>         str     w1, [x2]
>         ret
>
>
> Original patch from Richard Henderson:
>
> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>
>
> Okay for trunk?
>
> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>             Richard Henderson <rth@redhat.com>
>
>         * config/aarch64/aarch64-modes.def (CC_V): New.
>         * config/aarch64/aarch64-protos.h
>         (aarch64_add_128bit_scratch_regs): Declare
>         (aarch64_add_128bit_scratch_regs): Declare.
>         (aarch64_expand_subvti): Declare.
>         (aarch64_gen_unlikely_cbranch): Declare
>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>         for signed overflow using CC_Vmode.
>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>         (aarch64_gen_unlikely_cbranch): New function.
>         (aarch64_add_128bit_scratch_regs): New function.
>         (aarch64_subv_128bit_scratch_regs): New function.
>         (aarch64_expand_subvti): New function.
>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>         (addti3): Create simpler code if low part is already known to be 0.
>         (addvti4, uaddvti4): New.
>         (*add<GPI>3_compareC_cconly_imm): New.
>         (*add<GPI>3_compareC_cconly): New.
>         (*add<GPI>3_compareC_imm): New.
>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>         handle constants within this pattern.
>         (*add<GPI>3_compareV_cconly_imm): New.
>         (*add<GPI>3_compareV_cconly): New.
>         (*add<GPI>3_compareV_imm): New.
>         (add<GPI>3_compareV): New.
>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>         (subv<GPI>4, usubv<GPI>4): New.
>         (subti): Handle op1 zero.
>         (subvti4, usub4ti4): New.
>         (*sub<GPI>3_compare1_imm): New.
>         (sub<GPI>3_carryinCV): New.
>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.

I've tried your patch, and 2 of the new tests FAIL:
    gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
    gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs

Am I missing something?

Thanks,

Christophe

[-- Attachment #2: pr6308v2.patch --]
[-- Type: application/octet-stream, Size: 29467 bytes --]

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 45f7a44..244e490 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
+CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
 
 /* Half-precision floating point for __fp16.  */
 FLOAT_MODE (HF, 2, 0);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f55d4ba..f38b2b8 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 bool aarch64_return_address_signing_enabled (void);
 void aarch64_save_restore_target_globals (tree);
+void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				      rtx *low_in1, rtx *low_in2,
+				      rtx *high_dest, rtx *high_in1,
+				      rtx *high_in2);
+void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				       rtx *low_in1, rtx *low_in2,
+				       rtx *high_dest, rtx *high_in1,
+				       rtx *high_in2);
+void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+			    rtx low_in2, rtx high_dest, rtx high_in1,
+			    rtx high_in2);
+
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
@@ -412,6 +424,8 @@ bool aarch64_float_const_representable_p (rtx);
 
 #if defined (RTX_CODE)
 
+void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
+				   rtx label_ref);
 bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, bool);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index f343d92..71a651c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && GET_CODE (y) == ZERO_EXTEND)
     return CC_Cmode;
 
+  /* A test for signed overflow.  */
+  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+      && code == NE
+      && GET_CODE (x) == PLUS
+      && GET_CODE (y) == SIGN_EXTEND)
+    return CC_Vmode;
+
   /* For everything else, return CCmode.  */
   return CCmode;
 }
@@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
 	}
       break;
 
+    case CC_Vmode:
+      switch (comp_code)
+	{
+	case NE: return AARCH64_VS;
+	case EQ: return AARCH64_VC;
+	default: return -1;
+	}
+      break;
+
     default:
       return -1;
     }
@@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
   return true;
 }
 
+/* Generate RTL for a conditional branch with rtx comparison CODE in
+   mode CC_MODE.  The destination of the unlikely conditional branch
+   is LABEL_REF.  */
+
+void
+aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
+			      rtx label_ref)
+{
+  rtx x;
+  x = gen_rtx_fmt_ee (code, VOIDmode,
+		      gen_rtx_REG (cc_mode, CC_REGNUM),
+		      const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
+			    pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
+void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				      rtx *low_in1, rtx *low_in2,
+				      rtx *high_dest, rtx *high_in1,
+				      rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = gen_lowpart (DImode, op1);
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = gen_highpart (DImode, op1);
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+}
+
+void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				       rtx *low_in1, rtx *low_in2,
+				       rtx *high_dest, rtx *high_in1,
+				       rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+
+}
+
+void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+			    rtx low_in2, rtx high_dest, rtx high_in1,
+			    rtx high_in2)
+{
+  if (low_in2 == const0_rtx)
+    {
+      low_dest = low_in1;
+      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
+				      force_reg (DImode, high_in2)));
+    }
+  else
+    {
+      if (CONST_INT_P (low_in2))
+	{
+	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
+	  high_in2 = force_reg (DImode, high_in2);
+	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
+	}
+      else
+	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
+      emit_insn (gen_subdi3_carryinCV (high_dest,
+				       force_reg (DImode, high_in1),
+				       high_in2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
+  emit_move_insn (gen_highpart (DImode, op0), high_dest);
+
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index a693a3b..3976ecb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1711,25 +1711,123 @@
   }
 )
 
+(define_expand "addv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+
+  DONE;
+})
+
+
 (define_expand "addti3"
   [(set (match_operand:TI 0 "register_operand" "")
 	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "register_operand" "")))]
+		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0,l1,l2,h0,h1,h2;
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      if (!aarch64_pluslong_operand (h2, DImode))
+	h2 = force_reg (DImode, h2);
+      emit_insn (gen_adddi3 (h0, h1, h2));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
   DONE;
 })
 
+(define_expand "addvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+(define_expand "uaddvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+  DONE;
+ })
+
 (define_insn "add<mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -1828,10 +1926,70 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+;; Note that since we're sign-extending, match the immediate in GPI
+;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
+(define_insn "*add<mode>3_compareV_cconly_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
+	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "@
+  cmn\\t%<w>0, %<w>1
+  cmp\\t%<w>0, #%n1"
+  [(set_attr "type" "alus_imm")]
+)
+
+(define_insn "*add<mode>3_compareV_cconly"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "cmn\\t%<w>0, %<w>1"
+  [(set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*add<mode>3_compareV_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r,r"))
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI>
+	    (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+   ""
+   "@
+   adds\\t%<w>0, %<w>1, %<w>2
+   subs\\t%<w>0, %<w>1, #%n2"
+  [(set_attr "type" "alus_imm,alus_imm")]
+)
+
+(define_insn "add<mode>3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+  ""
+  "adds\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "alus_sreg")]
+)
+
 (define_insn "*adds_shift_imm_<mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
-	 (plus:GPI (ASHIFT:GPI 
+	 (plus:GPI (ASHIFT:GPI
 		    (match_operand:GPI 1 "register_operand" "r")
 		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		   (match_operand:GPI 3 "register_operand" "r"))
@@ -2187,6 +2345,138 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "add<mode>3_carryinC"
+  [(parallel
+     [(set (match_dup 3)
+	   (ne:CC_C
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 4)
+		 (zero_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (zero_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (zero_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 5) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 5) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
+  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinC_zero"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinC"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_expand "add<mode>3_carryinV"
+  [(parallel
+     [(set (reg:CC_V CC_REGNUM)
+	   (ne:CC_V
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 3)
+		 (sign_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (sign_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 4) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 4) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
+  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinV_zero"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*add_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (and:GPI
@@ -2283,22 +2573,86 @@
    (set_attr "simd" "*,yes")]
 )
 
+(define_expand "subv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+(define_expand "usubv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
+
+  DONE;
+})
+
 (define_expand "subti3"
   [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "register_operand" "")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
 		  (match_operand:TI 2 "register_operand" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = gen_lowpart (DImode, operands[2]);
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = gen_highpart (DImode, operands[2]);
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+  emit_insn (gen_subdi3_carryin (h0, h1, h2));
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+  DONE;
+})
+
+(define_expand "subvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+(define_expand "usubvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
   DONE;
 })
 
@@ -2327,6 +2681,22 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*sub<mode>3_compare1_imm"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
+	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
+  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
+  "@
+  subs\\t%<w>0, %<w>1, %<w>2
+  adds\\t%<w>0, %<w>1, %<w>3"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "sub<mode>3_compare1"
   [(set (reg:CC CC_REGNUM)
 	(compare:CC
@@ -2554,6 +2924,85 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "sub<mode>3_carryinCV"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC
+	     (sign_extend:<DWI>
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
+	     (plus:<DWI>
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r"))
+	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+      (set (match_operand:GPI 0 "register_operand" "=r")
+	   (minus:GPI
+	     (minus:GPI (match_dup 1) (match_dup 2))
+	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+   ""
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (neg:GPI (match_dup 1))
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, %<w>1"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (minus:GPI (match_dup 1) (match_dup 2))
+	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*sub_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
new file mode 100644
index 0000000..0b31500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_add (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
new file mode 100644
index 0000000..9768a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_add (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_saddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
new file mode 100644
index 0000000..126a526
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_add (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_saddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
new file mode 100644
index 0000000..c1261e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_sub (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
new file mode 100644
index 0000000..1040464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_sub (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_ssubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
new file mode 100644
index 0000000..a03df88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_sub (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_ssubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
new file mode 100644
index 0000000..c573c2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_add (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
new file mode 100644
index 0000000..e325591
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_add (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_uaddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
new file mode 100644
index 0000000..5f42886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_uaddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
new file mode 100644
index 0000000..a84f4a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_sub (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
new file mode 100644
index 0000000..ed033da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_sub (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_usubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
new file mode 100644
index 0000000..a742f0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_usubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
-- 
1.9.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-05-19 21:42   ` Michael Collison
@ 2017-07-05  9:38     ` Richard Earnshaw (lists)
  2017-07-06  7:29       ` Michael Collison
  2017-08-01  6:33       ` Michael Collison
  0 siblings, 2 replies; 8+ messages in thread
From: Richard Earnshaw (lists) @ 2017-07-05  9:38 UTC (permalink / raw)
  To: Michael Collison, Christophe Lyon; +Cc: gcc-patches, nd

On 19/05/17 22:11, Michael Collison wrote:
> Christophe,
> 
> I had a type in the two test cases: "addcs" should have been "adcs". I caught this previously but submitted the previous patch incorrectly. Updated patch attached.
> 
> Okay for trunk?
> 

Apologies for the delay responding, I've been procrastinating over this
one.   In part it's due to the size of the patch with very little
top-level description of what's the motivation and overall approach to
the problem.

It would really help review if this could be split into multiple patches
with a description of what each stage achieves.

Anyway, there are a couple of obvious formatting issues to deal with
first, before we get into the details of the patch.

> -----Original Message-----
> From: Christophe Lyon [mailto:christophe.lyon@linaro.org] 
> Sent: Friday, May 19, 2017 3:59 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub operations
> 
> Hi Michael,
> 
> 
> On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
>> Hi,
>>
>> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>>
>> Sure for a simple test case such as:
>>
>> int
>> f (int x, int y, int *ovf)
>> {
>>   int res;
>>   *ovf = __builtin_sadd_overflow (x, y, &res);
>>   return res;
>> }
>>
>> Current trunk at -O2 generates
>>
>> f:
>>         mov     w3, w0
>>         mov     w4, 0
>>         add     w0, w0, w1
>>         tbnz    w1, #31, .L4
>>         cmp     w0, w3
>>         blt     .L3
>> .L2:
>>         str     w4, [x2]
>>         ret
>>         .p2align 3
>> .L4:
>>         cmp     w0, w3
>>         ble     .L2
>> .L3:
>>         mov     w4, 1
>>         b       .L2
>>
>>
>> With the patch this now generates:
>>
>> f:
>>         adds    w0, w0, w1
>>         cset    w1, vs
>>         str     w1, [x2]
>>         ret
>>
>>
>> Original patch from Richard Henderson:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>>
>>
>> Okay for trunk?
>>
>> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>>             Richard Henderson <rth@redhat.com>
>>
>>         * config/aarch64/aarch64-modes.def (CC_V): New.
>>         * config/aarch64/aarch64-protos.h
>>         (aarch64_add_128bit_scratch_regs): Declare
>>         (aarch64_add_128bit_scratch_regs): Declare.
>>         (aarch64_expand_subvti): Declare.
>>         (aarch64_gen_unlikely_cbranch): Declare
>>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>>         for signed overflow using CC_Vmode.
>>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>>         (aarch64_gen_unlikely_cbranch): New function.
>>         (aarch64_add_128bit_scratch_regs): New function.
>>         (aarch64_subv_128bit_scratch_regs): New function.
>>         (aarch64_expand_subvti): New function.
>>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>>         (addti3): Create simpler code if low part is already known to be 0.
>>         (addvti4, uaddvti4): New.
>>         (*add<GPI>3_compareC_cconly_imm): New.
>>         (*add<GPI>3_compareC_cconly): New.
>>         (*add<GPI>3_compareC_imm): New.
>>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>>         handle constants within this pattern.
>>         (*add<GPI>3_compareV_cconly_imm): New.
>>         (*add<GPI>3_compareV_cconly): New.
>>         (*add<GPI>3_compareV_imm): New.
>>         (add<GPI>3_compareV): New.
>>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>>         (subv<GPI>4, usubv<GPI>4): New.
>>         (subti): Handle op1 zero.
>>         (subvti4, usub4ti4): New.
>>         (*sub<GPI>3_compare1_imm): New.
>>         (sub<GPI>3_carryinCV): New.
>>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.
> 
> I've tried your patch, and 2 of the new tests FAIL:
>     gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
>     gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs
> 
> Am I missing something?
> 
> Thanks,
> 
> Christophe
> 
> 
> pr6308v2.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
> index 45f7a44..244e490 100644
> --- a/gcc/config/aarch64/aarch64-modes.def
> +++ b/gcc/config/aarch64/aarch64-modes.def
> @@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
>  CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
>  CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
>  CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
> +CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
>  
>  /* Half-precision floating point for __fp16.  */
>  FLOAT_MODE (HF, 2, 0);
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index f55d4ba..f38b2b8 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);
>  void aarch64_reset_previous_fndecl (void);
>  bool aarch64_return_address_signing_enabled (void);
>  void aarch64_save_restore_target_globals (tree);
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2);
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2);
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2);
> +

It's a little bit inconsistent, but the general style in
aarch64-protos.h is not to include parameter names in prototypes, just
their types.

>  
>  /* Initialize builtins for SIMD intrinsics.  */
>  void init_aarch64_simd_builtins (void);
> @@ -412,6 +424,8 @@ bool aarch64_float_const_representable_p (rtx);
>  
>  #if defined (RTX_CODE)
>  
> +void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
> +				   rtx label_ref);
>  bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, bool);
>  machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
>  rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index f343d92..71a651c 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
>        && GET_CODE (y) == ZERO_EXTEND)
>      return CC_Cmode;
>  
> +  /* A test for signed overflow.  */
> +  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
> +      && code == NE
> +      && GET_CODE (x) == PLUS
> +      && GET_CODE (y) == SIGN_EXTEND)
> +    return CC_Vmode;
> +
>    /* For everything else, return CCmode.  */
>    return CCmode;
>  }
> @@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
>  	}
>        break;
>  
> +    case CC_Vmode:
> +      switch (comp_code)
> +	{
> +	case NE: return AARCH64_VS;
> +	case EQ: return AARCH64_VC;
> +	default: return -1;
> +	}
> +      break;
> +
>      default:
>        return -1;
>      }
> @@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>    return true;
>  }
>  
> +/* Generate RTL for a conditional branch with rtx comparison CODE in
> +   mode CC_MODE.  The destination of the unlikely conditional branch
> +   is LABEL_REF.  */
> +
> +void
> +aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
> +			      rtx label_ref)
> +{
> +  rtx x;
> +  x = gen_rtx_fmt_ee (code, VOIDmode,
> +		      gen_rtx_REG (cc_mode, CC_REGNUM),
> +		      const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
> +			    pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
> +}
> +
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,

Function names must start in column 1, with the return type on the
preceding line.  All functions should have a top-level comment
describing what they do (their contract with the caller).

> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = gen_lowpart (DImode, op1);
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = gen_highpart (DImode, op1);
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +}
> +
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,

Same here.

> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +
> +}
> +
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
And here.

> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2)
> +{
> +  if (low_in2 == const0_rtx)
> +    {
> +      low_dest = low_in1;
> +      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
> +				      force_reg (DImode, high_in2)));
> +    }
> +  else
> +    {
> +      if (CONST_INT_P (low_in2))
> +	{
> +	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
> +	  high_in2 = force_reg (DImode, high_in2);
> +	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
> +	}
> +      else
> +	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
> +      emit_insn (gen_subdi3_carryinCV (high_dest,
> +				       force_reg (DImode, high_in1),
> +				       high_in2));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
> +  emit_move_insn (gen_highpart (DImode, op0), high_dest);
> +
> +}
> +
>  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
>  
>  static unsigned HOST_WIDE_INT
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index a693a3b..3976ecb 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1711,25 +1711,123 @@
>    }
>  )
>  
> +(define_expand "addv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "uaddv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]

With no rtl in the expand to describe this pattern, it really should
have a top-level comment explaining the arguments (reference to the
manual is probably OK in this case).

> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +
>  (define_expand "addti3"
>    [(set (match_operand:TI 0 "register_operand" "")
>  	(plus:TI (match_operand:TI 1 "register_operand" "")
> -		 (match_operand:TI 2 "register_operand" "")))]
> +		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0,l1,l2,h0,h1,h2;
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      if (!aarch64_pluslong_operand (h2, DImode))
> +	h2 = force_reg (DImode, h2);
> +      emit_insn (gen_adddi3 (h0, h1, h2));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
>    DONE;
>  })
>  
> +(define_expand "addvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]

Same here.

> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "uaddvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
> +  DONE;
> + })
> +
>  (define_insn "add<mode>3_compare0"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> @@ -1828,10 +1926,70 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +;; Note that since we're sign-extending, match the immediate in GPI
> +;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
> +(define_insn "*add<mode>3_compareV_cconly_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
> +	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "@
> +  cmn\\t%<w>0, %<w>1
> +  cmp\\t%<w>0, #%n1"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_cconly"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V

Use of ne is wrong here.  The condition register should be set to the
result of a compare rtl construct.  The same applies elsewhere within
this patch.  NE is then used on the result of the comparison.  The mode
of the compare then indicates what might or might not be valid in the
way the comparison is finally constructed.

Note that this issue may go back to the earlier patches that this is
based on, but those are equally incorrect and wil need fixing as well at
some point.  We shouldn't prepetuate the issue.

> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "cmn\\t%<w>0, %<w>1"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r,r"))
> +	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +   ""
> +   "@
> +   adds\\t%<w>0, %<w>1, %<w>2
> +   subs\\t%<w>0, %<w>1, #%n2"
> +  [(set_attr "type" "alus_imm,alus_imm")]
> +)
> +
> +(define_insn "add<mode>3_compareV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +  ""
> +  "adds\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
>  (define_insn "*adds_shift_imm_<mode>"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> -	 (plus:GPI (ASHIFT:GPI 
> +	 (plus:GPI (ASHIFT:GPI
>  		    (match_operand:GPI 1 "register_operand" "r")
>  		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
>  		   (match_operand:GPI 3 "register_operand" "r"))
> @@ -2187,6 +2345,138 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "add<mode>3_carryinC"
> +  [(parallel
> +     [(set (match_dup 3)
> +	   (ne:CC_C
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 4)
> +		 (zero_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (zero_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (zero_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 5) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 5) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
> +  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinC_zero"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinC"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_expand "add<mode>3_carryinV"
> +  [(parallel
> +     [(set (reg:CC_V CC_REGNUM)
> +	   (ne:CC_V
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 3)
> +		 (sign_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (sign_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 4) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 4) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
> +  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinV_zero"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*add_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(plus:GPI (and:GPI
> @@ -2283,22 +2573,86 @@
>     (set_attr "simd" "*,yes")]
>  )
>  
> +(define_expand "subv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "usubv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
> +
> +  DONE;
> +})
> +
>  (define_expand "subti3"
>    [(set (match_operand:TI 0 "register_operand" "")
> -	(minus:TI (match_operand:TI 1 "register_operand" "")
> +	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
>  		  (match_operand:TI 2 "register_operand" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0 = gen_reg_rtx (DImode);
> +  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_lowpart_offset (DImode, TImode));
> +  rtx l2 = gen_lowpart (DImode, operands[2]);
> +  rtx h0 = gen_reg_rtx (DImode);
> +  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_highpart_offset (DImode, TImode));
> +  rtx h2 = gen_highpart (DImode, operands[2]);
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  emit_insn (gen_subdi3_compare1 (l0, l1, l2));
> +  emit_insn (gen_subdi3_carryin (h0, h1, h2));
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +  DONE;
> +})
> +
> +(define_expand "subvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "usubvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
>    DONE;
>  })
>  
> @@ -2327,6 +2681,22 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +(define_insn "*sub<mode>3_compare1_imm"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
> +	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
> +  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
> +  "@
> +  subs\\t%<w>0, %<w>1, %<w>2
> +  adds\\t%<w>0, %<w>1, %<w>3"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
>  (define_insn "sub<mode>3_compare1"
>    [(set (reg:CC CC_REGNUM)
>  	(compare:CC
> @@ -2554,6 +2924,85 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "sub<mode>3_carryinCV"
> +  [(parallel
> +     [(set (reg:CC CC_REGNUM)
> +	   (compare:CC
> +	     (sign_extend:<DWI>
> +	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
> +	     (plus:<DWI>
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r"))
> +	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
> +      (set (match_operand:GPI 0 "register_operand" "=r")
> +	   (minus:GPI
> +	     (minus:GPI (match_dup 1) (match_dup 2))
> +	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
> +   ""
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r"))
> +	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (neg:GPI (match_dup 1))
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, %<w>1"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 2 "register_operand" "r"))
> +	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (minus:GPI (match_dup 1) (match_dup 2))
> +	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*sub_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> new file mode 100644
> index 0000000..0b31500
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_add (__int128 x, __int128 y)
> +{
> +  __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> new file mode 100644
> index 0000000..9768a98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_add (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_saddl_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> new file mode 100644
> index 0000000..126a526
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_add (long long x, long long y)
> +{
> +  long long r;
> +
> +  int ovr = __builtin_saddll_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> new file mode 100644
> index 0000000..c1261e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_sub (__int128 x, __int128 y)
> +{
> +  __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> new file mode 100644
> index 0000000..1040464
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_sub (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_ssubl_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> new file mode 100644
> index 0000000..a03df88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_sub (long long x, long long y)
> +{
> +  long long r;
> +
> +  int ovr = __builtin_ssubll_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> new file mode 100644
> index 0000000..c573c2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_add (unsigned __int128 x, unsigned __int128 y)
> +{
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> new file mode 100644
> index 0000000..e325591
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_add (unsigned long x, unsigned long y)
> +{
> +  unsigned long r;
> +
> +  int ovr = __builtin_uaddl_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> new file mode 100644
> index 0000000..5f42886
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_add (unsigned long long x, unsigned long long y)
> +{
> +  unsigned long long r;
> +
> +  int ovr = __builtin_uaddll_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> new file mode 100644
> index 0000000..a84f4a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_sub (unsigned __int128 x, unsigned __int128 y)
> +{
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> new file mode 100644
> index 0000000..ed033da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_sub (unsigned long x, unsigned long y)
> +{
> +  unsigned long r;
> +
> +  int ovr = __builtin_usubl_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> new file mode 100644
> index 0000000..a742f0c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
> +{
> +  unsigned long long r;
> +
> +  int ovr = __builtin_usubll_overflow (x, y, &r);
> +  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-07-05  9:38     ` Richard Earnshaw (lists)
@ 2017-07-06  7:29       ` Michael Collison
  2017-07-06  8:22         ` Richard Earnshaw (lists)
  2017-08-01  6:33       ` Michael Collison
  1 sibling, 1 reply; 8+ messages in thread
From: Michael Collison @ 2017-07-06  7:29 UTC (permalink / raw)
  To: Richard Earnshaw, Christophe Lyon; +Cc: gcc-patches, nd

Richard,

Can you explain "Use of ne is wrong here.  The condition register should be set to the result of a compare rtl construct.  The same applies elsewhere within this patch.  NE is then used on the result of the comparison.  The mode of the compare then indicates what might or might not be valid in the way the comparison is finally constructed."?

Why is "ne" wrong? I don't doubt you are correct, but I see nothing in the internals manual that forbids it. I want to understand what issues this exposes.

As you indicate I used this idiom in the arm port when I added the overflow operations there as well. Additionally other targets seem to use the comparison operators this way (i386 for the umulv).

Regards,

Michael Collison

-----Original Message-----
From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
Sent: Wednesday, July 5, 2017 2:38 AM
To: Michael Collison <Michael.Collison@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub operations

On 19/05/17 22:11, Michael Collison wrote:
> Christophe,
> 
> I had a type in the two test cases: "addcs" should have been "adcs". I caught this previously but submitted the previous patch incorrectly. Updated patch attached.
> 
> Okay for trunk?
> 

Apologies for the delay responding, I've been procrastinating over this
one.   In part it's due to the size of the patch with very little
top-level description of what's the motivation and overall approach to the problem.

It would really help review if this could be split into multiple patches with a description of what each stage achieves.

Anyway, there are a couple of obvious formatting issues to deal with first, before we get into the details of the patch.

> -----Original Message-----
> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
> Sent: Friday, May 19, 2017 3:59 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub 
> operations
> 
> Hi Michael,
> 
> 
> On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
>> Hi,
>>
>> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>>
>> Sure for a simple test case such as:
>>
>> int
>> f (int x, int y, int *ovf)
>> {
>>   int res;
>>   *ovf = __builtin_sadd_overflow (x, y, &res);
>>   return res;
>> }
>>
>> Current trunk at -O2 generates
>>
>> f:
>>         mov     w3, w0
>>         mov     w4, 0
>>         add     w0, w0, w1
>>         tbnz    w1, #31, .L4
>>         cmp     w0, w3
>>         blt     .L3
>> .L2:
>>         str     w4, [x2]
>>         ret
>>         .p2align 3
>> .L4:
>>         cmp     w0, w3
>>         ble     .L2
>> .L3:
>>         mov     w4, 1
>>         b       .L2
>>
>>
>> With the patch this now generates:
>>
>> f:
>>         adds    w0, w0, w1
>>         cset    w1, vs
>>         str     w1, [x2]
>>         ret
>>
>>
>> Original patch from Richard Henderson:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>>
>>
>> Okay for trunk?
>>
>> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>>             Richard Henderson <rth@redhat.com>
>>
>>         * config/aarch64/aarch64-modes.def (CC_V): New.
>>         * config/aarch64/aarch64-protos.h
>>         (aarch64_add_128bit_scratch_regs): Declare
>>         (aarch64_add_128bit_scratch_regs): Declare.
>>         (aarch64_expand_subvti): Declare.
>>         (aarch64_gen_unlikely_cbranch): Declare
>>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>>         for signed overflow using CC_Vmode.
>>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>>         (aarch64_gen_unlikely_cbranch): New function.
>>         (aarch64_add_128bit_scratch_regs): New function.
>>         (aarch64_subv_128bit_scratch_regs): New function.
>>         (aarch64_expand_subvti): New function.
>>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>>         (addti3): Create simpler code if low part is already known to be 0.
>>         (addvti4, uaddvti4): New.
>>         (*add<GPI>3_compareC_cconly_imm): New.
>>         (*add<GPI>3_compareC_cconly): New.
>>         (*add<GPI>3_compareC_imm): New.
>>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>>         handle constants within this pattern.
>>         (*add<GPI>3_compareV_cconly_imm): New.
>>         (*add<GPI>3_compareV_cconly): New.
>>         (*add<GPI>3_compareV_imm): New.
>>         (add<GPI>3_compareV): New.
>>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>>         (subv<GPI>4, usubv<GPI>4): New.
>>         (subti): Handle op1 zero.
>>         (subvti4, usub4ti4): New.
>>         (*sub<GPI>3_compare1_imm): New.
>>         (sub<GPI>3_carryinCV): New.
>>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.
> 
> I've tried your patch, and 2 of the new tests FAIL:
>     gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
>     gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs
> 
> Am I missing something?
> 
> Thanks,
> 
> Christophe
> 
> 
> pr6308v2.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-modes.def 
> b/gcc/config/aarch64/aarch64-modes.def
> index 45f7a44..244e490 100644
> --- a/gcc/config/aarch64/aarch64-modes.def
> +++ b/gcc/config/aarch64/aarch64-modes.def
> @@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
>  CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
>  CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
>  CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
> +CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
>  
>  /* Half-precision floating point for __fp16.  */  FLOAT_MODE (HF, 2, 
> 0); diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index f55d4ba..f38b2b8 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);  void 
> aarch64_reset_previous_fndecl (void);  bool 
> aarch64_return_address_signing_enabled (void);  void 
> aarch64_save_restore_target_globals (tree);
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2);
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2);
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2);
> +

It's a little bit inconsistent, but the general style in aarch64-protos.h is not to include parameter names in prototypes, just their types.

>  
>  /* Initialize builtins for SIMD intrinsics.  */  void 
> init_aarch64_simd_builtins (void); @@ -412,6 +424,8 @@ bool 
> aarch64_float_const_representable_p (rtx);
>  
>  #if defined (RTX_CODE)
>  
> +void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
> +				   rtx label_ref);
>  bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, 
> bool);  machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);  rtx 
> aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); diff --git 
> a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 
> f343d92..71a651c 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
>        && GET_CODE (y) == ZERO_EXTEND)
>      return CC_Cmode;
>  
> +  /* A test for signed overflow.  */
> +  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
> +      && code == NE
> +      && GET_CODE (x) == PLUS
> +      && GET_CODE (y) == SIGN_EXTEND)
> +    return CC_Vmode;
> +
>    /* For everything else, return CCmode.  */
>    return CCmode;
>  }
> @@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
>  	}
>        break;
>  
> +    case CC_Vmode:
> +      switch (comp_code)
> +	{
> +	case NE: return AARCH64_VS;
> +	case EQ: return AARCH64_VC;
> +	default: return -1;
> +	}
> +      break;
> +
>      default:
>        return -1;
>      }
> @@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>    return true;
>  }
>  
> +/* Generate RTL for a conditional branch with rtx comparison CODE in
> +   mode CC_MODE.  The destination of the unlikely conditional branch
> +   is LABEL_REF.  */
> +
> +void
> +aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
> +			      rtx label_ref)
> +{
> +  rtx x;
> +  x = gen_rtx_fmt_ee (code, VOIDmode,
> +		      gen_rtx_REG (cc_mode, CC_REGNUM),
> +		      const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
> +			    pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); }
> +
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx 
> +*low_dest,

Function names must start in column 1, with the return type on the preceding line.  All functions should have a top-level comment describing what they do (their contract with the caller).

> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = gen_lowpart (DImode, op1);
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = gen_highpart (DImode, op1);
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode)); }
> +
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx 
> +*low_dest,

Same here.

> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +
> +}
> +
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
And here.

> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2)
> +{
> +  if (low_in2 == const0_rtx)
> +    {
> +      low_dest = low_in1;
> +      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
> +				      force_reg (DImode, high_in2)));
> +    }
> +  else
> +    {
> +      if (CONST_INT_P (low_in2))
> +	{
> +	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
> +	  high_in2 = force_reg (DImode, high_in2);
> +	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
> +	}
> +      else
> +	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
> +      emit_insn (gen_subdi3_carryinCV (high_dest,
> +				       force_reg (DImode, high_in1),
> +				       high_in2));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, op0), low_dest);  
> + emit_move_insn (gen_highpart (DImode, op0), high_dest);
> +
> +}
> +
>  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
>  
>  static unsigned HOST_WIDE_INT
> diff --git a/gcc/config/aarch64/aarch64.md 
> b/gcc/config/aarch64/aarch64.md index a693a3b..3976ecb 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1711,25 +1711,123 @@
>    }
>  )
>  
> +(define_expand "addv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "uaddv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]

With no rtl in the expand to describe this pattern, it really should have a top-level comment explaining the arguments (reference to the manual is probably OK in this case).

> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +
>  (define_expand "addti3"
>    [(set (match_operand:TI 0 "register_operand" "")
>  	(plus:TI (match_operand:TI 1 "register_operand" "")
> -		 (match_operand:TI 2 "register_operand" "")))]
> +		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0,l1,l2,h0,h1,h2;
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      if (!aarch64_pluslong_operand (h2, DImode))
> +	h2 = force_reg (DImode, h2);
> +      emit_insn (gen_adddi3 (h0, h1, h2));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
>    DONE;
>  })
>  
> +(define_expand "addvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]

Same here.

> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "uaddvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);  DONE;
> + })
> +
>  (define_insn "add<mode>3_compare0"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> @@ -1828,10 +1926,70 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +;; Note that since we're sign-extending, match the immediate in GPI 
> +;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
> +(define_insn "*add<mode>3_compareV_cconly_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
> +	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "@
> +  cmn\\t%<w>0, %<w>1
> +  cmp\\t%<w>0, #%n1"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_cconly"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V

Use of ne is wrong here.  The condition register should be set to the result of a compare rtl construct.  The same applies elsewhere within this patch.  NE is then used on the result of the comparison.  The mode of the compare then indicates what might or might not be valid in the way the comparison is finally constructed.

Note that this issue may go back to the earlier patches that this is based on, but those are equally incorrect and wil need fixing as well at some point.  We shouldn't prepetuate the issue.

> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "cmn\\t%<w>0, %<w>1"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r,r"))
> +	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +   ""
> +   "@
> +   adds\\t%<w>0, %<w>1, %<w>2
> +   subs\\t%<w>0, %<w>1, #%n2"
> +  [(set_attr "type" "alus_imm,alus_imm")]
> +)
> +
> +(define_insn "add<mode>3_compareV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +  ""
> +  "adds\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
>  (define_insn "*adds_shift_imm_<mode>"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> -	 (plus:GPI (ASHIFT:GPI 
> +	 (plus:GPI (ASHIFT:GPI
>  		    (match_operand:GPI 1 "register_operand" "r")
>  		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
>  		   (match_operand:GPI 3 "register_operand" "r")) @@ -2187,6 
> +2345,138 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "add<mode>3_carryinC"
> +  [(parallel
> +     [(set (match_dup 3)
> +	   (ne:CC_C
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 4)
> +		 (zero_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (zero_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (zero_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 5) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 5) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
> +  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinC_zero"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinC"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_expand "add<mode>3_carryinV"
> +  [(parallel
> +     [(set (reg:CC_V CC_REGNUM)
> +	   (ne:CC_V
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 3)
> +		 (sign_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (sign_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 4) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 4) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
> +  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinV_zero"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*add_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(plus:GPI (and:GPI
> @@ -2283,22 +2573,86 @@
>     (set_attr "simd" "*,yes")]
>  )
>  
> +(define_expand "subv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "usubv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
> +
> +  DONE;
> +})
> +
>  (define_expand "subti3"
>    [(set (match_operand:TI 0 "register_operand" "")
> -	(minus:TI (match_operand:TI 1 "register_operand" "")
> +	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
>  		  (match_operand:TI 2 "register_operand" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0 = gen_reg_rtx (DImode);
> +  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_lowpart_offset (DImode, TImode));
> +  rtx l2 = gen_lowpart (DImode, operands[2]);
> +  rtx h0 = gen_reg_rtx (DImode);
> +  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_highpart_offset (DImode, TImode));
> +  rtx h2 = gen_highpart (DImode, operands[2]);
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  emit_insn (gen_subdi3_compare1 (l0, l1, l2));  emit_insn 
> + (gen_subdi3_carryin (h0, h1, h2));
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +  DONE;
> +})
> +
> +(define_expand "subvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "usubvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
>    DONE;
>  })
>  
> @@ -2327,6 +2681,22 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +(define_insn "*sub<mode>3_compare1_imm"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
> +	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
> +  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
> +  "@
> +  subs\\t%<w>0, %<w>1, %<w>2
> +  adds\\t%<w>0, %<w>1, %<w>3"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
>  (define_insn "sub<mode>3_compare1"
>    [(set (reg:CC CC_REGNUM)
>  	(compare:CC
> @@ -2554,6 +2924,85 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "sub<mode>3_carryinCV"
> +  [(parallel
> +     [(set (reg:CC CC_REGNUM)
> +	   (compare:CC
> +	     (sign_extend:<DWI>
> +	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
> +	     (plus:<DWI>
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r"))
> +	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
> +      (set (match_operand:GPI 0 "register_operand" "=r")
> +	   (minus:GPI
> +	     (minus:GPI (match_dup 1) (match_dup 2))
> +	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
> +   ""
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r"))
> +	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (neg:GPI (match_dup 1))
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, %<w>1"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 2 "register_operand" "r"))
> +	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (minus:GPI (match_dup 1) (match_dup 2))
> +	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*sub_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(minus:GPI (match_operand:GPI 4 "register_operand" "rk") diff --git 
> a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> new file mode 100644
> index 0000000..0b31500
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_add (__int128 x, __int128 y) {
> +  __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> new file mode 100644
> index 0000000..9768a98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_add (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_saddl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> new file mode 100644
> index 0000000..126a526
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_add (long long x, long long y) {
> +  long long r;
> +
> +  int ovr = __builtin_saddll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> new file mode 100644
> index 0000000..c1261e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_sub (__int128 x, __int128 y) {
> +  __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> new file mode 100644
> index 0000000..1040464
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_sub (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_ssubl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> new file mode 100644
> index 0000000..a03df88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_sub (long long x, long long y) {
> +  long long r;
> +
> +  int ovr = __builtin_ssubll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> new file mode 100644
> index 0000000..c573c2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_add (unsigned __int128 x, unsigned 
> +__int128 y) {
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> new file mode 100644
> index 0000000..e325591
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_add (unsigned long x, unsigned long y) {
> +  unsigned long r;
> +
> +  int ovr = __builtin_uaddl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> new file mode 100644
> index 0000000..5f42886
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_add (unsigned long long x, unsigned long 
> +long y) {
> +  unsigned long long r;
> +
> +  int ovr = __builtin_uaddll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> new file mode 100644
> index 0000000..a84f4a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_sub (unsigned __int128 x, unsigned 
> +__int128 y) {
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> new file mode 100644
> index 0000000..ed033da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_sub (unsigned long x, unsigned long y) {
> +  unsigned long r;
> +
> +  int ovr = __builtin_usubl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> new file mode 100644
> index 0000000..a742f0c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_sub (unsigned long long x, unsigned long 
> +long y) {
> +  unsigned long long r;
> +
> +  int ovr = __builtin_usubll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-07-06  7:29       ` Michael Collison
@ 2017-07-06  8:22         ` Richard Earnshaw (lists)
  0 siblings, 0 replies; 8+ messages in thread
From: Richard Earnshaw (lists) @ 2017-07-06  8:22 UTC (permalink / raw)
  To: Michael Collison; +Cc: gcc-patches

On 06/07/17 08:29, Michael Collison wrote:
> Richard,
> 
> Can you explain "Use of ne is wrong here.  The condition register should
> be set to the result of a compare rtl construct.  The same applies
> elsewhere within this patch.  NE is then used on the result of the
> comparison.  The mode of the compare then indicates what might or might
> not be valid in the way the comparison is finally constructed."?
> 
> Why is "ne" wrong? I don't doubt you are correct, but I see nothing in
> the internals manual that forbids it. I want to understand what issues
> this exposes.
> 

Because the idiomatic form on a machine with a flags register is

CCreg:mode = COMPARE:mode (A, B)

which is then used with

<cond-op> (CCreg:mode, 0)

where cond-op is NE, EQ, GE, ... as appropriate.


> As you indicate I used this idiom in the arm port when I added the
> overflow operations there as well. Additionally other targets seem to
> use the comparison operators this way (i386 for the umulv).

Some targets really have boolean predicate operations that set results
explicitly in GP registers as the truth of A < B, etc.  On those
machines using

 pred-reg = cond-op (A, B)

makes sense, but not on ARM or AArch64.

R.

> 
> Regards,
> 
> Michael Collison
> 
> -----Original Message-----
> From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com]
> Sent: Wednesday, July 5, 2017 2:38 AM
> To: Michael Collison <Michael.Collison@arm.com>; Christophe Lyon
> <christophe.lyon@linaro.org>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub
> operations
> 
> On 19/05/17 22:11, Michael Collison wrote:
>> Christophe,
>> 
>> I had a type in the two test cases: "addcs" should have been "adcs". I caught this previously but submitted the previous patch incorrectly. Updated patch attached.
>> 
>> Okay for trunk?
>> 
> 
> Apologies for the delay responding, I've been procrastinating over this
> one.   In part it's due to the size of the patch with very little
> top-level description of what's the motivation and overall approach to
> the problem.
> 
> It would really help review if this could be split into multiple patches
> with a description of what each stage achieves.
> 
> Anyway, there are a couple of obvious formatting issues to deal with
> first, before we get into the details of the patch.
> 
>> -----Original Message-----
>> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
>> Sent: Friday, May 19, 2017 3:59 AM
>> To: Michael Collison <Michael.Collison@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
>> Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub 
>> operations
>> 
>> Hi Michael,
>> 
>> 
>> On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
>>> Hi,
>>>
>>> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>>>
>>> Sure for a simple test case such as:
>>>
>>> int
>>> f (int x, int y, int *ovf)
>>> {
>>>   int res;
>>>   *ovf = __builtin_sadd_overflow (x, y, &res);
>>>   return res;
>>> }
>>>
>>> Current trunk at -O2 generates
>>>
>>> f:
>>>         mov     w3, w0
>>>         mov     w4, 0
>>>         add     w0, w0, w1
>>>         tbnz    w1, #31, .L4
>>>         cmp     w0, w3
>>>         blt     .L3
>>> .L2:
>>>         str     w4, [x2]
>>>         ret
>>>         .p2align 3
>>> .L4:
>>>         cmp     w0, w3
>>>         ble     .L2
>>> .L3:
>>>         mov     w4, 1
>>>         b       .L2
>>>
>>>
>>> With the patch this now generates:
>>>
>>> f:
>>>         adds    w0, w0, w1
>>>         cset    w1, vs
>>>         str     w1, [x2]
>>>         ret
>>>
>>>
>>> Original patch from Richard Henderson:
>>>
>>> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>>>
>>>
>>> Okay for trunk?
>>>
>>> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>>>             Richard Henderson <rth@redhat.com>
>>>
>>>         * config/aarch64/aarch64-modes.def (CC_V): New.
>>>         * config/aarch64/aarch64-protos.h
>>>         (aarch64_add_128bit_scratch_regs): Declare
>>>         (aarch64_add_128bit_scratch_regs): Declare.
>>>         (aarch64_expand_subvti): Declare.
>>>         (aarch64_gen_unlikely_cbranch): Declare
>>>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>>>         for signed overflow using CC_Vmode.
>>>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>>>         (aarch64_gen_unlikely_cbranch): New function.
>>>         (aarch64_add_128bit_scratch_regs): New function.
>>>         (aarch64_subv_128bit_scratch_regs): New function.
>>>         (aarch64_expand_subvti): New function.
>>>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>>>         (addti3): Create simpler code if low part is already known to be 0.
>>>         (addvti4, uaddvti4): New.
>>>         (*add<GPI>3_compareC_cconly_imm): New.
>>>         (*add<GPI>3_compareC_cconly): New.
>>>         (*add<GPI>3_compareC_imm): New.
>>>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>>>         handle constants within this pattern.
>>>         (*add<GPI>3_compareV_cconly_imm): New.
>>>         (*add<GPI>3_compareV_cconly): New.
>>>         (*add<GPI>3_compareV_imm): New.
>>>         (add<GPI>3_compareV): New.
>>>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>>>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>>>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>>>         (subv<GPI>4, usubv<GPI>4): New.
>>>         (subti): Handle op1 zero.
>>>         (subvti4, usub4ti4): New.
>>>         (*sub<GPI>3_compare1_imm): New.
>>>         (sub<GPI>3_carryinCV): New.
>>>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>>>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>>>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>>>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.
>> 
>> I've tried your patch, and 2 of the new tests FAIL:
>>     gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
>>     gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs
>> 
>> Am I missing something?
>> 
>> Thanks,
>> 
>> Christophe
>> 
>> 
>> pr6308v2.patch
>> 
>> 
>> diff --git a/gcc/config/aarch64/aarch64-modes.def 
>> b/gcc/config/aarch64/aarch64-modes.def
>> index 45f7a44..244e490 100644
>> --- a/gcc/config/aarch64/aarch64-modes.def
>> +++ b/gcc/config/aarch64/aarch64-modes.def
>> @@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
>>  CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
>>  CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
>>  CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
>> +CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
>>  
>>  /* Half-precision floating point for __fp16.  */  FLOAT_MODE (HF, 2, 
>> 0); diff --git a/gcc/config/aarch64/aarch64-protos.h 
>> b/gcc/config/aarch64/aarch64-protos.h
>> index f55d4ba..f38b2b8 100644
>> --- a/gcc/config/aarch64/aarch64-protos.h
>> +++ b/gcc/config/aarch64/aarch64-protos.h
>> @@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);  void 
>> aarch64_reset_previous_fndecl (void);  bool 
>> aarch64_return_address_signing_enabled (void);  void 
>> aarch64_save_restore_target_globals (tree);
>> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
>> +                                   rtx *low_in1, rtx *low_in2,
>> +                                   rtx *high_dest, rtx *high_in1,
>> +                                   rtx *high_in2);
>> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
>> +                                    rtx *low_in1, rtx *low_in2,
>> +                                    rtx *high_dest, rtx *high_in1,
>> +                                    rtx *high_in2);
>> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
>> +                         rtx low_in2, rtx high_dest, rtx high_in1,
>> +                         rtx high_in2);
>> +
> 
> It's a little bit inconsistent, but the general style in
> aarch64-protos.h is not to include parameter names in prototypes, just
> their types.
> 
>>  
>>  /* Initialize builtins for SIMD intrinsics.  */  void 
>> init_aarch64_simd_builtins (void); @@ -412,6 +424,8 @@ bool 
>> aarch64_float_const_representable_p (rtx);
>>  
>>  #if defined (RTX_CODE)
>>  
>> +void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
>> +                                rtx label_ref);
>>  bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, 
>> bool);  machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);  rtx 
>> aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); diff --git 
>> a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 
>> f343d92..71a651c 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
>>        && GET_CODE (y) == ZERO_EXTEND)
>>      return CC_Cmode;
>>  
>> +  /* A test for signed overflow.  */
>> +  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
>> +      && code == NE
>> +      && GET_CODE (x) == PLUS
>> +      && GET_CODE (y) == SIGN_EXTEND)
>> +    return CC_Vmode;
>> +
>>    /* For everything else, return CCmode.  */
>>    return CCmode;
>>  }
>> @@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
>>        }
>>        break;
>>  
>> +    case CC_Vmode:
>> +      switch (comp_code)
>> +     {
>> +     case NE: return AARCH64_VS;
>> +     case EQ: return AARCH64_VC;
>> +     default: return -1;
>> +     }
>> +      break;
>> +
>>      default:
>>        return -1;
>>      }
>> @@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>>    return true;
>>  }
>>  
>> +/* Generate RTL for a conditional branch with rtx comparison CODE in
>> +   mode CC_MODE.  The destination of the unlikely conditional branch
>> +   is LABEL_REF.  */
>> +
>> +void
>> +aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
>> +                           rtx label_ref)
>> +{
>> +  rtx x;
>> +  x = gen_rtx_fmt_ee (code, VOIDmode,
>> +                   gen_rtx_REG (cc_mode, CC_REGNUM),
>> +                   const0_rtx);
>> +
>> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
>> +                         gen_rtx_LABEL_REF (VOIDmode, label_ref),
>> +                         pc_rtx);
>> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); }
>> +
>> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx 
>> +*low_dest,
> 
> Function names must start in column 1, with the return type on the
> preceding line.  All functions should have a top-level comment
> describing what they do (their contract with the caller).
> 
>> +                                   rtx *low_in1, rtx *low_in2,
>> +                                   rtx *high_dest, rtx *high_in1,
>> +                                   rtx *high_in2)
>> +{
>> +  *low_dest = gen_reg_rtx (DImode);
>> +  *low_in1 = gen_lowpart (DImode, op1);
>> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
>> +                               subreg_lowpart_offset (DImode, TImode));
>> +  *high_dest = gen_reg_rtx (DImode);
>> +  *high_in1 = gen_highpart (DImode, op1);
>> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
>> +                                subreg_highpart_offset (DImode, TImode)); }
>> +
>> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx 
>> +*low_dest,
> 
> Same here.
> 
>> +                                    rtx *low_in1, rtx *low_in2,
>> +                                    rtx *high_dest, rtx *high_in1,
>> +                                    rtx *high_in2)
>> +{
>> +  *low_dest = gen_reg_rtx (DImode);
>> +  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
>> +                               subreg_lowpart_offset (DImode, TImode));
>> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
>> +                               subreg_lowpart_offset (DImode, TImode));
>> +  *high_dest = gen_reg_rtx (DImode);
>> +  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
>> +                                subreg_highpart_offset (DImode, TImode));
>> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
>> +                                subreg_highpart_offset (DImode, TImode));
>> +
>> +}
>> +
>> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
> And here.
> 
>> +                         rtx low_in2, rtx high_dest, rtx high_in1,
>> +                         rtx high_in2)
>> +{
>> +  if (low_in2 == const0_rtx)
>> +    {
>> +      low_dest = low_in1;
>> +      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
>> +                                   force_reg (DImode, high_in2)));
>> +    }
>> +  else
>> +    {
>> +      if (CONST_INT_P (low_in2))
>> +     {
>> +       low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
>> +       high_in2 = force_reg (DImode, high_in2);
>> +       emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
>> +     }
>> +      else
>> +     emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
>> +      emit_insn (gen_subdi3_carryinCV (high_dest,
>> +                                    force_reg (DImode, high_in1),
>> +                                    high_in2));
>> +    }
>> +
>> +  emit_move_insn (gen_lowpart (DImode, op0), low_dest);  
>> + emit_move_insn (gen_highpart (DImode, op0), high_dest);
>> +
>> +}
>> +
>>  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
>>  
>>  static unsigned HOST_WIDE_INT
>> diff --git a/gcc/config/aarch64/aarch64.md 
>> b/gcc/config/aarch64/aarch64.md index a693a3b..3976ecb 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -1711,25 +1711,123 @@
>>    }
>>  )
>>  
>> +(define_expand "addv<mode>4"
>> +  [(match_operand:GPI 0 "register_operand")
>> +   (match_operand:GPI 1 "register_operand")
>> +   (match_operand:GPI 2 "register_operand")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], 
>> +operands[2]));
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
>> +
>> +  DONE;
>> +})
>> +
>> +(define_expand "uaddv<mode>4"
>> +  [(match_operand:GPI 0 "register_operand")
>> +   (match_operand:GPI 1 "register_operand")
>> +   (match_operand:GPI 2 "register_operand")
>> +   (match_operand 3 "")]
> 
> With no rtl in the expand to describe this pattern, it really should
> have a top-level comment explaining the arguments (reference to the
> manual is probably OK in this case).
> 
>> +  ""
>> +{
>> +  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], 
>> +operands[2]));
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
>> +
>> +  DONE;
>> +})
>> +
>> +
>>  (define_expand "addti3"
>>    [(set (match_operand:TI 0 "register_operand" "")
>>        (plus:TI (match_operand:TI 1 "register_operand" "")
>> -              (match_operand:TI 2 "register_operand" "")))]
>> +              (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
>>    ""
>>  {
>> -  rtx low = gen_reg_rtx (DImode);
>> -  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
>> -                               gen_lowpart (DImode, operands[2])));
>> +  rtx l0,l1,l2,h0,h1,h2;
>>  
>> -  rtx high = gen_reg_rtx (DImode);
>> -  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
>> -                              gen_highpart (DImode, operands[2])));
>> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
>> +                                &l0, &l1, &l2, &h0, &h1, &h2);
>> +
>> +  if (l2 == const0_rtx)
>> +    {
>> +      l0 = l1;
>> +      if (!aarch64_pluslong_operand (h2, DImode))
>> +     h2 = force_reg (DImode, h2);
>> +      emit_insn (gen_adddi3 (h0, h1, h2));
>> +    }
>> +  else
>> +    {
>> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
>> +      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
>> +    }
>> +
>> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
>> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>>  
>> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
>> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
>>    DONE;
>>  })
>>  
>> +(define_expand "addvti4"
>> +  [(match_operand:TI 0 "register_operand" "")
>> +   (match_operand:TI 1 "register_operand" "")
>> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
>> +   (match_operand 3 "")]
> 
> Same here.
> 
>> +  ""
>> +{
>> +  rtx l0,l1,l2,h0,h1,h2;
>> +
>> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
>> +                                &l0, &l1, &l2, &h0, &h1, &h2);
>> +
>> +  if (l2 == const0_rtx)
>> +    {
>> +      l0 = l1;
>> +      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
>> +    }
>> +  else
>> +    {
>> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
>> +      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
>> +    }
>> +
>> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
>> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>> +
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
>> +  DONE;
>> +})
>> +
>> +(define_expand "uaddvti4"
>> +  [(match_operand:TI 0 "register_operand" "")
>> +   (match_operand:TI 1 "register_operand" "")
>> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  rtx l0,l1,l2,h0,h1,h2;
>> +
>> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
>> +                                &l0, &l1, &l2, &h0, &h1, &h2);
>> +
>> +  if (l2 == const0_rtx)
>> +    {
>> +      l0 = l1;
>> +      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
>> +    }
>> +  else
>> +    {
>> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
>> +      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
>> +    }
>> +
>> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
>> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>> +
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);  DONE;
>> + })
>> +
>>  (define_insn "add<mode>3_compare0"
>>    [(set (reg:CC_NZ CC_REGNUM)
>>        (compare:CC_NZ
>> @@ -1828,10 +1926,70 @@
>>    [(set_attr "type" "alus_sreg")]
>>  )
>>  
>> +;; Note that since we're sign-extending, match the immediate in GPI 
>> +;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
>> +(define_insn "*add<mode>3_compareV_cconly_imm"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
>> +         (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
>> +       (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
>> +  ""
>> +  "@
>> +  cmn\\t%<w>0, %<w>1
>> +  cmp\\t%<w>0, #%n1"
>> +  [(set_attr "type" "alus_imm")]
>> +)
>> +
>> +(define_insn "*add<mode>3_compareV_cconly"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
> 
> Use of ne is wrong here.  The condition register should be set to the
> result of a compare rtl construct.  The same applies elsewhere within
> this patch.  NE is then used on the result of the comparison.  The mode
> of the compare then indicates what might or might not be valid in the
> way the comparison is finally constructed.
> 
> Note that this issue may go back to the earlier patches that this is
> based on, but those are equally incorrect and wil need fixing as well at
> some point.  We shouldn't prepetuate the issue.
> 
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
>> +         (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
>> +       (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
>> +  ""
>> +  "cmn\\t%<w>0, %<w>1"
>> +  [(set_attr "type" "alus_sreg")]
>> +)
>> +
>> +(define_insn "*add<mode>3_compareV_imm"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI>
>> +           (match_operand:GPI 1 "register_operand" "r,r"))
>> +         (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
>> +       (sign_extend:<DWI>
>> +         (plus:GPI (match_dup 1) (match_dup 2)))))
>> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
>> +     (plus:GPI (match_dup 1) (match_dup 2)))]
>> +   ""
>> +   "@
>> +   adds\\t%<w>0, %<w>1, %<w>2
>> +   subs\\t%<w>0, %<w>1, #%n2"
>> +  [(set_attr "type" "alus_imm,alus_imm")]
>> +)
>> +
>> +(define_insn "add<mode>3_compareV"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
>> +         (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
>> +       (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
>> +   (set (match_operand:GPI 0 "register_operand" "=r")
>> +     (plus:GPI (match_dup 1) (match_dup 2)))]
>> +  ""
>> +  "adds\\t%<w>0, %<w>1, %<w>2"
>> +  [(set_attr "type" "alus_sreg")]
>> +)
>> +
>>  (define_insn "*adds_shift_imm_<mode>"
>>    [(set (reg:CC_NZ CC_REGNUM)
>>        (compare:CC_NZ
>> -      (plus:GPI (ASHIFT:GPI 
>> +      (plus:GPI (ASHIFT:GPI
>>                    (match_operand:GPI 1 "register_operand" "r")
>>                    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
>>                   (match_operand:GPI 3 "register_operand" "r")) @@ -2187,6 
>> +2345,138 @@
>>    [(set_attr "type" "adc_reg")]
>>  )
>>  
>> +(define_expand "add<mode>3_carryinC"
>> +  [(parallel
>> +     [(set (match_dup 3)
>> +        (ne:CC_C
>> +          (plus:<DWI>
>> +            (plus:<DWI>
>> +              (match_dup 4)
>> +              (zero_extend:<DWI>
>> +                (match_operand:GPI 1 "register_operand" "r")))
>> +            (zero_extend:<DWI>
>> +              (match_operand:GPI 2 "register_operand" "r")))
>> +        (zero_extend:<DWI>
>> +          (plus:GPI
>> +            (plus:GPI (match_dup 5) (match_dup 1))
>> +            (match_dup 2)))))
>> +      (set (match_operand:GPI 0 "register_operand")
>> +        (plus:GPI
>> +          (plus:GPI (match_dup 5) (match_dup 1))
>> +          (match_dup 2)))])]
>> +   ""
>> +{
>> +  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
>> +  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
>> +  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
>> +})
>> +
>> +(define_insn "*add<mode>3_carryinC_zero"
>> +  [(set (reg:CC_C CC_REGNUM)
>> +     (ne:CC_C
>> +       (plus:<DWI>
>> +         (match_operand:<DWI> 2 "aarch64_carry_operation" "")
>> +         (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
>> +       (zero_extend:<DWI>
>> +         (plus:GPI
>> +           (match_operand:GPI 3 "aarch64_carry_operation" "")
>> +           (match_dup 1)))))
>> +   (set (match_operand:GPI 0 "register_operand")
>> +     (plus:GPI (match_dup 3) (match_dup 1)))]
>> +   ""
>> +   "adcs\\t%<w>0, %<w>1, <w>zr"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_insn "*add<mode>3_carryinC"
>> +  [(set (reg:CC_C CC_REGNUM)
>> +     (ne:CC_C
>> +       (plus:<DWI>
>> +         (plus:<DWI>
>> +           (match_operand:<DWI> 3 "aarch64_carry_operation" "")
>> +           (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
>> +         (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
>> +       (zero_extend:<DWI>
>> +         (plus:GPI
>> +           (plus:GPI
>> +             (match_operand:GPI 4 "aarch64_carry_operation" "")
>> +             (match_dup 1))
>> +           (match_dup 2)))))
>> +   (set (match_operand:GPI 0 "register_operand")
>> +     (plus:GPI
>> +       (plus:GPI (match_dup 4) (match_dup 1))
>> +       (match_dup 2)))]
>> +   ""
>> +   "adcs\\t%<w>0, %<w>1, %<w>2"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_expand "add<mode>3_carryinV"
>> +  [(parallel
>> +     [(set (reg:CC_V CC_REGNUM)
>> +        (ne:CC_V
>> +          (plus:<DWI>
>> +            (plus:<DWI>
>> +              (match_dup 3)
>> +              (sign_extend:<DWI>
>> +                (match_operand:GPI 1 "register_operand" "r")))
>> +            (sign_extend:<DWI>
>> +              (match_operand:GPI 2 "register_operand" "r")))
>> +        (sign_extend:<DWI>
>> +          (plus:GPI
>> +            (plus:GPI (match_dup 4) (match_dup 1))
>> +            (match_dup 2)))))
>> +      (set (match_operand:GPI 0 "register_operand")
>> +        (plus:GPI
>> +          (plus:GPI (match_dup 4) (match_dup 1))
>> +          (match_dup 2)))])]
>> +   ""
>> +{
>> +  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
>> +  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
>> +  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
>> +})
>> +
>> +(define_insn "*add<mode>3_carryinV_zero"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
>> +       (plus:<DWI>
>> +         (match_operand:<DWI> 2 "aarch64_carry_operation" "")
>> +         (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
>> +       (sign_extend:<DWI>
>> +         (plus:GPI
>> +           (match_operand:GPI 3 "aarch64_carry_operation" "")
>> +           (match_dup 1)))))
>> +   (set (match_operand:GPI 0 "register_operand")
>> +     (plus:GPI (match_dup 3) (match_dup 1)))]
>> +   ""
>> +   "adcs\\t%<w>0, %<w>1, <w>zr"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_insn "*add<mode>3_carryinV"
>> +  [(set (reg:CC_V CC_REGNUM)
>> +     (ne:CC_V
>> +       (plus:<DWI>
>> +         (plus:<DWI>
>> +           (match_operand:<DWI> 3 "aarch64_carry_operation" "")
>> +           (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
>> +         (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
>> +       (sign_extend:<DWI>
>> +         (plus:GPI
>> +           (plus:GPI
>> +             (match_operand:GPI 4 "aarch64_carry_operation" "")
>> +             (match_dup 1))
>> +           (match_dup 2)))))
>> +   (set (match_operand:GPI 0 "register_operand")
>> +     (plus:GPI
>> +       (plus:GPI (match_dup 4) (match_dup 1))
>> +       (match_dup 2)))]
>> +   ""
>> +   "adcs\\t%<w>0, %<w>1, %<w>2"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>>  (define_insn "*add_uxt<mode>_shift2"
>>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>>        (plus:GPI (and:GPI
>> @@ -2283,22 +2573,86 @@
>>     (set_attr "simd" "*,yes")]
>>  )
>>  
>> +(define_expand "subv<mode>4"
>> +  [(match_operand:GPI 0 "register_operand")
>> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
>> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
>> +operands[2]));
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
>> +
>> +  DONE;
>> +})
>> +
>> +(define_expand "usubv<mode>4"
>> +  [(match_operand:GPI 0 "register_operand")
>> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
>> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
>> +operands[2]));
>> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
>> +
>> +  DONE;
>> +})
>> +
>>  (define_expand "subti3"
>>    [(set (match_operand:TI 0 "register_operand" "")
>> -     (minus:TI (match_operand:TI 1 "register_operand" "")
>> +     (minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
>>                  (match_operand:TI 2 "register_operand" "")))]
>>    ""
>>  {
>> -  rtx low = gen_reg_rtx (DImode);
>> -  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
>> -                               gen_lowpart (DImode, operands[2])));
>> +  rtx l0 = gen_reg_rtx (DImode);
>> +  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
>> +                             subreg_lowpart_offset (DImode, TImode));
>> +  rtx l2 = gen_lowpart (DImode, operands[2]);
>> +  rtx h0 = gen_reg_rtx (DImode);
>> +  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
>> +                             subreg_highpart_offset (DImode, TImode));
>> +  rtx h2 = gen_highpart (DImode, operands[2]);
>>  
>> -  rtx high = gen_reg_rtx (DImode);
>> -  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
>> -                              gen_highpart (DImode, operands[2])));
>> +  emit_insn (gen_subdi3_compare1 (l0, l1, l2));  emit_insn 
>> + (gen_subdi3_carryin (h0, h1, h2));
>>  
>> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
>> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
>> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
>> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>> +  DONE;
>> +})
>> +
>> +(define_expand "subvti4"
>> +  [(match_operand:TI 0 "register_operand")
>> +   (match_operand:TI 1 "aarch64_reg_or_zero")
>> +   (match_operand:TI 2 "aarch64_reg_or_imm")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  rtx l0,l1,l2,h0,h1,h2;
>> +
>> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
>> +                                 &l0, &l1, &l2, &h0, &h1, &h2);
>> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
>> +
>> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
>> +  DONE;
>> +})
>> +
>> +(define_expand "usubvti4"
>> +  [(match_operand:TI 0 "register_operand")
>> +   (match_operand:TI 1 "aarch64_reg_or_zero")
>> +   (match_operand:TI 2 "aarch64_reg_or_imm")
>> +   (match_operand 3 "")]
>> +  ""
>> +{
>> +  rtx l0,l1,l2,h0,h1,h2;
>> +
>> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
>> +                                 &l0, &l1, &l2, &h0, &h1, &h2);
>> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
>> +
>> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
>>    DONE;
>>  })
>>  
>> @@ -2327,6 +2681,22 @@
>>    [(set_attr "type" "alus_sreg")]
>>  )
>>  
>> +(define_insn "*sub<mode>3_compare1_imm"
>> +  [(set (reg:CC CC_REGNUM)
>> +     (compare:CC
>> +       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
>> +       (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
>> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
>> +     (plus:GPI
>> +       (match_dup 1)
>> +       (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
>> +  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
>> +  "@
>> +  subs\\t%<w>0, %<w>1, %<w>2
>> +  adds\\t%<w>0, %<w>1, %<w>3"
>> +  [(set_attr "type" "alus_imm")]
>> +)
>> +
>>  (define_insn "sub<mode>3_compare1"
>>    [(set (reg:CC CC_REGNUM)
>>        (compare:CC
>> @@ -2554,6 +2924,85 @@
>>    [(set_attr "type" "adc_reg")]
>>  )
>>  
>> +(define_expand "sub<mode>3_carryinCV"
>> +  [(parallel
>> +     [(set (reg:CC CC_REGNUM)
>> +        (compare:CC
>> +          (sign_extend:<DWI>
>> +            (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
>> +          (plus:<DWI>
>> +            (sign_extend:<DWI>
>> +              (match_operand:GPI 2 "register_operand" "r"))
>> +            (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
>> +      (set (match_operand:GPI 0 "register_operand" "=r")
>> +        (minus:GPI
>> +          (minus:GPI (match_dup 1) (match_dup 2))
>> +          (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
>> +   ""
>> +)
>> +
>> +(define_insn "*sub<mode>3_carryinCV_z1_z2"
>> +  [(set (reg:CC CC_REGNUM)
>> +     (compare:CC
>> +       (const_int 0)
>> +       (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
>> +   (set (match_operand:GPI 0 "register_operand" "=r")
>> +     (neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
>> +   ""
>> +   "sbcs\\t%<w>0, <w>zr, <w>zr"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_insn "*sub<mode>3_carryinCV_z1"
>> +  [(set (reg:CC CC_REGNUM)
>> +     (compare:CC
>> +       (const_int 0)
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI>
>> +           (match_operand:GPI 1 "register_operand" "r"))
>> +         (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
>> +   (set (match_operand:GPI 0 "register_operand" "=r")
>> +     (minus:GPI
>> +       (neg:GPI (match_dup 1))
>> +       (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
>> +   ""
>> +   "sbcs\\t%<w>0, <w>zr, %<w>1"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_insn "*sub<mode>3_carryinCV_z2"
>> +  [(set (reg:CC CC_REGNUM)
>> +     (compare:CC
>> +       (sign_extend:<DWI>
>> +         (match_operand:GPI 1 "register_operand" "r"))
>> +       (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
>> +   (set (match_operand:GPI 0 "register_operand" "=r")
>> +     (minus:GPI
>> +       (match_dup 1)
>> +       (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
>> +   ""
>> +   "sbcs\\t%<w>0, %<w>1, <w>zr"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>> +(define_insn "*sub<mode>3_carryinCV"
>> +  [(set (reg:CC CC_REGNUM)
>> +     (compare:CC
>> +       (sign_extend:<DWI>
>> +         (match_operand:GPI 1 "register_operand" "r"))
>> +       (plus:<DWI>
>> +         (sign_extend:<DWI>
>> +           (match_operand:GPI 2 "register_operand" "r"))
>> +         (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
>> +   (set (match_operand:GPI 0 "register_operand" "=r")
>> +     (minus:GPI
>> +       (minus:GPI (match_dup 1) (match_dup 2))
>> +       (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
>> +   ""
>> +   "sbcs\\t%<w>0, %<w>1, %<w>2"
>> +  [(set_attr "type" "adc_reg")]
>> +)
>> +
>>  (define_insn "*sub_uxt<mode>_shift2"
>>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>>        (minus:GPI (match_operand:GPI 4 "register_operand" "rk") diff --git 
>> a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
>> new file mode 100644
>> index 0000000..0b31500
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +__int128 overflow_add (__int128 x, __int128 y) {
>> +  __int128 r;
>> +
>> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> +/* { dg-final { scan-assembler "adcs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
>> new file mode 100644
>> index 0000000..9768a98
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +long overflow_add (long x, long y)
>> +{
>> +  long r;
>> +
>> +  int ovr = __builtin_saddl_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
>> new file mode 100644
>> index 0000000..126a526
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +long long overflow_add (long long x, long long y) {
>> +  long long r;
>> +
>> +  int ovr = __builtin_saddll_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
>> new file mode 100644
>> index 0000000..c1261e3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +__int128 overflow_sub (__int128 x, __int128 y) {
>> +  __int128 r;
>> +
>> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> +/* { dg-final { scan-assembler "sbcs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
>> new file mode 100644
>> index 0000000..1040464
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +long overflow_sub (long x, long y)
>> +{
>> +  long r;
>> +
>> +  int ovr = __builtin_ssubl_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
>> new file mode 100644
>> index 0000000..a03df88
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +long long overflow_sub (long long x, long long y) {
>> +  long long r;
>> +
>> +  int ovr = __builtin_ssubll_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
>> new file mode 100644
>> index 0000000..c573c2a
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned __int128 overflow_add (unsigned __int128 x, unsigned 
>> +__int128 y) {
>> +  unsigned __int128 r;
>> +
>> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> +/* { dg-final { scan-assembler "adcs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
>> new file mode 100644
>> index 0000000..e325591
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned long overflow_add (unsigned long x, unsigned long y) {
>> +  unsigned long r;
>> +
>> +  int ovr = __builtin_uaddl_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
>> new file mode 100644
>> index 0000000..5f42886
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned long long overflow_add (unsigned long long x, unsigned long 
>> +long y) {
>> +  unsigned long long r;
>> +
>> +  int ovr = __builtin_uaddll_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "adds" } } */
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
>> new file mode 100644
>> index 0000000..a84f4a4
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned __int128 overflow_sub (unsigned __int128 x, unsigned 
>> +__int128 y) {
>> +  unsigned __int128 r;
>> +
>> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> +/* { dg-final { scan-assembler "sbcs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
>> new file mode 100644
>> index 0000000..ed033da
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned long overflow_sub (unsigned long x, unsigned long y) {
>> +  unsigned long r;
>> +
>> +  int ovr = __builtin_usubl_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c 
>> b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
>> new file mode 100644
>> index 0000000..a742f0c
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
>> @@ -0,0 +1,18 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" }  */
>> +
>> +extern void overflow_handler ();
>> +
>> +unsigned long long overflow_sub (unsigned long long x, unsigned long 
>> +long y) {
>> +  unsigned long long r;
>> +
>> +  int ovr = __builtin_usubll_overflow (x, y, &r);  if (ovr)
>> +    overflow_handler ();
>> +
>> +  return r;
>> +}
>> +
>> +/* { dg-final { scan-assembler "subs" } } */
>> +
>> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][Aarch64] Add support for overflow add and sub operations
  2017-07-05  9:38     ` Richard Earnshaw (lists)
  2017-07-06  7:29       ` Michael Collison
@ 2017-08-01  6:33       ` Michael Collison
  1 sibling, 0 replies; 8+ messages in thread
From: Michael Collison @ 2017-08-01  6:33 UTC (permalink / raw)
  To: Richard Earnshaw, Christophe Lyon; +Cc: gcc-patches, nd

[-- Attachment #1: Type: text/plain, Size: 42133 bytes --]

Updated the patch per Richard's comments in particular the issues relating to use of NE: " Use of ne is wrong here.  The condition register should be set to the result of a compare rtl construct.  The same applies elsewhere within this patch.  NE is then used on the result of the comparison.  The mode of the compare then indicates what might or might not be valid in the way the comparison is finally constructed."

Okay for trunk?

2017-08-01  Michael Collison  <michael.collison@arm.com>
	    Richard Henderson <rth@redhat.com>

	* config/aarch64/aarch64-modes.def (CC_V): New.
	* config/aarch64/aarch64-protos.h
	(aarch64_add_128bit_scratch_regs): Declare
	(aarch64_add_128bit_scratch_regs): Declare.
	(aarch64_expand_subvti): Declare.
	(aarch64_gen_unlikely_cbranch): Declare
	* config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
	for signed overflow using CC_Vmode.
	(aarch64_get_condition_code_1): Handle CC_Vmode.
	(aarch64_gen_unlikely_cbranch): New function.
	(aarch64_add_128bit_scratch_regs): New function.
	(aarch64_subv_128bit_scratch_regs): New function.
	(aarch64_expand_subvti): New function.
	* config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
	(addti3): Create simpler code if low part is already known to be 0.
	(addvti4, uaddvti4): New.
	(*add<GPI>3_compareC_cconly_imm): New.
	(*add<GPI>3_compareC_cconly): New.
	(*add<GPI>3_compareC_imm): New.
	(*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
	handle constants within this pattern.
	(*add<GPI>3_compareV_cconly_imm): New.
	(*add<GPI>3_compareV_cconly): New.
	(*add<GPI>3_compareV_imm): New.
	(add<GPI>3_compareV): New.
	(add<GPI>3_carryinC, add<GPI>3_carryinV): New.
	(*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
	(*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
	(subv<GPI>4, usubv<GPI>4): New.
	(subti): Handle op1 zero.
	(subvti4, usub4ti4): New.
	(*sub<GPI>3_compare1_imm): New.
	(sub<GPI>3_carryinCV): New.
	(*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
	(*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
	* testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
	* testsuite/gcc.target/arm/builtin_usubll.c: New testcase.

-----Original Message-----
From: Richard Earnshaw (lists) [mailto:Richard.Earnshaw@arm.com] 
Sent: Wednesday, July 5, 2017 2:38 AM
To: Michael Collison <Michael.Collison@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub operations

On 19/05/17 22:11, Michael Collison wrote:
> Christophe,
> 
> I had a type in the two test cases: "addcs" should have been "adcs". I caught this previously but submitted the previous patch incorrectly. Updated patch attached.
> 
> Okay for trunk?
> 

Apologies for the delay responding, I've been procrastinating over this
one.   In part it's due to the size of the patch with very little
top-level description of what's the motivation and overall approach to the problem.

It would really help review if this could be split into multiple patches with a description of what each stage achieves.

Anyway, there are a couple of obvious formatting issues to deal with first, before we get into the details of the patch.

> -----Original Message-----
> From: Christophe Lyon [mailto:christophe.lyon@linaro.org]
> Sent: Friday, May 19, 2017 3:59 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH][Aarch64] Add support for overflow add and sub 
> operations
> 
> Hi Michael,
> 
> 
> On 19 May 2017 at 07:12, Michael Collison <Michael.Collison@arm.com> wrote:
>> Hi,
>>
>> This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:
>>
>> Sure for a simple test case such as:
>>
>> int
>> f (int x, int y, int *ovf)
>> {
>>   int res;
>>   *ovf = __builtin_sadd_overflow (x, y, &res);
>>   return res;
>> }
>>
>> Current trunk at -O2 generates
>>
>> f:
>>         mov     w3, w0
>>         mov     w4, 0
>>         add     w0, w0, w1
>>         tbnz    w1, #31, .L4
>>         cmp     w0, w3
>>         blt     .L3
>> .L2:
>>         str     w4, [x2]
>>         ret
>>         .p2align 3
>> .L4:
>>         cmp     w0, w3
>>         ble     .L2
>> .L3:
>>         mov     w4, 1
>>         b       .L2
>>
>>
>> With the patch this now generates:
>>
>> f:
>>         adds    w0, w0, w1
>>         cset    w1, vs
>>         str     w1, [x2]
>>         ret
>>
>>
>> Original patch from Richard Henderson:
>>
>> https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01903.html
>>
>>
>> Okay for trunk?
>>
>> 2017-05-17  Michael Collison  <michael.collison@arm.com>
>>             Richard Henderson <rth@redhat.com>
>>
>>         * config/aarch64/aarch64-modes.def (CC_V): New.
>>         * config/aarch64/aarch64-protos.h
>>         (aarch64_add_128bit_scratch_regs): Declare
>>         (aarch64_add_128bit_scratch_regs): Declare.
>>         (aarch64_expand_subvti): Declare.
>>         (aarch64_gen_unlikely_cbranch): Declare
>>         * config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
>>         for signed overflow using CC_Vmode.
>>         (aarch64_get_condition_code_1): Handle CC_Vmode.
>>         (aarch64_gen_unlikely_cbranch): New function.
>>         (aarch64_add_128bit_scratch_regs): New function.
>>         (aarch64_subv_128bit_scratch_regs): New function.
>>         (aarch64_expand_subvti): New function.
>>         * config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
>>         (addti3): Create simpler code if low part is already known to be 0.
>>         (addvti4, uaddvti4): New.
>>         (*add<GPI>3_compareC_cconly_imm): New.
>>         (*add<GPI>3_compareC_cconly): New.
>>         (*add<GPI>3_compareC_imm): New.
>>         (*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
>>         handle constants within this pattern.
>>         (*add<GPI>3_compareV_cconly_imm): New.
>>         (*add<GPI>3_compareV_cconly): New.
>>         (*add<GPI>3_compareV_imm): New.
>>         (add<GPI>3_compareV): New.
>>         (add<GPI>3_carryinC, add<GPI>3_carryinV): New.
>>         (*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
>>         (*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
>>         (subv<GPI>4, usubv<GPI>4): New.
>>         (subti): Handle op1 zero.
>>         (subvti4, usub4ti4): New.
>>         (*sub<GPI>3_compare1_imm): New.
>>         (sub<GPI>3_carryinCV): New.
>>         (*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
>>         (*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New.
>>         * testsuite/gcc.target/arm/builtin_sadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_saddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uadd_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_uaddll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_ssubll.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usub_128.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubl.c: New testcase.
>>         * testsuite/gcc.target/arm/builtin_usubll.c: New testcase.
> 
> I've tried your patch, and 2 of the new tests FAIL:
>     gcc.target/aarch64/builtin_sadd_128.c scan-assembler addcs
>     gcc.target/aarch64/builtin_uadd_128.c scan-assembler addcs
> 
> Am I missing something?
> 
> Thanks,
> 
> Christophe
> 
> 
> pr6308v2.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-modes.def 
> b/gcc/config/aarch64/aarch64-modes.def
> index 45f7a44..244e490 100644
> --- a/gcc/config/aarch64/aarch64-modes.def
> +++ b/gcc/config/aarch64/aarch64-modes.def
> @@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
>  CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
>  CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
>  CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
> +CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
>  
>  /* Half-precision floating point for __fp16.  */  FLOAT_MODE (HF, 2, 
> 0); diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index f55d4ba..f38b2b8 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -388,6 +388,18 @@ void aarch64_relayout_simd_types (void);  void 
> aarch64_reset_previous_fndecl (void);  bool 
> aarch64_return_address_signing_enabled (void);  void 
> aarch64_save_restore_target_globals (tree);
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2);
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2);
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2);
> +

It's a little bit inconsistent, but the general style in aarch64-protos.h is not to include parameter names in prototypes, just their types.

>  
>  /* Initialize builtins for SIMD intrinsics.  */  void 
> init_aarch64_simd_builtins (void); @@ -412,6 +424,8 @@ bool 
> aarch64_float_const_representable_p (rtx);
>  
>  #if defined (RTX_CODE)
>  
> +void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
> +				   rtx label_ref);
>  bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, 
> bool);  machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);  rtx 
> aarch64_gen_compare_reg (RTX_CODE, rtx, rtx); diff --git 
> a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 
> f343d92..71a651c 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -4716,6 +4716,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
>        && GET_CODE (y) == ZERO_EXTEND)
>      return CC_Cmode;
>  
> +  /* A test for signed overflow.  */
> +  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
> +      && code == NE
> +      && GET_CODE (x) == PLUS
> +      && GET_CODE (y) == SIGN_EXTEND)
> +    return CC_Vmode;
> +
>    /* For everything else, return CCmode.  */
>    return CCmode;
>  }
> @@ -4822,6 +4829,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
>  	}
>        break;
>  
> +    case CC_Vmode:
> +      switch (comp_code)
> +	{
> +	case NE: return AARCH64_VS;
> +	case EQ: return AARCH64_VC;
> +	default: return -1;
> +	}
> +      break;
> +
>      default:
>        return -1;
>      }
> @@ -13630,6 +13646,88 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
>    return true;
>  }
>  
> +/* Generate RTL for a conditional branch with rtx comparison CODE in
> +   mode CC_MODE.  The destination of the unlikely conditional branch
> +   is LABEL_REF.  */
> +
> +void
> +aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
> +			      rtx label_ref)
> +{
> +  rtx x;
> +  x = gen_rtx_fmt_ee (code, VOIDmode,
> +		      gen_rtx_REG (cc_mode, CC_REGNUM),
> +		      const0_rtx);
> +
> +  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
> +			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
> +			    pc_rtx);
> +  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); }
> +
> +void aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx 
> +*low_dest,

Function names must start in column 1, with the return type on the preceding line.  All functions should have a top-level comment describing what they do (their contract with the caller).

> +				      rtx *low_in1, rtx *low_in2,
> +				      rtx *high_dest, rtx *high_in1,
> +				      rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = gen_lowpart (DImode, op1);
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = gen_highpart (DImode, op1);
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode)); }
> +
> +void aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx 
> +*low_dest,

Same here.

> +				       rtx *low_in1, rtx *low_in2,
> +				       rtx *high_dest, rtx *high_in1,
> +				       rtx *high_in2)
> +{
> +  *low_dest = gen_reg_rtx (DImode);
> +  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				  subreg_lowpart_offset (DImode, TImode));
> +  *high_dest = gen_reg_rtx (DImode);
> +  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
> +				   subreg_highpart_offset (DImode, TImode));
> +
> +}
> +
> +void aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
And here.

> +			    rtx low_in2, rtx high_dest, rtx high_in1,
> +			    rtx high_in2)
> +{
> +  if (low_in2 == const0_rtx)
> +    {
> +      low_dest = low_in1;
> +      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
> +				      force_reg (DImode, high_in2)));
> +    }
> +  else
> +    {
> +      if (CONST_INT_P (low_in2))
> +	{
> +	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
> +	  high_in2 = force_reg (DImode, high_in2);
> +	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
> +	}
> +      else
> +	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
> +      emit_insn (gen_subdi3_carryinCV (high_dest,
> +				       force_reg (DImode, high_in1),
> +				       high_in2));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, op0), low_dest);  
> + emit_move_insn (gen_highpart (DImode, op0), high_dest);
> +
> +}
> +
>  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
>  
>  static unsigned HOST_WIDE_INT
> diff --git a/gcc/config/aarch64/aarch64.md 
> b/gcc/config/aarch64/aarch64.md index a693a3b..3976ecb 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1711,25 +1711,123 @@
>    }
>  )
>  
> +(define_expand "addv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "uaddv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "register_operand")
> +   (match_operand:GPI 2 "register_operand")
> +   (match_operand 3 "")]

With no rtl in the expand to describe this pattern, it really should have a top-level comment explaining the arguments (reference to the manual is probably OK in this case).

> +  ""
> +{
> +  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +
>  (define_expand "addti3"
>    [(set (match_operand:TI 0 "register_operand" "")
>  	(plus:TI (match_operand:TI 1 "register_operand" "")
> -		 (match_operand:TI 2 "register_operand" "")))]
> +		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0,l1,l2,h0,h1,h2;
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      if (!aarch64_pluslong_operand (h2, DImode))
> +	h2 = force_reg (DImode, h2);
> +      emit_insn (gen_adddi3 (h0, h1, h2));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
>    DONE;
>  })
>  
> +(define_expand "addvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]

Same here.

> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "uaddvti4"
> +  [(match_operand:TI 0 "register_operand" "")
> +   (match_operand:TI 1 "register_operand" "")
> +   (match_operand:TI 2 "aarch64_reg_or_imm" "")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
> +				   &l0, &l1, &l2, &h0, &h1, &h2);
> +
> +  if (l2 == const0_rtx)
> +    {
> +      l0 = l1;
> +      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
> +    }
> +  else
> +    {
> +      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
> +      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
> +    }
> +
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);  
> + emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);  DONE;
> + })
> +
>  (define_insn "add<mode>3_compare0"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> @@ -1828,10 +1926,70 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +;; Note that since we're sign-extending, match the immediate in GPI 
> +;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
> +(define_insn "*add<mode>3_compareV_cconly_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
> +	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "@
> +  cmn\\t%<w>0, %<w>1
> +  cmp\\t%<w>0, #%n1"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_cconly"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V

Use of ne is wrong here.  The condition register should be set to the result of a compare rtl construct.  The same applies elsewhere within this patch.  NE is then used on the result of the comparison.  The mode of the compare then indicates what might or might not be valid in the way the comparison is finally constructed.

Note that this issue may go back to the earlier patches that this is based on, but those are equally incorrect and wil need fixing as well at some point.  We shouldn't prepetuate the issue.

> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
> +  ""
> +  "cmn\\t%<w>0, %<w>1"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
> +(define_insn "*add<mode>3_compareV_imm"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r,r"))
> +	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +   ""
> +   "@
> +   adds\\t%<w>0, %<w>1, %<w>2
> +   subs\\t%<w>0, %<w>1, #%n2"
> +  [(set_attr "type" "alus_imm,alus_imm")]
> +)
> +
> +(define_insn "add<mode>3_compareV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(plus:GPI (match_dup 1) (match_dup 2)))]
> +  ""
> +  "adds\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "alus_sreg")]
> +)
> +
>  (define_insn "*adds_shift_imm_<mode>"
>    [(set (reg:CC_NZ CC_REGNUM)
>  	(compare:CC_NZ
> -	 (plus:GPI (ASHIFT:GPI 
> +	 (plus:GPI (ASHIFT:GPI
>  		    (match_operand:GPI 1 "register_operand" "r")
>  		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
>  		   (match_operand:GPI 3 "register_operand" "r")) @@ -2187,6 
> +2345,138 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "add<mode>3_carryinC"
> +  [(parallel
> +     [(set (match_dup 3)
> +	   (ne:CC_C
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 4)
> +		 (zero_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (zero_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (zero_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 5) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 5) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
> +  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinC_zero"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinC"
> +  [(set (reg:CC_C CC_REGNUM)
> +	(ne:CC_C
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (zero_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_expand "add<mode>3_carryinV"
> +  [(parallel
> +     [(set (reg:CC_V CC_REGNUM)
> +	   (ne:CC_V
> +	     (plus:<DWI>
> +	       (plus:<DWI>
> +		 (match_dup 3)
> +		 (sign_extend:<DWI>
> +		   (match_operand:GPI 1 "register_operand" "r")))
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r")))
> +	   (sign_extend:<DWI>
> +	     (plus:GPI
> +	       (plus:GPI (match_dup 4) (match_dup 1))
> +	       (match_dup 2)))))
> +      (set (match_operand:GPI 0 "register_operand")
> +	   (plus:GPI
> +	     (plus:GPI (match_dup 4) (match_dup 1))
> +	     (match_dup 2)))])]
> +   ""
> +{
> +  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> +  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
> +  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
> +})
> +
> +(define_insn "*add<mode>3_carryinV_zero"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
> +	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (match_operand:GPI 3 "aarch64_carry_operation" "")
> +	      (match_dup 1)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI (match_dup 3) (match_dup 1)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*add<mode>3_carryinV"
> +  [(set (reg:CC_V CC_REGNUM)
> +	(ne:CC_V
> +	  (plus:<DWI>
> +	    (plus:<DWI>
> +	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
> +	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
> +	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
> +	  (sign_extend:<DWI>
> +	    (plus:GPI
> +	      (plus:GPI
> +		(match_operand:GPI 4 "aarch64_carry_operation" "")
> +		(match_dup 1))
> +	      (match_dup 2)))))
> +   (set (match_operand:GPI 0 "register_operand")
> +	(plus:GPI
> +	  (plus:GPI (match_dup 4) (match_dup 1))
> +	  (match_dup 2)))]
> +   ""
> +   "adcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*add_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(plus:GPI (and:GPI
> @@ -2283,22 +2573,86 @@
>     (set_attr "simd" "*,yes")]
>  )
>  
> +(define_expand "subv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +
> +  DONE;
> +})
> +
> +(define_expand "usubv<mode>4"
> +  [(match_operand:GPI 0 "register_operand")
> +   (match_operand:GPI 1 "aarch64_reg_or_zero")
> +   (match_operand:GPI 2 "aarch64_reg_or_zero")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], 
> +operands[2]));
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
> +
> +  DONE;
> +})
> +
>  (define_expand "subti3"
>    [(set (match_operand:TI 0 "register_operand" "")
> -	(minus:TI (match_operand:TI 1 "register_operand" "")
> +	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
>  		  (match_operand:TI 2 "register_operand" "")))]
>    ""
>  {
> -  rtx low = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
> -				  gen_lowpart (DImode, operands[2])));
> +  rtx l0 = gen_reg_rtx (DImode);
> +  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_lowpart_offset (DImode, TImode));
> +  rtx l2 = gen_lowpart (DImode, operands[2]);
> +  rtx h0 = gen_reg_rtx (DImode);
> +  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
> +				subreg_highpart_offset (DImode, TImode));
> +  rtx h2 = gen_highpart (DImode, operands[2]);
>  
> -  rtx high = gen_reg_rtx (DImode);
> -  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
> -				 gen_highpart (DImode, operands[2])));
> +  emit_insn (gen_subdi3_compare1 (l0, l1, l2));  emit_insn 
> + (gen_subdi3_carryin (h0, h1, h2));
>  
> -  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
> -  emit_move_insn (gen_highpart (DImode, operands[0]), high);
> +  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
> +  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
> +  DONE;
> +})
> +
> +(define_expand "subvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
> +  DONE;
> +})
> +
> +(define_expand "usubvti4"
> +  [(match_operand:TI 0 "register_operand")
> +   (match_operand:TI 1 "aarch64_reg_or_zero")
> +   (match_operand:TI 2 "aarch64_reg_or_imm")
> +   (match_operand 3 "")]
> +  ""
> +{
> +  rtx l0,l1,l2,h0,h1,h2;
> +
> +  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
> +				    &l0, &l1, &l2, &h0, &h1, &h2);
> +  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
> +
> +  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
>    DONE;
>  })
>  
> @@ -2327,6 +2681,22 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>  
> +(define_insn "*sub<mode>3_compare1_imm"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
> +	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
> +   (set (match_operand:GPI 0 "register_operand" "=r,r")
> +	(plus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
> +  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
> +  "@
> +  subs\\t%<w>0, %<w>1, %<w>2
> +  adds\\t%<w>0, %<w>1, %<w>3"
> +  [(set_attr "type" "alus_imm")]
> +)
> +
>  (define_insn "sub<mode>3_compare1"
>    [(set (reg:CC CC_REGNUM)
>  	(compare:CC
> @@ -2554,6 +2924,85 @@
>    [(set_attr "type" "adc_reg")]
>  )
>  
> +(define_expand "sub<mode>3_carryinCV"
> +  [(parallel
> +     [(set (reg:CC CC_REGNUM)
> +	   (compare:CC
> +	     (sign_extend:<DWI>
> +	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
> +	     (plus:<DWI>
> +	       (sign_extend:<DWI>
> +		 (match_operand:GPI 2 "register_operand" "r"))
> +	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
> +      (set (match_operand:GPI 0 "register_operand" "=r")
> +	   (minus:GPI
> +	     (minus:GPI (match_dup 1) (match_dup 2))
> +	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
> +   ""
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z1"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (const_int 0)
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 1 "register_operand" "r"))
> +	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (neg:GPI (match_dup 1))
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, <w>zr, %<w>1"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV_z2"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (match_dup 1)
> +	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, <w>zr"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
> +(define_insn "*sub<mode>3_carryinCV"
> +  [(set (reg:CC CC_REGNUM)
> +	(compare:CC
> +	  (sign_extend:<DWI>
> +	    (match_operand:GPI 1 "register_operand" "r"))
> +	  (plus:<DWI>
> +	    (sign_extend:<DWI>
> +	      (match_operand:GPI 2 "register_operand" "r"))
> +	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +	(minus:GPI
> +	  (minus:GPI (match_dup 1) (match_dup 2))
> +	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
> +   ""
> +   "sbcs\\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "adc_reg")]
> +)
> +
>  (define_insn "*sub_uxt<mode>_shift2"
>    [(set (match_operand:GPI 0 "register_operand" "=rk")
>  	(minus:GPI (match_operand:GPI 4 "register_operand" "rk") diff --git 
> a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> new file mode 100644
> index 0000000..0b31500
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_add (__int128 x, __int128 y) {
> +  __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> new file mode 100644
> index 0000000..9768a98
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_add (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_saddl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> new file mode 100644
> index 0000000..126a526
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_add (long long x, long long y) {
> +  long long r;
> +
> +  int ovr = __builtin_saddll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> new file mode 100644
> index 0000000..c1261e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +__int128 overflow_sub (__int128 x, __int128 y) {
> +  __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> new file mode 100644
> index 0000000..1040464
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long overflow_sub (long x, long y)
> +{
> +  long r;
> +
> +  int ovr = __builtin_ssubl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> new file mode 100644
> index 0000000..a03df88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +long long overflow_sub (long long x, long long y) {
> +  long long r;
> +
> +  int ovr = __builtin_ssubll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> new file mode 100644
> index 0000000..c573c2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_add (unsigned __int128 x, unsigned 
> +__int128 y) {
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_add_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +/* { dg-final { scan-assembler "adcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> new file mode 100644
> index 0000000..e325591
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_add (unsigned long x, unsigned long y) {
> +  unsigned long r;
> +
> +  int ovr = __builtin_uaddl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> new file mode 100644
> index 0000000..5f42886
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_add (unsigned long long x, unsigned long 
> +long y) {
> +  unsigned long long r;
> +
> +  int ovr = __builtin_uaddll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "adds" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> new file mode 100644
> index 0000000..a84f4a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned __int128 overflow_sub (unsigned __int128 x, unsigned 
> +__int128 y) {
> +  unsigned __int128 r;
> +
> +  int ovr = __builtin_sub_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +/* { dg-final { scan-assembler "sbcs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> new file mode 100644
> index 0000000..ed033da
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long overflow_sub (unsigned long x, unsigned long y) {
> +  unsigned long r;
> +
> +  int ovr = __builtin_usubl_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c 
> b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> new file mode 100644
> index 0000000..a742f0c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" }  */
> +
> +extern void overflow_handler ();
> +
> +unsigned long long overflow_sub (unsigned long long x, unsigned long 
> +long y) {
> +  unsigned long long r;
> +
> +  int ovr = __builtin_usubll_overflow (x, y, &r);  if (ovr)
> +    overflow_handler ();
> +
> +  return r;
> +}
> +
> +/* { dg-final { scan-assembler "subs" } } */
> +
> 


[-- Attachment #2: gnutools-6308-v4.patch --]
[-- Type: application/octet-stream, Size: 30944 bytes --]

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 45f7a44..244e490 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
+CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
 
 /* Half-precision floating point for __fp16.  */
 FLOAT_MODE (HF, 2, 0);
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e397ff4..b67d4d1 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -407,6 +407,16 @@ void aarch64_relayout_simd_types (void);
 void aarch64_reset_previous_fndecl (void);
 bool aarch64_return_address_signing_enabled (void);
 void aarch64_save_restore_target_globals (tree);
+void aarch64_add_128bit_scratch_regs (rtx, rtx, rtx *,
+				      rtx *, rtx *,
+				      rtx *, rtx *,
+				      rtx *);
+void aarch64_subv_128bit_scratch_regs (rtx, rtx, rtx *,
+				       rtx *, rtx *,
+				       rtx *, rtx *, rtx *);
+void aarch64_expand_subvti (rtx, rtx, rtx,
+			    rtx, rtx, rtx, rtx);
+
 
 /* Initialize builtins for SIMD intrinsics.  */
 void init_aarch64_simd_builtins (void);
@@ -431,6 +441,8 @@ bool aarch64_float_const_representable_p (rtx);
 
 #if defined (RTX_CODE)
 
+void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode,
+				   rtx label_ref);
 bool aarch64_legitimate_address_p (machine_mode, rtx, RTX_CODE, bool);
 machine_mode aarch64_select_cc_mode (RTX_CODE, rtx, rtx);
 rtx aarch64_gen_compare_reg (RTX_CODE, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ef1b5a8..035543e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4882,6 +4882,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && GET_CODE (y) == ZERO_EXTEND)
     return CC_Cmode;
 
+  /* A test for signed overflow.  */
+  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+      && code == NE
+      && GET_CODE (x) == PLUS
+      && GET_CODE (y) == SIGN_EXTEND)
+    return CC_Vmode;
+
   /* For everything else, return CCmode.  */
   return CCmode;
 }
@@ -4988,6 +4995,15 @@ aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
 	}
       break;
 
+    case CC_Vmode:
+      switch (comp_code)
+	{
+	case NE: return AARCH64_VS;
+	case EQ: return AARCH64_VC;
+	default: return -1;
+	}
+      break;
+
     default:
       return -1;
     }
@@ -13969,6 +13985,97 @@ aarch64_split_dimode_const_store (rtx dst, rtx src)
   return true;
 }
 
+/* Generate RTL for a conditional branch with rtx comparison CODE in
+   mode CC_MODE.  The destination of the unlikely conditional branch
+   is LABEL_REF.  */
+
+void
+aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
+			      rtx label_ref)
+{
+  rtx x;
+  x = gen_rtx_fmt_ee (code, VOIDmode,
+		      gen_rtx_REG (cc_mode, CC_REGNUM),
+		      const0_rtx);
+
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
+			    pc_rtx);
+  aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
+}
+
+/* Generate DImode scratch registers for 128-bit (TImode) addition.  */
+
+void
+aarch64_add_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				 rtx *low_in1, rtx *low_in2,
+				 rtx *high_dest, rtx *high_in1,
+				 rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = gen_lowpart (DImode, op1);
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = gen_highpart (DImode, op1);
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+}
+
+/* Generate DImode scratch registers for 128-bit (TImode) subtraction.  */
+
+void
+aarch64_subv_128bit_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
+				  rtx *low_in1, rtx *low_in2,
+				  rtx *high_dest, rtx *high_in1,
+				  rtx *high_in2)
+{
+  *low_dest = gen_reg_rtx (DImode);
+  *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				  subreg_lowpart_offset (DImode, TImode));
+  *high_dest = gen_reg_rtx (DImode);
+  *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+  *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
+				   subreg_highpart_offset (DImode, TImode));
+
+}
+
+/* Generate RTL for 128-bit (TImode) subtraction.  */
+
+void
+aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
+		       rtx low_in2, rtx high_dest, rtx high_in1,
+		       rtx high_in2)
+{
+  if (low_in2 == const0_rtx)
+    {
+      low_dest = low_in1;
+      emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
+				      force_reg (DImode, high_in2)));
+    }
+  else
+    {
+      if (CONST_INT_P (low_in2))
+	{
+	  low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
+	  high_in2 = force_reg (DImode, high_in2);
+	  emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
+	}
+      else
+	emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
+      emit_insn (gen_subdi3_carryinCV (high_dest,
+				       force_reg (DImode, high_in1),
+				       high_in2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, op0), low_dest);
+  emit_move_insn (gen_highpart (DImode, op0), high_dest);
+
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index e6e7e64..fda7a74 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1606,25 +1606,135 @@
   }
 )
 
+;; This pattern is used to implement the built-in function implementing signed
+;; integer addition with overflow checking for SImode and DImode
+
+(define_expand "addv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+;; The pattern is used to implement the built-in functions implementing unsigned
+;; integer addition with overflow checking for SImode and DImode
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+
+  DONE;
+})
+
+
 (define_expand "addti3"
   [(set (match_operand:TI 0 "register_operand" "")
 	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "register_operand" "")))]
+		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0,l1,l2,h0,h1,h2;
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      if (!aarch64_pluslong_operand (h2, DImode))
+	h2 = force_reg (DImode, h2);
+      emit_insn (gen_adddi3 (h0, h1, h2));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
   DONE;
 })
 
+;; This pattern is used to implement the built-in function implementing signed
+;; integer addition with overflow checking for TImode
+
+(define_expand "addvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+;; This pattern is used to implement the built-in function implementing unsigned
+;; integer addition with overflow checking for TImode
+
+(define_expand "uaddvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_add_128bit_scratch_regs (operands[1], operands[2],
+				   &l0, &l1, &l2, &h0, &h1, &h2);
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Cmode, operands[3]);
+  DONE;
+ })
+
 (define_insn "add<mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -1710,7 +1820,7 @@
 
 (define_insn "add<mode>3_compareC"
   [(set (reg:CC_C CC_REGNUM)
-	(ne:CC_C
+	(compare:CC_C
 	  (plus:<DWI>
 	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
 	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
@@ -1723,10 +1833,70 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+;; Note that since we're sign-extending, match the immediate in GPI
+;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
+(define_insn "*add<mode>3_compareV_cconly_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
+	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "@
+  cmn\\t%<w>0, %<w>1
+  cmp\\t%<w>0, #%n1"
+  [(set_attr "type" "alus_imm")]
+)
+
+(define_insn "*add<mode>3_compareV_cconly"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "cmn\\t%<w>0, %<w>1"
+  [(set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*add<mode>3_compareV_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r,r"))
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI>
+	    (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+   ""
+   "@
+   adds\\t%<w>0, %<w>1, %<w>2
+   subs\\t%<w>0, %<w>1, #%n2"
+  [(set_attr "type" "alus_imm,alus_imm")]
+)
+
+(define_insn "add<mode>3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+  ""
+  "adds\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "alus_sreg")]
+)
+
 (define_insn "*adds_shift_imm_<mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
-	 (plus:GPI (ASHIFT:GPI 
+	 (plus:GPI (ASHIFT:GPI
 		    (match_operand:GPI 1 "register_operand" "r")
 		    (match_operand:QI 2 "aarch64_shift_imm_<mode>" "n"))
 		   (match_operand:GPI 3 "register_operand" "r"))
@@ -2082,6 +2252,138 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "add<mode>3_carryinC"
+  [(parallel
+     [(set (match_dup 3)
+	   (compare:CC_C
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 4)
+		 (zero_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (zero_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (zero_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 5) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 5) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
+  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinC_zero"
+  [(set (reg:CC_C CC_REGNUM)
+	(compare:CC_C
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinC"
+  [(set (reg:CC_C CC_REGNUM)
+	(compare:CC_C
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_expand "add<mode>3_carryinV"
+  [(parallel
+     [(set (reg:CC_V CC_REGNUM)
+	   (compare:CC_V
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 3)
+		 (sign_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (sign_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 4) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 4) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
+  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinV_zero"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinV"
+  [(set (reg:CC_V CC_REGNUM)
+	(compare:CC_V
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*add_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (and:GPI
@@ -2178,22 +2480,98 @@
    (set_attr "simd" "*,yes")]
 )
 
+;; This pattern is used to implement the built-in function implementing signed
+;; integer subtraction with overflow checking for SImode and DImode
+
+(define_expand "subv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+
+  DONE;
+})
+
+;; This pattern is used to implement the built-in function implementing unsigned
+;; integer subtraction with overflow checking for SImode and DImode
+
+(define_expand "usubv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
+
+  DONE;
+})
+
 (define_expand "subti3"
   [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "register_operand" "")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
 		  (match_operand:TI 2 "register_operand" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = gen_lowpart (DImode, operands[2]);
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = gen_highpart (DImode, operands[2]);
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+  emit_insn (gen_subdi3_carryin (h0, h1, h2));
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+  DONE;
+})
+
+;; This pattern is used to implement the built-in function implementing signed
+;; integer subtraction with overflow checking for TImode
+
+(define_expand "subvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (NE, CC_Vmode, operands[3]);
+  DONE;
+})
+
+;; This pattern is used to implement the built-in function implementing unsigned
+;; integer subtraction with overflow checking for TImode
+
+(define_expand "usubvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0,l1,l2,h0,h1,h2;
+
+  aarch64_subv_128bit_scratch_regs (operands[1], operands[2],
+				    &l0, &l1, &l2, &h0, &h1, &h2);
+  aarch64_expand_subvti (operands[0], l0, l1, l2, h0, h1, h2);
+
+  aarch64_gen_unlikely_cbranch (LTU, CCmode, operands[3]);
   DONE;
 })
 
@@ -2222,6 +2600,22 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*sub<mode>3_compare1_imm"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
+	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
+  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
+  "@
+  subs\\t%<w>0, %<w>1, #%n3
+  adds\\t%<w>0, %<w>1, %3"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "sub<mode>3_compare1"
   [(set (reg:CC CC_REGNUM)
 	(compare:CC
@@ -2498,6 +2892,85 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "sub<mode>3_carryinCV"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC
+	     (sign_extend:<DWI>
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
+	     (plus:<DWI>
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r"))
+	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+      (set (match_operand:GPI 0 "register_operand" "=r")
+	   (minus:GPI
+	     (minus:GPI (match_dup 1) (match_dup 2))
+	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+   ""
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (neg:GPI (match_dup 1))
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, %<w>1"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (minus:GPI (match_dup 1) (match_dup 2))
+	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*sub_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
new file mode 100644
index 0000000..0b31500
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_sadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_add (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
new file mode 100644
index 0000000..9768a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_add (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_saddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
new file mode 100644
index 0000000..126a526
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_saddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_add (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_saddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
new file mode 100644
index 0000000..c1261e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+__int128 overflow_sub (__int128 x, __int128 y)
+{
+  __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
new file mode 100644
index 0000000..1040464
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long overflow_sub (long x, long y)
+{
+  long r;
+
+  int ovr = __builtin_ssubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
new file mode 100644
index 0000000..a03df88
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_ssubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+long long overflow_sub (long long x, long long y)
+{
+  long long r;
+
+  int ovr = __builtin_ssubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
new file mode 100644
index 0000000..c573c2a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uadd_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_add (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_add_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+/* { dg-final { scan-assembler "adcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
new file mode 100644
index 0000000..e325591
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_add (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_uaddl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
new file mode 100644
index 0000000..5f42886
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_uaddll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_add (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_uaddll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "adds" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
new file mode 100644
index 0000000..a84f4a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usub_128.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned __int128 overflow_sub (unsigned __int128 x, unsigned __int128 y)
+{
+  unsigned __int128 r;
+
+  int ovr = __builtin_sub_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+/* { dg-final { scan-assembler "sbcs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
new file mode 100644
index 0000000..ed033da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubl.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long overflow_sub (unsigned long x, unsigned long y)
+{
+  unsigned long r;
+
+  int ovr = __builtin_usubl_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
new file mode 100644
index 0000000..a742f0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/builtin_usubll.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" }  */
+
+extern void overflow_handler ();
+
+unsigned long long overflow_sub (unsigned long long x, unsigned long long y)
+{
+  unsigned long long r;
+
+  int ovr = __builtin_usubll_overflow (x, y, &r);
+  if (ovr)
+    overflow_handler ();
+
+  return r;
+}
+
+/* { dg-final { scan-assembler "subs" } } */
+
-- 
1.9.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH][Aarch64] Add support for overflow add and sub operations
@ 2016-11-30 23:06 Michael Collison
  0 siblings, 0 replies; 8+ messages in thread
From: Michael Collison @ 2016-11-30 23:06 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rth, James Greenhalgh

[-- Attachment #1: Type: text/plain, Size: 1915 bytes --]

Hi,

This patch improves code generations for builtin arithmetic overflow operations for the aarch64 backend. As an example for a simple test case such as:

int
f (int x, int y, int *ovf)
{
  int res;
  *ovf = __builtin_sadd_overflow (x, y, &res);
  return res;
}

Current trunk at -O2 generates

f:
	mov	w3, w0
	mov	w4, 0
	add	w0, w0, w1
	tbnz	w1, #31, .L4
	cmp	w0, w3
	blt	.L3
.L2:
	str	w4, [x2]
	ret
	.p2align 3
.L4:
	cmp	w0, w3
	ble	.L2
.L3:
	mov	w4, 1
	b	.L2


With the patch this now generates:

f:
	adds	w0, w0, w1
	cset	w1, vs
	str	w1, [x2]
	ret

Tested on aarch64-linux-gnu with no regressions. Okay for trunk?


2016-11-30  Michael Collison  <michael.collison@arm.com>
	    Richard Henderson <rth@redhat.com>

	* config/aarch64/aarch64-modes.def (CC_V): New.
	* config/aarch64/aarch64.c (aarch64_select_cc_mode): Test
	for signed overflow using CC_Vmode.
	(aarch64_get_condition_code_1): Handle CC_Vmode.
	* config/aarch64/aarch64.md (addv<GPI>4, uaddv<GPI>4): New.
	(addti3): Create simpler code if low part is already known to be 0.
	(addvti4, uaddvti4): New.
	(*add<GPI>3_compareC_cconly_imm): New.
	(*add<GPI>3_compareC_cconly): New.
	(*add<GPI>3_compareC_imm): New.
	(*add<GPI>3_compareC): Rename from add<GPI>3_compare1; do not
	handle constants within this pattern.
	(*add<GPI>3_compareV_cconly_imm): New.
	(*add<GPI>3_compareV_cconly): New.
	(*add<GPI>3_compareV_imm): New.
	(add<GPI>3_compareV): New.
	(add<GPI>3_carryinC, add<GPI>3_carryinV): New.
	(*add<GPI>3_carryinC_zero, *add<GPI>3_carryinV_zero): New.
	(*add<GPI>3_carryinC, *add<GPI>3_carryinV): New.
	(subv<GPI>4, usubv<GPI>4): New.
	(subti): Handle op1 zero.
	(subvti4, usub4ti4): New.
	(*sub<GPI>3_compare1_imm): New.
	(sub<GPI>3_carryinCV): New.
	(*sub<GPI>3_carryinCV_z1_z2, *sub<GPI>3_carryinCV_z1): New.
	(*sub<GPI>3_carryinCV_z2, *sub<GPI>3_carryinCV): New

[-- Attachment #2: rth_overflow_ipreview1.patch --]
[-- Type: application/octet-stream, Size: 21909 bytes --]

diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index de8227f..71c2069 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -24,6 +24,7 @@ CC_MODE (CC_SWP);
 CC_MODE (CC_NZ);    /* Only N and Z bits of condition flags are valid.  */
 CC_MODE (CC_Z);     /* Only Z bit of condition flags is valid.  */
 CC_MODE (CC_C);     /* Only C bit of condition flags is valid.  */
+CC_MODE (CC_V);     /* Only V bit of condition flags is valid.  */
 
 /* Half-precision floating point for __fp16.  */
 FLOAT_MODE (HF, 2, 0);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6078b16..e020d24 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -4324,6 +4324,13 @@ aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
       && GET_CODE (y) == ZERO_EXTEND)
     return CC_Cmode;
 
+  /* A test for signed overflow.  */
+  if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
+      && code == NE
+      && GET_CODE (x) == PLUS
+      && GET_CODE (y) == SIGN_EXTEND)
+    return CC_Vmode;
+
   /* For everything else, return CCmode.  */
   return CCmode;
 }
@@ -4430,6 +4437,15 @@ aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
 	}
       break;
 
+    case CC_Vmode:
+      switch (comp_code)
+	{
+	case NE: return AARCH64_VS;
+	case EQ: return AARCH64_VC;
+	default: return -1;
+	}
+      break;
+
     default:
       return -1;
       break;
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 6afaf90..a074341 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1720,25 +1720,155 @@
   }
 )
 
+(define_expand "addv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareV (operands[0], operands[1], operands[2]));
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Vmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
+(define_expand "uaddv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "register_operand")
+   (match_operand:GPI 2 "register_operand")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_add<mode>3_compareC (operands[0], operands[1], operands[2]));
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Cmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
+
 (define_expand "addti3"
   [(set (match_operand:TI 0 "register_operand" "")
 	(plus:TI (match_operand:TI 1 "register_operand" "")
-		 (match_operand:TI 2 "register_operand" "")))]
+		 (match_operand:TI 2 "aarch64_reg_or_imm" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_compareC (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = gen_lowpart (DImode, operands[1]);
+  rtx l2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = gen_highpart (DImode, operands[1]);
+  rtx h2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_highpart_offset (DImode, TImode));
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_adddi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      if (!aarch64_pluslong_operand (h2, DImode))
+	h2 = force_reg (DImode, h2);
+      emit_insn (gen_adddi3 (h0, h1, h2));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryin (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
   DONE;
 })
 
+(define_expand "addvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = gen_lowpart (DImode, operands[1]);
+  rtx l2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = gen_highpart (DImode, operands[1]);
+  rtx h2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_highpart_offset (DImode, TImode));
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareV (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinV (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Vmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
+(define_expand "uaddvti4"
+  [(match_operand:TI 0 "register_operand" "")
+   (match_operand:TI 1 "register_operand" "")
+   (match_operand:TI 2 "aarch64_reg_or_imm" "")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = gen_lowpart (DImode, operands[1]);
+  rtx l2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = gen_highpart (DImode, operands[1]);
+  rtx h2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_highpart_offset (DImode, TImode));
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_adddi3_compareC (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      emit_insn (gen_adddi3_compareC (l0, l1, force_reg (DImode, l2)));
+      emit_insn (gen_adddi3_carryinC (h0, h1, force_reg (DImode, h2)));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Cmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+   DONE;
+ })
+
 (define_insn "add<mode>3_compare0"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -1837,6 +1967,66 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+;; Note that since we're sign-extending, match the immediate in GPI
+;; rather than in DWI.  Since CONST_INT is modeless, this works fine.
+(define_insn "*add<mode>3_compareV_cconly_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r,r"))
+	    (match_operand:GPI 1 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "@
+  cmn\\t%<w>0, %<w>1
+  cmp\\t%<w>0, #%n1"
+  [(set_attr "type" "alus_imm")]
+)
+
+(define_insn "*add<mode>3_compareV_cconly"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 0 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 0) (match_dup 1)))))]
+  ""
+  "cmn\\t%<w>0, %<w>1"
+  [(set_attr "type" "alus_sreg")]
+)
+
+(define_insn "*add<mode>3_compareV_imm"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r,r"))
+	    (match_operand:GPI 2 "aarch64_plus_immediate" "I,J"))
+	  (sign_extend:<DWI>
+	    (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+   ""
+   "@
+   adds\\t%<w>0, %<w>1, %<w>2
+   subs\\t%<w>0, %<w>1, #%n2"
+  [(set_attr "type" "alus_imm,alus_imm")]
+)
+
+(define_insn "add<mode>3_compareV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r"))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI> (plus:GPI (match_dup 1) (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(plus:GPI (match_dup 1) (match_dup 2)))]
+  ""
+  "adds\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "alus_sreg")]
+ )
+
 (define_insn "*adds_shift_imm_<mode>"
   [(set (reg:CC_NZ CC_REGNUM)
 	(compare:CC_NZ
@@ -2196,6 +2386,138 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "add<mode>3_carryinC"
+  [(parallel
+     [(set (match_dup 3)
+	   (ne:CC_C
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 4)
+		 (zero_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (zero_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (zero_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 5) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 5) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  operands[3] = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[4] = gen_rtx_NE (<DWI>mode, operands[3], const0_rtx);
+  operands[5] = gen_rtx_NE (<MODE>mode, operands[3], const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinC_zero"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinC"
+  [(set (reg:CC_C CC_REGNUM)
+	(ne:CC_C
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (zero_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (zero_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (zero_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_expand "add<mode>3_carryinV"
+  [(parallel
+     [(set (reg:CC_V CC_REGNUM)
+	   (ne:CC_V
+	     (plus:<DWI>
+	       (plus:<DWI>
+		 (match_dup 3)
+		 (sign_extend:<DWI>
+		   (match_operand:GPI 1 "register_operand" "r")))
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r")))
+	   (sign_extend:<DWI>
+	     (plus:GPI
+	       (plus:GPI (match_dup 4) (match_dup 1))
+	       (match_dup 2)))))
+      (set (match_operand:GPI 0 "register_operand")
+	   (plus:GPI
+	     (plus:GPI (match_dup 4) (match_dup 1))
+	     (match_dup 2)))])]
+   ""
+{
+  rtx cc = gen_rtx_REG (CC_Cmode, CC_REGNUM);
+  operands[3] = gen_rtx_NE (<DWI>mode, cc, const0_rtx);
+  operands[4] = gen_rtx_NE (<MODE>mode, cc, const0_rtx);
+})
+
+(define_insn "*add<mode>3_carryinV_zero"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (match_operand:<DWI> 2 "aarch64_carry_operation" "")
+	    (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (match_operand:GPI 3 "aarch64_carry_operation" "")
+	      (match_dup 1)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI (match_dup 3) (match_dup 1)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*add<mode>3_carryinV"
+  [(set (reg:CC_V CC_REGNUM)
+	(ne:CC_V
+	  (plus:<DWI>
+	    (plus:<DWI>
+	      (match_operand:<DWI> 3 "aarch64_carry_operation" "")
+	      (sign_extend:<DWI> (match_operand:GPI 1 "register_operand" "r")))
+	    (sign_extend:<DWI> (match_operand:GPI 2 "register_operand" "r")))
+	  (sign_extend:<DWI>
+	    (plus:GPI
+	      (plus:GPI
+		(match_operand:GPI 4 "aarch64_carry_operation" "")
+		(match_dup 1))
+	      (match_dup 2)))))
+   (set (match_operand:GPI 0 "register_operand")
+	(plus:GPI
+	  (plus:GPI (match_dup 4) (match_dup 1))
+	  (match_dup 2)))]
+   ""
+   "adcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*add_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(plus:GPI (and:GPI
@@ -2292,22 +2614,158 @@
    (set_attr "simd" "*,yes")]
 )
 
+(define_expand "subv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Vmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
+(define_expand "usubv<mode>4"
+  [(match_operand:GPI 0 "register_operand")
+   (match_operand:GPI 1 "aarch64_reg_or_zero")
+   (match_operand:GPI 2 "aarch64_reg_or_zero")
+   (match_operand 3 "")]
+  ""
+{
+  emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1], operands[2]));
+
+  rtx x;
+  x = gen_rtx_LTU (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
 (define_expand "subti3"
   [(set (match_operand:TI 0 "register_operand" "")
-	(minus:TI (match_operand:TI 1 "register_operand" "")
+	(minus:TI (match_operand:TI 1 "aarch64_reg_or_zero" "")
 		  (match_operand:TI 2 "register_operand" "")))]
   ""
 {
-  rtx low = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_compare1 (low, gen_lowpart (DImode, operands[1]),
-				  gen_lowpart (DImode, operands[2])));
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = gen_lowpart (DImode, operands[2]);
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = gen_highpart (DImode, operands[2]);
 
-  rtx high = gen_reg_rtx (DImode);
-  emit_insn (gen_subdi3_carryin (high, gen_highpart (DImode, operands[1]),
-				 gen_highpart (DImode, operands[2])));
+  emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+  emit_insn (gen_subdi3_carryin (h0, h1, h2));
 
-  emit_move_insn (gen_lowpart (DImode, operands[0]), low);
-  emit_move_insn (gen_highpart (DImode, operands[0]), high);
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+  DONE;
+})
+
+(define_expand "subvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_highpart_offset (DImode, TImode));
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_subdi3_compare1 (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      if (CONST_INT_P (l2))
+	{
+	  l2 = force_reg (DImode, GEN_INT (-UINTVAL (l2)));
+	  h2 = force_reg (DImode, h2);
+	  emit_insn (gen_adddi3_compareC (l0, l1, l2));
+	}
+      else
+	emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+      emit_insn (gen_subdi3_carryinCV (h0, force_reg (DImode, h1), h2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  rtx x;
+  x = gen_rtx_NE (VOIDmode, gen_rtx_REG (CC_Vmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
+  DONE;
+})
+
+(define_expand "usubvti4"
+  [(match_operand:TI 0 "register_operand")
+   (match_operand:TI 1 "aarch64_reg_or_zero")
+   (match_operand:TI 2 "aarch64_reg_or_imm")
+   (match_operand 3 "")]
+  ""
+{
+  rtx l0 = gen_reg_rtx (DImode);
+  rtx l1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx l2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_lowpart_offset (DImode, TImode));
+  rtx h0 = gen_reg_rtx (DImode);
+  rtx h1 = simplify_gen_subreg (DImode, operands[1], TImode,
+				subreg_highpart_offset (DImode, TImode));
+  rtx h2 = simplify_gen_subreg (DImode, operands[2], TImode,
+				subreg_highpart_offset (DImode, TImode));
+
+  if (l2 == const0_rtx)
+    {
+      l0 = l1;
+      emit_insn (gen_subdi3_compare1 (h0, h1, force_reg (DImode, h2)));
+    }
+  else
+    {
+      if (CONST_INT_P (l2))
+	{
+	  l2 = force_reg (DImode, GEN_INT (-UINTVAL (l2)));
+	  h2 = force_reg (DImode, h2);
+	  emit_insn (gen_adddi3_compareC (l0, l1, l2));
+	}
+      else
+	emit_insn (gen_subdi3_compare1 (l0, l1, l2));
+      emit_insn (gen_subdi3_carryinCV (h0, force_reg (DImode, h1), h2));
+    }
+
+  emit_move_insn (gen_lowpart (DImode, operands[0]), l0);
+  emit_move_insn (gen_highpart (DImode, operands[0]), h0);
+
+  rtx x;
+  x = gen_rtx_LTU (VOIDmode, gen_rtx_REG (CCmode, CC_REGNUM), const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+			    gen_rtx_LABEL_REF (VOIDmode, operands[3]),
+			    pc_rtx);
+  emit_jump_insn (gen_rtx_SET (pc_rtx, x));
   DONE;
 })
 
@@ -2336,6 +2794,22 @@
   [(set_attr "type" "alus_sreg")]
 )
 
+(define_insn "*sub<mode>3_compare1_imm"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ,rZ")
+	  (match_operand:GPI 2 "aarch64_plus_immediate" "I,J")))
+   (set (match_operand:GPI 0 "register_operand" "=r,r")
+	(plus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_plus_immediate" "J,I")))]
+  "UINTVAL (operands[2]) == -UINTVAL (operands[3])"
+  "@
+  subs\\t%<w>0, %<w>1, %<w>2
+  adds\\t%<w>0, %<w>1, %<w>3"
+  [(set_attr "type" "alus_imm")]
+)
+
 (define_insn "sub<mode>3_compare1"
   [(set (reg:CC CC_REGNUM)
 	(compare:CC
@@ -2563,6 +3037,85 @@
   [(set_attr "type" "adc_reg")]
 )
 
+(define_expand "sub<mode>3_carryinCV"
+  [(parallel
+     [(set (reg:CC CC_REGNUM)
+	   (compare:CC
+	     (sign_extend:<DWI>
+	       (match_operand:GPI 1 "aarch64_reg_or_zero" "rZ"))
+	     (plus:<DWI>
+	       (sign_extend:<DWI>
+		 (match_operand:GPI 2 "register_operand" "r"))
+	       (ltu:<DWI> (reg:CC CC_REGNUM) (const_int 0)))))
+      (set (match_operand:GPI 0 "register_operand" "=r")
+	   (minus:GPI
+	     (minus:GPI (match_dup 1) (match_dup 2))
+	     (ltu:GPI (reg:CC CC_REGNUM) (const_int 0))))])]
+   ""
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(neg:GPI (match_operand:GPI 1 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z1"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (const_int 0)
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 1 "register_operand" "r"))
+	    (match_operand:<DWI> 2 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (neg:GPI (match_dup 1))
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, <w>zr, %<w>1"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV_z2"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (match_operand:<DWI> 2 "aarch64_borrow_operation" "")))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (match_dup 1)
+	  (match_operand:GPI 3 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, <w>zr"
+  [(set_attr "type" "adc_reg")]
+)
+
+(define_insn "*sub<mode>3_carryinCV"
+  [(set (reg:CC CC_REGNUM)
+	(compare:CC
+	  (sign_extend:<DWI>
+	    (match_operand:GPI 1 "register_operand" "r"))
+	  (plus:<DWI>
+	    (sign_extend:<DWI>
+	      (match_operand:GPI 2 "register_operand" "r"))
+	    (match_operand:<DWI> 3 "aarch64_borrow_operation" ""))))
+   (set (match_operand:GPI 0 "register_operand" "=r")
+	(minus:GPI
+	  (minus:GPI (match_dup 1) (match_dup 2))
+	  (match_operand:GPI 4 "aarch64_borrow_operation" "")))]
+   ""
+   "sbcs\\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "adc_reg")]
+)
+
 (define_insn "*sub_uxt<mode>_shift2"
   [(set (match_operand:GPI 0 "register_operand" "=rk")
 	(minus:GPI (match_operand:GPI 4 "register_operand" "rk")
-- 
1.9.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-08-01  6:33 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-19  6:27 [PATCH][Aarch64] Add support for overflow add and sub operations Michael Collison
2017-05-19 11:00 ` Christophe Lyon
2017-05-19 21:42   ` Michael Collison
2017-07-05  9:38     ` Richard Earnshaw (lists)
2017-07-06  7:29       ` Michael Collison
2017-07-06  8:22         ` Richard Earnshaw (lists)
2017-08-01  6:33       ` Michael Collison
  -- strict thread matches above, loose matches on Subject: below --
2016-11-30 23:06 Michael Collison

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).