public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Overflow-trapping integer arithmetic routines7code
@ 2020-10-05 16:49 Stefan Kanthak
  2020-11-10 15:29 ` Jeff Law
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2020-10-05 16:49 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 424 bytes --]

The implementation of the functions __absv?i2(), __addv?i3() etc. for
trapping integer overflow provided in libgcc2.c is rather bad.
Same for __cmp?i2() and __ucmp?i2()

At least for AMD64 and i386 processors GCC creates awful to horrible
code for them: see <https://skanthak.homepage.t-online.de/gcc.html>
for some examples as well as the expected assembly.

The attached diff/patch provides better implementations.

Stefan

[-- Attachment #2: libgcc2.diff --]
[-- Type: application/octet-stream, Size: 10106 bytes --]

--- -/libgcc/libgcc2.c
+++ +/libgcc/libgcc2.c
@@ -75,9 +75,9 @@
 Wtype
 __addvSI3 (Wtype a, Wtype b)
 {
-  const Wtype w = (UWtype) a + (UWtype) b;
+  Wtype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -86,9 +86,9 @@
 SItype
 __addvsi3 (SItype a, SItype b)
 {
-  const SItype w = (USItype) a + (USItype) b;
+  SItype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -100,9 +100,9 @@
 DWtype
 __addvDI3 (DWtype a, DWtype b)
 {
-  const DWtype w = (UDWtype) a + (UDWtype) b;
+  DWtype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -113,9 +113,9 @@
 Wtype
 __subvSI3 (Wtype a, Wtype b)
 {
-  const Wtype w = (UWtype) a - (UWtype) b;
+  Wtype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -124,9 +124,9 @@
 SItype
 __subvsi3 (SItype a, SItype b)
 {
-  const SItype w = (USItype) a - (USItype) b;
+  SItype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -138,9 +138,9 @@
 DWtype
 __subvDI3 (DWtype a, DWtype b)
 {
-  const DWtype w = (UDWtype) a - (UDWtype) b;
+  DWtype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -151,22 +151,20 @@
 Wtype
 __mulvSI3 (Wtype a, Wtype b)
 {
-  const DWtype w = (DWtype) a * (DWtype) b;
+  Wtype w;
 
-  if ((Wtype) (w >> W_TYPE_SIZE) != (Wtype) w >> (W_TYPE_SIZE - 1))
+  if (__builtin_mul_overflow(a, b, &w))
     abort ();
 
   return w;
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
-#undef WORD_SIZE
-#define WORD_SIZE (sizeof (SItype) * __CHAR_BIT__)
 SItype
 __mulvsi3 (SItype a, SItype b)
 {
-  const DItype w = (DItype) a * (DItype) b;
+  DItype w;
 
-  if ((SItype) (w >> WORD_SIZE) != (SItype) w >> (WORD_SIZE-1))
+  if (__builtin_mul_overflow(a, b, &w))
     abort ();
 
   return w;
@@ -178,23 +176,23 @@
 Wtype
 __negvSI2 (Wtype a)
 {
-  const Wtype w = -(UWtype) a;
+  Wtype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow(0, a, &w))
     abort ();
 
-   return w;
+  return w;
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
 SItype
 __negvsi2 (SItype a)
 {
-  const SItype w = -(USItype) a;
+  SItype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow(0, a, &w))
     abort ();
 
-   return w;
+  return w;
 }
 #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
 #endif
@@ -203,9 +201,9 @@
 DWtype
 __negvDI2 (DWtype a)
 {
-  const DWtype w = -(UDWtype) a;
+  DWtype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow(0, a, &w))
     abort ();
 
   return w;
@@ -216,37 +214,43 @@
 Wtype
 __absvSI2 (Wtype a)
 {
-  Wtype w = a;
+  const Wtype v = 0 - (a < 0);
+#ifdef LIBGCC2_BAD_CODE
+  Wtype w = a ^ v;
 
-  if (a < 0)
-#ifdef L_negvsi2
-    w = __negvSI2 (a);
+  if (__builtin_sub_overflow(v, w, &w))
+    abort ();
+
+  return w;
 #else
-    w = -(UWtype) a;
+  Wtype w;
 
-  if (w < 0)
+  if (__builtin_add_overflow(a, v, &w))
     abort ();
-#endif
 
-   return w;
+  return v ^ w;
+#endif
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
 SItype
 __absvsi2 (SItype a)
 {
-  SItype w = a;
+  const SItype v = 0 - (a < 0);
+#ifdef LIBGCC2_BAD_CODE
+  SItype w = a ^ v;
 
-  if (a < 0)
-#ifdef L_negvsi2
-    w = __negvsi2 (a);
+  if (__builtin_sub_overflow(v, w, &w))
+    abort ();
+
+  return w;
 #else
-    w = -(USItype) a;
+  SItype w;
 
-  if (w < 0)
+  if (__builtin_add_overflow(a, v, &w))
     abort ();
-#endif
 
-   return w;
+  return v ^ w;
+#endif
 }
 #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
 #endif
@@ -255,144 +259,50 @@
 DWtype
 __absvDI2 (DWtype a)
 {
-  DWtype w = a;
+  const DWtype v = 0 - (a < 0);
+#ifdef LIBGCC2_BAD_CODE
+  DWtype w = a ^ v;
 
-  if (a < 0)
-#ifdef L_negvdi2
-    w = __negvDI2 (a);
+  if (__builtin_sub_overflow(v, w, &w))
+    abort ();
+
+  return w;
 #else
-    w = -(UDWtype) a;
+  DWtype w;
 
-  if (w < 0)
+  if (__builtin_add_overflow(a, v, &w))
     abort ();
-#endif
 
-  return w;
+  return v ^ w;
+#endif
 }
 #endif
 \f
 #ifdef L_mulvdi3
 DWtype
-__mulvDI3 (DWtype u, DWtype v)
+__mulvDI3 (DWtype a, DWtype b)
 {
-  /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
-     but the checked multiplication needs only two.  */
-  const DWunion uu = {.ll = u};
-  const DWunion vv = {.ll = v};
+#ifdef LIBGCC2_BAD_CODE
+  DWtype w;
 
-  if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
-    {
-      /* u fits in a single Wtype.  */
-      if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
-	{
-	  /* v fits in a single Wtype as well.  */
-	  /* A single multiplication.  No overflow risk.  */
-	  return (DWtype) uu.s.low * (DWtype) vv.s.low;
-	}
-      else
-	{
-	  /* Two multiplications.  */
-	  DWunion w0 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low};
-	  DWunion w1 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.high};
+  if (__builtin_mul_overflow(a, b, &w))
+    abort ();
 
-	  if (vv.s.high < 0)
-	    w1.s.high -= uu.s.low;
-	  if (uu.s.low < 0)
-	    w1.ll -= vv.ll;
-	  w1.ll += (UWtype) w0.s.high;
-	  if (__builtin_expect (w1.s.high == w1.s.low >> (W_TYPE_SIZE - 1), 1))
-	    {
-	      w0.s.high = w1.s.low;
-	      return w0.ll;
-	    }
-	}
-    }
-  else
-    {
-      if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
-	{
-	  /* v fits into a single Wtype.  */
-	  /* Two multiplications.  */
-	  DWunion w0 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low};
-	  DWunion w1 = {.ll = (UDWtype) (UWtype) uu.s.high
-			* (UDWtype) (UWtype) vv.s.low};
+  return w;
+#else
+  DWtype t;
+  const DWunion u = {.ll = a};
+  const DWunion v = {.ll = b};
+  DWunion w = {.ll = __umulsidi3 (u.s.low, v.s.low)};
 
-	  if (uu.s.high < 0)
-	    w1.s.high -= vv.s.low;
-	  if (vv.s.low < 0)
-	    w1.ll -= uu.ll;
-	  w1.ll += (UWtype) w0.s.high;
-	  if (__builtin_expect (w1.s.high == w1.s.low >> (W_TYPE_SIZE - 1), 1))
-	    {
-	      w0.s.high = w1.s.low;
-	      return w0.ll;
-	    }
-	}
-      else
-	{
-	  /* A few sign checks and a single multiplication.  */
-	  if (uu.s.high >= 0)
-	    {
-	      if (vv.s.high >= 0)
-		{
-		  if (uu.s.high == 0 && vv.s.high == 0)
-		    {
-		      const DWtype w = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low;
-		      if (__builtin_expect (w >= 0, 1))
-			return w;
-		    }
-		}
-	      else
-		{
-		  if (uu.s.high == 0 && vv.s.high == (Wtype) -1)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
+  if (__builtin_mul_overflow(u.s.low, v.s.high, &t)
+   || __builtin_add_overflow(t, w.s.high, &w.s.high)
+   || __builtin_mul_overflow(u.s.high, v.s.low, &t)
+   || __builtin_add_overflow(t, w.s.high, &w.s.high))
+    abort ();
 
-		      ww.s.high -= uu.s.low;
-		      if (__builtin_expect (ww.s.high < 0, 1))
-			return ww.ll;
-		    }
-		}
-	    }
-	  else
-	    {
-	      if (vv.s.high >= 0)
-		{
-		  if (uu.s.high == (Wtype) -1 && vv.s.high == 0)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
-
-		      ww.s.high -= vv.s.low;
-		      if (__builtin_expect (ww.s.high < 0, 1))
-			return ww.ll;
-		    }
-		}
-	      else
-		{
-		  if ((uu.s.high & vv.s.high) == (Wtype) -1
-		      && (uu.s.low | vv.s.low) != 0)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
-
-		      ww.s.high -= uu.s.low;
-		      ww.s.high -= vv.s.low;
-		      if (__builtin_expect (ww.s.high >= 0, 1))
-			return ww.ll;
-		    }
-		}
-	    }
-	}
-    }
-
-  /* Overflow.  */
-  abort ();
+  return w.ll;
+#endif
 }
 #endif
 \f
@@ -953,7 +863,7 @@
      aligns the divisor under the dividend and then perform number of
      test-subtract iterations which shift the dividend left. Number of
      iterations is k + 1 where k is the number of bit positions the
-     divisor must be shifted left  to align it under the dividend.
+     divisor must be shifted left to align it under the dividend.
      quotient bits can be saved in the rightmost positions of the dividend
      as it shifts left on each test-subtract iteration. */
 
@@ -965,7 +875,7 @@
       k = lz1 - lz2;
       y = (y << k);
 
-      /* Dividend can exceed 2 ^ (width − 1) − 1 but still be less than the
+      /* Dividend can exceed 2 ^ (width - 1) - 1 but still be less than the
 	 aligned divisor. Normal iteration can drops the high order bit
 	 of the dividend. Therefore, first test-subtract iteration is a
 	 special case, saving its quotient bit in a separate location and
@@ -1325,37 +1235,15 @@
 cmp_return_type
 __cmpdi2 (DWtype a, DWtype b)
 {
-  const DWunion au = {.ll = a};
-  const DWunion bu = {.ll = b};
-
-  if (au.s.high < bu.s.high)
-    return 0;
-  else if (au.s.high > bu.s.high)
-    return 2;
-  if ((UWtype) au.s.low < (UWtype) bu.s.low)
-    return 0;
-  else if ((UWtype) au.s.low > (UWtype) bu.s.low)
-    return 2;
-  return 1;
+  return (a > b) - (a < b) + 1;
 }
 #endif
 
 #ifdef L_ucmpdi2
 cmp_return_type
-__ucmpdi2 (DWtype a, DWtype b)
+__ucmpdi2 (UDWtype a, UDWtype b)
 {
-  const DWunion au = {.ll = a};
-  const DWunion bu = {.ll = b};
-
-  if ((UWtype) au.s.high < (UWtype) bu.s.high)
-    return 0;
-  else if ((UWtype) au.s.high > (UWtype) bu.s.high)
-    return 2;
-  if ((UWtype) au.s.low < (UWtype) bu.s.low)
-    return 0;
-  else if ((UWtype) au.s.low > (UWtype) bu.s.low)
-    return 2;
-  return 1;
+  return (a > b) - (a < b) + 1;
 }
 #endif
 \f

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-10-05 16:49 [PATCH] Overflow-trapping integer arithmetic routines7code Stefan Kanthak
@ 2020-11-10 15:29 ` Jeff Law
  2020-11-10 15:57   ` Jakub Jelinek
  2020-11-10 17:21   ` Stefan Kanthak
  0 siblings, 2 replies; 9+ messages in thread
From: Jeff Law @ 2020-11-10 15:29 UTC (permalink / raw)
  To: Stefan Kanthak, gcc-patches


On 10/5/20 10:49 AM, Stefan Kanthak wrote:
> The implementation of the functions __absv?i2(), __addv?i3() etc. for
> trapping integer overflow provided in libgcc2.c is rather bad.
> Same for __cmp?i2() and __ucmp?i2()
>
> At least for AMD64 and i386 processors GCC creates awful to horrible
> code for them: see <https://skanthak.homepage.t-online.de/gcc.html>
> for some examples as well as the expected assembly.
>
> The attached diff/patch provides better implementations.

So you didn't indicate how this was tested.   The addition, subtraction,
negation and mulvsi3 are reasonable.  The mulvsi3 change introduced a
failure on m32r, but I'm highly confident it's an pre-existing simulator
bug.


But the abs changes and the other other multiply changes have undesired
LIBGCC2_BAD_CODE ifdefs that you need to clean up before we could accept
those changes.


What I don't understand is why the [u]cmpdi/cmpdi2 changes don't cause
infinite recursion.   I would expect the comparisons you're using to
compute the return value to themselves trigger calls to [u]cmpdi2.  I
can speculate that the expanders are open-coding the comparison, but in
that case I'd expect the libgcc2 routines to never be used, so
optimizing them is pointless :-)   But I put them through my tester and
couldn't see any evidence that they're causing problems though and
reviewed the generated code on m32r since it was convenient to do so.


Also note the ucmpdi changes require updating the prototype in
libgcc2.h, otherwise you get build failures building gcc itself.  I
fixed that minor omission.


So with all that in mind, I installed everything except the bits which
have the LIBGCC2_BAD_CODE ifdefs after testing on the various crosses. 
If you could remove the ifdefs on the abs/mult changes and resubmit them
it'd be appreciated.

Jeff


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-10 15:29 ` Jeff Law
@ 2020-11-10 15:57   ` Jakub Jelinek
  2020-11-10 16:05     ` Jeff Law
  2020-11-10 17:21   ` Stefan Kanthak
  1 sibling, 1 reply; 9+ messages in thread
From: Jakub Jelinek @ 2020-11-10 15:57 UTC (permalink / raw)
  To: Jeff Law; +Cc: Stefan Kanthak, gcc-patches

On Tue, Nov 10, 2020 at 08:29:58AM -0700, Jeff Law via Gcc-patches wrote:
> On 10/5/20 10:49 AM, Stefan Kanthak wrote:
> > The implementation of the functions __absv?i2(), __addv?i3() etc. for
> > trapping integer overflow provided in libgcc2.c is rather bad.
> > Same for __cmp?i2() and __ucmp?i2()
> >
> > At least for AMD64 and i386 processors GCC creates awful to horrible
> > code for them: see <https://skanthak.homepage.t-online.de/gcc.html>
> > for some examples as well as the expected assembly.
> >
> > The attached diff/patch provides better implementations.
> 
> So you didn't indicate how this was tested.   The addition, subtraction,
> negation and mulvsi3 are reasonable.  The mulvsi3 change introduced a
> failure on m32r, but I'm highly confident it's an pre-existing simulator
> bug.

The formatting is wrong too (e.g. no space before ( when calling functions).

> What I don't understand is why the [u]cmpdi/cmpdi2 changes don't cause
> infinite recursion.   I would expect the comparisons you're using to
> compute the return value to themselves trigger calls to [u]cmpdi2.  I
> can speculate that the expanders are open-coding the comparison, but in
> that case I'd expect the libgcc2 routines to never be used, so
> optimizing them is pointless :-)

Yeah, I don't really understand these changes either.

	Jakub


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-10 15:57   ` Jakub Jelinek
@ 2020-11-10 16:05     ` Jeff Law
  0 siblings, 0 replies; 9+ messages in thread
From: Jeff Law @ 2020-11-10 16:05 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Stefan Kanthak, gcc-patches


On 11/10/20 8:57 AM, Jakub Jelinek wrote:
> On Tue, Nov 10, 2020 at 08:29:58AM -0700, Jeff Law via Gcc-patches wrote:
>> On 10/5/20 10:49 AM, Stefan Kanthak wrote:
>>> The implementation of the functions __absv?i2(), __addv?i3() etc. for
>>> trapping integer overflow provided in libgcc2.c is rather bad.
>>> Same for __cmp?i2() and __ucmp?i2()
>>>
>>> At least for AMD64 and i386 processors GCC creates awful to horrible
>>> code for them: see <https://skanthak.homepage.t-online.de/gcc.html>
>>> for some examples as well as the expected assembly.
>>>
>>> The attached diff/patch provides better implementations.
>> So you didn't indicate how this was tested.   The addition, subtraction,
>> negation and mulvsi3 are reasonable.  The mulvsi3 change introduced a
>> failure on m32r, but I'm highly confident it's an pre-existing simulator
>> bug.
> The formatting is wrong too (e.g. no space before ( when calling functions).

Ugh.  I fixed that in one tree, but not in the tree I committed from. 
I'll adjust momentarily.

jeff


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-10 15:29 ` Jeff Law
  2020-11-10 15:57   ` Jakub Jelinek
@ 2020-11-10 17:21   ` Stefan Kanthak
  2020-11-25  5:48     ` Jeff Law
  1 sibling, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2020-11-10 17:21 UTC (permalink / raw)
  To: gcc-patches, Jeff Law

[-- Attachment #1: Type: text/plain, Size: 3369 bytes --]

Jeff Law <law@redhat.com> wrote:

> On 10/5/20 10:49 AM, Stefan Kanthak wrote:
>> The implementation of the functions __absv?i2(), __addv?i3() etc. for
>> trapping integer overflow provided in libgcc2.c is rather bad.
>> Same for __cmp?i2() and __ucmp?i2()
>>
>> At least for AMD64 and i386 processors GCC creates awful to horrible
>> code for them: see <https://skanthak.homepage.t-online.de/gcc.html>
>> for some examples as well as the expected assembly.
>>
>> The attached diff/patch provides better implementations.
> 
> So you didn't indicate how this was tested.

I inspected the assembly generated for i386 and AMD64 and found it to be
correct. I also ran both the current and the patched functions in parallel
and compared their results, without detecting any deviation (except for
the greater speed of the patched functions).

> The addition, subtraction, negation and mulvsi3 are reasonable. The
> mulvsi3 change introduced a failure on m32r, but I'm highly confident
> it's an pre-existing simulator bug.

I only have AMD64 processors, so my tests are limited to these platforms.

> But the abs changes and the other other multiply changes have undesired
> LIBGCC2_BAD_CODE ifdefs that you need to clean up before we could accept
> those changes.

I included these alternative #ifdef'ed implementations only to demonstrate
that GCC generates bad^Wnot the best code there, especially for __mulvDI3(),
which I implemented using __umulsidi3() plus 2 __builtin_mul_overflow() and
2 __builtin_add_overflow() for Wtype instead of just a single
__builtin_mul_overflow() for DWtype. 

> What I don't understand is why the [u]cmpdi/cmpdi2 changes don't cause
> infinite recursion. I would expect the comparisons you're using to
> compute the return value to themselves trigger calls to [u]cmpdi2.

Why should the compiler generate calls to [u]cmpdi2() there?
It doesn't generate calls to the missing functions [u]adddi2() or
[u]subdi2() for addition and subtraction of DWtypes either!

JFTR: this also holds for the __ashlDI3(),  __ashrDI3() and __lshrDI3()
      functions, which should better be written as

__ashldi3 (DWtype u, shift_count_type b)
{
  return u << b;
}

__ashrdi3 (DWtype u, shift_count_type b)
{
  return u >> b;
}

__lshrdi3 (UDWtype u, shift_count_type b)
{
  return u >> b;
}

> I can speculate that the expanders are open-coding the comparison, but
> in that case I'd expect the libgcc2 routines to never be used, so
> optimizing them is pointless :-)

Both are documented, so users might call them directly.
Besides that, their source smells -- nobody with a sane mind would NOT
write (a > b) - (a < b) + 1, what every self-respecting compiler/optimiser
should translate into branch-free code.

> But I put them through my tester and couldn't see any evidence that
> they're causing problems though and reviewed the generated code on m32r
> since it was convenient to do so.
> 
> 
> Also note the ucmpdi changes require updating the prototype in libgcc2.h,
> otherwise you get build failures building gcc itself. I fixed that minor
> omission.

Argh, I missed that indeed.

> So with all that in mind, I installed everything except the bits which
> have the LIBGCC2_BAD_CODE ifdefs after testing on the various crosses. 
> If you could remove the ifdefs on the abs/mult changes and resubmit them
> it'd be appreciated.

Done.

regards
Stefan

[-- Attachment #2: libgcc2.patch --]
[-- Type: application/octet-stream, Size: 9602 bytes --]

--- -/libgcc/libgcc2.c
+++ +/libgcc/libgcc2.c
@@ -75,9 +75,9 @@
 Wtype
 __addvSI3 (Wtype a, Wtype b)
 {
-  const Wtype w = (UWtype) a + (UWtype) b;
+  Wtype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -86,9 +86,9 @@
 SItype
 __addvsi3 (SItype a, SItype b)
 {
-  const SItype w = (USItype) a + (USItype) b;
+  SItype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -100,9 +100,9 @@
 DWtype
 __addvDI3 (DWtype a, DWtype b)
 {
-  const DWtype w = (UDWtype) a + (UDWtype) b;
+  DWtype w;
 
-  if (b >= 0 ? w < a : w > a)
+  if (__builtin_add_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -113,9 +113,9 @@
 Wtype
 __subvSI3 (Wtype a, Wtype b)
 {
-  const Wtype w = (UWtype) a - (UWtype) b;
+  Wtype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -124,9 +124,9 @@
 SItype
 __subvsi3 (SItype a, SItype b)
 {
-  const SItype w = (USItype) a - (USItype) b;
+  SItype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -138,9 +138,9 @@
 DWtype
 __subvDI3 (DWtype a, DWtype b)
 {
-  const DWtype w = (UDWtype) a - (UDWtype) b;
+  DWtype w;
 
-  if (b >= 0 ? w > a : w < a)
+  if (__builtin_sub_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -151,22 +151,20 @@
 Wtype
 __mulvSI3 (Wtype a, Wtype b)
 {
-  const DWtype w = (DWtype) a * (DWtype) b;
+  Wtype w;
 
-  if ((Wtype) (w >> W_TYPE_SIZE) != (Wtype) w >> (W_TYPE_SIZE - 1))
+  if (__builtin_mul_overflow (a, b, &w))
     abort ();
 
   return w;
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
-#undef WORD_SIZE
-#define WORD_SIZE (sizeof (SItype) * __CHAR_BIT__)
 SItype
 __mulvsi3 (SItype a, SItype b)
 {
-  const DItype w = (DItype) a * (DItype) b;
+  DItype w;
 
-  if ((SItype) (w >> WORD_SIZE) != (SItype) w >> (WORD_SIZE-1))
+  if (__builtin_mul_overflow (a, b, &w))
     abort ();
 
   return w;
@@ -178,23 +176,23 @@
 Wtype
 __negvSI2 (Wtype a)
 {
-  const Wtype w = -(UWtype) a;
+  Wtype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow (0, a, &w))
     abort ();
 
-   return w;
+  return w;
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
 SItype
 __negvsi2 (SItype a)
 {
-  const SItype w = -(USItype) a;
+  SItype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow (0, a, &w))
     abort ();
 
-   return w;
+  return w;
 }
 #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
 #endif
@@ -203,9 +201,9 @@
 DWtype
 __negvDI2 (DWtype a)
 {
-  const DWtype w = -(UDWtype) a;
+  DWtype w;
 
-  if (a >= 0 ? w > 0 : w < 0)
+  if (__builtin_sub_overflow (0, a, &w))
     abort ();
 
   return w;
@@ -216,37 +214,25 @@
 Wtype
 __absvSI2 (Wtype a)
 {
-  Wtype w = a;
+  const Wtype v = 0 - (a < 0);
+  Wtype w;
 
-  if (a < 0)
-#ifdef L_negvsi2
-    w = __negvSI2 (a);
-#else
-    w = -(UWtype) a;
-
-  if (w < 0)
+  if (__builtin_add_overflow (a, v, &w))
     abort ();
-#endif
 
-   return w;
+  return v ^ w;
 }
 #ifdef COMPAT_SIMODE_TRAPPING_ARITHMETIC
 SItype
 __absvsi2 (SItype a)
 {
-  SItype w = a;
+  const SItype v = 0 - (a < 0);
+  SItype w;
 
-  if (a < 0)
-#ifdef L_negvsi2
-    w = __negvsi2 (a);
-#else
-    w = -(USItype) a;
-
-  if (w < 0)
+  if (__builtin_add_overflow (a, v, &w))
     abort ();
-#endif
 
-   return w;
+  return v ^ w;
 }
 #endif /* COMPAT_SIMODE_TRAPPING_ARITHMETIC */
 #endif
@@ -255,144 +241,32 @@
 DWtype
 __absvDI2 (DWtype a)
 {
-  DWtype w = a;
+  const DWtype v = 0 - (a < 0);
+  DWtype w;
 
-  if (a < 0)
-#ifdef L_negvdi2
-    w = __negvDI2 (a);
-#else
-    w = -(UDWtype) a;
-
-  if (w < 0)
+  if (__builtin_add_overflow (a, v, &w))
     abort ();
-#endif
 
-  return w;
+  return v ^ w;
 }
 #endif
 \f
 #ifdef L_mulvdi3
 DWtype
-__mulvDI3 (DWtype u, DWtype v)
+__mulvDI3 (DWtype a, DWtype b)
 {
-  /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
-     but the checked multiplication needs only two.  */
-  const DWunion uu = {.ll = u};
-  const DWunion vv = {.ll = v};
+  DWtype t;
+  const DWunion u = {.ll = a};
+  const DWunion v = {.ll = b};
+  DWunion w = {.ll = __umulsidi3 (u.s.low, v.s.low)};
 
-  if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
-    {
-      /* u fits in a single Wtype.  */
-      if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
-	{
-	  /* v fits in a single Wtype as well.  */
-	  /* A single multiplication.  No overflow risk.  */
-	  return (DWtype) uu.s.low * (DWtype) vv.s.low;
-	}
-      else
-	{
-	  /* Two multiplications.  */
-	  DWunion w0 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low};
-	  DWunion w1 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.high};
+  if (__builtin_mul_overflow (u.s.low, v.s.high, &t)
+   || __builtin_add_overflow (t, w.s.high, &w.s.high)
+   || __builtin_mul_overflow (u.s.high, v.s.low, &t)
+   || __builtin_add_overflow (t, w.s.high, &w.s.high))
+    abort ();
 
-	  if (vv.s.high < 0)
-	    w1.s.high -= uu.s.low;
-	  if (uu.s.low < 0)
-	    w1.ll -= vv.ll;
-	  w1.ll += (UWtype) w0.s.high;
-	  if (__builtin_expect (w1.s.high == w1.s.low >> (W_TYPE_SIZE - 1), 1))
-	    {
-	      w0.s.high = w1.s.low;
-	      return w0.ll;
-	    }
-	}
-    }
-  else
-    {
-      if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
-	{
-	  /* v fits into a single Wtype.  */
-	  /* Two multiplications.  */
-	  DWunion w0 = {.ll = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low};
-	  DWunion w1 = {.ll = (UDWtype) (UWtype) uu.s.high
-			* (UDWtype) (UWtype) vv.s.low};
-
-	  if (uu.s.high < 0)
-	    w1.s.high -= vv.s.low;
-	  if (vv.s.low < 0)
-	    w1.ll -= uu.ll;
-	  w1.ll += (UWtype) w0.s.high;
-	  if (__builtin_expect (w1.s.high == w1.s.low >> (W_TYPE_SIZE - 1), 1))
-	    {
-	      w0.s.high = w1.s.low;
-	      return w0.ll;
-	    }
-	}
-      else
-	{
-	  /* A few sign checks and a single multiplication.  */
-	  if (uu.s.high >= 0)
-	    {
-	      if (vv.s.high >= 0)
-		{
-		  if (uu.s.high == 0 && vv.s.high == 0)
-		    {
-		      const DWtype w = (UDWtype) (UWtype) uu.s.low
-			* (UDWtype) (UWtype) vv.s.low;
-		      if (__builtin_expect (w >= 0, 1))
-			return w;
-		    }
-		}
-	      else
-		{
-		  if (uu.s.high == 0 && vv.s.high == (Wtype) -1)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
-
-		      ww.s.high -= uu.s.low;
-		      if (__builtin_expect (ww.s.high < 0, 1))
-			return ww.ll;
-		    }
-		}
-	    }
-	  else
-	    {
-	      if (vv.s.high >= 0)
-		{
-		  if (uu.s.high == (Wtype) -1 && vv.s.high == 0)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
-
-		      ww.s.high -= vv.s.low;
-		      if (__builtin_expect (ww.s.high < 0, 1))
-			return ww.ll;
-		    }
-		}
-	      else
-		{
-		  if ((uu.s.high & vv.s.high) == (Wtype) -1
-		      && (uu.s.low | vv.s.low) != 0)
-		    {
-		      DWunion ww = {.ll = (UDWtype) (UWtype) uu.s.low
-				    * (UDWtype) (UWtype) vv.s.low};
-
-		      ww.s.high -= uu.s.low;
-		      ww.s.high -= vv.s.low;
-		      if (__builtin_expect (ww.s.high >= 0, 1))
-			return ww.ll;
-		    }
-		}
-	    }
-	}
-    }
-
-  /* Overflow.  */
-  abort ();
+  return w.ll;
 }
 #endif
 \f
@@ -953,7 +827,7 @@
      aligns the divisor under the dividend and then perform number of
      test-subtract iterations which shift the dividend left. Number of
      iterations is k + 1 where k is the number of bit positions the
-     divisor must be shifted left  to align it under the dividend.
+     divisor must be shifted left to align it under the dividend.
      quotient bits can be saved in the rightmost positions of the dividend
      as it shifts left on each test-subtract iteration. */
 
@@ -965,7 +839,7 @@
       k = lz1 - lz2;
       y = (y << k);
 
-      /* Dividend can exceed 2 ^ (width − 1) − 1 but still be less than the
+      /* Dividend can exceed 2 ^ (width - 1) - 1 but still be less than the
 	 aligned divisor. Normal iteration can drops the high order bit
 	 of the dividend. Therefore, first test-subtract iteration is a
 	 special case, saving its quotient bit in a separate location and
@@ -1325,37 +1199,15 @@
 cmp_return_type
 __cmpdi2 (DWtype a, DWtype b)
 {
-  const DWunion au = {.ll = a};
-  const DWunion bu = {.ll = b};
-
-  if (au.s.high < bu.s.high)
-    return 0;
-  else if (au.s.high > bu.s.high)
-    return 2;
-  if ((UWtype) au.s.low < (UWtype) bu.s.low)
-    return 0;
-  else if ((UWtype) au.s.low > (UWtype) bu.s.low)
-    return 2;
-  return 1;
+  return (a > b) - (a < b) + 1;
 }
 #endif
 
 #ifdef L_ucmpdi2
 cmp_return_type
-__ucmpdi2 (DWtype a, DWtype b)
+__ucmpdi2 (UDWtype a, UDWtype b)
 {
-  const DWunion au = {.ll = a};
-  const DWunion bu = {.ll = b};
-
-  if ((UWtype) au.s.high < (UWtype) bu.s.high)
-    return 0;
-  else if ((UWtype) au.s.high > (UWtype) bu.s.high)
-    return 2;
-  if ((UWtype) au.s.low < (UWtype) bu.s.low)
-    return 0;
-  else if ((UWtype) au.s.low > (UWtype) bu.s.low)
-    return 2;
-  return 1;
+  return (a > b) - (a < b) + 1;
 }
 #endif
 \f

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-10 17:21   ` Stefan Kanthak
@ 2020-11-25  5:48     ` Jeff Law
  2020-11-25 13:18       ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Law @ 2020-11-25  5:48 UTC (permalink / raw)
  To: Stefan Kanthak, gcc-patches



On 11/10/20 10:21 AM, Stefan Kanthak wrote:

>> So with all that in mind, I installed everything except the bits which
>> have the LIBGCC2_BAD_CODE ifdefs after testing on the various crosses. 
>> If you could remove the ifdefs on the abs/mult changes and resubmit them
>> it'd be appreciated.
> Done.
THanks.  I'm doing some testing on the abs changes right now.  They look
pretty reasonable, though they will tend to generate worse code on
targets that don't handle overflow arithmetic and testing all that well.

H8 would be an example that generates more efficient code with the
original implementation.  But I don't think the H8 is terribly
representative of embedded ports these days, though it's likely to get
better at precisely these kinds of scenarios in the near future as
certain modernization patches continue landing.

Something like visium is much better for evaluation as it's got a modern
structure which includes reasonable exposure of carry/overflow.  Not
surprisingly the visium port generates better code with your improved
abs routines.





> libgcc2.patch
>
> --- -/libgcc/libgcc2.c
> +++ +/libgcc/libgcc2.c
>  \f
>  #ifdef L_mulvdi3
>  DWtype
> -__mulvDI3 (DWtype u, DWtype v)
> +__mulvDI3 (DWtype a, DWtype b)
[ ... ]
You've essentially collapsed the function into:

 DWtype
__mulvDI3 (DWtype a, DWtype b)
{
  DWtype t;
  const DWunion u = {.ll = a};
  const DWunion v = {.ll = b};
  DWunion w = {.ll = __umulsidi3 (u.s.low, v.s.low)};

  if (__builtin_mul_overflow (u.s.low, v.s.high, &t)
   || __builtin_add_overflow (t, w.s.high, &w.s.high)
   || __builtin_mul_overflow (u.s.high, v.s.low, &t)
   || __builtin_add_overflow (t, w.s.high, &w.s.high))
    abort ();

  return w.ll
}

The first thing to note is that I believe that using "DWtype" for "t" is
going to inhibit the overflow check for the __builtin_mul_overflow calls
that store their result into "t".  Your inputs are both 32bits and the
output is 64 bits.   Multiplying 2 32bit numbers will not overflow a
64bit result.  As a result those overflow checks are actually removed in
the code we generate for mulvDI3, defeating the entire purpose of using
mulvdi3 instead of muldi3.

Second while the result of multiplying u.s.high with v.s.high can never
change the result in w.ll, it does affect the overall overflow status
that we need to compute.

I think (but have not yet verified) that to fix these problems we need
to change the type of "t" to SItype and add a check like:

  if (u.s.high && v.s.high)
    abort ();

Also note that your approach always does 3 multiplies, which can be very
expensive on some architectures.  The existing version in libgcc2.c will
often just do one or two multiplies.  So while your implementation looks
a lot simpler, I suspect its often much slower.  And on targets without
32bit multiplication support, it's probably horribly bad.

My inclination is to leave the overflow checking double-word multiplier
as-is.  Though I guess you could keep the same structure as the existing
implementation which tries to avoid unnecessary multiplies and still use
the __builtin_{add,mul}_overflow to simplify the code a bit less
aggressively.




Jeff


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-25  5:48     ` Jeff Law
@ 2020-11-25 13:18       ` Stefan Kanthak
  2020-11-25 18:11         ` Jeff Law
  0 siblings, 1 reply; 9+ messages in thread
From: Stefan Kanthak @ 2020-11-25 13:18 UTC (permalink / raw)
  To: Jeff Law, gcc-patches

Jeff Law <law@redhat.com> wrote:

> On 11/10/20 10:21 AM, Stefan Kanthak wrote:
>
>>> So with all that in mind, I installed everything except the bits which
>>> have the LIBGCC2_BAD_CODE ifdefs after testing on the various crosses.
>>> If you could remove the ifdefs on the abs/mult changes and resubmit them
>>> it'd be appreciated.
>> Done.
> THanks. I'm doing some testing on the abs changes right now. They look
> pretty reasonable, though they will tend to generate worse code on
> targets that don't handle overflow arithmetic and testing all that well.

OTOH the changes yield better code on targets which have a proper
overflow handling, and may benefit from eventual improvements in the
compiler/code generator itself on all targets.

> H8 would be an example that generates more efficient code with the
> original implementation. But I don't think the H8 is terribly
> representative of embedded ports these days, though it's likely to get
> better at precisely these kinds of scenarios in the near future as
> certain modernization patches continue landing.
>
> Something like visium is much better for evaluation as it's got a modern
> structure which includes reasonable exposure of carry/overflow. Not
> surprisingly the visium port generates better code with your improved
> abs routines.
>
>
>
>
>
>> libgcc2.patch
>>
>> --- -/libgcc/libgcc2.c
>> +++ +/libgcc/libgcc2.c
>>
>>  #ifdef L_mulvdi3
>>  DWtype
>> -__mulvDI3 (DWtype u, DWtype v)
>> +__mulvDI3 (DWtype a, DWtype b)
> [ ... ]
> You've essentially collapsed the function into:
>
> DWtype
> __mulvDI3 (DWtype a, DWtype b)
> {
>  DWtype t;
>  const DWunion u = {.ll = a};
>  const DWunion v = {.ll = b};
>  DWunion w = {.ll = __umulsidi3 (u.s.low, v.s.low)};
>
>  if (__builtin_mul_overflow (u.s.low, v.s.high, &t)
>   || __builtin_add_overflow (t, w.s.high, &w.s.high)
>   || __builtin_mul_overflow (u.s.high, v.s.low, &t)
>   || __builtin_add_overflow (t, w.s.high, &w.s.high))
>    abort ();
>
>  return w.ll
> }
>
> The first thing to note is that I believe that using "DWtype" for "t" is
> going to inhibit the overflow check for the __builtin_mul_overflow calls
> that store their result into "t". Your inputs are both 32bits and the
> output is 64 bits. Multiplying 2 32bit numbers will not overflow a
> 64bit result. As a result those overflow checks are actually removed in
> the code we generate for mulvDI3, defeating the entire purpose of using
> mulvdi3 instead of muldi3.

ARGH, my fault(s).
I wanted to show an alternative, but less efficient implementation which
I guarded with the #ifdef, but garbled the main implementation.

> Second while the result of multiplying u.s.high with v.s.high can never
> change the result in w.ll, it does affect the overall overflow status
> that we need to compute.
>
> I think (but have not yet verified) that to fix these problems we need
> to change the type of "t" to SItype and add a check like:
>
> if (u.s.high && v.s.high)
> abort ();

Correct in both cases!
See <https://skanthak.homepage.t-online.de/gcc.html#case17> for my
original code.
I rewrote it to minimise the changes to the current implementation and
introduced 2 silly errors on the way.

> Also note that your approach always does 3 multiplies, which can be very
> expensive on some architectures. The existing version in libgcc2.c will
> often just do one or two multiplies. So while your implementation looks
> a lot simpler, I suspect its often much slower. And on targets without
> 32bit multiplication support, it's probably horribly bad.

All (current) processors I know have super-scalar architecture and a
hardware multiplier, they'll execute the 3 multiplies in parallel.
In my tests on i386 and AMD64 (Core2, Skylake, Ryzen/EPYC), the code
generated for <https://skanthak.homepage.t-online.de/gcc.html#case17>
as well as the (almost) branch-free code shown below and in
<https://skanthak.homepage.t-online.de/gcc.html#case13> runs 10% to 25%
faster than the __mulvDI3 routine from libgcc: the many conditional
branches of your current implementation impair performance more than 3
multiplies!

> My inclination is to leave the overflow checking double-word multiplier
> as-is.

See but <https://gcc.gnu.org/pipermail/gcc/2020-October/234048.html> ff.

> Though I guess you could keep the same structure as the existing
> implementation which tries to avoid unnecessary multiplies and still use
> the __builtin_{add,mul}_overflow to simplify the code a bit less
> aggressively.

Tertium datur: take a look at the __udivmodDI4 routine.
It has separate code paths for targets without hardware divider, and
also for targets where the hardware divider needs a normalized dividend.
I therefore propose to add separate code paths for targets with and
without hardware multiplier for the __mulvDI3 routine too, guarded by a
preprocessor macro which tells whether a target has a hardware multiplier.

regards
Stefan

--- umulvdi3.s ---
# Copyright © 2004-2020, Stefan Kanthak <stefan.kanthak@nexgo.de>

        .arch   generic32
        .code32
        .intel_syntax noprefix
        .text
                                # [esp+16] = high dword of multiplier
                                # [esp+12] = low dword of multiplier
                                # [esp+8] = high dword of multiplicand
                                # [esp+4] = low dword of multiplicand
__umulvdi3:
        push    ebx
        xor     ebx, ebx        # ebx = 0

        mov     ecx, [esp+12]   # ecx = high dword of multiplicand
        cmp     ebx, ecx
        sbb     edx, edx        # edx = (high dword of multiplicand == 0)  ? 0 : -1
                                #     = (multiplicand < 2**32) ? 0 : -1
        mov     eax, [esp+20]   # eax = high dword of multiplier
        cmp     ebx, eax
        sbb     ebx, ebx        # ebx = (high dword of multiplier == 0) ? 0 : -1
                                #     = (multiplier < 2**32) ? 0 : -1
        and     ebx, edx        # ebx = (multiplicand < 2**32)
                                #     & (multiplier < 2**32) ? 0 : -1
                                #     = (product < 2**64) ? 0 : -1

        mov     edx, [esp+8]    # edx = low dword of multiplicand
        mul     edx             # edx:eax = high dword of multiplier
                                #         * low dword of multiplicand
        adc     ebx, ebx        # ebx = (product < 2**64) ? 0 : *

        xchg    ecx, eax        # ecx = high dword of multiplier
                                #     * low dword of multiplicand,
                                # eax = high dword of multiplicand
        mov     edx, [esp+16]   # edx = low dword of multiplier
        mul     edx             # edx:eax = high dword of multiplicand
                                #         * low dword of multiplier
        adc     ebx, ebx        # ebx = (product < 2**64) ? 0 : *

        add     ecx, eax        # ecx = high dword of multiplier
                                #     * low dword of multiplicand
                                #     + high dword of multiplicand
                                #     * low dword of multiplier
#       adc     ebx, ebx        # ebx = (product < 2**64) ? 0 : *

        mov     eax, [esp+8]    # eax = low dword of multiplicand
        mov     edx, [esp+16]   # edx = low dword of multiplier
        mul     edx             # edx:eax = low dword of multiplicand
                                #         * low dword of multiplier
        add     edx, ecx        # edx:eax = product % 2**64
        adc     ebx, ebx        # ebx = (product < 2**64) ? 0 : *
        jnz     .Loverflow      # product >= 2**64?

        pop     ebx
        ret

.Loverflow:
        ud2

        .size   __umulvdi3, .-__umulvdi3
        .type   __umulvdi3, @function
        .global __umulvdi3
        .end
--- EOF ---

--- mulvdi3.s ---
# Copyright © 2004-2020, Stefan Kanthak <stefan.kanthak@nexgo.de>

        .arch   generic32
        .code32
        .intel_syntax noprefix
        .text
                                # [esp+16] = high dword of multiplier
                                # [esp+12] = low dword of multiplier
                                # [esp+8] = high dword of multiplicand
                                # [esp+4] = low dword of multiplicand
__mulvdi3:
        push    ebx
        mov     eax, [esp+20]   # eax = high dword of multiplier
        mov     ecx, [esp+16]   # ecx = low dword of multiplier
        cdq                     # edx = (multiplier < 0) ? -1 : 0
        mov     ebx, edx        # ebx = (multiplier < 0) ? -1 : 0
        xor     ecx, edx
        xor     eax, edx        # eax:ecx = (multiplier < 0) ? ~multiplier : multiplier
        sub     ecx, edx
        sbb     eax, edx        # eax:ecx = (multiplier < 0) ? -multiplier : multiplier
                                #         = |multiplier|
        mov     [esp+16], ecx
        mov     [esp+20], eax   # multiplier = |multiplier|

        mov     eax, [esp+12]   # eax = high dword of multiplicand
        mov     ecx, [esp+8]    # ecx = low dword of multiplicand
        cdq                     # edx = (multiplicand < 0) ? -1 : 0
        xor     ebx, edx        # ebx = (multiplier < 0)
                                #     ^ (multiplicand < 0) ? -1 : 0
                                #     = (product < 0) ? -1 : 0
        xor     ecx, edx
        xor     eax, edx        # eax:ecx = (multiplicand < 0) ? ~multiplicand : multiplicand
        sub     ecx, edx
        sbb     eax, edx        # eax:ecx = (multiplicand < 0) ? -multiplicand : multiplicand
                                #         = |multiplicand|
        mov     [esp+8], ecx
        mov     [esp+12], eax   # multiplicand = |multiplicand|

        push    ebx             # [esp] = (product < 0) ? -1 : 0

        xor     edx, edx        # edx = 0
#       mov     eax, [esp+16]   # eax = high dword of |multiplicand|
        cmp     edx, eax
        sbb     ebx, ebx        # ebx = (high dword of |multiplicand| == 0) ? 0 : -1
                                #     = (|multiplicand| < 2**32) ? 0 : -1
        mov     ecx, [esp+24]   # ecx = high dword of |multiplier|
        cmp     edx, ecx
        sbb     edx, edx        # edx = (high dword of |multiplier| == 0) ? 0 : -1
                                #     = (|multiplier| < 2**32) ? 0 : -1
        and     ebx, edx        # ebx = (|multiplicand| < 2**32)
                                #     & (|multiplier| < 2**32) ? 0 : -1
                                #     = (|product| < 2**64) ? 0 : -1
.ifdef INTO
        shr     ebx, 1
        into
        mov     ebx, [esp+20]   # ebx = low dword of |multiplier|
        mul     ebx             # edx:eax = high dword of |multiplicand|
                                #         * low dword of |multiplier|
        into
        xchg    ecx, eax        # ecx = high dword of |multiplicand|
                                #     * low dword of |multiplier|,
                                # eax = high dword of |multiplier|
        mov     edx, [esp+12]   # edx = low dword of |multiplicand|
        mul     edx             # edx:eax = high dword of |multiplier|
                                #         * low dword of |multiplicand|
        into
        add     ecx, eax        # ecx = high dword of |multiplicand|
                                #     * low dword of |multiplier|
                                #     + high dword of |multiplier|
                                #     * low dword of |multiplicand|
#       sbb     eax, eax        # eax = (|product| < 2**64) ? 0 : -1
#       shr     eax, 1
#       into
        mov     eax, [esp+12]   # eax = low dword of |multiplicand|
        mul     ebx             # edx:eax = low dword of |multiplicand|
                                #         * low dword of |multiplier|
        add     edx, ecx        # edx:eax = |product % 2**64|
                                #         = |product| % 2**64
        sbb     ecx, ecx        # ecx = (|product| < 2**64) ? 0 : -1
        shr     ecx, 1
        into
        pop     ebx             # ebx = (product < 0) ? -1 : 0
        add     eax, ebx
        adc     edx, ebx        # edx:eax = (product < 0) ? ~product % 2**64 : product % 2**64
        mov     ecx, edx
        shr     ecx, 1
        into
        xor     eax, ebx
        xor     edx, ebx        # edx:eax = product % 2**64
        pop     ebx
        ret
.else
        mov     edx, [esp+20]   # edx = low dword of |multiplier|
        mul     edx             # edx:eax = high dword of |multiplicand|
                                #         * low dword of |multiplier|
        adc     ebx, ebx        # ebx = (|product| < 2**64) ? 0 : *

        xchg    ecx, eax        # ecx = high dword of |multiplicand|
                                #     * low dword of |multiplier|,
                                # eax = high dword of |multiplier|
        mov     edx, [esp+12]   # edx = low dword of |multiplicand|
        mul     edx             # edx:eax = high dword of |multiplier|
                                #         * low dword of |multiplicand|
        adc     ebx, ebx        # ebx = (|product| < 2**64) ? 0 : *

        add     ecx, eax        # ecx = high dword of |multiplicand|
                                #     * low dword of |multiplier|
                                #     + high dword of |multiplier|
                                #     * low dword of |multiplicand|
#       adc     ebx, ebx        # ebx = (|product| < 2**64) ? 0 : *

        mov     eax, [esp+12]   # eax = low dword of |multiplicand|
        mov     edx, [esp+20]   # edx = low dword of |multiplier|
        mul     edx             # edx:eax = low dword of |multiplicand|
                                #         * low dword of |multiplier|
        add     edx, ecx        # edx:eax = |product % 2**64|
                                #         = |product| % 2**64
        adc     ebx, ebx        # ebx = (|product| < 2**64) ? 0 : *

        pop     ecx             # ecx = (product < 0) ? -1 : 0
        xor     eax, ecx
        xor     edx, ecx        # edx:eax = (product < 0) ? product % 2**64 - 1 : product % 2**64
        sub     eax, ecx
        sbb     edx, ecx        # edx:eax = product % 2**64

        xor     ecx, edx        # ecx = (multiplicand < 0)
                                #     ^ (multiplier < 0)
                                #     ^ (product < 0) ? negative : positive
        add     ecx, ecx
        adc     ebx, ebx        # ebx = (-2**63 <= product < 2**63) ? 0 : *
        jnz     .Loverflow      # product < -2**63?
                                # product >= 2**63?
        pop     ebx
        ret

.Loverflow:
        ud2
.endif # INTO
        .size   __mulvdi3, .-__mulvdi3
        .type   __mulvdi3, @function
        .global __mulvdi3
        .end
--- EOF ---


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-25 13:18       ` Stefan Kanthak
@ 2020-11-25 18:11         ` Jeff Law
  2020-12-07 16:45           ` Stefan Kanthak
  0 siblings, 1 reply; 9+ messages in thread
From: Jeff Law @ 2020-11-25 18:11 UTC (permalink / raw)
  To: Stefan Kanthak, gcc-patches



On 11/25/20 6:18 AM, Stefan Kanthak wrote:
> Jeff Law <law@redhat.com> wrote:
>
>> On 11/10/20 10:21 AM, Stefan Kanthak wrote:
>>
>>>> So with all that in mind, I installed everything except the bits which
>>>> have the LIBGCC2_BAD_CODE ifdefs after testing on the various crosses.
>>>> If you could remove the ifdefs on the abs/mult changes and resubmit them
>>>> it'd be appreciated.
>>> Done.
>> THanks. I'm doing some testing on the abs changes right now. They look
>> pretty reasonable, though they will tend to generate worse code on
>> targets that don't handle overflow arithmetic and testing all that well.
> OTOH the changes yield better code on targets which have a proper
> overflow handling, and may benefit from eventual improvements in the
> compiler/code generator itself on all targets.
I mentioned it mostly because I wanted others to be aware that there are
targets where the abs changes may generate slightly worse code and that
resolution is (IMHO) mostly a matter of improving overflow handling in
the target.  These issues are small enough that I don't think they
should hinder the abs changes moving forward.


>
>> Also note that your approach always does 3 multiplies, which can be very
>> expensive on some architectures. The existing version in libgcc2.c will
>> often just do one or two multiplies. So while your implementation looks
>> a lot simpler, I suspect its often much slower. And on targets without
>> 32bit multiplication support, it's probably horribly bad.
> All (current) processors I know have super-scalar architecture and a
> hardware multiplier, they'll execute the 3 multiplies in parallel.
> In my tests on i386 and AMD64 (Core2, Skylake, Ryzen/EPYC), the code
> generated for <https://skanthak.homepage.t-online.de/gcc.html#case17>
> as well as the (almost) branch-free code shown below and in
> <https://skanthak.homepage.t-online.de/gcc.html#case13> runs 10% to 25%
> faster than the __mulvDI3 routine from libgcc: the many conditional
> branches of your current implementation impair performance more than 3
> multiplies!
All the world is not an x86.  GCC supports over 30 distinct processor
types, many of which target the embedded world.   Those chips often have
limited multiply capabilities and they're often quite slow with
no/minimal pipelining and no superscalar or out of order capabilities.

THe fact that it runs faster on x86 is good, but we have to think in a
more broad fashion.  As it stands right now I'm not going to put the
multiply changes in.  If you wanted to rework them so they're less
costly on the embedded targets, then that would be helpful.


>
>> My inclination is to leave the overflow checking double-word multiplier
>> as-is.
> See but <https://gcc.gnu.org/pipermail/gcc/2020-October/234048.html> ff.
Already read and considered it. 
>
>> Though I guess you could keep the same structure as the existing
>> implementation which tries to avoid unnecessary multiplies and still use
>> the __builtin_{add,mul}_overflow to simplify the code a bit less
>> aggressively.
> Tertium datur: take a look at the __udivmodDI4 routine.
> It has separate code paths for targets without hardware divider, and
> also for targets where the hardware divider needs a normalized dividend.
> I therefore propose to add separate code paths for targets with and
> without hardware multiplier for the __mulvDI3 routine too, guarded by a
> preprocessor macro which tells whether a target has a hardware multiplier.
I don't think there is a way to indicate that there's a hardware
multipler available (or what capabilities it might have -- some might
just have a 16x16 multiplier with or without widening variants, it can
depend on precisely what revision of the chip you're targeting -- which
can change based on compielr flags) and I would oppose a change that
adds something like TARGET_HAS_NO_HW_DIVIDE.  That's a wart and one I
would oppose spreading further.

Instead keep the tests that detect the special cases that don't need as
many multiplies and use the overflow builtins within that implementation
framework.  In cases where we can use the operands directly, that's
helpful as going through the struct/union likely leads to unnecessary
register shuffling.

jeff


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Overflow-trapping integer arithmetic routines7code
  2020-11-25 18:11         ` Jeff Law
@ 2020-12-07 16:45           ` Stefan Kanthak
  0 siblings, 0 replies; 9+ messages in thread
From: Stefan Kanthak @ 2020-12-07 16:45 UTC (permalink / raw)
  To: gcc-patches, Jeff Law

Jeff Law wrote Wednesday, November 25, 2020 7:11 PM:

> On 11/25/20 6:18 AM, Stefan Kanthak wrote:
>> Jeff Law <law@redhat.com> wrote:

[...]

>>> My inclination is to leave the overflow checking double-word multiplier
>>> as-is.
>> See but <https://gcc.gnu.org/pipermail/gcc/2020-October/234048.html> ff.
> Already read and considered it. 
>>
>>> Though I guess you could keep the same structure as the existing
>>> implementation which tries to avoid unnecessary multiplies and still use
>>> the __builtin_{add,mul}_overflow to simplify the code a bit less
>>> aggressively.

Only relying on GCC to generate efficient code for __builtin_mul_overflow,
this should be either

DWtype
__mulvDI3 (DWtype u, DWtype v)
{
    DWtype w;
    if (__builtin_mul_overflow (u, v, &w))
        abort ();
    return w;
}

or

DWtype
__mulvDI3 (DWtype u, DWtype v)
{
    UDWtype w, s = 0 - (u < 0), t = 0 - (v < 0);
    if (__builtin_mul_overflow ((u ^ s) - s, (v ^ t) - t, &w))
        abort ();
    s ^= t;
    w = (w ^ s) - s;
    if ((DWtype) (w ^ s) < 0)
        abort ();
    return w;
}

if the latter produces better machine code (which it does at least for
i386 and AMD64).

> Instead keep the tests that detect the special cases that don't need as
> many multiplies and use the overflow builtins within that implementation
> framework. In cases where we can use the operands directly, that's
> helpful as going through the struct/union likely leads to unnecessary
> register shuffling.

regards
Stefan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-12-07 16:50 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-05 16:49 [PATCH] Overflow-trapping integer arithmetic routines7code Stefan Kanthak
2020-11-10 15:29 ` Jeff Law
2020-11-10 15:57   ` Jakub Jelinek
2020-11-10 16:05     ` Jeff Law
2020-11-10 17:21   ` Stefan Kanthak
2020-11-25  5:48     ` Jeff Law
2020-11-25 13:18       ` Stefan Kanthak
2020-11-25 18:11         ` Jeff Law
2020-12-07 16:45           ` Stefan Kanthak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).