[PATCH] Sparc longlong.h enhancements.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Sparc longlong.h enhancements.
@ 2012-05-30 22:14 David Miller
  2012-05-31 13:11 ` Eric Botcazou
  0 siblings, 1 reply; 3+ messages in thread
From: David Miller @ 2012-05-30 22:14 UTC (permalink / raw)
  To: gcc-patches; +Cc: ebotcazou


Eric, while looking at soft-fp code generated in glibc I noticed that
for v9 on 32-bit we end up doing software multiplies and divides :-/

I also noticed that the two-limb addition and subtraction could be
done using a branchless sequence on 64-bit.

Any objections?

libgcc/

	* longlong.h [SPARC] (umul_ppmm, udiv_qrnnd): Use hardware integer multiply
	and divide instructions on 32-bit when V9.
	(add_ssaaaa, sub_ddmmss): Convert to branchless code on 64-bit.

diff --git a/libgcc/longlong.h b/libgcc/longlong.h
index 4fa9d46..626f199 100644
--- a/libgcc/longlong.h
+++ b/libgcc/longlong.h
@@ -1127,6 +1127,29 @@ UDItype __umulsidi3 (USItype, USItype);
 	     "rJ" ((USItype) (al)),					\
 	     "rI" ((USItype) (bl))					\
 	   __CLOBBER_CC)
+#if defined (__sparc_v9__)
+#define umul_ppmm(w1, w0, u, v) \
+  do {									\
+    register USItype __g1 asm ("g1");					\
+    __asm__ ("umul\t%2,%3,%1\n\t"					\
+	     "srlx\t%1, 32, %0"						\
+	     : "=r" ((USItype) (w1)),					\
+	       "=r" (__g1)						\
+	     : "r" ((USItype) (u)),					\
+	       "r" ((USItype) (v)));					\
+    (w0) = __g1;							\
+  } while (0)
+#define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
+  __asm__ ("mov\t%2,%%y\n\t"						\
+	   "udiv\t%3,%4,%0\n\t"						\
+	   "umul\t%0,%4,%1\n\t"						\
+	   "sub\t%3,%1,%1"						\
+	   : "=&r" ((USItype) (__q)),					\
+	     "=&r" ((USItype) (__r))					\
+	   : "r" ((USItype) (__n1)),					\
+	     "r" ((USItype) (__n0)),					\
+	     "r" ((USItype) (__d)))
+#else
 #if defined (__sparc_v8__)
 #define umul_ppmm(w1, w0, u, v) \
   __asm__ ("umul %2,%3,%1;rd %%y,%0"					\
@@ -1292,37 +1315,46 @@ UDItype __umulsidi3 (USItype, USItype);
 #define UDIV_TIME (3+7*32)	/* 7 instructions/iteration. 32 iterations.  */
 #endif /* __sparclite__ */
 #endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
 #endif /* sparc32 */
 
 #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
     && W_TYPE_SIZE == 64
 #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
-  __asm__ ("addcc %r4,%5,%1\n\t"					\
-   	   "add %r2,%3,%0\n\t"						\
-   	   "bcs,a,pn %%xcc, 1f\n\t"					\
-   	   "add %0, 1, %0\n"						\
-	   "1:"								\
+  do {									\
+    UDItype __carry = 0;						\
+    __asm__ ("addcc\t%r5,%6,%1\n\t"					\
+	     "add\t%r3,%4,%0\n\t"					\
+	     "movcs\t%%xcc, 1, %2\n\t"					\
+             "add\t%0, %2, %0"						\
 	   : "=r" ((UDItype)(sh)),				      	\
-	     "=&r" ((UDItype)(sl))				      	\
+	     "=&r" ((UDItype)(sl)),				      	\
+	     "=&r" (__carry)				      		\
 	   : "%rJ" ((UDItype)(ah)),				     	\
 	     "rI" ((UDItype)(bh)),				      	\
 	     "%rJ" ((UDItype)(al)),				     	\
-	     "rI" ((UDItype)(bl))				       	\
-	   __CLOBBER_CC)
+	     "rI" ((UDItype)(bl)),				       	\
+	     "2" (__carry)				       		\
+	   __CLOBBER_CC);						\
+  } while (0)
 
-#define sub_ddmmss(sh, sl, ah, al, bh, bl) 				\
-  __asm__ ("subcc %r4,%5,%1\n\t"					\
-   	   "sub %r2,%3,%0\n\t"						\
-   	   "bcs,a,pn %%xcc, 1f\n\t"					\
-   	   "sub %0, 1, %0\n\t"						\
-	   "1:"								\
+#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
+  do {									\
+    UDItype __carry = 0;						\
+    __asm__ ("subcc\t%r5,%6,%1\n\t"					\
+	     "sub\t%r3,%4,%0\n\t"					\
+	     "movcs\t%%xcc, 1, %2\n\t"					\
+             "add\t%0, %2, %0"						\
 	   : "=r" ((UDItype)(sh)),				      	\
-	     "=&r" ((UDItype)(sl))				      	\
-	   : "rJ" ((UDItype)(ah)),				     	\
+	     "=&r" ((UDItype)(sl)),				      	\
+	     "=&r" (__carry)				      		\
+	   : "%rJ" ((UDItype)(ah)),				     	\
 	     "rI" ((UDItype)(bh)),				      	\
-	     "rJ" ((UDItype)(al)),				     	\
-	     "rI" ((UDItype)(bl))				       	\
-	   __CLOBBER_CC)
+	     "%rJ" ((UDItype)(al)),				     	\
+	     "rI" ((UDItype)(bl)),				       	\
+	     "2" (__carry)				       		\
+	   __CLOBBER_CC);						\
+  } while (0)
 
 #define umul_ppmm(wh, wl, u, v)						\
   do {									\

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Sparc longlong.h enhancements.
  2012-05-30 22:14 [PATCH] Sparc longlong.h enhancements David Miller
@ 2012-05-31 13:11 ` Eric Botcazou
  2012-05-31 19:50   ` David Miller
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Botcazou @ 2012-05-31 13:11 UTC (permalink / raw)
  To: David Miller; +Cc: gcc-patches

> Eric, while looking at soft-fp code generated in glibc I noticed that
> for v9 on 32-bit we end up doing software multiplies and divides :-/
>
> I also noticed that the two-limb addition and subtraction could be
> done using a branchless sequence on 64-bit.
>
> Any objections?

None on principle, but...

>  #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9))
> \ && W_TYPE_SIZE == 64
>  #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
> -  __asm__ ("addcc %r4,%5,%1\n\t"					\
> -   	   "add %r2,%3,%0\n\t"						\
> -   	   "bcs,a,pn %%xcc, 1f\n\t"					\
> -   	   "add %0, 1, %0\n"						\
> -	   "1:"								\
> +  do {									\
> +    UDItype __carry = 0;						\
> +    __asm__ ("addcc\t%r5,%6,%1\n\t"					\
> +	     "add\t%r3,%4,%0\n\t"					\
> +	     "movcs\t%%xcc, 1, %2\n\t"					\
> +             "add\t%0, %2, %0"						\
>
>  	   : "=r" ((UDItype)(sh)),				      	\
>
> -	     "=&r" ((UDItype)(sl))				      	\
> +	     "=&r" ((UDItype)(sl)),				      	\
> +	     "=&r" (__carry)				      		\
>
>  	   : "%rJ" ((UDItype)(ah)),				     	\
>
>  	     "rI" ((UDItype)(bh)),				      	\
>  	     "%rJ" ((UDItype)(al)),				     	\
> -	     "rI" ((UDItype)(bl))				       	\
> -	   __CLOBBER_CC)
> +	     "rI" ((UDItype)(bl)),				       	\
> +	     "2" (__carry)				       		\
> +	   __CLOBBER_CC);						\
> +  } while (0)

If __carry is used as both source and destination for %2, why not use a single 
operand with the + modifier?

> -#define sub_ddmmss(sh, sl, ah, al, bh, bl) 				\
> -  __asm__ ("subcc %r4,%5,%1\n\t"					\
> -   	   "sub %r2,%3,%0\n\t"						\
> -   	   "bcs,a,pn %%xcc, 1f\n\t"					\
> -   	   "sub %0, 1, %0\n\t"						\
> -	   "1:"								\
> +#define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
> +  do {									\
> +    UDItype __carry = 0;						\
> +    __asm__ ("subcc\t%r5,%6,%1\n\t"					\
> +	     "sub\t%r3,%4,%0\n\t"					\
> +	     "movcs\t%%xcc, 1, %2\n\t"					\
> +             "add\t%0, %2, %0"						\
>
>  	   : "=r" ((UDItype)(sh)),				      	\
>
> -	     "=&r" ((UDItype)(sl))				      	\
> -	   : "rJ" ((UDItype)(ah)),				     	\
> +	     "=&r" ((UDItype)(sl)),				      	\
> +	     "=&r" (__carry)				      		\
> +	   : "%rJ" ((UDItype)(ah)),				     	\
>  	     "rI" ((UDItype)(bh)),				      	\
> -	     "rJ" ((UDItype)(al)),				     	\
> -	     "rI" ((UDItype)(bl))				       	\
> -	   __CLOBBER_CC)
> +	     "%rJ" ((UDItype)(al)),				     	\
> +	     "rI" ((UDItype)(bl)),				       	\
> +	     "2" (__carry)				       		\
> +	   __CLOBBER_CC);						\
> +  } while (0)

Likewise.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] Sparc longlong.h enhancements.
  2012-05-31 13:11 ` Eric Botcazou
@ 2012-05-31 19:50   ` David Miller
  0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2012-05-31 19:50 UTC (permalink / raw)
  To: ebotcazou; +Cc: gcc-patches

From: Eric Botcazou <ebotcazou@adacore.com>
Date: Thu, 31 May 2012 15:06:41 +0200

>> +  do {									\
>> +    UDItype __carry = 0;						\
>> +    __asm__ ("addcc\t%r5,%6,%1\n\t"					\
>> +	     "add\t%r3,%4,%0\n\t"					\
>> +	     "movcs\t%%xcc, 1, %2\n\t"					\
>> +             "add\t%0, %2, %0"						\
>>
>>  	   : "=r" ((UDItype)(sh)),				      	\
>>
>> -	     "=&r" ((UDItype)(sl))				      	\
>> +	     "=&r" ((UDItype)(sl)),				      	\
>> +	     "=&r" (__carry)				      		\
>>
>>  	   : "%rJ" ((UDItype)(ah)),				     	\
>>
>>  	     "rI" ((UDItype)(bh)),				      	\
>>  	     "%rJ" ((UDItype)(al)),				     	\
>> -	     "rI" ((UDItype)(bl))				       	\
>> -	   __CLOBBER_CC)
>> +	     "rI" ((UDItype)(bl)),				       	\
>> +	     "2" (__carry)				       		\
>> +	   __CLOBBER_CC);						\
>> +  } while (0)
> 
> If __carry is used as both source and destination for %2, why not use a single 
> operand with the + modifier?

Makes sense, I'll make that change and test it, thanks Eric.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2012-05-31 19:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-30 22:14 [PATCH] Sparc longlong.h enhancements David Miller
2012-05-31 13:11 ` Eric Botcazou
2012-05-31 19:50   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).