* [PATCH] Sparc longlong.h enhancements.
@ 2012-05-30 22:14 David Miller
2012-05-31 13:11 ` Eric Botcazou
0 siblings, 1 reply; 3+ messages in thread
From: David Miller @ 2012-05-30 22:14 UTC (permalink / raw)
To: gcc-patches; +Cc: ebotcazou
Eric, while looking at soft-fp code generated in glibc I noticed that
for v9 on 32-bit we end up doing software multiplies and divides :-/
I also noticed that the two-limb addition and subtraction could be
done using a branchless sequence on 64-bit.
Any objections?
libgcc/
* longlong.h [SPARC] (umul_ppmm, udiv_qrnnd): Use hardware integer multiply
and divide instructions on 32-bit when V9.
(add_ssaaaa, sub_ddmmss): Convert to branchless code on 64-bit.
diff --git a/libgcc/longlong.h b/libgcc/longlong.h
index 4fa9d46..626f199 100644
--- a/libgcc/longlong.h
+++ b/libgcc/longlong.h
@@ -1127,6 +1127,29 @@ UDItype __umulsidi3 (USItype, USItype);
"rJ" ((USItype) (al)), \
"rI" ((USItype) (bl)) \
__CLOBBER_CC)
+#if defined (__sparc_v9__)
+#define umul_ppmm(w1, w0, u, v) \
+ do { \
+ register USItype __g1 asm ("g1"); \
+ __asm__ ("umul\t%2,%3,%1\n\t" \
+ "srlx\t%1, 32, %0" \
+ : "=r" ((USItype) (w1)), \
+ "=r" (__g1) \
+ : "r" ((USItype) (u)), \
+ "r" ((USItype) (v))); \
+ (w0) = __g1; \
+ } while (0)
+#define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
+ __asm__ ("mov\t%2,%%y\n\t" \
+ "udiv\t%3,%4,%0\n\t" \
+ "umul\t%0,%4,%1\n\t" \
+ "sub\t%3,%1,%1" \
+ : "=&r" ((USItype) (__q)), \
+ "=&r" ((USItype) (__r)) \
+ : "r" ((USItype) (__n1)), \
+ "r" ((USItype) (__n0)), \
+ "r" ((USItype) (__d)))
+#else
#if defined (__sparc_v8__)
#define umul_ppmm(w1, w0, u, v) \
__asm__ ("umul %2,%3,%1;rd %%y,%0" \
@@ -1292,37 +1315,46 @@ UDItype __umulsidi3 (USItype, USItype);
#define UDIV_TIME (3+7*32) /* 7 instructions/iteration. 32 iterations. */
#endif /* __sparclite__ */
#endif /* __sparc_v8__ */
+#endif /* __sparc_v9__ */
#endif /* sparc32 */
#if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9)) \
&& W_TYPE_SIZE == 64
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
- __asm__ ("addcc %r4,%5,%1\n\t" \
- "add %r2,%3,%0\n\t" \
- "bcs,a,pn %%xcc, 1f\n\t" \
- "add %0, 1, %0\n" \
- "1:" \
+ do { \
+ UDItype __carry = 0; \
+ __asm__ ("addcc\t%r5,%6,%1\n\t" \
+ "add\t%r3,%4,%0\n\t" \
+ "movcs\t%%xcc, 1, %2\n\t" \
+ "add\t%0, %2, %0" \
: "=r" ((UDItype)(sh)), \
- "=&r" ((UDItype)(sl)) \
+ "=&r" ((UDItype)(sl)), \
+ "=&r" (__carry) \
: "%rJ" ((UDItype)(ah)), \
"rI" ((UDItype)(bh)), \
"%rJ" ((UDItype)(al)), \
- "rI" ((UDItype)(bl)) \
- __CLOBBER_CC)
+ "rI" ((UDItype)(bl)), \
+ "2" (__carry) \
+ __CLOBBER_CC); \
+ } while (0)
-#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
- __asm__ ("subcc %r4,%5,%1\n\t" \
- "sub %r2,%3,%0\n\t" \
- "bcs,a,pn %%xcc, 1f\n\t" \
- "sub %0, 1, %0\n\t" \
- "1:" \
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+ do { \
+ UDItype __carry = 0; \
+ __asm__ ("subcc\t%r5,%6,%1\n\t" \
+ "sub\t%r3,%4,%0\n\t" \
+ "movcs\t%%xcc, 1, %2\n\t" \
+ "add\t%0, %2, %0" \
: "=r" ((UDItype)(sh)), \
- "=&r" ((UDItype)(sl)) \
- : "rJ" ((UDItype)(ah)), \
+ "=&r" ((UDItype)(sl)), \
+ "=&r" (__carry) \
+ : "%rJ" ((UDItype)(ah)), \
"rI" ((UDItype)(bh)), \
- "rJ" ((UDItype)(al)), \
- "rI" ((UDItype)(bl)) \
- __CLOBBER_CC)
+ "%rJ" ((UDItype)(al)), \
+ "rI" ((UDItype)(bl)), \
+ "2" (__carry) \
+ __CLOBBER_CC); \
+ } while (0)
#define umul_ppmm(wh, wl, u, v) \
do { \
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Sparc longlong.h enhancements.
2012-05-30 22:14 [PATCH] Sparc longlong.h enhancements David Miller
@ 2012-05-31 13:11 ` Eric Botcazou
2012-05-31 19:50 ` David Miller
0 siblings, 1 reply; 3+ messages in thread
From: Eric Botcazou @ 2012-05-31 13:11 UTC (permalink / raw)
To: David Miller; +Cc: gcc-patches
> Eric, while looking at soft-fp code generated in glibc I noticed that
> for v9 on 32-bit we end up doing software multiplies and divides :-/
>
> I also noticed that the two-limb addition and subtraction could be
> done using a branchless sequence on 64-bit.
>
> Any objections?
None on principle, but...
> #if ((defined (__sparc__) && defined (__arch64__)) || defined (__sparcv9))
> \ && W_TYPE_SIZE == 64
> #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
> - __asm__ ("addcc %r4,%5,%1\n\t" \
> - "add %r2,%3,%0\n\t" \
> - "bcs,a,pn %%xcc, 1f\n\t" \
> - "add %0, 1, %0\n" \
> - "1:" \
> + do { \
> + UDItype __carry = 0; \
> + __asm__ ("addcc\t%r5,%6,%1\n\t" \
> + "add\t%r3,%4,%0\n\t" \
> + "movcs\t%%xcc, 1, %2\n\t" \
> + "add\t%0, %2, %0" \
>
> : "=r" ((UDItype)(sh)), \
>
> - "=&r" ((UDItype)(sl)) \
> + "=&r" ((UDItype)(sl)), \
> + "=&r" (__carry) \
>
> : "%rJ" ((UDItype)(ah)), \
>
> "rI" ((UDItype)(bh)), \
> "%rJ" ((UDItype)(al)), \
> - "rI" ((UDItype)(bl)) \
> - __CLOBBER_CC)
> + "rI" ((UDItype)(bl)), \
> + "2" (__carry) \
> + __CLOBBER_CC); \
> + } while (0)
If __carry is used as both source and destination for %2, why not use a single
operand with the + modifier?
> -#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
> - __asm__ ("subcc %r4,%5,%1\n\t" \
> - "sub %r2,%3,%0\n\t" \
> - "bcs,a,pn %%xcc, 1f\n\t" \
> - "sub %0, 1, %0\n\t" \
> - "1:" \
> +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
> + do { \
> + UDItype __carry = 0; \
> + __asm__ ("subcc\t%r5,%6,%1\n\t" \
> + "sub\t%r3,%4,%0\n\t" \
> + "movcs\t%%xcc, 1, %2\n\t" \
> + "add\t%0, %2, %0" \
>
> : "=r" ((UDItype)(sh)), \
>
> - "=&r" ((UDItype)(sl)) \
> - : "rJ" ((UDItype)(ah)), \
> + "=&r" ((UDItype)(sl)), \
> + "=&r" (__carry) \
> + : "%rJ" ((UDItype)(ah)), \
> "rI" ((UDItype)(bh)), \
> - "rJ" ((UDItype)(al)), \
> - "rI" ((UDItype)(bl)) \
> - __CLOBBER_CC)
> + "%rJ" ((UDItype)(al)), \
> + "rI" ((UDItype)(bl)), \
> + "2" (__carry) \
> + __CLOBBER_CC); \
> + } while (0)
Likewise.
--
Eric Botcazou
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Sparc longlong.h enhancements.
2012-05-31 13:11 ` Eric Botcazou
@ 2012-05-31 19:50 ` David Miller
0 siblings, 0 replies; 3+ messages in thread
From: David Miller @ 2012-05-31 19:50 UTC (permalink / raw)
To: ebotcazou; +Cc: gcc-patches
From: Eric Botcazou <ebotcazou@adacore.com>
Date: Thu, 31 May 2012 15:06:41 +0200
>> + do { \
>> + UDItype __carry = 0; \
>> + __asm__ ("addcc\t%r5,%6,%1\n\t" \
>> + "add\t%r3,%4,%0\n\t" \
>> + "movcs\t%%xcc, 1, %2\n\t" \
>> + "add\t%0, %2, %0" \
>>
>> : "=r" ((UDItype)(sh)), \
>>
>> - "=&r" ((UDItype)(sl)) \
>> + "=&r" ((UDItype)(sl)), \
>> + "=&r" (__carry) \
>>
>> : "%rJ" ((UDItype)(ah)), \
>>
>> "rI" ((UDItype)(bh)), \
>> "%rJ" ((UDItype)(al)), \
>> - "rI" ((UDItype)(bl)) \
>> - __CLOBBER_CC)
>> + "rI" ((UDItype)(bl)), \
>> + "2" (__carry) \
>> + __CLOBBER_CC); \
>> + } while (0)
>
> If __carry is used as both source and destination for %2, why not use a single
> operand with the + modifier?
Makes sense, I'll make that change and test it, thanks Eric.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2012-05-31 19:50 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-30 22:14 [PATCH] Sparc longlong.h enhancements David Miller
2012-05-31 13:11 ` Eric Botcazou
2012-05-31 19:50 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).