From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 25826 invoked by alias); 19 Jul 2004 10:19:47 -0000 Mailing-List: contact libc-hacker-help@sources.redhat.com; run by ezmlm Precedence: bulk List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-hacker-owner@sources.redhat.com Received: (qmail 25802 invoked from network); 19 Jul 2004 10:19:46 -0000 Received: from unknown (HELO sunsite.ms.mff.cuni.cz) (195.113.15.26) by sourceware.org with SMTP; 19 Jul 2004 10:19:46 -0000 Received: from sunsite.ms.mff.cuni.cz (sunsite.mff.cuni.cz [127.0.0.1]) by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8) with ESMTP id i6J83s3j017844; Mon, 19 Jul 2004 10:03:54 +0200 Received: (from jakub@localhost) by sunsite.ms.mff.cuni.cz (8.12.8/8.12.8/Submit) id i6J83r2p017842; Mon, 19 Jul 2004 10:03:53 +0200 Date: Mon, 19 Jul 2004 10:19:00 -0000 From: Jakub Jelinek To: Ulrich Drepper Cc: Glibc hackers Subject: Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2) Message-ID: <20040719080353.GL30497@sunsite.ms.mff.cuni.cz> Reply-To: Jakub Jelinek References: <20040714160203.GQ5191@sunsite.ms.mff.cuni.cz> <20040715093555.GA9816@sunsite.ms.mff.cuni.cz> <40F80B44.9020305@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=iso-8859-1 Content-Disposition: inline Content-Transfer-Encoding: 8bit In-Reply-To: <40F80B44.9020305@redhat.com> User-Agent: Mutt/1.4.1i X-SW-Source: 2004-07/txt/msg00038.txt.bz2 On Fri, Jul 16, 2004 at 10:07:16AM -0700, Ulrich Drepper wrote: > Jakub Jelinek wrote: > > > -4: fldl MO(one) // 1 : x > > +4: fld1 // 1 : x > > I deliberately avoided the use of fld1 since all my measurements showed > it is slower. Please some some measurements of your own. As far as I > remember the only time when it paid off to use fld1 is when the fld was > the only memory operation and therefore it could be avoided to load the > PIC register. If there are no other memory references, it is surely noticeably faster, otherwise I see it the same speed (but the instruction is shorter). With following proglet I get 10 cycles in both cases on Opteron, 42 on P3, 92 on P4. AFAIK GCC uses fld1 and fldz in the code it generates in all cases. #include #include extern long double foo1 (void); extern long double foo2 (void); asm (".balign 16; foo1: .section .rodata; .balign 16; mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80; .previous; fldz; faddl mzero; fxam; fnstsw; ret;"); asm (".balign 16; foo2: .section .rodata; .balign 16; zero: .double 0.0; .previous; fldl zero; faddl mzero; fxam; fnstsw; ret;"); int main (void) { unsigned long long st, en, m; int i; m = -1ULL; for (i = 0; i < 1000; ++i) { asm volatile ("rdtsc" : "=A" (st)); foo1 (); asm volatile ("rdtsc" : "=A" (en)); en -= st; if (en < m) m = en; } printf ("%lld\n", m); m = -1ULL; for (i = 0; i < 1000; ++i) { asm volatile ("rdtsc" : "=A" (st)); foo2 (); asm volatile ("rdtsc" : "=A" (en)); en -= st; if (en < m) m = en; } printf ("%lld\n", m); return 0; } Anyway, below is a patch variant without fldz and fld1, those changes weren't fixing anything and thus can be a separate change which can be argued about independently. 2004-07-19 Jakub Jelinek [BZ #258] * math/libm-test.inc (max_value, min_value): New variables. (initialize): Initialize them. (pow_test): Add a couple of new tests. * sysdeps/i386/fpu/e_powf.S (__ieee754_powf): Don't generate invalid exception if |y| >= 1U<<31. * sysdeps/i386/fpu/e_pow.S (__ieee754_pow): Don't generate invalid exception if |y| >= 1L<<63. * sysdeps/i386/fpu/e_powl.S (__ieee754_powl): Likewise. If y*log2(x) overflows to +-inf, return still +inf/+0 instead of NaN. * sysdeps/x86_64/fpu/e_powl.S (__ieee754_powl): Likewise. --- libc/math/libm-test.inc.jj 2004-02-13 12:28:07.000000000 +0100 +++ libc/math/libm-test.inc 2004-07-15 12:34:17.395663150 +0200 @@ -169,7 +169,7 @@ static int output_points; /* Should the static int ignore_max_ulp; /* Should we ignore max_ulp? */ static FLOAT minus_zero, plus_zero; -static FLOAT plus_infty, minus_infty, nan_value; +static FLOAT plus_infty, minus_infty, nan_value, max_value, min_value; static FLOAT max_error, real_max_error, imag_max_error; @@ -3593,6 +3593,28 @@ pow_test (void) TEST_ff_f (pow, -1, plus_infty, 1); TEST_ff_f (pow, 1, minus_infty, 1); TEST_ff_f (pow, -1, minus_infty, 1); + TEST_ff_f (pow, 1, 1, 1); + TEST_ff_f (pow, 1, -1, 1); + TEST_ff_f (pow, 1, 1.25, 1); + TEST_ff_f (pow, 1, -1.25, 1); + TEST_ff_f (pow, 1, 0x1p62L, 1); + TEST_ff_f (pow, 1, 0x1p63L, 1); + TEST_ff_f (pow, 1, 0x1p64L, 1); + TEST_ff_f (pow, 1, 0x1p72L, 1); + + /* pow (x, +-0) == 1. */ + TEST_ff_f (pow, plus_infty, 0, 1); + TEST_ff_f (pow, plus_infty, minus_zero, 1); + TEST_ff_f (pow, minus_infty, 0, 1); + TEST_ff_f (pow, minus_infty, minus_zero, 1); + TEST_ff_f (pow, 32.75L, 0, 1); + TEST_ff_f (pow, 32.75L, minus_zero, 1); + TEST_ff_f (pow, -32.75L, 0, 1); + TEST_ff_f (pow, -32.75L, minus_zero, 1); + TEST_ff_f (pow, 0x1p72L, 0, 1); + TEST_ff_f (pow, 0x1p72L, minus_zero, 1); + TEST_ff_f (pow, 0x1p-72L, 0, 1); + TEST_ff_f (pow, 0x1p-72L, minus_zero, 1); TEST_ff_f (pow, -0.1L, 1.1L, nan_value, INVALID_EXCEPTION); TEST_ff_f (pow, -0.1L, -1.1L, nan_value, INVALID_EXCEPTION); @@ -3609,6 +3631,10 @@ pow_test (void) TEST_ff_f (pow, minus_zero, -2, plus_infty, DIVIDE_BY_ZERO_EXCEPTION); TEST_ff_f (pow, minus_zero, -11.1L, plus_infty, DIVIDE_BY_ZERO_EXCEPTION); + TEST_ff_f (pow, 0x1p72L, 0x1p72L, plus_infty); + TEST_ff_f (pow, 10, -0x1p72L, 0); + TEST_ff_f (pow, max_value, max_value, plus_infty); + TEST_ff_f (pow, 10, -max_value, 0); TEST_ff_f (pow, 0, 1, 0); TEST_ff_f (pow, 0, 11, 0); @@ -3623,6 +3649,8 @@ pow_test (void) TEST_ff_f (pow, minus_zero, 2, 0); TEST_ff_f (pow, minus_zero, 11.1L, 0); + TEST_ff_f (pow, 0, plus_infty, 0); + TEST_ff_f (pow, minus_zero, plus_infty, 0); #ifndef TEST_INLINE /* pow (x, +inf) == +inf for |x| > 1. */ @@ -3667,6 +3695,11 @@ pow_test (void) /* pow (-0, y) == +0 for y > 0 and not an odd integer. */ TEST_ff_f (pow, minus_zero, 4, 0.0); + TEST_ff_f (pow, 16, 0.25L, 2); + TEST_ff_f (pow, 0x1p64L, 0.125L, 256); + TEST_ff_f (pow, 2, 4, 16); + TEST_ff_f (pow, 256, 8, 0x1p64L); + TEST_ff_f (pow, 0.75L, 1.25L, 0.697953644326574699205914060237425566L); #if defined TEST_DOUBLE || defined TEST_LDOUBLE @@ -4312,12 +4345,18 @@ initialize (void) HUGE_VALL, HUGE_VAL, HUGE_VALF); minus_infty = CHOOSE (-HUGE_VALL, -HUGE_VAL, -HUGE_VALF, -HUGE_VALL, -HUGE_VAL, -HUGE_VALF); + max_value = CHOOSE (LDBL_MAX, DBL_MAX, FLT_MAX, + LDBL_MAX, DBL_MAX, FLT_MAX); + min_value = CHOOSE (LDBL_MIN, DBL_MIN, FLT_MIN, + LDBL_MIN, DBL_MIN, FLT_MIN); (void) &plus_zero; (void) &nan_value; (void) &minus_zero; (void) &plus_infty; (void) &minus_infty; + (void) &max_value; + (void) &min_value; /* Clear all exceptions. From now on we must not get random exceptions. */ feclearexcept (FE_ALL_EXCEPT); --- libc/sysdeps/i386/fpu/e_powf.S.jj 2001-07-06 06:55:53.000000000 +0200 +++ libc/sysdeps/i386/fpu/e_powf.S 2004-07-15 13:23:15.977438871 +0200 @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1999, 2001 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1999, 2001, 2004 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -48,6 +48,9 @@ one: .double 1.0 ASM_TYPE_DIRECTIVE(limit,@object) limit: .double 0.29 ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p31,@object) +p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41 + ASM_SIZE_DIRECTIVE(p31) #ifdef PIC #define MO(op) op##@GOTOFF(%ecx) @@ -96,6 +99,14 @@ ENTRY(__ieee754_powf) fxch // y : x + /* fistpl raises invalid exception for |y| >= 1L<<31. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p31) // y : x + fnstsw + sahf + jnc 2f + /* First see whether `y' is a natural number. In this case we can use a more precise algorithm. */ fld %st // y : y : x --- libc/sysdeps/i386/fpu/e_powl.S.jj 2001-07-06 06:55:53.000000000 +0200 +++ libc/sysdeps/i386/fpu/e_powl.S 2004-07-15 12:56:07.678735978 +0200 @@ -1,5 +1,6 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -48,6 +49,9 @@ one: .double 1.0 ASM_TYPE_DIRECTIVE(limit,@object) limit: .double 0.29 ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p63,@object) +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) #ifdef PIC #define MO(op) op##@GOTOFF(%ecx) @@ -96,6 +100,14 @@ ENTRY(__ieee754_powl) fxch // y : x + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p63) // y : x + fnstsw + sahf + jnc 2f + /* First see whether `y' is a natural number. In this case we can use a more precise algorithm. */ fld %st // y : y : x @@ -161,6 +173,11 @@ ENTRY(__ieee754_powl) 7: fyl2x // log2(x) : y 8: fmul %st(1) // y*log2(x) : y + fxam + fnstsw + andb $0x45, %ah + cmpb $0x05, %ah // is y*log2(x) == ±inf ? + je 28f fst %st(1) // y*log2(x) : y*log2(x) frndint // int(y*log2(x)) : y*log2(x) fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) @@ -172,6 +189,12 @@ ENTRY(__ieee754_powl) fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x)) ret +28: fstp %st(1) // y*log2(x) + fldl MO(one) // 1 : y*log2(x) + fscale // 2^(y*log2(x)) : y*log2(x) + addl $8, %esp + fstp %st(1) // 2^(y*log2(x)) + ret // pow(x,±0) = 1 .align ALIGNARG(4) --- libc/sysdeps/i386/fpu/e_pow.S.jj 2001-07-06 06:55:53.000000000 +0200 +++ libc/sysdeps/i386/fpu/e_pow.S 2004-07-15 13:05:41.601640612 +0200 @@ -1,5 +1,6 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 + Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -48,6 +49,9 @@ one: .double 1.0 ASM_TYPE_DIRECTIVE(limit,@object) limit: .double 0.29 ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p63,@object) +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) #ifdef PIC #define MO(op) op##@GOTOFF(%ecx) @@ -96,6 +100,14 @@ ENTRY(__ieee754_pow) fxch // y : x + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p63) // y : x + fnstsw + sahf + jnc 2f + /* First see whether `y' is a natural number. In this case we can use a more precise algorithm. */ fld %st // y : y : x --- libc/sysdeps/x86_64/fpu/e_powl.S.jj 2001-09-19 12:24:08.000000000 +0200 +++ libc/sysdeps/x86_64/fpu/e_powl.S 2004-07-14 20:02:06.000000000 +0200 @@ -1,5 +1,5 @@ /* ix87 specific implementation of pow function. - Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc. + Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper , 1996. @@ -48,6 +48,10 @@ one: .double 1.0 ASM_TYPE_DIRECTIVE(limit,@object) limit: .double 0.29 ASM_SIZE_DIRECTIVE(limit) + ASM_TYPE_DIRECTIVE(p63,@object) +p63: + .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) #ifdef PIC #define MO(op) op##(%rip) @@ -87,6 +91,14 @@ ENTRY(__ieee754_powl) fxch // y : x + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fldl MO(p63) // 1L<<63 : y : x + fld %st(1) // y : 1L<<63 : y : x + fabs // |y| : 1L<<63 : y : x + fcomip %st(1), %st // 1L<<63 : y : x + fstp %st(0) // y : x + jnc 2f + /* First see whether `y' is a natural number. In this case we can use a more precise algorithm. */ fld %st // y : y : x @@ -148,6 +160,11 @@ ENTRY(__ieee754_powl) 7: fyl2x // log2(x) : y 8: fmul %st(1) // y*log2(x) : y + fxam + fnstsw + andb $0x45, %ah + cmpb $0x05, %ah // is y*log2(x) == ±inf ? + je 28f fst %st(1) // y*log2(x) : y*log2(x) frndint // int(y*log2(x)) : y*log2(x) fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) @@ -158,6 +175,11 @@ ENTRY(__ieee754_powl) fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x)) ret +28: fstp %st(1) // y*log2(x) + fldl MO(one) // 1 : y*log2(x) + fscale // 2^(y*log2(x)) : y*log2(x) + fstp %st(1) // 2^(y*log2(x)) + ret // pow(x,±0) = 1 .align ALIGNARG(4) Jakub