[PATCH] Fix powl on x86

public inbox for libc-hacker@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] Fix powl on x86_64 [BZ #258]
@ 2004-07-14 18:17 Jakub Jelinek
  2004-07-15 11:51 ` [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2) Jakub Jelinek
  0 siblings, 1 reply; 5+ messages in thread
From: Jakub Jelinek @ 2004-07-14 18:17 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

Hi!

i386 will need similar treatment.
Not sure if fld*/f{,u}comi{,p}/j? is faster than fcompl/fnstw/testing %ah/j?
or vice versa, maybe the comparisons could be coded more efficiently.

2004-07-14  Jakub Jelinek  <jakub@redhat.com>

	[BZ #258]
	* math/libm-test.inc (max_value, min_value): New variables.
	(initialize): Initialize them.
	(pow_test): Add a couple of new tests.
	* sysdeps/x86_64/fpu/e_powl.S (__ieee754_powl): Don't generate
	invalid exception if |y| is >= 1L<<63.  If y*log2(x) overflows
	to +-inf, return still +inf/+0 instead of NaN.  Replace
	fldl MO(zero) with fldz and fldl MO(one) with fld1.

--- libc/math/libm-test.inc.jj	2004-02-13 12:28:07.000000000 +0100
+++ libc/math/libm-test.inc	2004-07-14 20:09:24.475429821 +0200
@@ -169,7 +169,7 @@ static int output_points;	/* Should the 
 static int ignore_max_ulp;	/* Should we ignore max_ulp?  */
 
 static FLOAT minus_zero, plus_zero;
-static FLOAT plus_infty, minus_infty, nan_value;
+static FLOAT plus_infty, minus_infty, nan_value, max_value, min_value;
 
 static FLOAT max_error, real_max_error, imag_max_error;
 
@@ -3593,6 +3593,25 @@ pow_test (void)
   TEST_ff_f (pow, -1, plus_infty, 1);
   TEST_ff_f (pow, 1, minus_infty, 1);
   TEST_ff_f (pow, -1, minus_infty, 1);
+  TEST_ff_f (pow, 1, 1, 1);
+  TEST_ff_f (pow, 1, -1, 1);
+  TEST_ff_f (pow, 1, 1.25, 1);
+  TEST_ff_f (pow, 1, -1.25, 1);
+  TEST_ff_f (pow, 1, 0x1p72L, 1);
+
+  /* pow (x, +-0) == 1.  */
+  TEST_ff_f (pow, plus_infty, 0, 1);
+  TEST_ff_f (pow, plus_infty, minus_zero, 1);
+  TEST_ff_f (pow, minus_infty, 0, 1);
+  TEST_ff_f (pow, minus_infty, minus_zero, 1);
+  TEST_ff_f (pow, 32.75L, 0, 1);
+  TEST_ff_f (pow, 32.75L, minus_zero, 1);
+  TEST_ff_f (pow, -32.75L, 0, 1);
+  TEST_ff_f (pow, -32.75L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p72L, 0, 1);
+  TEST_ff_f (pow, 0x1p72L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p-72L, 0, 1);
+  TEST_ff_f (pow, 0x1p-72L, minus_zero, 1);
 
   TEST_ff_f (pow, -0.1L, 1.1L, nan_value, INVALID_EXCEPTION);
   TEST_ff_f (pow, -0.1L, -1.1L, nan_value, INVALID_EXCEPTION);
@@ -3609,6 +3628,10 @@ pow_test (void)
   TEST_ff_f (pow, minus_zero, -2, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
   TEST_ff_f (pow, minus_zero, -11.1L, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
 
+  TEST_ff_f (pow, 0x1p72L, 0x1p72L, plus_infty);
+  TEST_ff_f (pow, 10, -0x1p72L, 0);
+  TEST_ff_f (pow, max_value, max_value, plus_infty);
+  TEST_ff_f (pow, 10, -max_value, 0);
 
   TEST_ff_f (pow, 0, 1, 0);
   TEST_ff_f (pow, 0, 11, 0);
@@ -3623,6 +3646,8 @@ pow_test (void)
 
   TEST_ff_f (pow, minus_zero, 2, 0);
   TEST_ff_f (pow, minus_zero, 11.1L, 0);
+  TEST_ff_f (pow, 0, plus_infty, 0);
+  TEST_ff_f (pow, minus_zero, plus_infty, 0);
 
 #ifndef TEST_INLINE
   /* pow (x, +inf) == +inf for |x| > 1.  */
@@ -3667,6 +3692,11 @@ pow_test (void)
   /* pow (-0, y) == +0 for y > 0 and not an odd integer.  */
   TEST_ff_f (pow, minus_zero, 4, 0.0);
 
+  TEST_ff_f (pow, 16, 0.25L, 2);
+  TEST_ff_f (pow, 0x1p64L, 0.125L, 256);
+  TEST_ff_f (pow, 2, 4, 16);
+  TEST_ff_f (pow, 256, 8, 0x1p64L);
+
   TEST_ff_f (pow, 0.75L, 1.25L, 0.697953644326574699205914060237425566L);
 
 #if defined TEST_DOUBLE || defined TEST_LDOUBLE
@@ -4312,12 +4342,18 @@ initialize (void)
 		       HUGE_VALL, HUGE_VAL, HUGE_VALF);
   minus_infty = CHOOSE (-HUGE_VALL, -HUGE_VAL, -HUGE_VALF,
 			-HUGE_VALL, -HUGE_VAL, -HUGE_VALF);
+  max_value = CHOOSE (LDBL_MAX, DBL_MAX, FLT_MAX,
+		      LDBL_MAX, DBL_MAX, FLT_MAX);
+  min_value = CHOOSE (LDBL_MIN, DBL_MIN, FLT_MIN,
+		      LDBL_MIN, DBL_MIN, FLT_MIN);
 
   (void) &plus_zero;
   (void) &nan_value;
   (void) &minus_zero;
   (void) &plus_infty;
   (void) &minus_infty;
+  (void) &max_value;
+  (void) &min_value;
 
   /* Clear all exceptions.  From now on we must not get random exceptions.  */
   feclearexcept (FE_ALL_EXCEPT);
--- libc/sysdeps/x86_64/fpu/e_powl.S.jj	2001-09-19 12:24:08.000000000 +0200
+++ libc/sysdeps/x86_64/fpu/e_powl.S	2004-07-14 20:02:06.717899272 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##(%rip)
@@ -87,6 +90,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -105,7 +116,7 @@ ENTRY(__ieee754_powl)
 	negl	%eax
 	adcl	$0, %edx
 	negl	%edx
-4:	fldl	MO(one)		// 1 : x
+4:	fld1			// 1 : x
 	fxch
 
 6:	shrdl	$1, %edx, %eax
@@ -123,7 +134,7 @@ ENTRY(__ieee754_powl)
 
 	/* y is Â±NAN */
 30:	fldt	8(%rsp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fucomip	%st(1),%st	// x : y
 	je	31f
 	fxch			// y : x
@@ -133,7 +144,7 @@ ENTRY(__ieee754_powl)
 	.align ALIGNARG(4)
 2:	/* y is a real number.  */
 	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fld	%st(1)		// x : 1.0 : x : y
 	fsub	%st(1)		// x-1 : 1.0 : x : y
 	fabs			// |x-1| : 1.0 : x : y
@@ -148,6 +159,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah      // is y*log2(x) == Â±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -158,11 +174,16 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fld1			// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
+	fld1
 	ret
 
 	// y == Â±inf
@@ -191,7 +212,7 @@ ENTRY(__ieee754_powl)
 	ret
 
 	.align ALIGNARG(4)
-14:	fldl	MO(one)
+14:	fld1
 	ret
 
 	.align ALIGNARG(4)
@@ -275,7 +296,7 @@ ENTRY(__ieee754_powl)
 	jz	27f		// jump if not odd
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	fchs
 	ret
@@ -283,7 +304,7 @@ ENTRY(__ieee754_powl)
 25:	fstp	%st(0)
 26:
 27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	ret
 
@@ -310,7 +331,7 @@ ENTRY(__ieee754_powl)
 
 22:	fstp	%st(0)
 23:
-24:	fldl	MO(zero)
+24:	fldz
 	ret
 
 END(__ieee754_powl)

	Jakub

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  2004-07-14 18:17 [PATCH] Fix powl on x86_64 [BZ #258] Jakub Jelinek
@ 2004-07-15 11:51 ` Jakub Jelinek
  2004-07-16 17:09   ` Ulrich Drepper
  0 siblings, 1 reply; 5+ messages in thread
From: Jakub Jelinek @ 2004-07-15 11:51 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

On Wed, Jul 14, 2004 at 06:02:03PM +0200, Jakub Jelinek wrote:
> i386 will need similar treatment.
> Not sure if fld*/f{,u}comi{,p}/j? is faster than fcompl/fnstw/testing %ah/j?
> or vice versa, maybe the comparisons could be coded more efficiently.

Here is an updated version which fixes i386 too.
I have done also some benchmarking and e.g. on P3 pow{f,,l} is now from 22
ticks slower to 106 ticks faster depending on input, on x86_64 (Opteron) I saw
slowdowns at most 16 ticks for powl (powf and pow aren't done in assembly,
so there is no change for x86_64), on Opteron -m32 from 10 ticks slower to
29 ticks faster.
But more importantly it is correct.

2004-07-15  Jakub Jelinek  <jakub@redhat.com>

	[BZ #258]
	* math/libm-test.inc (max_value, min_value): New variables.
	(initialize): Initialize them.
	(pow_test): Add a couple of new tests.
	* sysdeps/i386/fpu/e_powf.S (__ieee754_powf): Don't generate invalid
	exception if |y| >= 1U<<31.  Replace fldl MO(zero) with fldz and
	fldl MO(one) with fld1.
	* sysdeps/i386/fpu/e_pow.S (__ieee754_pow): Don't generate invalid
	exception if |y| >= 1L<<63.  Replace fldl MO(zero) with fldz and
	fldl MO(one) with fld1.
	* sysdeps/i386/fpu/e_powl.S (__ieee754_powl): Likewise.
	If y*log2(x) overflows to +-inf, return still +inf/+0 instead of NaN.
	* sysdeps/x86_64/fpu/e_powl.S (__ieee754_powl): Likewise.

--- libc/math/libm-test.inc.jj	2004-02-13 12:28:07.000000000 +0100
+++ libc/math/libm-test.inc	2004-07-15 12:34:17.395663150 +0200
@@ -169,7 +169,7 @@ static int output_points;	/* Should the 
 static int ignore_max_ulp;	/* Should we ignore max_ulp?  */
 
 static FLOAT minus_zero, plus_zero;
-static FLOAT plus_infty, minus_infty, nan_value;
+static FLOAT plus_infty, minus_infty, nan_value, max_value, min_value;
 
 static FLOAT max_error, real_max_error, imag_max_error;
 
@@ -3593,6 +3593,28 @@ pow_test (void)
   TEST_ff_f (pow, -1, plus_infty, 1);
   TEST_ff_f (pow, 1, minus_infty, 1);
   TEST_ff_f (pow, -1, minus_infty, 1);
+  TEST_ff_f (pow, 1, 1, 1);
+  TEST_ff_f (pow, 1, -1, 1);
+  TEST_ff_f (pow, 1, 1.25, 1);
+  TEST_ff_f (pow, 1, -1.25, 1);
+  TEST_ff_f (pow, 1, 0x1p62L, 1);
+  TEST_ff_f (pow, 1, 0x1p63L, 1);
+  TEST_ff_f (pow, 1, 0x1p64L, 1);
+  TEST_ff_f (pow, 1, 0x1p72L, 1);
+
+  /* pow (x, +-0) == 1.  */
+  TEST_ff_f (pow, plus_infty, 0, 1);
+  TEST_ff_f (pow, plus_infty, minus_zero, 1);
+  TEST_ff_f (pow, minus_infty, 0, 1);
+  TEST_ff_f (pow, minus_infty, minus_zero, 1);
+  TEST_ff_f (pow, 32.75L, 0, 1);
+  TEST_ff_f (pow, 32.75L, minus_zero, 1);
+  TEST_ff_f (pow, -32.75L, 0, 1);
+  TEST_ff_f (pow, -32.75L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p72L, 0, 1);
+  TEST_ff_f (pow, 0x1p72L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p-72L, 0, 1);
+  TEST_ff_f (pow, 0x1p-72L, minus_zero, 1);
 
   TEST_ff_f (pow, -0.1L, 1.1L, nan_value, INVALID_EXCEPTION);
   TEST_ff_f (pow, -0.1L, -1.1L, nan_value, INVALID_EXCEPTION);
@@ -3609,6 +3631,10 @@ pow_test (void)
   TEST_ff_f (pow, minus_zero, -2, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
   TEST_ff_f (pow, minus_zero, -11.1L, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
 
+  TEST_ff_f (pow, 0x1p72L, 0x1p72L, plus_infty);
+  TEST_ff_f (pow, 10, -0x1p72L, 0);
+  TEST_ff_f (pow, max_value, max_value, plus_infty);
+  TEST_ff_f (pow, 10, -max_value, 0);
 
   TEST_ff_f (pow, 0, 1, 0);
   TEST_ff_f (pow, 0, 11, 0);
@@ -3623,6 +3649,8 @@ pow_test (void)
 
   TEST_ff_f (pow, minus_zero, 2, 0);
   TEST_ff_f (pow, minus_zero, 11.1L, 0);
+  TEST_ff_f (pow, 0, plus_infty, 0);
+  TEST_ff_f (pow, minus_zero, plus_infty, 0);
 
 #ifndef TEST_INLINE
   /* pow (x, +inf) == +inf for |x| > 1.  */
@@ -3667,6 +3695,11 @@ pow_test (void)
   /* pow (-0, y) == +0 for y > 0 and not an odd integer.  */
   TEST_ff_f (pow, minus_zero, 4, 0.0);
 
+  TEST_ff_f (pow, 16, 0.25L, 2);
+  TEST_ff_f (pow, 0x1p64L, 0.125L, 256);
+  TEST_ff_f (pow, 2, 4, 16);
+  TEST_ff_f (pow, 256, 8, 0x1p64L);
+
   TEST_ff_f (pow, 0.75L, 1.25L, 0.697953644326574699205914060237425566L);
 
 #if defined TEST_DOUBLE || defined TEST_LDOUBLE
@@ -4312,12 +4345,18 @@ initialize (void)
 		       HUGE_VALL, HUGE_VAL, HUGE_VALF);
   minus_infty = CHOOSE (-HUGE_VALL, -HUGE_VAL, -HUGE_VALF,
 			-HUGE_VALL, -HUGE_VAL, -HUGE_VALF);
+  max_value = CHOOSE (LDBL_MAX, DBL_MAX, FLT_MAX,
+		      LDBL_MAX, DBL_MAX, FLT_MAX);
+  min_value = CHOOSE (LDBL_MIN, DBL_MIN, FLT_MIN,
+		      LDBL_MIN, DBL_MIN, FLT_MIN);
 
   (void) &plus_zero;
   (void) &nan_value;
   (void) &minus_zero;
   (void) &plus_infty;
   (void) &minus_infty;
+  (void) &max_value;
+  (void) &min_value;
 
   /* Clear all exceptions.  From now on we must not get random exceptions.  */
   feclearexcept (FE_ALL_EXCEPT);
--- libc/sysdeps/i386/fpu/e_powf.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powf.S	2004-07-15 13:23:15.977438871 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p31,@object)
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +99,14 @@ ENTRY(__ieee754_powf)
 
 	fxch			// y : x
 
+	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p31)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -113,7 +124,7 @@ ENTRY(__ieee754_powf)
 	jns	4f		// y >= 0, jump
 	fdivrl	MO(one)		// 1/x		(now referred to as x)
 	negl	%edx
-4:	fldl	MO(one)		// 1 : x
+4:	fld1			// 1 : x
 	fxch
 
 6:	shrl	$1, %edx
@@ -129,7 +140,7 @@ ENTRY(__ieee754_powf)
 
 	/* y is Â±NAN */
 30:	flds	4(%esp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fucomp	%st(1)		// x : y
 	fnstsw
 	sahf
@@ -141,7 +152,7 @@ ENTRY(__ieee754_powf)
 	.align ALIGNARG(4)
 2:	/* y is a real number.  */
 	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fld	%st(1)		// x : 1.0 : x : y
 	fsub	%st(1)		// x-1 : 1.0 : x : y
 	fabs			// |x-1| : 1.0 : x : y
@@ -171,7 +182,7 @@ ENTRY(__ieee754_powf)
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
+	fld1
 	ret
 
 	// y == Â±inf
@@ -195,7 +206,7 @@ ENTRY(__ieee754_powf)
 	ret
 
 	.align ALIGNARG(4)
-14:	fldl	MO(one)
+14:	fld1
 	ret
 
 	.align ALIGNARG(4)
@@ -276,7 +287,7 @@ ENTRY(__ieee754_powf)
 	jbe	27f		// does not fit in mantissa bits
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	fchs
 	ret
@@ -284,7 +295,7 @@ ENTRY(__ieee754_powf)
 25:	fstp	%st(0)
 26:	addl	$4, %esp
 27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	ret
 
@@ -314,7 +325,7 @@ ENTRY(__ieee754_powf)
 
 22:	fstp	%st(0)
 23:	addl	$4, %esp	// Don't use pop.
-24:	fldl	MO(zero)
+24:	fldz
 	ret
 
 END(__ieee754_powf)
--- libc/sysdeps/i386/fpu/e_powl.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powl.S	2004-07-15 12:56:07.678735978 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -116,7 +128,7 @@ ENTRY(__ieee754_powl)
 	negl	%eax
 	adcl	$0, %edx
 	negl	%edx
-4:	fldl	MO(one)		// 1 : x
+4:	fld1			// 1 : x
 	fxch
 
 6:	shrdl	$1, %edx, %eax
@@ -134,7 +146,7 @@ ENTRY(__ieee754_powl)
 
 	/* y is Â±NAN */
 30:	fldt	4(%esp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fucomp	%st(1)		// x : y
 	fnstsw
 	sahf
@@ -146,7 +158,7 @@ ENTRY(__ieee754_powl)
 	.align ALIGNARG(4)
 2:	/* y is a real number.  */
 	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fld	%st(1)		// x : 1.0 : x : y
 	fsub	%st(1)		// x-1 : 1.0 : x : y
 	fabs			// |x-1| : 1.0 : x : y
@@ -161,6 +173,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah	// is y*log2(x) == Â±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -172,11 +189,17 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fld1			// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	addl	$8, %esp
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
+	fld1
 	ret
 
 	// y == Â±inf
@@ -200,7 +223,7 @@ ENTRY(__ieee754_powl)
 	ret
 
 	.align ALIGNARG(4)
-14:	fldl	MO(one)
+14:	fld1
 	ret
 
 	.align ALIGNARG(4)
@@ -273,7 +296,7 @@ ENTRY(__ieee754_powl)
 	jz	27f		// jump if not odd
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	fchs
 	ret
@@ -281,7 +304,7 @@ ENTRY(__ieee754_powl)
 25:	fstp	%st(0)
 26:	addl	$8, %esp
 27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	ret
 
@@ -309,7 +332,7 @@ ENTRY(__ieee754_powl)
 
 22:	fstp	%st(0)
 23:	addl	$8, %esp	// Don't use 2 x pop
-24:	fldl	MO(zero)
+24:	fldz
 	ret
 
 END(__ieee754_powl)
--- libc/sysdeps/i386/fpu/e_pow.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_pow.S	2004-07-15 13:05:41.601640612 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_pow)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -116,7 +128,7 @@ ENTRY(__ieee754_pow)
 	negl	%eax
 	adcl	$0, %edx
 	negl	%edx
-4:	fldl	MO(one)		// 1 : x
+4:	fld1			// 1 : x
 	fxch
 
 6:	shrdl	$1, %edx, %eax
@@ -134,7 +146,7 @@ ENTRY(__ieee754_pow)
 
 	/* y is Â±NAN */
 30:	fldl	4(%esp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fucomp	%st(1)		// x : y
 	fnstsw
 	sahf
@@ -146,7 +158,7 @@ ENTRY(__ieee754_pow)
 	.align ALIGNARG(4)
 2:	/* y is a real number.  */
 	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fld	%st(1)		// x : 1.0 : x : y
 	fsub	%st(1)		// x-1 : 1.0 : x : y
 	fabs			// |x-1| : 1.0 : x : y
@@ -176,7 +188,7 @@ ENTRY(__ieee754_pow)
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
+	fld1
 	ret
 
 	// y == Â±inf
@@ -200,7 +212,7 @@ ENTRY(__ieee754_pow)
 	ret
 
 	.align ALIGNARG(4)
-14:	fldl	MO(one)
+14:	fld1
 	ret
 
 	.align ALIGNARG(4)
@@ -283,7 +295,7 @@ ENTRY(__ieee754_pow)
 	jbe	27f		// does not fit in mantissa bits
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	fchs
 	ret
@@ -291,7 +303,7 @@ ENTRY(__ieee754_pow)
 25:	fstp	%st(0)
 26:	addl	$8, %esp
 27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	ret
 
@@ -322,7 +334,7 @@ ENTRY(__ieee754_pow)
 
 22:	fstp	%st(0)
 23:	addl	$8, %esp	// Don't use 2 x pop
-24:	fldl	MO(zero)
+24:	fldz
 	ret
 
 END(__ieee754_pow)
--- libc/sysdeps/x86_64/fpu/e_powl.S.jj	2001-09-19 12:24:08.000000000 +0200
+++ libc/sysdeps/x86_64/fpu/e_powl.S	2004-07-14 20:02:06.000000000 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,10 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:
+	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##(%rip)
@@ -87,6 +91,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -105,7 +117,7 @@ ENTRY(__ieee754_powl)
 	negl	%eax
 	adcl	$0, %edx
 	negl	%edx
-4:	fldl	MO(one)		// 1 : x
+4:	fld1			// 1 : x
 	fxch
 
 6:	shrdl	$1, %edx, %eax
@@ -123,7 +135,7 @@ ENTRY(__ieee754_powl)
 
 	/* y is Â±NAN */
 30:	fldt	8(%rsp)		// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fucomip	%st(1),%st	// x : y
 	je	31f
 	fxch			// y : x
@@ -133,7 +145,7 @@ ENTRY(__ieee754_powl)
 	.align ALIGNARG(4)
 2:	/* y is a real number.  */
 	fxch			// x : y
-	fldl	MO(one)		// 1.0 : x : y
+	fld1			// 1.0 : x : y
 	fld	%st(1)		// x : 1.0 : x : y
 	fsub	%st(1)		// x-1 : 1.0 : x : y
 	fabs			// |x-1| : 1.0 : x : y
@@ -148,6 +160,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah      // is y*log2(x) == Â±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -158,11 +175,16 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fld1			// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
 11:	fstp	%st(0)		// pop y
-	fldl	MO(one)
+	fld1
 	ret
 
 	// y == Â±inf
@@ -191,7 +213,7 @@ ENTRY(__ieee754_powl)
 	ret
 
 	.align ALIGNARG(4)
-14:	fldl	MO(one)
+14:	fld1
 	ret
 
 	.align ALIGNARG(4)
@@ -275,7 +297,7 @@ ENTRY(__ieee754_powl)
 	jz	27f		// jump if not odd
 	// It's an odd integer.
 	// Raise divide-by-zero exception and get minus infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	fchs
 	ret
@@ -283,7 +305,7 @@ ENTRY(__ieee754_powl)
 25:	fstp	%st(0)
 26:
 27:	// Raise divide-by-zero exception and get infinity value.
-	fldl	MO(one)
+	fld1
 	fdivl	MO(zero)
 	ret
 
@@ -310,7 +332,7 @@ ENTRY(__ieee754_powl)
 
 22:	fstp	%st(0)
 23:
-24:	fldl	MO(zero)
+24:	fldz
 	ret
 
 END(__ieee754_powl)

	Jakub

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  2004-07-15 11:51 ` [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2) Jakub Jelinek
@ 2004-07-16 17:09   ` Ulrich Drepper
  2004-07-19 10:19     ` Jakub Jelinek
  0 siblings, 1 reply; 5+ messages in thread
From: Ulrich Drepper @ 2004-07-16 17:09 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

Jakub Jelinek wrote:

> -4:	fldl	MO(one)		// 1 : x
> +4:	fld1			// 1 : x

I deliberately avoided the use of fld1 since all my measurements showed
it is slower.  Please some some measurements of your own.  As far as I
remember the only time when it paid off to use fld1 is when the fld was
the only memory operation and therefore it could be avoided to load the
PIC register.

-- 
âž§ Ulrich Drepper âž§ Red Hat, Inc. âž§ 444 Castro St âž§ Mountain View, CA â–

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  2004-07-16 17:09   ` Ulrich Drepper
@ 2004-07-19 10:19     ` Jakub Jelinek
  2004-07-20  7:07       ` Ulrich Drepper
  0 siblings, 1 reply; 5+ messages in thread
From: Jakub Jelinek @ 2004-07-19 10:19 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: Glibc hackers

On Fri, Jul 16, 2004 at 10:07:16AM -0700, Ulrich Drepper wrote:
> Jakub Jelinek wrote:
> 
> > -4:	fldl	MO(one)		// 1 : x
> > +4:	fld1			// 1 : x
> 
> I deliberately avoided the use of fld1 since all my measurements showed
> it is slower.  Please some some measurements of your own.  As far as I
> remember the only time when it paid off to use fld1 is when the fld was
> the only memory operation and therefore it could be avoided to load the
> PIC register.

If there are no other memory references, it is surely noticeably faster,
otherwise I see it the same speed (but the instruction is shorter).
With following proglet I get 10 cycles in both cases on Opteron,
42 on P3, 92 on P4.  AFAIK GCC uses fld1 and fldz in the code it generates
in all cases.

#include <stdio.h>
#include <math.h>

extern long double foo1 (void);
extern long double foo2 (void);
asm (".balign 16; foo1: .section .rodata; .balign 16; mzero: .byte 0, 0, 0, 0, 0, 0, 0, 0x80; .previous; fldz; faddl mzero; fxam; fnstsw; ret;");
asm (".balign 16; foo2: .section .rodata; .balign 16; zero: .double 0.0; .previous; fldl zero; faddl mzero; fxam; fnstsw; ret;");

int main (void)
{
  unsigned long long st, en, m;
  int i;
  m = -1ULL;
  for (i = 0; i < 1000; ++i)
    {
      asm volatile ("rdtsc" : "=A" (st));
      foo1 ();
      asm volatile ("rdtsc" : "=A" (en));
      en -= st;
      if (en < m) m = en;
    }
  printf ("%lld\n", m);
  m = -1ULL;
  for (i = 0; i < 1000; ++i)
    {
      asm volatile ("rdtsc" : "=A" (st));
      foo2 ();
      asm volatile ("rdtsc" : "=A" (en));
      en -= st;
      if (en < m) m = en;
    }
  printf ("%lld\n", m);
  return 0;
}

Anyway, below is a patch variant without fldz and fld1, those changes
weren't fixing anything and thus can be a separate change which can be
argued about independently.

2004-07-19  Jakub Jelinek  <jakub@redhat.com>

	[BZ #258]
	* math/libm-test.inc (max_value, min_value): New variables.
	(initialize): Initialize them.
	(pow_test): Add a couple of new tests.
	* sysdeps/i386/fpu/e_powf.S (__ieee754_powf): Don't generate invalid
	exception if |y| >= 1U<<31.
	* sysdeps/i386/fpu/e_pow.S (__ieee754_pow): Don't generate invalid
	exception if |y| >= 1L<<63.
	* sysdeps/i386/fpu/e_powl.S (__ieee754_powl): Likewise.
	If y*log2(x) overflows to +-inf, return still +inf/+0 instead of NaN.
	* sysdeps/x86_64/fpu/e_powl.S (__ieee754_powl): Likewise.

--- libc/math/libm-test.inc.jj	2004-02-13 12:28:07.000000000 +0100
+++ libc/math/libm-test.inc	2004-07-15 12:34:17.395663150 +0200
@@ -169,7 +169,7 @@ static int output_points;	/* Should the 
 static int ignore_max_ulp;	/* Should we ignore max_ulp?  */
 
 static FLOAT minus_zero, plus_zero;
-static FLOAT plus_infty, minus_infty, nan_value;
+static FLOAT plus_infty, minus_infty, nan_value, max_value, min_value;
 
 static FLOAT max_error, real_max_error, imag_max_error;
 
@@ -3593,6 +3593,28 @@ pow_test (void)
   TEST_ff_f (pow, -1, plus_infty, 1);
   TEST_ff_f (pow, 1, minus_infty, 1);
   TEST_ff_f (pow, -1, minus_infty, 1);
+  TEST_ff_f (pow, 1, 1, 1);
+  TEST_ff_f (pow, 1, -1, 1);
+  TEST_ff_f (pow, 1, 1.25, 1);
+  TEST_ff_f (pow, 1, -1.25, 1);
+  TEST_ff_f (pow, 1, 0x1p62L, 1);
+  TEST_ff_f (pow, 1, 0x1p63L, 1);
+  TEST_ff_f (pow, 1, 0x1p64L, 1);
+  TEST_ff_f (pow, 1, 0x1p72L, 1);
+
+  /* pow (x, +-0) == 1.  */
+  TEST_ff_f (pow, plus_infty, 0, 1);
+  TEST_ff_f (pow, plus_infty, minus_zero, 1);
+  TEST_ff_f (pow, minus_infty, 0, 1);
+  TEST_ff_f (pow, minus_infty, minus_zero, 1);
+  TEST_ff_f (pow, 32.75L, 0, 1);
+  TEST_ff_f (pow, 32.75L, minus_zero, 1);
+  TEST_ff_f (pow, -32.75L, 0, 1);
+  TEST_ff_f (pow, -32.75L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p72L, 0, 1);
+  TEST_ff_f (pow, 0x1p72L, minus_zero, 1);
+  TEST_ff_f (pow, 0x1p-72L, 0, 1);
+  TEST_ff_f (pow, 0x1p-72L, minus_zero, 1);
 
   TEST_ff_f (pow, -0.1L, 1.1L, nan_value, INVALID_EXCEPTION);
   TEST_ff_f (pow, -0.1L, -1.1L, nan_value, INVALID_EXCEPTION);
@@ -3609,6 +3631,10 @@ pow_test (void)
   TEST_ff_f (pow, minus_zero, -2, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
   TEST_ff_f (pow, minus_zero, -11.1L, plus_infty, DIVIDE_BY_ZERO_EXCEPTION);
 
+  TEST_ff_f (pow, 0x1p72L, 0x1p72L, plus_infty);
+  TEST_ff_f (pow, 10, -0x1p72L, 0);
+  TEST_ff_f (pow, max_value, max_value, plus_infty);
+  TEST_ff_f (pow, 10, -max_value, 0);
 
   TEST_ff_f (pow, 0, 1, 0);
   TEST_ff_f (pow, 0, 11, 0);
@@ -3623,6 +3649,8 @@ pow_test (void)
 
   TEST_ff_f (pow, minus_zero, 2, 0);
   TEST_ff_f (pow, minus_zero, 11.1L, 0);
+  TEST_ff_f (pow, 0, plus_infty, 0);
+  TEST_ff_f (pow, minus_zero, plus_infty, 0);
 
 #ifndef TEST_INLINE
   /* pow (x, +inf) == +inf for |x| > 1.  */
@@ -3667,6 +3695,11 @@ pow_test (void)
   /* pow (-0, y) == +0 for y > 0 and not an odd integer.  */
   TEST_ff_f (pow, minus_zero, 4, 0.0);
 
+  TEST_ff_f (pow, 16, 0.25L, 2);
+  TEST_ff_f (pow, 0x1p64L, 0.125L, 256);
+  TEST_ff_f (pow, 2, 4, 16);
+  TEST_ff_f (pow, 256, 8, 0x1p64L);
+
   TEST_ff_f (pow, 0.75L, 1.25L, 0.697953644326574699205914060237425566L);
 
 #if defined TEST_DOUBLE || defined TEST_LDOUBLE
@@ -4312,12 +4345,18 @@ initialize (void)
 		       HUGE_VALL, HUGE_VAL, HUGE_VALF);
   minus_infty = CHOOSE (-HUGE_VALL, -HUGE_VAL, -HUGE_VALF,
 			-HUGE_VALL, -HUGE_VAL, -HUGE_VALF);
+  max_value = CHOOSE (LDBL_MAX, DBL_MAX, FLT_MAX,
+		      LDBL_MAX, DBL_MAX, FLT_MAX);
+  min_value = CHOOSE (LDBL_MIN, DBL_MIN, FLT_MIN,
+		      LDBL_MIN, DBL_MIN, FLT_MIN);
 
   (void) &plus_zero;
   (void) &nan_value;
   (void) &minus_zero;
   (void) &plus_infty;
   (void) &minus_infty;
+  (void) &max_value;
+  (void) &min_value;
 
   /* Clear all exceptions.  From now on we must not get random exceptions.  */
   feclearexcept (FE_ALL_EXCEPT);
--- libc/sysdeps/i386/fpu/e_powf.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powf.S	2004-07-15 13:23:15.977438871 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p31,@object)
+p31:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
+	ASM_SIZE_DIRECTIVE(p31)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +99,14 @@ ENTRY(__ieee754_powf)
 
 	fxch			// y : x
 
+	/* fistpl raises invalid exception for |y| >= 1L<<31.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p31)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
--- libc/sysdeps/i386/fpu/e_powl.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_powl.S	2004-07-15 12:56:07.678735978 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -161,6 +173,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah	// is y*log2(x) == Â±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -172,6 +189,12 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	addl	$8, %esp
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)
--- libc/sysdeps/i386/fpu/e_pow.S.jj	2001-07-06 06:55:53.000000000 +0200
+++ libc/sysdeps/i386/fpu/e_pow.S	2004-07-15 13:05:41.601640612 +0200
@@ -1,5 +1,6 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004
+   Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +49,9 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##@GOTOFF(%ecx)
@@ -96,6 +100,14 @@ ENTRY(__ieee754_pow)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fld	%st		// y : y : x
+	fabs			// |y| : y : x
+	fcompl	MO(p63)		// y : x
+	fnstsw
+	sahf
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
--- libc/sysdeps/x86_64/fpu/e_powl.S.jj	2001-09-19 12:24:08.000000000 +0200
+++ libc/sysdeps/x86_64/fpu/e_powl.S	2004-07-14 20:02:06.000000000 +0200
@@ -1,5 +1,5 @@
 /* ix87 specific implementation of pow function.
-   Copyright (C) 1996, 1997, 1998, 1999, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1997, 1998, 1999, 2001, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
 
@@ -48,6 +48,10 @@ one:	.double 1.0
 	ASM_TYPE_DIRECTIVE(limit,@object)
 limit:	.double 0.29
 	ASM_SIZE_DIRECTIVE(limit)
+	ASM_TYPE_DIRECTIVE(p63,@object)
+p63:
+	.byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43
+	ASM_SIZE_DIRECTIVE(p63)
 
 #ifdef PIC
 #define MO(op) op##(%rip)
@@ -87,6 +91,14 @@ ENTRY(__ieee754_powl)
 
 	fxch			// y : x
 
+	/* fistpll raises invalid exception for |y| >= 1L<<63.  */
+	fldl	MO(p63)		// 1L<<63 : y : x
+	fld	%st(1)		// y : 1L<<63 : y : x
+	fabs			// |y| : 1L<<63 : y : x
+	fcomip	%st(1), %st	// 1L<<63 : y : x
+	fstp	%st(0)		// y : x
+	jnc	2f
+
 	/* First see whether `y' is a natural number.  In this case we
 	   can use a more precise algorithm.  */
 	fld	%st		// y : y : x
@@ -148,6 +160,11 @@ ENTRY(__ieee754_powl)
 
 7:	fyl2x			// log2(x) : y
 8:	fmul	%st(1)		// y*log2(x) : y
+	fxam
+	fnstsw
+	andb	$0x45, %ah
+	cmpb	$0x05, %ah      // is y*log2(x) == Â±inf ?
+	je	28f
 	fst	%st(1)		// y*log2(x) : y*log2(x)
 	frndint			// int(y*log2(x)) : y*log2(x)
 	fsubr	%st, %st(1)	// int(y*log2(x)) : fract(y*log2(x))
@@ -158,6 +175,11 @@ ENTRY(__ieee754_powl)
 	fstp	%st(1)		// 2^fract(y*log2(x))*2^int(y*log2(x))
 	ret
 
+28:	fstp	%st(1)		// y*log2(x)
+	fldl	MO(one)		// 1 : y*log2(x)
+	fscale			// 2^(y*log2(x)) : y*log2(x)
+	fstp	%st(1)		// 2^(y*log2(x))
+	ret
 
 	// pow(x,Â±0) = 1
 	.align ALIGNARG(4)


	Jakub

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2)
  2004-07-19 10:19     ` Jakub Jelinek
@ 2004-07-20  7:07       ` Ulrich Drepper
  0 siblings, 0 replies; 5+ messages in thread
From: Ulrich Drepper @ 2004-07-20  7:07 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Glibc hackers

Applied.

And it might very well be that fld1 nowadays is as fast as the memory
load.  When I wrote the code I used an i386+i387.

-- 
âž§ Ulrich Drepper âž§ Red Hat, Inc. âž§ 444 Castro St âž§ Mountain View, CA â–

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-07-20  7:07 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-07-14 18:17 [PATCH] Fix powl on x86_64 [BZ #258] Jakub Jelinek
2004-07-15 11:51 ` [PATCH] Fix powl on i386/x86_64 [BZ #258] (take 2) Jakub Jelinek
2004-07-16 17:09   ` Ulrich Drepper
2004-07-19 10:19     ` Jakub Jelinek
2004-07-20  7:07       ` Ulrich Drepper

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).