public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* i386 rounding modes and workaround solution
@ 2001-09-10  7:48 Han-Kwang Nienhuys
  2001-09-10  7:53 ` Jan Hubicka
  2001-09-10  8:24 ` Tim Prince
  0 siblings, 2 replies; 5+ messages in thread
From: Han-Kwang Nienhuys @ 2001-09-10  7:48 UTC (permalink / raw)
  To: gcc

I recently found out that

1. converting float or double to int in gcc or g++ on an x86 platform
  is incredebly slow

2. the functions floor(), ceil() in glibc math.h are incredibly slow as
  well.

Point 1 is documented somewhere on a 'known deficiencies list' for
gcc; I didn't find much about point 2 on the web. This issue has
appeared several times on gnu mailing list, but without a practical
solution.

The cause is that the program code must change the rounding mode of
the floating-point unit with the FLDCW instruction, which is very
slow. Note that the libc function double rint(double) defined in
<math.h> is supposed to round according to current rounding mode, but
is - in my version of glibc (2.2.0) implemented as a function call
instead of an inline function, which is not very efficient either.

In a program for numerical calculations, almost all time was spent in
a code line similar to

  int index = (int) floor(x*inverse_step);

I was able to increase the speed of this program by a factor 4.2 using
the functions defined below.

At the bottom of this message is a header file I wrote that provides
the necessary inline functions for someone who needs to do a lot of
float->int conversions on an x86 platform.

I am not very experienced in assembler, nor do I read this list. Send
any comments directly to my email address.

Han-Kwang Nienhuys
FOM Institute voor Atomic and Molecular Physics, Amsterdam, The Netherlands
http://www.amolf.nl/

================== start of code  ===================

/*********************************************************************
fastround.h

Workaround for slow gcc/g++/glibc float->int conversions on intel x86
written by: Han-Kwang Nienhuys, Sept. 2001 (h.nienhuys@amolf.NOSPAM.nl)
copyright: public domain

The Intel x86 floating-point unit has several different
float->int rounding modes, that are switched on/off with bits in the
fpu status register.

Unfortunately, changing the status register with the fldcw instruction
is very slow; with gcc and glibc, this happens with

  floor()    (glibc),
  ceil()     (glibc),
  typecast to int (gcc/g++).

For example, the C code

  int index = (int) floor(x*inverse_step);

generates 4 FLDCW instructions! Each of them destroys the floating-point
pipeline; this line takes about 134 clock cycles on my Intel Pentium III.
Replacing this line by

  int index = fastf2i(x*inverse_step)

increased the speed of a program (numerical integration with frequent use
of look-up tables) by a factor 4.2.

This header file provides the following inlined functions:

  double fastfround(double x);
  float fastfround_f(float x);
  long double fastfround_l(long double x)
   - round according to current rounding mode

  int fastf2i(double x);
  int fastf2i_f(float x);
  int fastf2i_l(long double x);
   - round according to current rounding mode and convert to int

  void fpu_set_cw(unsigned short int status)
   - set fpu status

  unsigned short fpu_get_cw();
   - get current fpu status

  unsigned short fpu_set_roundtrunc();
   - set to truncate mode, return original status

  unsigned short fpu_set_roundceil();
   - set to ceiling mode, return original status

  unsigned short fpu_set_roundfloor();
   - set to floor mode, return original status

  unsigned short fpu_set_roundnormal();
   - set to normal mode, return original status

Note: Several functions in glibc2 <math.h> assume that the status register
is NOT changed; always restore the original values.
The dangerous functions are:  pow2(), expm1(), exp(), pow(), rint().
There may be more of them, so be careful.

behaviour according to table below.
For normal mode: n+0.5, rounds to closest even integer

             -1.50 -1.00 -0.51 -0.50 -0.49  0.49  0.50  0.51  1.00 1.50
  normal     -2    -1    -1     0     0     0     0     1     1    2   
  ceiling    -2    -1     0     0     0     1     1     1     1    2   
  floor      -2    -1    -1    -1    -1     0     0     0     1    2   
  truncate   -2    -1     0     0     0     0     0     0     1    2   

*******************************************************************/
 


#ifndef _FASTROUND_H_
#define _FASTROUND_H_

#ifndef __i386__
#error These functions are only available for i386-class machines!
#endif

/* round float to float in various precision modes */

static __inline double
fastfround (double x)
{
  register long double result;
  __asm __volatile ("frndint"  : "=t" (result) : "0" (x));
  return result;
}

static __inline float
fastfround_f (float x)
{
  register long double result;
  __asm __volatile ("frndint"  : "=t" (result) : "0" (x));
  return result;
}

static __inline long double
fastfround_l (long double x)
{
  register long double result;
  __asm __volatile ("frndint"  : "=t" (result) : "0" (x));
  return result;
}

/* round float to int in various precision modes */

static __inline int
fastf2i (double x)
{
  __volatile int result;
  __asm __volatile ("fistl %0" : "=m" (result) : "t" (x) );
  return result;
}

static __inline int
fastf2i_f (float x)
{
  __volatile int result;
  __asm __volatile ("fistl %0" : "=m" (result) : "t" (x) );
  return result;
}

static __inline int
fastf2i_l (long double x)
{
  __volatile int result;
  __asm __volatile ("fistl %0" : "=m" (result) : "t" (x) );
  return result;
}

/* get status */

static __inline unsigned short int
fpu_get_cw()
{
  volatile unsigned short int cw;
  __asm __volatile ("fnstcw %0" : "=m" (cw));
  return cw;
}

/* set status */

static __inline void
fpu_set_cw(unsigned short int cw)
{
  __asm __volatile ("fldcw %0" : : "m" (cw));
}

/* set status equivalent to ceil() */

static __inline unsigned short int
fpu_set_roundceil ()
{
  unsigned short int cwnew, cw = fpu_get_cw();
  cwnew = (cw & 0xf3ff) | 0x0800;
  fpu_set_cw(cwnew);
  return cw;
}

/* set status equivalent to floor() */

static __inline unsigned short int
fpu_set_roundfloor ()
{
  unsigned short int cwnew, cw = fpu_get_cw();
  cwnew = (cw & 0xf3ff) | 0x0400;
  fpu_set_cw(cwnew);
  return cw;
}

/* set status equivalent to (int) */

static __inline unsigned short int
fpu_set_roundtrunc ()
{
  unsigned short int cwnew, cw = fpu_get_cw();
  cwnew = (cw & 0xf3ff) | 0x0c00;
  fpu_set_cw(cwnew);
  return cw;
}

/* set status to round to nearest (both 0.5 and -0.5 are truncated to 0) */

static __inline unsigned short int
fpu_set_roundnormal ()
{
  unsigned short int cwnew, cw = fpu_get_cw();
  cwnew = (cw & 0xf3ff);
  fpu_set_cw(cwnew);
  return cw;
}

#endif

/* this section is a small test of the various settings

#define NX 9
volatile double x[NX] = { -1, -0.51, -0.5, -0.49, 0.0, 0.49, 0.50, 0.51, 1.0 };

void printresults() {
  int i;
  printf("%-12s", "fastround");
  for (i=0; i<NX; ++i)
    printf(" %6.2f", fastfround(x[i]));
  printf("\n");
  printf("%-12s", "fastf2i");
  for (i=0; i<NX; ++i)
    printf(" %6d", fastf2i(x[i]));
  printf("\n");
}


int main()
{
  int i, cworig;
  
  printf("%-12s", "original");
  for (i=0; i<NX; ++i)
    printf(" %6.2f", x[i]);

  printf("\n--std cw--\n");
  printresults();
  cworig = fpu_get_cw();
  printf("cw=%04x\n", cworig);

  printf("\n--ceil--\n");
  fpu_set_roundceil();
  printresults();
  fpu_set_cw(cworig);

  printf("\n--floor--\n");
  fpu_set_roundfloor();
  printresults();
  fpu_set_cw(cworig);

  printf("\n--(int)--\n");
  fpu_set_roundtrunc();
  printresults();
  fpu_set_cw(cworig);

  printf("\n--normal?--\n");
  fpu_set_roundnormal();
  printresults();
  fpu_set_cw(cworig);

  return 0;  
}

*/






   
    

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2001-09-11  8:36 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-09-10  7:48 i386 rounding modes and workaround solution Han-Kwang Nienhuys
2001-09-10  7:53 ` Jan Hubicka
2001-09-10  8:24 ` Tim Prince
2001-09-11  2:13   ` Jan Hubicka
2001-09-11  8:36   ` Han-Kwang Nienhuys

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).