Re: gcc 11.1.0: printf("%.43f\n", 0x1.52f8a8e32e982p-140): printed value is incorrectly rounded

public inbox for newlib@sourceware.org
 help / color / mirror / Atom feed

From: Steven Abner <pheonix.sja@att.net>
To: Newlib <newlib@sourceware.org>
Subject: Re: gcc 11.1.0: printf("%.43f\n", 0x1.52f8a8e32e982p-140): printed value is incorrectly rounded
Date: Sat, 06 Nov 2021 19:25:08 -0400	[thread overview]
Message-ID: <1636241108.8268.0@smtp.mail.att.net> (raw)
In-Reply-To: <1636200365.25270.0@smtp.mail.att.net>

[-- Attachment #1: Type: text/plain, Size: 832 bytes --]


On Sat, Nov 6, 2021 at 8:06 AM, Steven Abner <pheonix.sja@att.net> 
wrote:
>  sample use:
> steven@steven-ryzen:~/Development/Projects$ gcc  newlib_printf_demo.c 
> -o nldemo
> steven@steven-ryzen:~/Development/Projects$ ./nldemo
> 0.0000000000000000000000000000000000000000009
> 3.14159
> 
> Steve

  At least I'll stoke off the list comments:
Apologies, I hadn't synced files, so what I sent was an earlier version
lacking full long double abstract and full 128 mantissas. This also has
easy switching for testing between mantissas. This also voids earlier
ldtostr.c since was also non-synced. This version suppose to allow
running of quadruples along side of long double abstract. If giving
this serious consideration, can provide strtold for total independence
from gdtoa. This a big leap, so TALK amongst yourselves.

Steve



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: newlib_printf_demo.c --]
[-- Type: text/x-csrc, Size: 84620 bytes --]

#include <stdarg.h>

/* must define a data model */
#ifndef __LP64__
 #define __LP32__  1
#endif
/* found out 24 Jan 2019 from new computer build */
/* gcc does not define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ */
#ifndef __LITTLE_ENDIAN__
 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  #define __LITTLE_ENDIAN__
 #else
  #define __BIG_ENDIAN__
 #endif
#endif

typedef signed char         int8_t;
typedef unsigned char       uint8_t;
typedef short               int16_t;
typedef unsigned short      uint16_t;
typedef int                 int32_t;
typedef unsigned int        uint32_t;
typedef long long           int64_t;
typedef unsigned long long  uint64_t;

#if __SIZEOF_INT128__ == 16
typedef __int128            int128_t;
typedef unsigned __int128   uint128_t;
#endif

typedef union {
#if __SIZEOF_INT128__ == 16
 uint128_t   uo;
#endif
#ifdef __BIG_ENDIAN__
 struct { uint64_t ull1, ull0; } ulls;
 struct { uint32_t ul3, ul2, ul1, ul0; } uls;
 struct { uint16_t us7, us6, us5, us4, us3, us2, us1, us0; } uss;
#else  /* __LITTLE_ENDIAN__ */
 struct { uint64_t ull0, ull1; } ulls;
 struct { uint32_t ul0, ul1, ul2, ul3; } uls;
 struct { uint16_t us0, us1, us2, us3, us4, us5, us6, us7; } uss;
#endif
} octle;
/* defined in as_stdio.h */
#define quadruple octle

/* gcc appears to only defines _LP64 __LP64__ (2019) */
/* only models with 64bit ints */
#if defined (__ILP64__) || defined (__SILP64__)
typedef  int64_t      intmax_t;
typedef uint64_t     uintmax_t;
#elif defined (__LP32__)
typedef  int16_t      intmax_t;
typedef uint16_t     uintmax_t;
#else /* __LP64__ has 32bit int */
typedef  int32_t      intmax_t;
typedef uint32_t     uintmax_t;
#endif

/* these are all register size based */
#if defined (__LP32__) || defined (__ILP32__)
typedef  int32_t      intptr_t;
typedef uint32_t     uintptr_t;
#define __SIZEOF_REGISTER_T__ 4
#define __SIZEOF_SIZE_T__ 4
#else
typedef  int64_t      intptr_t;
typedef uint64_t     uintptr_t;
#define __SIZEOF_REGISTER_T__ 8
#define __SIZEOF_SIZE_T__ 8
#endif

typedef  intptr_t   register_t;
typedef  intptr_t    _intptr_t;
typedef uintptr_t   u_intptr_t;
typedef  intptr_t      ssize_t;   /* Used for a count of bytes or an error indication. */
typedef uintptr_t       size_t;   /* Used for sizes of objects. */
typedef  intptr_t    ptrdiff_t;   /* Used for subtracting two pointers. */
typedef uintptr_t      clock_t;   /* Used for system times in clock ticks or CLOCKS_PER_SEC . */
typedef  intptr_t       time_t;   /* Used for time in seconds. */

#define NULL        ((void *)0)

/* for testing on non double-double machines,
 * replace actual when ready to release */
/* testing of abstract long doubles,
 * other files needing support, to use:
 *  stdio.h - ftold prototype
 *  stdlib.h - strtold, stold prototypes
 *  strtold.c - interface code supporting prototypes */
/* system testing on is __LDBL_MANT_DIG__ == 64 */
//#undef __LDBL_MANT_DIG__
//#define __LDBL_MANT_DIG__ 113
//#define __LDBL_MANT_DIG__ 106
//#define __LDBL_MANT_DIG__ 53

#if ((__LDBL_MANT_DIG__ == 106) || (__LDBL_MANT_DIG__ == 113))
 #define LD_SUBST  quadruple
#elif (__LDBL_MANT_DIG__ == 53)
 #define LD_SUBST  double
#else
 #define LD_SUBST  long double
#endif


#ifdef __APPLE__
  #define FILE FILE_mac
#elif defined __linux  /* !__APPLE__ */
  typedef struct _IO_FILE _IO_FILE;
  #define FILE _IO_FILE
#else    /* !__APPLE__ !__CYGWIN__ !__linux */
  #error: unknown FILE structure
#endif  /* FILE * */
extern void *malloc(size_t);
extern void free(void *);
extern int puts(const char *);
size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict);
extern FILE* stdout;

  /* in ldtostr.c */
unsigned char *
realtostr(uint32_t [5], unsigned char **, int, int, int, unsigned);
unsigned char *
realtohex(uint32_t [5], unsigned char **, int, int, int, unsigned);
void qtow(quadruple, uint32_t [5]);
void ldtow(long double, uint32_t [5]);
void dtow(double, uint32_t [5]);
  /* in strtold.c for utilies */
double wtod(uint32_t [5]);

  /* guarentee use of this printf */
int phx_printf(const char *format, ...);

/* local defines */
#define ALT_PREFIX   0002
#define PREC_PREFIX  0400

/* locate dependent utf8 loading of decimal point */
/* for newlib demo return ascii */
struct __params {
  unsigned char **s;
  unsigned char *slimit;
  //locale_t ld;
};

static unsigned
__decimalLoad(unsigned char *s, struct __params *params) {
  (void)params;
  *s = '.';
  return (1);
}

#define BUFSTRSIZE     64
unsigned char *
__format(const unsigned char *format, struct __params *params, va_list arg) {

  unsigned char   buffer[BUFSTRSIZE]; /* size of 64bit binary ouput  no prec,
                                       * field nor alt goes here */
  unsigned char  **s = params->s;
  unsigned char  *slimit = params->slimit;
  unsigned char  *strBuf = buffer, *strPtr, *strEnd, *sPtr = *s;
  uint64_t        val_ull;            /* for any sizeof() > 4 bytes, if
                                       * unsigned is 64bits it will bypass
                                       * val_ul to go here */
  unsigned        val_ul;             /* need unsigned instead of unsigned long
                                       * incase unsigned 64bits */
  uint32_t        arg_flags;          /* arg_flags needs to be 32bits??? */
  int             arg_prec, arg_field, n;
  unsigned char   ch, sign, arg_size; /* arg_size max value is sizeof(long
                                       * double) if int goes 128bits still
                                       * within char size */
  uintptr_t      *vPtr = NULL;

  --format;
  goto L500;
  do {

    strEnd = strPtr = &strBuf[BUFSTRSIZE];
    arg_flags = 0;
    arg_size  = sizeof(int);
    arg_prec  = arg_field = -1;
    sign = 0;

  next_ch:
    ch = *++format;
  EO_EXTENSIONS:
    if (!ch) break;
    switch (ch) {

        /* Removed Flags for demo */
        /* Removed some Length modifiers for demo */
      case 'l':
        if ((ch = *++format) != 'l') {   /* and ll */
                  arg_size = sizeof(long);         goto EO_EXTENSIONS;  }
                  arg_size = sizeof(long long);    goto next_ch;
      case 'q':   arg_size = sizeof(octle);        goto next_ch;	/* 128bit */
      case 'L':   arg_size = sizeof(long double);  goto next_ch;	/* long double */

        /* Removed some Field-Width / Precision for demo */
      case '.':
        ch = *++format;
        arg_flags |= PREC_PREFIX;

      case '1': case '2': case '3':
      case '4': case '5': case '6':
      case '7': case '8': case '9':
        n = ch - 48;
        while ((unsigned)((ch = *++format) - '0') <= 9)
          n = (n * 10) + (ch - '0');
        if (ch != '$') {
          if (arg_flags & PREC_PREFIX)  arg_prec = n;
          else  arg_field = n;
          arg_flags &= ~PREC_PREFIX;
          goto EO_EXTENSIONS;
        }
        goto next_ch;

        /* Removed all Formats... but real */
      case 'a':  case 'e':  case 'f':  case 'g':
      case 'A':  case 'E':  case 'F':  case 'G':
          /* gcc doesn't permit floats, it converts to doubles */
      {
        unsigned char *(*toStr)(uint32_t [5], unsigned char **,
                                                      int, int, int, unsigned);
        uint32_t w160[5];
        uint32_t dp;  /* 4 byte decimal point param */
        int32_t sch;
        __decimalLoad((unsigned char*)&dp, params);
        if ((ch | 0x20) == 'a')  ch -= 'A', toStr = realtohex;
        else                     ch -= 'E', toStr = realtostr;
        sch = format[-1];
          /* force 128bit long double into quad parsing */
        if (sch == 'q') {
          //sch = (('q' << 8) | ch);
          sch = ((34 << 8) | ch);
          qtow(((vPtr == NULL) ? va_arg(arg, octle) : *(octle *)vPtr), w160);
        } else if (sch == 'L') {
#if   (__LDBL_MANT_DIG__ == 53)
          sch = (( 9 << 8) | ch);
#elif (__LDBL_MANT_DIG__ == 64)
          sch = ((18 << 8) | ch);
#elif (__LDBL_MANT_DIG__ == 106)
          sch = ((25 << 8) | ch);
#elif (__LDBL_MANT_DIG__ == 113)
          sch = ((34 << 8) | ch);
#else
  #error: undefined mantissa digits
#endif
          //sch = (('L' << 8) | ch);
          ldtow(((vPtr == NULL) ? va_arg(arg, long double) : *(long double *)vPtr), w160);
        } else {
          //sch = (('l' << 8) | ch);
          sch = ((9 << 8) | ch);
          dtow(((vPtr == NULL) ? va_arg(arg, double) : *(double *)vPtr), w160);
        }
        arg_flags |= (strBuf != buffer) << 7;
        strPtr = toStr(w160, &strBuf, sch, arg_prec, (int)arg_flags, dp);
      }
        if (strPtr == NULL)  goto error;

        val_ul = (n = (int)(strPtr - strBuf));
        strPtr = strBuf;
        arg_prec = 0;
        arg_flags &= ~ALT_PREFIX;
          /* grouping code removed for demo */
    } /* end switch */
                        /* main output section */
      /* adjust the different output factors */
    if ((arg_prec -= n) < 0) arg_prec = 0;
    if ((arg_field -= (arg_prec + val_ul)) < 0) arg_field = 0;
      /* is there output space? removed for demo */
      /* different formatting removed for demo */
    if (sign != 0) *sPtr++ = sign;
    if (arg_prec)  do *sPtr++ = '0'; while ((--arg_prec));
    if (n & 1)
               *sPtr++ = *strPtr++;
    if (n & 2) {
      *(int16_t *)sPtr = *(int16_t *)strPtr;
                  sPtr += sizeof(int16_t); strPtr += sizeof(int16_t);  }
    if ((n >>= 2))
      do {
      *(int32_t *)sPtr = *(int32_t *)strPtr;
                  sPtr += sizeof(int32_t); strPtr += sizeof(int32_t);
      } while ((--n));
    if (arg_field)  do  *sPtr++ = ' ';  while ((--arg_field));

      /* parsing of format string */
    do {
L500: ch = *++format;
      if ((sPtr >= slimit)) {
        --format;
          /* removed buffer extreme */
        puts("this is demo, don't be cute!");
        goto end;
      }
      *sPtr = ch;
      if (!ch) goto end;
      if (ch == '%')  break;
      sPtr++;
    } while (1);
  } while (1);
  *sPtr = 0;
end:
  if (strBuf != buffer)  free(strBuf);
  return sPtr;
error:
  if (strBuf != buffer)  free(strBuf);
  *--sPtr = 0;
  return (*s - 12);
}

#define FORMAT_LINE_MAX  4096

static int
phx_vfprintf(FILE *stream, const char *format, va_list arg) {
//_vfprintf(FILE *stream, const char *format, va_list arg, locale_t ld) {

  unsigned char  *sPtr, *wrPtr, *rdPtr;
  ssize_t         to_write;
  size_t          u8Sz  = FORMAT_LINE_MAX;
  struct __params params;

    /* make sure writable stream, Remove writable for demo */
  if ( (stream == NULL) //|| (!_io_writable(stream))
      || (format == NULL) )  return -1;

    /* create buffer to 'write' into */
  if ((wrPtr = (unsigned char*)malloc(u8Sz)) == NULL)  return -1;

    /* label rdPtr as <format>. If conversion, rdPtr will be allocated
     * and filled with UTF-8 version of <format> */
  rdPtr = (unsigned char*)format;

                        /* UTF-8 <format> input */
  params.s           = &wrPtr;
  params.slimit      = (wrPtr + u8Sz);
  //params.ld          = ld;
  sPtr = __format((const unsigned char *)rdPtr, &params, arg);

                        /* UTF-8 text ouput */
  if ((to_write = (ssize_t)(sPtr - wrPtr)) > 0)
    to_write = fwrite((const void *)wrPtr, (size_t)1, (size_t)to_write, stream);
  free(wrPtr);
  return (int)to_write;
}

int
phx_printf(const char *format, ...) {

  int     written;
  va_list arg;

  va_start(arg, format);
    written = phx_vfprintf((FILE *)stdout, format, arg);
    //written = _vfprintf((FILE *)stdout, format, arg, NULL);
  va_end(arg);
  return written;
}

/*
 *  realtostr - convert real numbers to formated latn numerical string
 *  realtohex - convert real numbers to formated hexadecimal string
 *  qtold - convert quadruple-precision number to a system defined long double
 *  ldtod - convert a system defined long double to a double-precision number
 */
        /* C langauge - string literals */
#ifdef __BIG_ENDIAN__
 #define cconst16_t(a,b)     (uint16_t)(((uint8_t)(a) << 8) | (uint8_t)(b))
 #define cconst32_t(a,b,c,d) (uint32_t)(((uint8_t)(a) << 24) | ((uint8_t)(b) << 16) | ((uint8_t)(c) << 8) | (uint8_t)(d))
#else
 #define cconst16_t(a,b)     (uint16_t)(((uint8_t)(b) << 8) | (uint8_t)(a))
 #define cconst32_t(a,b,c,d) (uint32_t)(((uint8_t)(d) << 24) | ((uint8_t)(c) << 16) | ((uint8_t)(b) << 8) | (uint8_t)(a))
#endif
        /* compiler convience macro */
#ifndef __GNUC_PREREQ
 #if defined __GNUC__ && defined __GNUC_MINOR__
  #define __GNUC_PREREQ(major,minor) \
    (((__GNUC__ << 16) + __GNUC_MINOR__) >= (((major) << 16) + (minor)))
 #else
  #define __GNUC_PREREQ(major,minor) 0
 #endif
#endif  /* __GNUC_PREREQ */

#ifndef __LDBL_MANT_DIG__
#define __LDBL_MANT_DIG__  LDBL_MANT_DIG
#endif

/* endian break down of uint64_t */
#ifdef __BIG_ENDIAN__
  typedef struct {  uint16_t w3; uint16_t w2; uint16_t w1; uint16_t w0;  } _ull_ws;
  typedef struct {  uint32_t hi; uint32_t lo;  } _ull_ls;
#elif defined (__LITTLE_ENDIAN__)
  typedef struct {  uint16_t w0; uint16_t w1; uint16_t w2; uint16_t w3;  } _ull_ws;
  typedef struct {  uint32_t lo; uint32_t hi;  } _ull_ls;
#else
  #error: undefined endianness
#endif
typedef union {  uint64_t ull;  _ull_ls  uls;  _ull_ws  uss;  } _ull_t;

#ifdef __ppc__
  #define CNTLZW(x,y) \
    do __asm__("cntlzw %0,%1" : "=r" (x) : "r" (y)); while(0)
#elif defined (__i386__)
  #define CNTLZW(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 31 - x;  } while(0)
#elif defined (__x86_64__)
  #define CNTLZW(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 31 - x;  } while(0)
  #define CNTLZD(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 63 - x;  } while(0)
#else
    /* about 50% slower, 3 instructions vs 31 instructions */
  #define CNTLZW(x,y) \
    do {  unsigned t = y;   \
      t |= t >> 1;          \
      t |= t >> 2;          \
      t |= t >> 4;          \
      t |= t >> 8;          \
      t |= t >> 16;         \
      t = t - ((t >> 1) & 0x55555555);                            \
      t = (t & 0x33333333) + ((t >> 2) & 0x33333333);             \
      x  = 32 - (((t + (t >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); \
    } while (0);
#endif

/* gcc needed help doing 32 bit shifts */
#if (defined __i386__) || (defined __x86_64__)
 #define SHRDU32(hi, lo, shift)               \
 do {  __asm__ (                              \
         "mov   %4,%%ecx\n\t"                 \
         "shrdl %%cl,%2,%1\n\t"               \
         "shrl %%cl,%0"                       \
         : "=&r" (hi), "=&r" (lo)             \
         : "0" (hi), "1" (lo), "rm" (shift)   \
         : "%ecx", "cc");           \
 } while(0)
 #define SHRDU32I(hi, lo, shift)              \
 do {  __asm__ (                              \
         "shrdl %4,%2,%1\n\t"                 \
         "shrl %4,%0"                         \
         : "=&r" (hi), "=&r" (lo)             \
         : "0" (hi), "1" (lo), "i" (shift)    \
         : "cc");                   \
 } while(0)
 #define SHLDU32(hi, lo, shift)               \
 do {  __asm__ (                              \
         "mov %4,%%ecx\n\t"                   \
         "shldl %%cl,%3,%0\n\t"               \
         "shll %%cl,%1"                       \
         : "=&r" (hi), "=&r" (lo)             \
         : "0" (hi), "1" (lo), "rm" (shift)  \
         : "%ecx", "cc");           \
 } while(0)
 #define SHLDU32I(hi, lo, shift)              \
 do {  __asm__ (                              \
         "shldl %4,%3,%0\n\t"                 \
         "shll %4,%1"                         \
         : "=&r" (hi), "=&r" (lo)           \
         : "0" (hi), "1" (lo), "i" (shift)  \
         : "cc");                   \
 } while(0)
#else
 #define SHRDU32(hi, lo, shift)               \
  lo = (hi << (32 - shift)) | (lo >> shift);  \
  hi >>= shift
 #define SHRDU32I SHRDU32
 #define SHLDU32(hi, lo, shift)               \
  hi = (lo >> (32 - shift)) | (hi << shift);  \
  lo <<= shift
 #define SHLDU32I SHLDU32
#endif

/* if defined, code keeps denormal ability, else treats
 * exponent 0 as zero, does not touch floats as may need range, since
 * accuracy not a concern when using floats. */
#define LEGACY_DENORMAL
#if (__LDBL_MANT_DIG__ == 106)
  /* required to extend precision <= 10^-292 */
#define LEGACY_DENORMAL
  /* required for double-double translation */
extern int ffs(int);
#endif

/* defined in as_stdio.h, needed for utilities */
#define quadruple octle
/* utilities */
void qtow(quadruple, uint32_t [5]);
void ldtow(LD_SUBST ldbl, uint32_t w160[5]);
void dtow(double, uint32_t [5]);
void ftow(float, uint32_t [5]);
LD_SUBST qtold(quadruple);
double ldtod(LD_SUBST);
double qtod(quadruple);

#define _CARRY  1000000000000000000ULL          /* 0x 0DE0B6B3 A7640000 */

  /* for rounding */
static const uint64_t _fcarry[] = {
  5, 50, 500, 5000, 50000, 500000, 5000000, 50000000, 500000000, 5000000000ULL,
  50000000000ULL, 500000000000ULL, 5000000000000ULL, 50000000000000ULL, 500000000000000ULL,
  5000000000000000ULL, 50000000000000000ULL, 500000000000000000ULL,
  /*18*/
  1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000,
  10000000000ULL, 100000000000ULL, 1000000000000ULL, 10000000000000ULL, 100000000000000ULL,
  1000000000000000ULL, 10000000000000000ULL, 100000000000000000ULL, _CARRY
};
  /* for cases: nan, inf, hex */
static const char *_const_outStr =
  "0123456789abcdefnanpinfe0x0.00000123456789ABCDEFNANPINFE0X0.0000";

      /*** internal routines ***/
  /* generic */
static uint64_t _divlu2r(uint64_t) __attribute__ ((noinline));
static void _ulltobuffer_nlr(uint64_t, unsigned char *, int);
static int  _first_digit(uint64_t);
  /* ldtostr specific */
static unsigned char *__n_char_sequence(uint32_t [5], unsigned char *, int);
static uint32_t __fractional(_ull_t *, _ull_t);
static uint32_t __fractional_128(_ull_t *, _ull_t);
static uint32_t __positive_lowexp128(_ull_t *, uint32_t [5], unsigned);
static uint32_t __positive_exponent_fp(_ull_t *, uint32_t [5], int);
static uint32_t __negative_exponent_fp(_ull_t *, uint32_t [5], int);

/* from fprintf... needs to be the same */
#define ZERO_PREFIX  0001
#define ALT_PREFIX   0002
#define LEFT_PREFIX  0004
#define SIGN_PREFIX  0010
#define SPC_PREFIX   0020
#define STAR_PREFIX  0040
#define GRP_PREFIX   0100
#define USGN_PREFIX  0200
#define PREC_PREFIX  0400
#define ALLOC_BUF    USGN_PREFIX
#define BUFSTRSIZE     64

/* upper 16 of 32bit 'type' to pass 'unit' max from
 calcation prior to entry of x_exponent */
#define UPPER_TYPE  0x20
#define IS_UPPER_TYPE(a)  (!((a) & UPPER_TYPE))
#define TYPE_ZERO   0x40
#define IS_TYPE_ZERO(a)   ((a) & TYPE_ZERO)
#define TYPE_E  0
#define TYPE_F  1
#define TYPE_G  2
#define IS_TYPE_E(a)  (((a) & 3) == 0)
#define IS_TYPE_F(a)  ((a) & TYPE_F)
#define IS_TYPE_G(a)  ((a) & TYPE_G)
#define SURPRESS_NSEQ(a) (((a) & 4) != 0)

/* Setup for information passed, second byte.
 * 5:3 field setup. Mantissa uses 5, Exponent uses 3.
 * 0 => 127 bias
 * 1 => 1023 bias
 * 2 => 16383 bias
 * 3-7 => unused (262143 no setup)
 *  These correspond to '__LDBL_MANT_DIG__' :
 * 0 << 3 => 24 bit mantissa
 * 1 << 3 => 53 bit mantissa
 * 2 << 3 => 64 bit mantissa
 * 3 << 3 => 106 bit mantissa
 * 4 << 3 => 113 bit mantissa
 * 5-31 << 3 => unused
 */
 /* LDBL removed, label to an abstract C type 'long double' */
#define IS_FLT(a)     ((uint8_t)((a) >> 8) ==  0)  /* 000000 */
#define IS_DBL(a)     ((uint8_t)((a) >> 8) ==  9)  /* 001001 */
#define IS_E80(a)     ((uint8_t)((a) >> 8) == 18)  /* 010010 */
#define IS_DBL_DBL(a) ((uint8_t)((a) >> 8) == 25)  /* 011001 */
#define IS_QUAD(a)    ((uint8_t)((a) >> 8) == 34)  /* 100010 */

static uint16_t __mbits[ ] = {  24, 53, 64, 106, 113  };

#pragma mark *** Support Mathematics ***

static uint64_t
_divlu2r(uint64_t c) {

  _ull_t q;
  uint32_t mConst = 2305843009U;  /* 2^61 / 10^9 */
  uint32_t tConst = 1000000000U;
  q.uls.hi = (uint32_t)(c >> 32);
  q.uls.lo = ((uint32_t)c);
  if (q.uls.hi != 0) {
    uint32_t shift;
    CNTLZW(shift, q.uls.hi);
    SHLDU32(q.uls.hi, q.uls.lo, shift);
    q.uls.hi = (uint32_t)((q.uls.hi * (uint64_t)mConst) >> 32);
      /* should be 4 or more, but found have slipped 3.
       * When adding to above, gcc adds extra code, slows too much */
    if (shift > 3)  q.uls.hi >>= (shift - 3);
  } else {
    q.uls.hi = (uint32_t)(((q.uls.lo | 1) * (uint64_t)mConst) >> (32 + 29));
  }
  q.uls.lo = (uint32_t)(c - ((uint64_t)q.uls.hi * (uint32_t)tConst));
  if (q.uls.lo >= tConst)
    q.uls.hi += 1, q.uls.lo -= tConst;
  return q.ull;
}

/* best C written that gcc can interept. Size/Speed -02 */
/* must accept 18 */
static void
_ulltobuffer_nlr(uint64_t c, unsigned char *buf, int m) {

  _ull_t b;
  int n;

    /* in 4.60, 2.30 fixed point integers */
  c = _divlu2r(c) << 2;
  b.ull = ((uint32_t)(c >> 32)) * 2882303762ULL;
  n = (m <= 9) ? m : 9;
  *buf = (b.uls.hi >> 28) + '0';
  if ((--n) == 0)  return;
  do {
    b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
    b.uls.lo <<= 4;
    *++buf = (b.uls.hi >> 28) + '0';
  } while ((--n));
  if (m > 9) {
    n = 9;
    b.ull = ((uint32_t)c) * 2882303762ULL;
    goto L2;
    do {
      b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
      b.uls.lo <<= 4;
  L2: buf[(n - 8)] = (b.uls.hi >> 28) + '0';
    } while ((++n) < m);
  }
}


/*#pragma GCC diagnostic ignored "-Wmisleading-indentation"*/
#if __GNUC_PREREQ(4,5)
 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
/* uses 2 64s as swap loads, this prevents stalls */
/* returns 1 less than actual digits, the power of 10, or log10 value */
static int
_first_digit(uint64_t a) {

  _ull_t reg;
  const uint64_t *fc;
  uint64_t b;
  reg.ull = a;
  fc = _fcarry;
  if (reg.uls.hi) {
    a = fc[35];
    b = fc[34];
    if (reg.ull >= a)  return 17;  a = fc[33];
    if (reg.ull >= b)  return 16;  b = fc[32];
    if (reg.ull >= a)  return 15;  a = fc[31];
    if (reg.ull >= b)  return 14;  b = fc[30];
    if (reg.ull >= a)  return 13;  a = fc[29];
    if (reg.ull >= b)  return 12;  b = fc[28];
    if (reg.ull >= a)  return 11;
    if (reg.ull >= b)  return 10;
    return 9;
  } else {
    uint32_t c, d;
    c = (uint32_t)fc[27];
    d = (uint32_t)fc[26];
    if (reg.uls.lo >= c)  return 9;  c = (uint32_t)fc[25];
    if (reg.uls.lo >= d)  return 8;  d = (uint32_t)fc[24];
    if (reg.uls.lo >= c)  return 7;  c = (uint32_t)fc[23];
    if (reg.uls.lo >= d)  return 6;
    if (reg.uls.lo >= c)  return 5;
    if (reg.uls.lo >= 10000U)  return 4;
    if (reg.uls.lo >= 1000U)  return 3;
    if (reg.uls.lo >= 100U)  return 2;
    if (reg.uls.lo >= 10U)  return 1;
    return 0;
  }
}
/*#pragma GCC diagnostic pop*/
#if __GNUC_PREREQ(4,5)
 #pragma GCC diagnostic pop
#endif

#pragma mark *** Support Subroutines ***

static unsigned char *
__n_char_sequence(uint32_t w160[5], unsigned char *sPtr, int type) {

  const char   *_outvalues;
  unsigned is_zero;

    /* e80 long double type, 63rd bit required set reguardless if nan or inf */
    /* if is_zero is <zero>, then is infinity, else nan */
    /* any info in nan is considered n-char-sequence */
    /* bit 62 of nan, when 0, and others [61-0] set, is considered signal nan.
     * bit 62 of nan, when 1, if others 0, special quiet nan, others [61-0] set,
     * considered quiet signaling nan. */
  is_zero = (w160[2] | w160[1] | w160[0]);
  if (IS_E80(type))
        is_zero |= w160[3] & 0x7FFFFFFF;
  else  is_zero |= w160[3];
  _outvalues = _const_outStr + 16;
  if (IS_UPPER_TYPE(type)) _outvalues += 32;
  if (!is_zero){
    *((uint32_t *)sPtr) = ((int32_t *)_outvalues)[1];
    return (sPtr + 3);
  }

  *((uint32_t *)sPtr) = ((int32_t *)_outvalues)[0];
  sPtr += 3;
  if (SURPRESS_NSEQ(type)) return sPtr;

    /* now output the n-char-sequence */
    /* (0x  hexdigits )
     * e80,16nibbles, doubles,13nibbles, floats,5.5nibbles, */
  *((uint16_t *)sPtr) = cconst16_t('(','0');
  sPtr += 2;
  *sPtr = _outvalues[9]; /* 'x' or 'X' */
  sPtr++;
  _outvalues -= 16;
  sPtr[ 0] = _outvalues[(w160[3]  >> 28)];
  sPtr[ 1] = _outvalues[((w160[3] <<  4) >> 28)];
  sPtr[ 2] = _outvalues[((w160[3] <<  8) >> 28)];
  sPtr[ 3] = _outvalues[((w160[3] << 12) >> 28)];
  sPtr[ 4] = _outvalues[((w160[3] << 16) >> 28)];
  sPtr[ 5] = _outvalues[((w160[3] << 20) >> 28)];
  if (IS_FLT(type)) {  sPtr +=  6; goto end_brace;  }
  sPtr[ 6] = _outvalues[((w160[3] << 24) >> 28)];
  sPtr[ 7] = _outvalues[((w160[3] << 28) >> 28)];
  sPtr[ 8] = _outvalues[(w160[2]  >> 28)];
  sPtr[ 9] = _outvalues[((w160[2] <<  4) >> 28)];
  sPtr[10] = _outvalues[((w160[2] <<  8) >> 28)];
  sPtr[11] = _outvalues[((w160[2] << 12) >> 28)];
  sPtr[12] = _outvalues[((w160[2] << 16) >> 28)];
  if (IS_DBL(type)) {  sPtr += 13; goto end_brace;  }
  sPtr[13] = _outvalues[((w160[2] << 20) >> 28)];
  sPtr[14] = _outvalues[((w160[2] << 24) >> 28)];
  sPtr[15] = _outvalues[((w160[2] << 28) >> 28)];
  sPtr += 16;
  if (IS_E80(type))  goto end_brace;
    /* undo w160 formatting of '[0]' for number */
  sPtr[ 0] = _outvalues[(w160[1]  >> 28)];
  sPtr[ 1] = _outvalues[((w160[1] <<  4) >> 28)];
  sPtr[ 2] = _outvalues[((w160[1] <<  8) >> 28)];
  sPtr[ 3] = _outvalues[((w160[1] << 12) >> 28)];
  sPtr[ 4] = _outvalues[((w160[1] << 16) >> 28)];
  sPtr[ 5] = _outvalues[((w160[1] << 20) >> 28)];
  sPtr[ 6] = _outvalues[((w160[1] << 24) >> 28)];
  sPtr[ 7] = _outvalues[((w160[1] << 28) >> 28)];
  sPtr[ 8] = _outvalues[(w160[0]  >> 28)];
  sPtr[ 9] = _outvalues[((w160[0] <<  4) >> 28)];
  if (IS_QUAD(type)) {
    sPtr[10] = _outvalues[((w160[0] <<  8) >> 28)];
    sPtr[11] = _outvalues[((w160[0] << 12) >> 28)];
    sPtr += 12;
  } else {
    sPtr += 10;
  }
end_brace:
  *sPtr = ')';
  return (sPtr + 1);
}

/* converts fractional portion of IEEE754 value to
 * base 10 representation, in 10^18 ull format.
 */
static uint32_t
__fractional(_ull_t *uPtr, _ull_t reg) {

  uint32_t last = 1;
  uint64_t mlConst = 3814697265625ULL;  /* 5^18 (42bits) 0x378 2DACE9D9 */
    /*
  0xFFFFFFFF  FFFFFFFF  start
  0xFFFFC 3FFF FFFFFFFF split
     */
  uPtr[0].ull = (reg.uls.hi >> 14) * mlConst; /* 18 + 46 == 64 */
    /* test added, rather than just '&='. This is guess that small 'standard'
     * fractions might be used */
  if (((reg.uls.hi &= 0x00003FFF) | reg.uls.lo) != 0) {
      /* the left over 14bits from hi with 4bits from lo (46-18=>28).
  0x00003FFF FFFFFFFF  start
  0x00003FFFF FFFFFFF split
       */
    uint64_t u64;
    last += 1;
    uPtr[0].ull += (u64 = ((uint32_t)(reg.ull >> 28)) * mlConst) >> 18;
      /* grab dropped 18 */
    uint32_t gcc_helper = (uint32_t)u64 & 0x3FFFF;
    uPtr[1].ull = gcc_helper * mlConst;
    if ((reg.uls.lo &= 0xFFFFFFF) != 0) {
        /*
  0x00000000 0FFFFFFF   start
  0x          FFFFC 3FF split
         */
      _ull_t scratch;
      last += 1;
        /* (28-18=>10) */
      uPtr[0].ull += (u64 = (reg.uls.lo >> 10) * mlConst) >> 36;
      uint32_t gcc_helper1 = (uint32_t)u64 & 0x3FFFF;
      scratch.ull  = gcc_helper1 * mlConst;
      uint32_t gcc_helper2 = (((uint32_t)(u64 >> 18)) & 0x3FFFF);
      u64 = ((gcc_helper2 * mlConst) + (scratch.ull >> 18));
      uint32_t gcc_helper3 = (scratch.uls.lo & 0x3FFFF);
      uPtr[2].ull  = gcc_helper3 * mlConst;
      u64 += uPtr[1].ull;
      if (u64 >= _CARRY)  u64 -= _CARRY, uPtr[0].ull += 1ULL;
      uPtr[1].ull = u64;
      if ((reg.uls.lo &= 0x3FF) != 0) {
          /*
  0x00000000 000003FF end of 64bit resolution
           */
        uint32_t u32;
        last += 1;
          // 54*10^18, 64-18=54
        u32 = (uint32_t)((scratch.ull = reg.uls.lo * mlConst) >> 28);
        uPtr[0].ull += (uint64_t)(scratch.uls.hi >> 14);
          // 54*10^18*10^18
          /* first  18 of 46 remainder */
        u64 = ((u32 & 0x3FFFF) * mlConst) + uPtr[1].ull;
          /* second 18 of 46 remainder, scratch.uls.lo &= 0x0FFFFFFF; */
        u32 = scratch.uls.lo & 0x0FFFFFFF;
        scratch.ull = (u32 >> 10) * mlConst;
        u64 += (scratch.ull >> 18);
        uPtr[2].ull += (scratch.uls.lo & 0x3FFFF) * mlConst;
        if (uPtr[2].ull >= _CARRY)  uPtr[2].ull -= _CARRY, u64 += 1ULL;
        uPtr[1].ull = u64;
          /* last   10 of 45 remainder */
        scratch.ull = ((u32 & 0x000003FF) << 8) * mlConst;
        uPtr[1].ull += (uint64_t)(scratch.uls.hi >> 4);
        if (uPtr[1].ull >= _CARRY)  uPtr[1].ull -= _CARRY, uPtr[0].ull += 1ULL;
          /* of scratch2 36 remainder */
        scratch.uls.hi &= 0x0000000F;
        uPtr[2].ull += ((uint32_t)(scratch.ull >> 18)) * mlConst;
        uPtr[2].ull += ((scratch.uls.lo & 0x3FFFF) * mlConst) >> 18;
        if (uPtr[2].ull >= _CARRY)  uPtr[2].ull -= _CARRY, uPtr[1].ull += 1ULL;
          /* of 18 remainder */
        scratch.uls.lo = (uint32_t)((scratch.uls.lo & 0x3FFFF)
                                                           * mlConst) & 0x3FFFF;
        uPtr[3].ull = scratch.uls.lo * mlConst;
  } } }
  return last;
}

/* this is for second register of fractional */
/* expected entry use: __fractional(uPtr, reg2);
 * to avoid initial allocation problems, create stack use instead of uPtr.
 * also avoids re-zeroing of uPtr regs
 * uPtr parameter for adding to reg1's converted fractional, the final result */
static uint32_t
__fractional_128(_ull_t *uPtr, _ull_t reg) {

  uint32_t last;
  _ull_t sreg[15]; /* StackREGisters */
    /* need these cleared, used as flags */
  sreg[2].ull = (sreg[1].ull = 0);
    /* create regs of multiplied by 5^64 (3 max) */
  __fractional(sreg, reg);

    /* creates sr3-sr6, affects uPtr[4] */
  sreg[6].ull = (sreg[5].ull = 0);
  last = 1 + __fractional(&sreg[ 3], sreg[0]);
  if ((sreg[1].ull)) {
    last = 5;  /* creates uPtr[5] value */
    sreg[9].ull = (sreg[10].ull = 0);
      /* creates sr7-sr10 */
    __fractional(&sreg[ 7], sreg[1]);
    sreg[4].ull += sreg[ 7].ull;
    sreg[5].ull += sreg[ 8].ull;
    sreg[6].ull += sreg[ 9].ull;
    sreg[7].ull  = sreg[10].ull;
    if ((sreg[2].ull)) {
      last++;
        /* creates sr11-sr14*/
      __fractional(&sreg[11], sreg[2]);
      sreg[5].ull += sreg[11].ull;
      sreg[6].ull += sreg[12].ull;
      sreg[7].ull += sreg[13].ull;
        /* increase in last */
      uPtr[6].ull  = sreg[14].ull;
    }
    if (sreg[7].ull >= _CARRY)  sreg[6].ull += 1ULL, sreg[7].ull -= _CARRY;
    uPtr[5].ull  = sreg[7].ull;
    if ((uPtr[5].ull))  last++;
    while (sreg[6].ull >= _CARRY)  sreg[5].ull += 1ULL, sreg[6].ull -= _CARRY;
  }
    /* sreg3-sreg8 set for addin transfer to uPtr */
  uPtr[4].ull  = sreg[6].ull;
  uint32_t n = 3;
  do {
    uPtr[n].ull += sreg[(n+2)].ull;
    while (uPtr[n].ull >= _CARRY)
      uPtr[n].ull -= _CARRY, uPtr[(n-1)].ull += 1ULL;
  } while ((--n));
  return last;
}

/* enters uPtr[0] = 0, assumes inlined */
static uint32_t
__positive_lowexp128(_ull_t *uPtr, uint32_t w160[5], unsigned pw2) {

  _ull_t reg0, reg1;
  reg0.ull = *(uint64_t*)&w160[2];
  reg1.ull = *(uint64_t*)&w160[0];

  uint32_t last = 0, pw10 = 0, unit_18 = 0, hi;
  _ull_t *fractionalPtr = &uPtr[1];

  if (pw2 == 0) {
    uPtr[0].uls.lo = 1;
    goto add_frac;
  }
  if ((pw2 & 63) != 0) {
    uint32_t cl = pw2 & 63;
      /* need room for implicit '1' */
    if (cl < 32) {
      uPtr[0].uls.lo =        (1 << cl) | (reg0.uls.hi >> (32 - cl));
      reg0.uls.hi = (reg0.uls.hi << cl) | (reg0.uls.lo >> (32 - cl));
      reg0.uls.lo = (reg0.uls.lo << cl) | (reg1.uls.hi >> (32 - cl));
      reg1.uls.hi = (reg1.uls.hi << cl) | (reg1.uls.lo >> (32 - cl));
      reg1.uls.lo = (reg1.uls.lo << cl);
    } else if (cl == 32) {
      uPtr[0].uls.hi = 1, uPtr[0].uls.lo = reg0.uls.hi;
         reg0.uls.hi = reg0.uls.lo;
         reg0.uls.lo = reg1.uls.hi;
         reg1.uls.hi = reg1.uls.lo;
         reg1.uls.lo = 0;
    } else {
      cl -= 32;
      uPtr[0].uls.hi =           (1 << cl) | (reg0.uls.hi >> (32 - cl));
      uPtr[0].uls.lo = (reg0.uls.hi << cl) | (reg0.uls.lo >> (32 - cl));
         reg0.uls.hi = (reg0.uls.lo << cl) | (reg1.uls.hi >> (32 - cl));
         reg0.uls.lo = (reg1.uls.hi << cl) | (reg1.uls.lo >> (32 - cl));
         reg1.uls.hi = (reg1.uls.lo << cl);
         reg1.uls.lo = 0;
        /* make sure whole in 10^18 ull format */
      if ((cl > 27) || ((cl == 27) && (uPtr[0].ull >= _CARRY))) {
        uint64_t gcc_helper;
        gcc_helper = uPtr[0].ull - ((hi = uPtr[0].uls.hi >> 28) * _CARRY);
        while (((uint32_t)(gcc_helper >> 32) >> 28))  gcc_helper -= _CARRY, hi++;
        if (gcc_helper  >= _CARRY)  gcc_helper -= _CARRY, hi++;
        uPtr[1].ull = gcc_helper;
        uPtr[0].ull = (uint64_t)hi;
          /* increase base since 2 whole ulls */
        fractionalPtr++, pw10 += 18, unit_18++;
      }
    }
      /* Is rest to be fraction? */
    if ((pw2 & ~63) == 0) {
  add_frac:
      if (reg0.ull != 0)  last = __fractional(fractionalPtr, reg0);
      if (reg1.ull != 0)  last = __fractional_128(fractionalPtr, reg1);
      return ((pw10 << 16) | ((last + unit_18 + 1) << 4) | (last + 1));
    }
  } else {
      /* case: entry == 2^64, 2^128 */
    if (reg0.ull >= _CARRY) {
      reg0.ull -= _CARRY - 446744073709551616ULL;
      hi = reg0.uls.hi >> 28;
      reg0.ull -= (hi += (hi >= 7)) * _CARRY;
      while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
      hi++;
    } else {
      reg0.ull += 446744073709551616ULL;
      if ((hi = (reg0.ull >= _CARRY)))  reg0.ull -= _CARRY;
    }
    uPtr[0].uls.lo = hi + 18;
    uPtr[1].ull = reg0.ull;
    fractionalPtr++, pw10 += 18, unit_18++;
      /* Is rest to be fraction? */
    if (pw2 != 128) {
      if (reg1.ull != 0)
        last = __fractional(fractionalPtr, reg1);
      return ((pw10 << 16) | ((last + unit_18 + 1) << 4) | (last + 1));
    }
    reg0.ull = 0;
  }
    /* Multiply by 2^64, found this larger code-wise, but around 13% faster. */
    /* When used in terms of realtostr(), a 3% increase. */
  _ull_t sreg0, sreg1;
  fractionalPtr++, pw10 += 18, unit_18++;
  if (pw10 == 36) {
    uPtr[1].ull = _divlu2r(uPtr[1].ull);
    sreg0.ull = _divlu2r((446744073U * (uint64_t)uPtr[0].uls.lo)
                        + (18U * (uint64_t)uPtr[1].uls.hi));
    sreg1.ull = ((18U * (uint64_t)uPtr[1].uls.lo)
                 + (446744073U * (uint64_t)uPtr[1].uls.hi)
                 + (709551616U * (uint64_t)uPtr[0].uls.lo)
                 + ((uint64_t)sreg0.uls.lo * 1000000000U));
    uPtr[0].uls.lo = (uPtr[0].uss.w0 * 18U) + sreg0.uls.hi;
    sreg0.ull = _divlu2r((709551616U * (uint64_t)uPtr[1].uls.hi)
                        + (446744073U * (uint64_t)uPtr[1].uls.lo));
    uPtr[2].ull = (((uint64_t)uPtr[1].uls.lo * 709551616U)
                   + ((uint64_t)sreg0.uls.lo * 1000000000U));
    sreg1.ull += (uint64_t)sreg0.uls.hi;
    if (uPtr[2].ull >= _CARRY)  sreg1.ull += 1ULL, uPtr[2].ull -= _CARRY;
    if (sreg1.ull >= _CARRY)  uPtr[0].ull += 1ULL, sreg1.ull -= _CARRY;
    uPtr[1].ull = sreg1.ull;
  } else {
    if (uPtr[0].ull > 1000000000ULL) {
      uint64_t gcc_helper0, gcc_helper1;
      uPtr[0].ull = _divlu2r(uPtr[0].ull);
      sreg0.ull = _divlu2r((uint64_t)uPtr[0].uls.hi * 18U);
      gcc_helper0 = ((uint64_t)uPtr[0].uls.lo * 709551616U);
      gcc_helper1 = ((uint64_t)uPtr[0].uls.lo * 18U)
                  + ((uint64_t)uPtr[0].uls.hi * 446744073U);
      sreg1.ull = _divlu2r(((uint64_t)uPtr[0].uls.hi * 709551616U)
                          + ((uint64_t)uPtr[0].uls.lo * 446744073U));
      gcc_helper0 += ((uint64_t)sreg1.uls.lo * 1000000000U);
      sreg1.ull = (((uint64_t)sreg0.uls.lo * 1000000000U)
                   + (uint64_t)sreg1.uls.hi
                   + gcc_helper1);
      if (gcc_helper0 >= _CARRY)  sreg1.ull += 1ULL, gcc_helper0 -= _CARRY;
      uPtr[2].ull = gcc_helper0;
      if (sreg1.ull >= _CARRY)  sreg0.uls.hi += 1, sreg1.ull -= _CARRY;
      if (sreg0.uls.hi != 0) {
        uPtr[0].uls.lo = sreg0.uls.hi, uPtr[0].uls.hi = 0;
        uPtr[1].ull = sreg1.ull;
        fractionalPtr++, pw10 += 18, unit_18++;
      } else {
        uPtr[0].ull = sreg1.ull;
        uPtr[1].ull = uPtr[2].ull;
        uPtr[2].ull = 0;
      }
    } else {
/* wtf? */
      uint32_t a0 = uPtr[0].uls.lo;
      uint64_t gcc_helper0 = (uint64_t)a0 * 18U;
      uint64_t gcc_helper1 = (uint64_t)a0 * 709551616U;
      sreg0.ull = _divlu2r((uint64_t)a0 * 446744073U);
      gcc_helper0 += (uint64_t)sreg0.uls.hi;
      gcc_helper1 += ((uint64_t)sreg0.uls.lo * 1000000000U);
      if (gcc_helper1 >= _CARRY)  gcc_helper1 -= _CARRY, gcc_helper0 += 1ULL;
      uPtr[0].ull = gcc_helper0;
      uPtr[1].ull = gcc_helper1;
    }
  }
    /* deal with fraction part multiply by 2^64 */
  if (reg0.ull != 0) {
      /* add in first fractional when used & 63 */
    hi = reg0.uls.hi >> 28;
    reg0.ull -= (hi += (hi >= 7)) * _CARRY;
    reg0.ull += fractionalPtr[-1].ull;
    while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
    fractionalPtr[-1].ull = reg0.ull;
    fractionalPtr[-2].ull += (uint64_t)hi;
    if (reg1.ull != 0)
      last = __fractional(fractionalPtr, reg1);
  } else if (reg1.ull != 0) {
    hi = 0;
    if (reg1.ull >= _CARRY) {
      hi = reg1.uls.hi >> 28;
      reg1.ull -= (hi += (hi >= 7)) * _CARRY;
    }
    reg1.ull += fractionalPtr[-1].ull;
    while (reg1.ull >= _CARRY)  hi += 1, reg1.ull -= _CARRY;
    fractionalPtr[-1].ull = reg1.ull;
    fractionalPtr[-2].ull += (uint64_t)hi;
  }
    /* valid wholes (1st, u18 count) and fractional count */
  return ((pw10 << 16) | ((last + unit_18 + 1) << 4) | (last + 1));
}

static uint32_t
__positive_exponent_fp(_ull_t *uPtr, uint32_t w160[5], int type) {

  uint32_t  pw2;  /* power of 2 */
  uint32_t  unit_18;
  uint32_t  u32;

    /* load 'passed' data */
  pw2 = (w160[4] & 0x7FFF) - 16383;

    /* handle up to/including 129bit mantissa */
  uint32_t  packed_return;
  packed_return = __positive_lowexp128(uPtr, w160, (pw2 < 128 ? pw2 : 128));
  if (pw2 <= 128)  return packed_return;
  pw2 -= 128;

  if ((pw2 & 0xf) != 0) {
    uint32_t m = pw2 & 0xf;
    uint32_t n = 16 - m;
    uPtr[0].uls.lo = ((uPtr[0].uls.lo << m)
                      + (u32 = (((uint32_t)((uPtr[1].uls.hi
                                        * 2475880078ULL) >> 32)) >> (11 + n))));
    _ull_t gcc_helper;
    gcc_helper.ull = (uPtr[1].ull - ((u32 << n) * 15258789062500ULL));
    SHLDU32(gcc_helper.uls.hi, gcc_helper.uls.lo, m);
    gcc_helper.ull += (uint64_t)(u32 = (((uint32_t)((uPtr[2].uls.hi
                                        * 2475880078ULL) >> 32)) >> (11 + n)));
    if (gcc_helper.ull >= _CARRY)
      uPtr[0].uls.lo++, gcc_helper.ull -= _CARRY;
    uPtr[1].ull = gcc_helper.ull;
    uPtr[2].ull = (uPtr[2].ull -= (u32 << n) * 15258789062500ULL) << m;
    if (uPtr[2].ull >= _CARRY)
      uPtr[1].uls.lo++, uPtr[2].ull -= _CARRY;
  }
  unit_18 = 2;
  if ((pw2 >> 4) != 0) {
    uint32_t unit_max = type >> 16;
    pw2 >>= 4;
    do {
      uint64_t *cap = &uPtr[((unit_18 > unit_max) ? unit_max : unit_18)].ull;
      _ull_t work;
      uint64_t *input, *output, next;
      input = output = &uPtr->ull;
      next = input[1];
      if ((work.ull = *input) >= 15258789062500) {
        unit_18++;
        *output =
              (uint64_t)(u32 = (uint32_t)((work.uls.hi * 2475880078ULL) >> 43));
        work.ull -= u32 * 15258789062500ULL;
        output++;
      }
      *output = work.ull << 16;
      ++input;
      do {
        work.ull = next;
        next = *(++input);
        *output += (uint64_t)(u32 = (uint32_t)(((work.uls.hi * 2475880078ULL)
                                                  + (work.uls.hi >> 1)) >> 43));
          /* >= _CARRY does not occur, just forces better gcc code,
           * but probably does occur. approx 2% speed increase using dead code */
        if (*output >= _CARRY)  output[-1] += 1, *output -= _CARRY;
        *(++output) = (work.ull - (u32 * 15258789062500ULL)) << 16;
        if (*output >= _CARRY)  output[-1] += 1, *output -= _CARRY;
      } while (input <= cap);
    } while ((--pw2));
  }
  return (((unit_18 * 18) << 16) | ((unit_18 + 1) << 4));
}

static uint32_t
__negative_exponent_fp(_ull_t *uPtr, uint32_t w160[5], int type) {

  const uint64_t fConst = 15258789062500ULL; /* 0xDE0 B6B3A764 */
  int      pw2, pw10;         /* power of 2, power of 10 */
  uint32_t index;             /* pre-use as flag */
  _ull_t   reg0;

  pw10 = -18;
  pw2  = 16383 - (w160[4] & 0x7FFF);

#ifdef LEGACY_DENORMAL
  pw2 -= (index = ((uint32_t)pw2 == 16383U));
#endif

  reg0.ull = *(uint64_t*)&w160[2];

  if (*(uint64_t*)&w160[0] != 0) {
    _ull_t   reg1;
    reg1.ull = *(uint64_t*)&w160[0];
    __fractional(uPtr, reg0);
    __fractional_128(uPtr, reg1);
#ifdef LEGACY_DENORMAL
    if (index == 0)  uPtr->ull += _CARRY;
#else
    uPtr->ull += _CARRY;
#endif
    index = 8;
  } else if (reg0.ull != 0) {
      /* convert fractional to base 10 */
    __fractional(uPtr, reg0);
#ifdef LEGACY_DENORMAL
      /* 2 cases: denormal, normal */
    if (index == 0)  uPtr->ull += _CARRY;
#else
    uPtr->ull += _CARRY;
#endif
    index = 5;
  } else {
    if (pw2 < 18) {
        /* straight power of 2 */
        /* first 17, [-1, -17]... shift of 10^18,
         * 10^18/18446744073709551616 =
         * 0.0542101086242752217003726400434970855712
         * 0.0542101086242752217003726400434970855712 * 2^64 =
         * 10^18 = 0x232830643 * 2^32 + 2808348672 */
        /* no whole, all fraction */
      uPtr->uls.lo = ((uint32_t)232830643U << (32 - pw2))
                     | ((uint32_t)2808348672U >> pw2);
      uPtr->uls.hi =  (uint32_t)232830643U >> pw2;
      return (uint32_t)((-(18 << 16)) | (1 << 4) | 8);
    }
      /* there are 18 trailing zeroes to _CARRY */
    uPtr[0].ull = _CARRY >> 18;
    pw2 -= 18;
    index = 5;
  }
    /* 5^18 * 2^2 * 2^16 = 10^18 */
    /* each shift into new ull is multiplied by 10^18.
     * or 2^-16 * 10^18 = shift * 5^18 * 2^2 */
  if (pw2 & 0xf) {
      /* partial 2^-16, odd wording to assist gcc */
    uint32_t m = pw2 & 0xf;
    uint32_t shift = (16 - m);
    uint32_t gcc_helper;
    if (index == 8) {
      gcc_helper = (uPtr[7].uls.lo << shift) & 0xFFFF;
      uPtr[7].ull = gcc_helper * fConst;
      SHRDU32(uPtr[6].uls.hi, uPtr[6].uls.lo, m);
      gcc_helper = (uPtr[5].uls.lo << shift) & 0xFFFF;
      uPtr[6].ull += gcc_helper * fConst;
      SHRDU32(uPtr[5].uls.hi, uPtr[5].uls.lo, m);
      gcc_helper = (uPtr[4].uls.lo << shift) & 0xFFFF;
      uPtr[5].ull += gcc_helper * fConst;
      SHRDU32(uPtr[4].uls.hi, uPtr[4].uls.lo, m);
      gcc_helper = (uPtr[3].uls.lo << shift) & 0xFFFF;
      uPtr[4].ull += gcc_helper * fConst;
    } else {
      gcc_helper = (uPtr[3].uls.lo << shift) & 0xFFFF;
      uPtr[4].ull = gcc_helper * fConst;
    }
    SHRDU32(uPtr[3].uls.hi, uPtr[3].uls.lo, m);
    gcc_helper = (uPtr[2].uls.lo << shift) & 0xFFFF;
    uPtr[3].ull += gcc_helper * fConst;
    SHRDU32(uPtr[2].uls.hi, uPtr[2].uls.lo, m);
    gcc_helper = (uPtr[1].uls.lo << shift) & 0xFFFF;
    uPtr[2].ull += gcc_helper * fConst;
    SHRDU32(uPtr[1].uls.hi, uPtr[1].uls.lo, m);
    gcc_helper = (uPtr[0].uls.lo << shift) & 0xFFFF;
    uPtr[1].ull += gcc_helper * fConst;
    SHRDU32(uPtr[0].uls.hi, uPtr[0].uls.lo, m);
    if ((uPtr[0].uls.hi | uPtr[0].uls.lo) == 0) {
      pw10 -= 18;
      uPtr[0].ull = uPtr[1].ull, uPtr[1].ull = uPtr[2].ull;
      uPtr[2].ull = uPtr[3].ull, uPtr[3].ull = uPtr[4].ull;
      if (index == 8) {
        uPtr[4].ull = uPtr[5].ull, uPtr[5].ull = uPtr[6].ull;
        uPtr[6].ull = uPtr[7].ull, uPtr[7].ull = 0;
      } else {
        uPtr[4].ull = 0;
      }
    }
  }
    /* 2^-16, multiples */
  if ((pw2 >> 4) != 0) {
    _ull_t *regPtr, *cap;
    pw2 >>= 4;
      /* min regs reqired to represent arg_prec */
    cap = uPtr + (unsigned)(type >> 16);
      /* track 10^18 ull expansion */
    int16_t fcnt = 0;
    do {
      uint64_t work;
      uint32_t w0;
      int32_t flag;
      regPtr = uPtr;
      w0 = (uint32_t)regPtr->uss.w0;
      fcnt += (flag = -((regPtr->ull = regPtr->ull >> 16) == 0));
      work = (++regPtr)->ull;
      do {
        uint64_t m0 = w0 * fConst;
        w0 = (uint16_t)work;
        regPtr[flag].ull = m0 + (work >> 16);
        ++regPtr;
      } while ((regPtr <= cap) && ((work = regPtr->ull) != 0));
      (regPtr += flag)->ull = w0 * fConst;
    } while ((--pw2) != 0);
    index = (int)(regPtr - uPtr) + 1;
    pw10 += fcnt * 18;
  }
  return ((pw10 << 16) | (index << 4) | 8);
}

/* enters with 'sSize = strPtr - sPtr;'.
 * s, sPtr could have been malloc'd on entry, but expected to have been
 * cleared, output used, prior to entry. Possibly previous real allocation,
 * output to 'buffer', FILE or supplied, and not free'd.
 * printf() supplies BUFSTRSIZE on entry, realtostr supplies 45 ulls.
 * BUFSTRSIZE considered large for normal requests,
 * 45 ulls covers all needs for double reals.
 * all real types generally need 9, [±][0][.](precision)[e±000[0]]
 * IS_TYPE_F adds exponent to count, [±][0](exponent)[.](precision)[e±000[0]]
 * IS_TYPE_F has possiblity of grouping:
 * GRP_PREFIX assumes thousands, xxx,xxx,xxx,xxx etc. = 6 per ull
 * can't reuse precision 'reg's as double duty... 18 bytes / 8 byte ull
 * bufS(string) must come first for free'ing of allocation
 */

#pragma mark *** External Interfaces ***

#if (!__GNUC_PREREQ(4,5))
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif

unsigned char *
realtostr(uint32_t w160[5], unsigned char **s, int type,
          int arg_prec, int arg_flags, unsigned decimal) {

  int exponent, rpos;   /* Rounding POSition */
    /* function used based on exponent, math unit */
  uint32_t (*math_func)(_ull_t *, uint32_t [5], int);
  unsigned char  *sPtr, dblt_sign, *decimalPtr;
    /* set up for precision of 792 digits, 18/ull * 44ull */
    /* malloc if exponent and precision dictate, 44ull = 352 bytes */
    /* reg: static reserved,
     * regs: actual reg use address (stack or allocated) */
  _ull_t reg[45], *regs;
  /* surpress warning due to zero jump, zero runs through allocation unit
   * incase of large precision requests. */
#if __GNUC_PREREQ(4,5)
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
    /* regs: top of stack, ubPtr: bottom of stack, ufPtr: fractional start */
  _ull_t *ubPtr, *ufPtr;
  int d0;   /* Digit0 (first digit) */
#if __GNUC_PREREQ(4,5)
#pragma GCC diagnostic pop
#endif
  unsigned strsz, pw2, packed_return;

                        /*** input defaults ***/
  if (arg_prec < 0)  arg_prec = 6;
    /* CASE 'g':  initial zero(or first digit) is significant digit */
  if ((arg_prec) && (IS_TYPE_G(type)))  --arg_prec;

                        /*** sign ***/
  if ((w160[4] & 0x8000) != 0)  dblt_sign = '-';
  else {
    dblt_sign = 0;
    if ((arg_flags & (SIGN_PREFIX | SPC_PREFIX)) != 0)
      dblt_sign = (arg_flags & SPC_PREFIX) ? ' ' : '+';
  }
                        /*** exponent ***/
    /* Case where bypasses all below code */
  if ((exponent = w160[4] & 0x7FFF) == 0x7FFF) {
      /* inf or nan, nan(n-char-sequence) */
    *(sPtr = *s) = dblt_sign, sPtr += (dblt_sign != 0);
    return __n_char_sequence(w160, sPtr,
                                      (type | ((arg_flags & ALT_PREFIX) << 1)));
  }
    /* Mantissa greater than 64 can use up to 7 ulls. Establish default. */
    /* digits calculations error on overestimate needed */
  uint32_t mbit = __mbits[(type >> 11)];
  pw2 = (mbit > 64) ? 7 : 4;
    /* Decide ulls needed (pw2), and math_func, based on exponent. */
  if (exponent < 0x3FFF) {
#ifdef LOW_PRECISION
    if ( ((exponent <= NEG_CUTIN) && (exponent >= NEG_CUTOUT)) &&
                                            (arg_prec < NEG_FULL_IN) ) {
      math_func = __negative_exponent_lp;
    } else
#endif
    {
      math_func = __negative_exponent_fp;
        /* Type 'f' or large precisions must calculate needed ulls. */
      if ( ((((uint32_t)(0x3FFF - exponent) >> 4) != 0) && (arg_prec >= 63)) ||
          (!IS_TYPE_E(type)) ) {
          /* Firstly calculate the number of prefixing zeros (log2). */
        uint32_t dgt;
        dgt = ((uint32_t)(((uint64_t)(0x3FFF - exponent)
                                                    * 2585827973U) >> 32)) >> 1;
          /* With 'f', if arg_prec less than prefixing, return is a string
           * of zeroes. Must set up for here, so we can enter into the
           * allocation section */
        if ( (IS_TYPE_F(type)) && ((unsigned)arg_prec < dgt) ) {
          exponent = 0;
          *(uint64_t*)&w160[2] = (*(uint64_t*)&w160[0] = 0);
        } else {
            /* Calculate significant digits. */
          dgt = mbit - 1 + (0x3FFF - exponent) - dgt;
          if (dgt > (unsigned)arg_prec)  dgt = arg_prec;
            /* adjust default ulls by dividing digits by 18 */
          if (dgt > 38)
            pw2 += (uint32_t)((((uint64_t)(uint32_t)(dgt - 38))
                                                 * (uint32_t)238609295U) >> 32);
    } } }
  } else {
      /* Positive exponents. Up tp 2^128 can have fractional ull needs */
    if (exponent >= (0x3FFF + 128))
      pw2 = 2;
#ifdef LOW_PRECISION
      /* Type 'f' precision only applies to < 2^128, greater is just zeroes */
    if ( ( ((exponent >= POS_CUTIN) && (exponent <= POS_CUTOUT)) &&
          (arg_prec < POS_FULL_IN) ) && (!IS_TYPE_F(type)) )
        math_func = __positive_exponent_lp;
    else
#endif
    {
      math_func = __positive_exponent_fp;
        /* Type 'f' requires maximum ulls to cover all significant digits */
      if ( ((exponent >= 0x40c2) && (arg_prec >= 27)) || (IS_TYPE_F(type)) ) {
        uint32_t dgt;
        dgt = ((uint32_t)(((uint64_t)(exponent - 0x3FFF)
                                                    * 2585827973U) >> 32)) >> 1;
          /* 'e' doesn't need full precision */
        if ((IS_TYPE_E(type)) && ((unsigned)arg_prec < dgt))  dgt = arg_prec;
          /* divide by 18 for number ulls needed */
        pw2 = 1 + (uint32_t)(((uint64_t)((uint32_t)dgt) * 252645135U) >> 32);
      }
    }
  }
                        /*** allocation ***/
    /* allocate reg here based on exponent/precision, see above notes */
    /* group allocation, fprintf() to free, returning we allocated thru 's' */
  strsz = arg_prec + 9;
    /* sets loop limits for requested arg_prec */
  type |= pw2 << 16;
    /* type F has special requirements, if positive exponent */
  if ((type << 31) & ~(exponent - 0x3FFF)) {
      /* grouping assumption of thousands: 1,000,000 */
      /* add in exponent: [±][0](exponent)[.](precision)[e±000[0]] */
    uint32_t triplets = (exponent - 0x3FFF);
    strsz += 1
          + (((uint32_t)(((uint64_t)((uint32_t)triplets)
                                                   * 2585827973U) >> 32)) >> 1);
    if ((arg_flags & GRP_PREFIX) != 0)
      strsz += ((uint16_t)21845 * (uint32_t)triplets) >> 16;
  }
  regs = reg;
  if ((pw2 > 44) || (strsz > BUFSTRSIZE)) {
      /* if either static exceeds limit, allocate both */
    strsz = (strsz + ((1 << 6) - 1)) & ~((1 << 6) - 1);
      /* test for non-static string buffer */
    if (arg_flags & ALLOC_BUF)  free(*s);
      /* assignment to sPtr, sPtr is one tested and free'd in fprintf() */
      /* request 1 more than needed to overwrite pass limit.
       * if 7 ulls needed [0-7] actually needed (8) + 1 for excess write */
    if ((sPtr = malloc((size_t)(strsz + ((pw2 + 2) * sizeof(_ull_t))))) == NULL)
      return sPtr;
    *s = sPtr;
    regs = (_ull_t*)(sPtr + strsz);
  }
    /* load sPtr and sign, now that *s is known */
  *(sPtr = *s) = dblt_sign, sPtr += (dblt_sign != 0);
    /* zero regs, no longer part of math units */
  do regs[pw2].uls.hi = 0, regs[pw2].uls.lo = 0; while ((int32_t)(--pw2) >= 0);
    /* zero test now that allocation occured. Possible extreme '0' precision. */
  if ((exponent == 0) && ((*(uint64_t*)&w160[2] | *(uint64_t*)&w160[0]) == 0)) {
      /* Bypass math/rounding, but add correct signals */
    if (IS_TYPE_G(type))  type |= TYPE_F;
    type |= TYPE_ZERO;
    goto skip_round;
  }
                        /*** math ***/
  packed_return = (*math_func)(regs, w160, type);
    /* transfer info returned into addresses, exponent */
  exponent = (d0 = _first_digit(regs->ull));
  exponent += (int32_t)packed_return >> 16;
  ubPtr = regs + ((packed_return >> 4) & 0x3FF);
  packed_return &= 0x0F;
  ufPtr = ((packed_return == 0) ? ubPtr :
          ((packed_return == 0x08) ? regs : ubPtr - (packed_return - 1)));

                        /*** rounding ***/
    /* assign position 1 pass display precision */
  rpos = arg_prec + 1;
  if (IS_TYPE_F(type))  {
      /* this excludes positive exponents, they will not jump to skip */
    if (arg_prec < ~exponent)  goto skip_round;
      /* positive exponents must count pre-radix as precision */
    if (~exponent >> 31)
      arg_prec += exponent;
      /* when negative, remove pre-zero count, positive add in whole portion */
    rpos += exponent;
  }
    /* rounding position <= first digit position, has front end zeroes */
  if ((rpos -= d0) > 0) {
    _ull_t *aPtr = regs;
      /* test rounding for second ull or higher, assume not large arg_prec */
    while (((++aPtr) < ubPtr) && (rpos > 18))  rpos -= 18;
      /* verify position has digits to round */
    if (aPtr < ubPtr) {
        /* set last ull, if rpos == 1 then exclude current aPtr */
      ubPtr = aPtr + (rpos != 1);
      if (ufPtr > ubPtr)  ufPtr = ubPtr;
      const uint64_t *fc = _fcarry;
      if ((aPtr->ull += fc[18 - rpos]) >= _CARRY) {
        uint64_t t = aPtr[-1].ull + 1ULL;
          /* carry forced propagation, makes ull == 0 */
        (ubPtr = aPtr)->ull = 0;
        (--aPtr)->ull = t;
        if ((aPtr == regs) && (t >= fc[19 + d0])) {
          exponent++;
            /* type 'f' has increase in precision if positive exponent */
            /* condition of == 18 never occurs */
          arg_prec += (type & ((uint32_t)~exponent >> 31));
          ++d0;
        }
      }
    }
  } else {
      /* first ull rounding */
    const uint64_t *fc = _fcarry;
      /* limit reads of ulls to next register */
    ubPtr = regs + 1;
    if (ufPtr > ubPtr)  ufPtr = ubPtr;
      /* add '5' to rounding location, propagation test for first digit */
    if ((regs->ull += fc[-rpos]) >= fc[19 + d0]) {
      exponent++;
      if ((++d0) == 18) {
          /* rare case. inserted here to avoid alter of ulltobuffer for odd case
           * and testing all cases for this condition. */
          /* occurs @ 1e18, 1e-18
           9.9999950000000026e+17 first precision 6
           9.9999999999999987e+17 last
           9.9999950000000012e-19 first precision 6
           9.9999999999999988e-19 last
           */
          /* type f and negative exponent */
        decimalPtr = (unsigned char*)&decimal;
        if (((uint32_t)exponent >> 31) & type) {
          unsigned j, k;
          *sPtr++ = '0';
          *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
          j = 1 + (decimalPtr[1] != 0);
          sPtr += j, decimalPtr += j;
          if ((j = -exponent - 1)) {
            if (j >= (unsigned)arg_prec) goto L24;
            arg_prec -= j;
            k = j & 3;
            if ((j = (int)((unsigned)j >> 2)))
              do  *(uint32_t*)sPtr = 0x30303030U, sPtr += 4;  while ((--j));
            *(uint32_t*)sPtr = 0x30303030U;
            sPtr += k;
          }
          *sPtr++ = '1';
          if ((arg_prec -= 1) > 0)  goto L245;
          return sPtr;
        }
        *sPtr++ = '1';
        if ( (IS_TYPE_G(type)) && (((unsigned)exponent) <= (unsigned)arg_prec) )
          type |= TYPE_F;
        if ( (arg_prec) && (!(IS_TYPE_G(type)) || (arg_flags & ALT_PREFIX)) ) {
          unsigned dflag = (decimalPtr[1] != 0);
          *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
          sPtr += dflag + 1, decimalPtr += dflag + 1;
          goto L245;
        }
        goto L25;
      }
    }
  }
                    /*** adjust for entered type... g ***/
    /* rounding can adjust exponent, so now can test if conditions met */
  if (IS_TYPE_G(type)) {
      /* this should exclude negatives */
    if (((unsigned)exponent) <= (unsigned)arg_prec)  goto L23;
    if (((unsigned)(exponent + 4)) <= 4) {
      arg_prec -= exponent;
L23:  type |= TYPE_F;
    }
  }
                        /*** output section ***/
skip_round:
  decimalPtr = (unsigned char*)&decimal;
    /* type f and negative exponent, <zero> */
  if ((((uint32_t)exponent >> 31) & type) | (IS_TYPE_ZERO(type))) {
    unsigned j, k;
    *sPtr++ = '0';
    *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
    j = 1 + (decimalPtr[1] != 0);
    if (!arg_prec) {
      if (IS_TYPE_E(type))  goto L11;
      if (arg_flags & ALT_PREFIX)  return (sPtr + j);
      return sPtr;
    }
    sPtr += j, decimalPtr += j;
    if ((j = -exponent - 1)) {
      if (j >= (unsigned)arg_prec) goto L24;
      arg_prec -= j;
      k = j & 3;
      if ((j = (int)((unsigned)j >> 2)))
        do  *(uint32_t*)sPtr = 0x30303030U, sPtr += 4;  while ((--j));
      *(uint32_t*)sPtr = 0x30303030U;
      sPtr += k;
    }
  }
  {
    /*      first ull
     if regs == ufPtr then all digits are part of precision. d0 does not
     count digits but returns log10... log10(1-9) = 0
     if 'f', data starts at sPtr. Otherwise starts sPtr + size of decimal.
     will write on last char of decimal, which then that char copies to sPtr
     then overwritten by decimal.
     (!IS_TYPE_F(type)) stringPtr = sPtr + decimalsz
     To determine if first digit is used as precision use
     (decimalPtr == (char*)&decimal)
     Two-byte decimal determined by (((char*)&decimal)[1] != 0)
     */

      /* deal with d0 */
      /* _fcarry multiplier based on d0, then can adjust d0 */
    const uint64_t *mConst = &_fcarry[35 - d0];
    if ((d0 += ((regs == ufPtr) & IS_TYPE_F(type))) > arg_prec)  d0 = arg_prec;
    unsigned char *stringPtr = sPtr + 1 + !!(((char*)&decimal)[1]);
    if (IS_TYPE_F(type))  stringPtr = sPtr;
    _ulltobuffer_nlr((regs->ull * *mConst), stringPtr,
                          (int)(d0 + (decimalPtr == (unsigned char*)&decimal)));

      /* type 'e' requires radix after first digit */
    if (!IS_TYPE_F(type)) {
        /* copy first digit and decimal point */
      *(uint16_t*)sPtr = cconst16_t((*stringPtr), (*decimalPtr));
      sPtr += 2;
        /* update and check for 2 byte decimal */
      if (*++decimalPtr != 0)  *sPtr = *decimalPtr, sPtr++, decimalPtr++;
      if (!arg_prec) {
        if (arg_flags & ALT_PREFIX)
              sPtr += (((char*)&decimal)[1] != 0);
        else  sPtr -= 1 + (((char*)&decimal)[1] != 0);
        goto L11;
      }
    }
      /* increment by number of digits and significant when TYPE_F */
    sPtr += d0 + (decimalPtr == (unsigned char*)&decimal);
    if ((arg_prec -= d0) == 0)  goto L25;
  }

  do {
      /* case where has whole/fraction */
    if ((++regs) >= ufPtr) {
      if (decimalPtr == (unsigned char*)&decimal) {
          /* this also used as delimiter for whole part, stop point for
           * trailing zero removal */
        unsigned dflag = (decimalPtr[1] != 0);
        *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
        sPtr += dflag + 1, decimalPtr += dflag + 1;
      }
        /* set to last ull, if not already at */
      if (ufPtr == ubPtr)  break;
      ufPtr = ubPtr;
    }
    d0 = (arg_prec < 18) ? arg_prec : 18;
    _ulltobuffer_nlr(regs->ull, sPtr, d0);
    sPtr += d0;
    if ((arg_prec -= d0) == 0)  goto L25;
  } while (1);
L24:
  if ( (arg_prec) && (!(IS_TYPE_G(type)) || (arg_flags & ALT_PREFIX)) ) {
      /* safe because of added room for exponent in allocation */
    unsigned k;
  L245:
    k = arg_prec & 3;
    arg_prec = (int)((unsigned)arg_prec >> 2);
    *(uint32_t*)sPtr = 0x30303030U;
    if (arg_prec)
      do  sPtr += 4, *(uint32_t*)sPtr = 0x30303030U;  while ((--arg_prec));
    sPtr += k;
  }
L25:
    /* case 'g', remove any trailing <zero>s */
  if (IS_TYPE_E(type))  goto L11;
  if (!(arg_flags & ALT_PREFIX)) {
      /* test only 'last' byte of decimal point.
       * '== 0' only occurs if written. decimal is at max 2 bytes */
    if (*decimalPtr == 0) {
      unsigned char ch;
      ch = *--sPtr;
      if (IS_TYPE_G(type))
        while (ch == '0')  ch = *--sPtr;
        /* decimal point is at max 2 bytes */
      if (ch != (unsigned char)decimalPtr[-1])  ++sPtr;
      else if (&decimalPtr[-1] != (unsigned char*)&decimal) --sPtr;
    }
  } else if (*decimalPtr != 0) {
    unsigned dflag = (decimalPtr[1] != 0);
    *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
    sPtr += dflag + 1, decimalPtr += dflag + 1;
  }
                        /*** exponent output ***/
  if (IS_TYPE_F(type))  return sPtr;
L11:
    /* converts g,G to e,E and adds sign (portable character set) */
  *(uint16_t*)sPtr = cconst16_t(('e' ^ ((type & 0x20) ^ UPPER_TYPE)),
                                ('+' - ((exponent >> 31) << 1)));
    /* absolute value */
  exponent = (exponent ^ (exponent >> 31)) - (exponent >> 31);
  if (exponent < 10) {
      /* exponent output reqires 2 digit display */
    *(uint16_t*)(&sPtr[2]) = cconst16_t('0',('0' + exponent));
    return (sPtr + 4);
  }
    /* quadruple, long doubles range [10,4966] */
  uint32_t ecd = (uint32_t)((((uint32_t)exponent << 19) * 2199023989ULL) >> 32);
  ++sPtr;
  if (exponent >= 1000)  *++sPtr = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 100)   *++sPtr = (ecd >> 28) + '0';
  sPtr[1] = ((ecd = (((ecd << 4) >> 4) * 10)) >> 28) + '0';
  sPtr[2] = ((((ecd << 4) >> 4) * 10) >> 28) + '0';
  return (sPtr + 3);
}

/* "[±]0xh.hhhhp±d"
 * normalized: h. went with '1.x'
 * denornalize: .h went with '0.x'
 * output precision based on 'g' style
 */
unsigned char *
realtohex(uint32_t w160[5], unsigned char **s, int type,
         int arg_prec, int arg_flags, unsigned decimal) {

  unsigned char *sPtr = *s;
  int exponent;
  octle result;
  uint32_t pcnt;
  int32_t rounding = __mbits[(type >> 11)] >> 2;

                      /*** input defaults ***/
  char *decimalPtr = (char*)&decimal;
  uint32_t dflag = (decimalPtr[1] != 0);
  if (arg_prec < 0) {
    arg_prec = rounding;
    type |= TYPE_G;
  }
  rounding = (arg_prec < rounding);

    /* determine upper of lower case output */
  const char    *_outvalues = _const_outStr;
  if (IS_UPPER_TYPE(type)) _outvalues += 32;

                      /*** sign && exponent ***/
  exponent = w160[4] & 0x7FFF;
  if ((w160[4] & 0x8000) != 0)  *sPtr++ = '-';
  else {
    if ((arg_flags & (SIGN_PREFIX | SPC_PREFIX)) != 0)
      *sPtr++ =  (arg_flags & SPC_PREFIX) ? ' ' : '+';
  }
  if (exponent == 0x7FFF)
    return __n_char_sequence(w160, sPtr,
                                  (type | ((arg_flags & ALT_PREFIX) << 1)));

                      /*** prefix  0x or 0X ***/
  *((uint16_t *)sPtr) = ((uint16_t *)_outvalues)[12];
  sPtr += 2;
    /* pre-output hidden 'whole' bit */
  *sPtr++ = '1';

                      /*** fractional load ***/
  result = *(octle*)w160;
    /* zero/denormal check */
  if ((exponent -= 16383) == -16383) {
    sPtr[-1] = '0';
    if ((result.ulls.ull1 == 0) && (result.ulls.ull0 == 0)) {
      arg_prec = (exponent = 0);
      if (!(arg_flags & ALT_PREFIX))  goto out_exp;
    }
  }
    /* decimal point */
  *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
  sPtr += dflag + 1, decimalPtr += dflag + 1;
  if (arg_prec == 0) {
    if (!(arg_flags & ALT_PREFIX))
      sPtr -= dflag + 1, decimalPtr -= dflag + 1;
    goto out_exp;
  }

                        /*** rounding ***/
  if (rounding) {
      /* when arg_prec is less than 'type's' nibble count */
    uint32_t rbit = (arg_prec << 2);
    if (rbit >= 64) {
      if (rbit == 64) {
        result.ulls.ull1 += (result.ulls.ull0 > 0x8000000000000000ULL);
        goto r1_check;
      }
      rbit -= 64;
      result.ulls.ull0 +=
          (uint64_t)((result.ulls.ull0 & (0xffffffffffffffffULL >> rbit))
                     > (0x8000000000000000ULL >> rbit)) << (63 - rbit);
      if (result.ulls.ull0 < *(uint64_t*)w160) {
        result.ulls.ull1 += 1ULL;
        goto r1_check;
      }
    } else {
      result.ulls.ull1 +=
          (uint64_t)((result.ulls.ull1 & (0xffffffffffffffffULL >> rbit))
                     > (0x8000000000000000ULL >> rbit)) << (63 - rbit);
    r1_check:
      if (result.ulls.ull1 < *(uint64_t*)&w160[2]) {
        result.ulls.ull1 = 0x8000000000000000ULL;
        exponent++;
      }
    }
  }

                        /*** output ***/
    /* work as 2-64bits */
  pcnt = (arg_prec < 16) ? arg_prec : 16;
  arg_prec -= pcnt;
  do {
      /* rotate into position */
    result.ulls.ull1 = (result.ulls.ull1 >> 60) | (result.ulls.ull1 << 4);
    *sPtr++ = _outvalues[(result.uls.ul2 & 0x0000000F)];
  } while ((--pcnt));
  if ((arg_prec != 0) && (result.ulls.ull0 != 0)) {
    pcnt = (arg_prec < 16) ? arg_prec : 16;
    arg_prec -= pcnt;
    do {
        /* rotate into position */
      result.ulls.ull0 = (result.ulls.ull0 >> 60) | (result.ulls.ull0 << 4);
      *sPtr++ = _outvalues[(result.uls.ul0 & 0x0000000F)];
    } while ((--pcnt));
  }

  if (IS_TYPE_G(type)) {
    unsigned char ch = *--sPtr;
    while (ch == '0')  ch = *--sPtr;
    if (ch != (unsigned char)decimalPtr[-1])  ++sPtr;
    else if (&decimalPtr[-1] != (char*)&decimal) --sPtr;
  } else if (arg_prec) {
    pcnt = arg_prec & 3;
    arg_prec = (int)((unsigned)arg_prec >> 2);
    *(uint32_t*)sPtr = 0x30303030U;
    if (arg_prec)
      do  sPtr += 4, *(uint32_t*)sPtr = 0x30303030U;  while ((--arg_prec));
    sPtr += pcnt;
  }

            /*** exponent output (p±d p[-16494,+16383]) ***/
out_exp:
  *sPtr++ = _outvalues[19];
    /* based on portable character set */
  *sPtr++ = '+' - ((exponent >> 31) << 1);
    /* absolute value */
  exponent = (exponent ^ (exponent >> 31)) - (exponent >> 31);

  if (exponent < 10) {
    *sPtr = exponent + '0';
    return (sPtr + 1);
  }
    /* divide exponent by 10000   3518437209 */
  uint32_t ecd = (uint32_t)(((uint64_t)(uint32_t)exponent * 3518437209U) >> 17);
  ecd += 1 << 14;
  if (exponent >= 10000)  *sPtr++ = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 1000)   *sPtr++ = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 100)    *sPtr++ = (ecd >> 28) + '0';
  sPtr[0] = ((ecd = (((ecd << 4) >> 4) * 10)) >> 28) + '0';
  sPtr[1] = ((((ecd << 4) >> 4) * 10) >> 28) + '0';

  return (sPtr + 2);
}

#pragma mark *** Utilities ***

#include <errno.h>

double
wtod(uint32_t w160[5]) {

  octle   result = *(octle *)w160;
  unsigned char *cw = (unsigned char*)&result.ulls.ull0;

    /* possiblity of rounding forcing ff */
  if ((w160[4] & 0x7fff) <= 0x3C00) {
#ifdef LEGACY_DENORMAL
    uint16_t shift = 0x3C00 - (w160[4] & 0x7fff);
    if (shift <= 52) {
        /* when shift == 52 0xfffffffffffff fff >> 0 > 0x8000000000000 000 >> 0
           when shift == 0  0x0000000000000 fff      > 0x0000000000000 800 */
      uint32_t round =
        (((result.ulls.ull1 & (0xffffffffffffffffULL >> (52 - shift)))
         | (result.ulls.ull0 != 0))
                            > (0x8000000000000000ULL >> (52 - shift)));
        /* 52nd bit '1' hidden */
      result.ulls.ull0 = (result.ulls.ull1 >> 1) | 0x8000000000000000ULL;
      if (shift == 52)
        result.ulls.ull0 = (uint64_t)round;
      else
        result.ulls.ull0 = (result.ulls.ull0 >> (shift + 12)) + (uint64_t)round;
    } else {
      if (shift != 0x3C00)
        //errno = ERANGE;
        errno = 34;
      result.ulls.ull0 = 0;
    }
    result.uls.ul1 |= (w160[4] & 0x8000) << 16;
#else
    if ((w160[4] & 0x7fff) != 0x3C00)
      //errno = ERANGE;
      errno = 34;
    result.uls.ul0 = 0;
    result.uls.ul1 = (w160[4] & 0x8000) << 16;
#endif
    return *(double*)cw;
  }
  if ((w160[4] & 0x7fff) > 0x43FE) {
    if ((w160[4] & 0x7fff) >= 0x7FFF) {
        /* nan outputs lowest, drops highest */
      result.uss.us3 = (w160[4] & 0xFFF0) | (result.uss.us3 & 0x000F);
      return *(double*)cw;
    }
  dinf:
    //errno = ERANGE;
    errno = 34;
    result.uls.ul0 = 0;
    result.uls.ul1 = ((w160[4] & 0x8000) << 16) | 0x7ff00000U;
    return *(double*)cw;
  }
  result.ulls.ull0 = (result.ulls.ull1 >> 12)
                      + (((result.uss.us4 & 0x0FFF) | (result.ulls.ull0 != 0)) > 0x800);
  result.uss.us3 += (((w160[4] & 0x7fff) - 16383 + 1023) << 4) | (w160[4] & 0x8000);
  if ((result.uss.us3 & 0x7ff0) == 0x7FF0)  goto dinf;
  return *(double*)cw;
}

#pragma mark ***** real to w160

/* quadruple is IEEE-754 binary128 */
void
qtow(quadruple qdbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  octle *iqdbl;
  unsigned char *cqdbl = (unsigned char*)&qdbl;
  iqdbl = (octle*)cqdbl;

  w160[4] = iqdbl->uss.us7;
  *(uint64_t*)&w160[2] = iqdbl->ulls.ull1 << 16;
  *(uint16_t*)&w160[2] = iqdbl->uss.us3;
  *(uint64_t*)&w160[0] = iqdbl->ulls.ull0 << 16;
}

/* abstract long double */
void
ldtow(LD_SUBST ldbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  octle *ildbl;
  unsigned char *cldbl = (unsigned char*)&ldbl;
  ildbl = (octle*)cldbl;

  *(uint64_t*)w160 = 0;
#if (__LDBL_MANT_DIG__ == 53)
  dtow((double)ldbl, w160);
#elif (__LDBL_MANT_DIG__ == 64)
  w160[4] = ildbl->uss.us4;
    /* nan/infinty require explicit '1' */
  if ((ildbl->uss.us4 & 0x7fff) != 0x7fff)
    *(uint64_t*)&w160[2] = ildbl->ulls.ull0 << 1;
  else
    *(uint64_t*)&w160[2] = ildbl->ulls.ull0;
#elif (__LDBL_MANT_DIG__ == 113)
  qtow((quadruple)ldbl, w160);
#elif (__LDBL_MANT_DIG__ == 106)
  octle result;
  int16_t e16, e16_2;
  w160[4] = (ildbl->uss.us7 & 0x8000);
  e16 = ((ildbl->uss.us7 & 0x7ff0) >> 4) - 1023 + 16383;
  w160[4] |= e16;
  e16_2 = ((ildbl->uss.us3 & 0x7ff0) >> 4) - 1023 + 16383;
    /* zero, do nothing */
  if (ildbl->ulls.ull1 == 0) {
    *(uint64_t*)&w160[2] = 0;
    return;
  }
  result = *ildbl;
  if (e16 == 0x43ff) {
    if (e16 != e16_2) {
        /* inf */
      result.uss.us7 = 0x8000;
    } else {
        /* nan */
      result.ulls.ull1 = (result.ulls.ull1 << 12)
                          | ((result.uls.ul1 & 0x000fffff) >> 8);
      result.ulls.ull0 <<= 24;
    }
    w160[4] |= 0x7fff;
    *(uint64_t*)&w160[2] = result.ulls.ull1;
    *(uint64_t*)&w160[0] = result.ulls.ull0;
    return;
  }
 #ifdef LEGACY_DENORMAL
  if (e16_2 == 0x3C00) {
    /* denormal, cases:
     * normal/denormal  merge
     * denormal/zero    do nothing
     */
    if (e16 == 0x3C00) {
      if (((result.uss.us7 & 0xf) == 0) && (result.uls.ul2 == 0)) {
        w160[4] &= 0x8000;
        *(uint64_t*)&w160[2] = 0;
        *(uint64_t*)&w160[0] = 0;
        return;
      }
      *(uint64_t*)&w160[2] = result.ulls.ull1 << 12;
      *(uint64_t*)&w160[0] = 0;
      return;
    }
    result.uss.us3 &= 0xf;
    result.ulls.ull0 <<= (53 - (e16 - 0x3C00));
      /* |xxxxxxxxxxxx 0aaaaaaaaaaa aaaaaaaaa|
       *  a's are shifted 1 because of 0 exponent */
    result.uss.us7 = (result.uss.us7 & 0xf) | ((e16 != 0x3C00) << 4);
      /* 12 bits from ull0 all 0s from exponent/sign/bit53 */
    result.ulls.ull1 = (result.ulls.ull1 << 12) | (result.uls.ul1 >> 8);
    result.ulls.ull0 <<= 24;
    *(uint64_t*)&w160[2] = result.ulls.ull1;
    *(uint64_t*)&w160[0] = result.ulls.ull0;
    return;
  }
 #endif
  /* Both doubles are normals */
  result.ulls.ull1 <<= 12;
  result.ulls.ull0 <<= 12;
  /* First double can be less than 52 bit mantissa if second double
   * was forced to use bits to normalize second double.
   * Second double more than likely greater than the normal continuous 52
   * bit mantissa calculation due to need of implicit '1'. */
  if ((e16 - 52) <= e16_2) {
      /* pulled bits from first double, add extra 11 bits to fill 1+63 */
    int16_t shift = e16_2 - (e16 - 52);
    result.ulls.ull0 >>= 1;
    result.uss.us3 |= 0x8000;
    result.ulls.ull1 |= (result.ulls.ull0 >> (51 - shift));
      /* clear ull1 bits */
    result.ulls.ull0 = result.ulls.ull0 << (shift + 13);
  } else {
      /* Second double >= 52 bits after first double. */
    int16_t shift = (e16 - 52) - e16_2;
    result.ulls.ull0 >>= 1;
    result.uss.us3 |= 0x8000;
    result.ulls.ull0 >>= shift - 1;
     /* pull top 11/12 bits into ull1 */
    result.ulls.ull1 |= result.ulls.ull0 >> 52;
    result.ulls.ull0 = result.ulls.ull0 << 12;
  }
  *(uint64_t*)&w160[2] = result.ulls.ull1;
  *(uint64_t*)&w160[0] = result.ulls.ull0;
#else
  #error: undefined mantissa digits
#endif
}

/* double is IEEE-754 binary64 */
void
dtow(double dbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  uint64_t *idbl;
  unsigned char *cdbl = (unsigned char*)&dbl;
  idbl = (uint64_t*)cdbl;

  *(uint64_t*)w160 = 0;
  w160[4] = (uint32_t)(*idbl >> 52);
  *(uint64_t*)&w160[2] = *idbl << 12;
  if ((w160[4] & 0x7FF) == 0x7FF)
    w160[4] = (w160[4] << 4) | 0x000F;
  else
    w160[4] = ((w160[4] & 0x7FF) + 16383 - 1023) | ((w160[4] & 0x800) << 4);
  if ((w160[4] & 0x7FFF) == 0x3c00) {
      /* conversion of denormals to normals */
    if (*(uint64_t*)&w160[2] != 0) {
      uint32_t shift;
      if (w160[3] == 0) {
        CNTLZW(shift, w160[2]);
        w160[3] = w160[2] << (shift + 1);
        w160[4] -= 32 + shift;
        w160[2] = 0;
      } else {
        CNTLZW(shift, w160[3]);
        if (shift != 31) {
          uint32_t hi = w160[3], lo = w160[2];
          SHLDU32(hi, lo, (shift + 1));
          w160[3] = hi, w160[2] = lo;
        } else
          w160[3] = w160[2], w160[2] = 0;
        w160[4] -= shift;
      }
    } else {
        /* zero */
      w160[4] &= 0x8000;
    }
  }
}

/* float is IEEE-754 binary32 */
/* force float denornal to double, and zero to as float zero */
void
ftow(float flt, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  unsigned char *cf = (unsigned char*)&flt;

  *(uint64_t*)w160 = 0;
  *(uint64_t*)&w160[2] = 0;
  w160[4] = (*(uint32_t*)cf) >> 23;

  w160[3] = (*(uint32_t*)cf) << 9;
  if ((w160[4] & 0x0FF) == 0x0FF)
    w160[4] = (w160[4] << 7) | 0x00FF;
  else
    w160[4] = ((w160[4] & 0xFF) + 16383 - 127) | ((w160[4] & 0x100) << 7);
  if ((w160[4] & 0x7FFF) == 0x3f80) {
      /* conversion of denormals to normals */
    if (w160[3] != 0) {
      uint32_t shift;
      CNTLZW(shift, w160[3]);
      w160[3] <<= (shift + 1);
      w160[4] -= shift;
    } else {
        /* zero */
      w160[4] &= 0x8000;
    }
  }
}

#pragma mark ***** real to real

/* convert binary128 to non-exact long double (113,106,64,53,etc) */
LD_SUBST
qtold(quadruple qdbl) {

#if (__LDBL_MANT_DIG__ == 113)
    /* C type 'long double' defined same as quadruple, just cast */
  return (LD_SUBST)qdbl;
#elif (__LDBL_MANT_DIG__ == 53)
    /* binary64 reduces exponent size, and mantissa */
  return (long double)qtod(qdbl);
#elif (__LDBL_MANT_DIG__ == 64)
    /* e80 adds implicit '1' between exponent and mantissa, along with
     * reducing mantissa bits */
  octle result = (octle)qdbl;
  unsigned char *cld = (unsigned char*)&result;
    /* NaN uses lowest bits, bit 0 start, for n-char-sequence */
  if ((result.uss.us7 & 0x7fff) != 0x7fff) {
    uint32_t round = ((result.ulls.ull0 & 0x0001ffffffffffff)
                      > 0x0001000000000000);
      /* << 16 >> 1 used to clear 63rd bit to watch for overflow */
    result.ulls.ull0 = (((result.ulls.ull1 << 16) >> 1)
                        | (result.uss.us3 >> 1)) + (uint64_t)round;
      /* add possible 'overflow', could use ull1>>48 if want clean pad bits */
    result.uss.us4 = result.uss.us7 + (result.uss.us3 >> 15);
      /* make sure didn't cause NaN */
    if ((result.uss.us4 & 0x7fff) == 0x7fff)
      //errno = ERANGE;
      errno = 34;
 #ifdef LEGACY_DENORMAL
    result.uss.us3 |= ((result.uss.us4 & 0x7fff) != 0) << 15;
 #else
    result.uss.us3 |= 0x8000;
 #endif
  } else {
    result.uss.us4 = result.uss.us7;
      /* e80s require forced 63rd bit */
    result.uss.us3 |= 0x8000;
  }
  return *(long double*)cld;
#elif (__LDBL_MANT_DIG__ == 106)
  octle result;
  unsigned char *cld = (unsigned char*)&result;
  result = qdbl;
  uint32_t shift;
  int16_t sign = result.uss.us7 & 0x8000;
  int16_t e16 = result.uss.us7 & 0x7FFF;
  if (e16 == 0x7FFF) {
    result.ulls.ull1 = (result.ulls.ull1 << 12) | (result.uss.us3 >> 4);
    result.uss.us3 = sign | (0x7ff0) | (result.uss.us3 & 0xf);
    result.uss.us7 = sign | (0x7ff0) | (result.uss.us7 & 0xf);
  } else if (e16 > 0x43FE) {
    errno = ERANGE;
    result.ulls.ull1 = result.ulls.ull0 = 0;
    result.uss.us3 = (result.uss.us7 = sign | (0x7ff0));
  } else if (e16 > 0x3C00) {
      /* force 106 bits */
    result.ulls.ull0 += (uint64_t)(((result.uls.ul0 & 0x7f) > 0x40) << 7);
    if (result.uls.ul1 < qdbl.uls.ul1)  result.ulls.ull1 += 1ULL;
    result.uss.us0 &= ~0x7f;
      /* set ull1 */
    result.ulls.ull1 = (result.ulls.ull1 << 4) | (result.ulls.ull0 >> 60);
    result.uss.us7 = sign
                   | ((e16 - 0x3FFF + 0x3FF) << 4) | (result.uss.us7 & 0xf);
      /* adjust flush left bits used to create ull0 */
    if ((result.ulls.ull0 <<= 4) == 0) {
      result.uss.us3 = sign;
      return *(quadruple*)cld;
    }
      /* determine ull0 possiblities
       * first doubles shift plus seconds is:
       * above 0 or <= 0. Seconds considers shift needed to
       * create above 0's implicit '1' */
    if ((e16 - 52) <= 0x3C00) {
        /* denormalize second double, 13 'cause 0 uses >>1 */
      shift = 0x3C00 - (e16 - 52) + 13;
      if (shift >= 64) {
        result.ulls.ull0 = 0;
        result.uss.us3 = sign;
      } else {
        result.ulls.ull0 >>= shift;
        result.uss.us3 = sign | (result.uss.us3 & 0xf);
      }
      return *(quadruple*)cld;
    }
    if (result.uls.ul1 != 0) {
      CNTLZW(shift, result.uls.ul1);
      shift++;
    } else {
      CNTLZW(shift, result.uls.ul0);
      shift += 33;
    }
    if ((e16 - (52 + shift)) > 0x3C00) {
        /* use shift for drop of implicit '1' and determining exponent */
        /* don't count implicit '1' drop into exponent */
      sign |= ((e16 - 0x3FFF + 0x3FF) - (52 + shift)) << 4;
      result.ulls.ull0 <<= shift;
      result.ulls.ull0 >>= 12;
      result.uss.us3 = sign | (result.uss.us3 & 0xf);
      return *(quadruple*)cld;
    } else {
        /* recalculate shift, must find implicit '1' from ull1 */
      int16_t right_shift;
        /* ffs returns bit index + 1, so bit 0 == 1 */
      if ((right_shift = ffs((int)result.uls.ul2)) == 0) {
        right_shift = ffs((int)result.uls.ul3) + 32;
      }
        /* drop 1 bit from ull1 */
      result.ulls.ull1 &= (0xffffffffffffffffULL << right_shift);
        /* adds 0's prior to right_shift */
      uint32_t round;
      round = ((result.ulls.ull0
                & (0xffffffffffffffffULL >> (64 - (right_shift + 12))))
               > (0x8000000000000000ULL >> (64 - (right_shift + 12))));
      result.ulls.ull0 >>= (right_shift - 1);
      result.ulls.ull0 >>= 12;
      result.ulls.ull0 += (uint64_t)round;
      sign |= ((e16 - (52 - (right_shift - 1))) - 0x3C00) << 4;
      result.uss.us3 = sign | (result.uss.us3 & 0xf);
      return *(quadruple*)cld;
    }
  } else {
    shift = 0x3C00 - e16;
    if (shift == 0) {
      result.ulls.ull1 = (result.ulls.ull1 << 3) | (result.uss.us3 >> 13);
      result.ulls.ull0 <<= 3;
    } else if (shift <= 52) {
        /* 52nd bit '1' hidden */
      result.ulls.ull1 = (result.ulls.ull1 << 15) | 0x8000000000000000ULL
                          | (result.uls.ul1 >> 17);
      uint32_t round =
        (((result.ulls.ull1 & (0x7fffffffffffffffULL >> (64 - (shift + 12))))
         | (result.ulls.ull0 != 0))
                            > (0x4000000000000000ULL >> (64 - (shift + 12))));
      if (shift == 52)
        result.ulls.ull1 = (uint64_t)round;
      else
        result.ulls.ull1 = (result.ulls.ull1 >> (shift + 12)) + (uint64_t)round;
    } else {
      if (shift != 0x3C00)  errno = ERANGE;
      result.ulls.ull1 = 0;
    }
    result.uss.us7 = sign | (result.uss.us7 & 0xf);
    result.ulls.ull0 = 0;
    result.uss.us3 = sign;
  }
  return *(quadruple*)cld;
#else
  #error: undefined mantissa digits
#endif
}

/* convert binary128 to binary64 */
double
qtod(quadruple qdbl) {

  uint32_t w160[5];
  unsigned char *wc = (unsigned char *)w160;
  octle iqdbl = qdbl;
  w160[4] = iqdbl.uss.us7;
  if ((iqdbl.uss.us7 & 0x7fff) != 0x7fff) {
    *(uint64_t*)&wc[8] = (iqdbl.ulls.ull1 << 16) | iqdbl.uss.us3;
    *(uint64_t*)wc = (iqdbl.ulls.ull0 << 16);
  } else {
    *(uint64_t*)&wc[8] = iqdbl.ulls.ull1;
    *(uint64_t*)wc = iqdbl.ulls.ull0;
  }
  return wtod(w160);
}

/* convert abstract long double to binary64 */
double
ldtod(LD_SUBST ldbl) {

#if (__LDBL_MANT_DIG__ == 53)
  unsigned char *wc = (unsigned char*)&ldbl;
  return *(double*)wc;
#elif (__LDBL_MANT_DIG__ == 106)
  unsigned char *wc = (unsigned char*)&ldbl.ulls.ull1;
  return *(double*)wc;
#elif (__LDBL_MANT_DIG__ == 64)
  unsigned char *wc = (unsigned char*)&ldbl;
  octle result = *(octle*)wc;
  if ((result.uss.us4 & 0x7fff) == 0x7fff) {
    result.uss.us3 = (result.uss.us3 & 0x000f) | (result.uss.us4 & 0xfff0);
  } else {
    int16_t sign, e16;
    sign = result.uss.us4 & 0x8000;
    e16  = result.uss.us4 & 0x7fff;
    if (e16 > 0x43FE) {
      //errno = ERANGE;
      errno = 34;
      result.ulls.ull0 = 0;
      result.uss.us3 = sign | (0x7ff0);
    } else if (e16 > 0x3C00) {
        /* reminder: hidden not hidden 63-11 = 52 */
      result.ulls.ull0 = ((result.ulls.ull0 >> 11)
                          + ((result.uss.us0 & 0x07ff) > 0x0400));
      result.uss.us3 = (result.uss.us3 & 0x000f)
                        | sign | ((e16 - 0x3FFF + 0x3FF) << 4);
 #ifdef LEGACY_DENORMAL
    } else if (e16 > 0x3BCC) {
      int16_t shift = 0x3C00 - e16;
      uint32_t round;
      round = ((result.ulls.ull0
                & (0xffffffffffffffffULL >> (64 - (shift + 12))))
               > (0x8000000000000000ULL >> (64 - (shift + 12))));
      result.ulls.ull0 = (result.ulls.ull0 >> (shift + 12)) + (uint64_t)round;
      result.uss.us3 = (result.uss.us3 & 0x000f) | sign;
    } else {
      //errno = ERANGE;
      errno = 34;
      result.ulls.ull0 = 0;
      result.uss.us3 = sign;
 #else
    } else {
      if (e16 != 0x3C00)
        errno = ERANGE;
      result.ulls.ull0 = 0;
      result.uss.us3 = sign;
 #endif
    }
  }
  return *(double*)(wc = (unsigned char*)&result);
#elif (__LDBL_MANT_DIG__ == 113)
  uint32_t w160[5];
  ldtow(ldbl, w160);
  return wtod(w160);
#else
  #error: undefined mantissa digits
#endif
}



int
main(void) {
  phx_printf("%.43f\n", 0x1.52f8a8e32e982p-140);
  phx_printf("%g\n", 355.0/113.0);
  return 0;
}

next prev parent reply	other threads:[~2021-11-06 23:25 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CAL9Mx1uMOz2wfqbMpD_xfA=D9JkpFzVz6AR_DKHK34AvrGOP6w@mail.gmail.com>
2021-11-03 21:26 ` Fwd: " Pavel M
2021-11-04 12:24   ` Corinna Vinschen
2021-11-04 16:48     ` Keith Packard
2021-11-04 17:27       ` Dave Nadler
2021-11-04 17:58         ` Keith Packard
2021-11-04 21:28           ` Brian Inglis
2021-11-05  2:54             ` Keith Packard
2021-11-05 16:00               ` Steven Abner
2021-11-06 12:06                 ` Steven Abner
2021-11-06 23:25                   ` Steven Abner [this message]
2021-11-08  9:57               ` Corinna Vinschen
2021-11-04 20:57       ` Corinna Vinschen
2021-11-28  7:43     ` Takashi Yano
2021-11-28 12:38       ` Takashi Yano
2021-11-28 13:16         ` Takashi Yano
2021-11-29 10:56           ` Takashi Yano
2021-11-29 14:24             ` Takashi Yano
2021-11-29 15:55               ` Corinna Vinschen
2021-11-30 10:51                 ` Takashi Yano
2021-11-30 15:09                   ` Corinna Vinschen
2021-11-30 20:38                     ` Takashi Yano
2021-11-30 21:17                       ` Takashi Yano
2021-12-06  9:53                         ` Corinna Vinschen
2021-12-06 10:26                           ` Takashi Yano
2021-12-01 12:05                     ` Takashi Yano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1636241108.8268.0@smtp.mail.att.net \
    --to=pheonix.sja@att.net \
    --cc=newlib@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).