#include <stdarg.h>

/* must define a data model */
#ifndef __LP64__
 #define __LP32__  1
#endif
/* found out 24 Jan 2019 from new computer build */
/* gcc does not define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ */
#ifndef __LITTLE_ENDIAN__
 #if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
  #define __LITTLE_ENDIAN__
 #else
  #define __BIG_ENDIAN__
 #endif
#endif

typedef signed char         int8_t;
typedef unsigned char       uint8_t;
typedef short               int16_t;
typedef unsigned short      uint16_t;
typedef int                 int32_t;
typedef unsigned int        uint32_t;
typedef long long           int64_t;
typedef unsigned long long  uint64_t;

#if __SIZEOF_INT128__ == 16
typedef __int128            int128_t;
typedef unsigned __int128   uint128_t;
#endif

typedef union {
#if __SIZEOF_INT128__ == 16
 uint128_t   uo;
#endif
#ifdef __BIG_ENDIAN__
 struct { uint64_t ull1, ull0; } ulls;
 struct { uint32_t ul3, ul2, ul1, ul0; } uls;
 struct { uint16_t us7, us6, us5, us4, us3, us2, us1, us0; } uss;
#else  /* __LITTLE_ENDIAN__ */
 struct { uint64_t ull0, ull1; } ulls;
 struct { uint32_t ul0, ul1, ul2, ul3; } uls;
 struct { uint16_t us0, us1, us2, us3, us4, us5, us6, us7; } uss;
#endif
} octle;
/* defined in as_stdio.h */
#define quadruple octle

/* gcc appears to only defines _LP64 __LP64__ (2019) */
/* only models with 64bit ints */
#if defined (__ILP64__) || defined (__SILP64__)
typedef  int64_t      intmax_t;
typedef uint64_t     uintmax_t;
#elif defined (__LP32__)
typedef  int16_t      intmax_t;
typedef uint16_t     uintmax_t;
#else /* __LP64__ has 32bit int */
typedef  int32_t      intmax_t;
typedef uint32_t     uintmax_t;
#endif

/* these are all register size based */
#if defined (__LP32__) || defined (__ILP32__)
typedef  int32_t      intptr_t;
typedef uint32_t     uintptr_t;
#define __SIZEOF_REGISTER_T__ 4
#define __SIZEOF_SIZE_T__ 4
#else
typedef  int64_t      intptr_t;
typedef uint64_t     uintptr_t;
#define __SIZEOF_REGISTER_T__ 8
#define __SIZEOF_SIZE_T__ 8
#endif

typedef  intptr_t   register_t;
typedef  intptr_t    _intptr_t;
typedef uintptr_t   u_intptr_t;
typedef  intptr_t      ssize_t;   /* Used for a count of bytes or an error indication. */
typedef uintptr_t       size_t;   /* Used for sizes of objects. */
typedef  intptr_t    ptrdiff_t;   /* Used for subtracting two pointers. */
typedef uintptr_t      clock_t;   /* Used for system times in clock ticks or CLOCKS_PER_SEC . */
typedef  intptr_t       time_t;   /* Used for time in seconds. */

#define NULL        ((void *)0)

#ifdef __APPLE__
  #define FILE FILE_mac
#elif defined __linux  /* !__APPLE__ */
  typedef struct _IO_FILE _IO_FILE;
  #define FILE _IO_FILE
#else    /* !__APPLE__ !__CYGWIN__ !__linux */
  #error: unknown FILE structure
#endif  /* FILE * */
extern void *malloc(size_t);
extern void free(void *);
extern int puts(const char *);
size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict);
extern FILE* stdout;

  /* in ldtostr.c */
extern unsigned char *
realtostr(uint32_t [5], unsigned char **, int, int, int, unsigned);
extern unsigned char *
realtohex(uint32_t [5], unsigned char **, int, int, int, unsigned);

  /* guarentee use of this printf */
int phx_printf(const char *format, ...);

/* local defines */
#define ALT_PREFIX   0002
#define PREC_PREFIX  0400

/* converts to w160 type used by realtostr,
 * current use as 63-63 with implicit '1"-implicit nil */
static void
qtow(quadruple qdbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  octle *iqdbl;
  unsigned char *cqdbl = (unsigned char*)&qdbl;
  iqdbl = (octle*)cqdbl;

    /* long double shifted to high bit of uint128_t, with implicit
     * bit becoming explicit at bit 127 */
  w160[4] = iqdbl->uss.us7;
  *(uint64_t*)&w160[2] = (iqdbl->ulls.ull1 << 15)
                       | (iqdbl->uls.ul1 >> 17);
    /* lower 64 set same position as upper, upper gets 'implicit '1' */
  iqdbl->uss.us3 &= 1;
  *(uint64_t*)&w160[0] = (iqdbl->ulls.ull0 << 14);
    /* place implicit bit for all but denormal */
  w160[3] |= (w160[4] != 0) << 31;
}

/* converts to w160 type used by realtostr */
static void
ldtow(long double ldbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  octle *ildbl;
  unsigned char *cldbl = (unsigned char*)&ldbl;
  ildbl = (octle*)cldbl;

  w160[4] = ildbl->uss.us4;
  *(uint64_t*)&w160[2] = ildbl->ulls.ull0;
  *(uint64_t*)w160 = 0;
}

/* converts to w160 type used by realtostr */
static void
dtow(double dbl, uint32_t w160[5]) {

    /* required strict aliasing conversion */
  uint64_t *idbl;
  unsigned char *cdbl = (unsigned char*)&dbl;
  idbl = (uint64_t*)cdbl;
  *(uint64_t*)&w160[2] = *idbl << 11;
  w160[4] = (uint32_t)(*idbl >> 52);

  if ((w160[4] & 0x7FF) == 0x7FF)
    w160[4] = (w160[4] << 4) | 0x000F;
  else if (((w160[4] & 0x7FF) == 0) && (*(uint64_t*)&w160[2] == 0)) {
    w160[4] <<= 4;
  } else {
    w160[4] = ((w160[4] & 0x7FF) + 16383 - 1023) | ((w160[4] & 0x800) << 4);
      /* place implicit bit for all but denormal */
    w160[3] |= ((w160[4] & 0x7FFF) != 0x3C00) << 31;
  }
  *(uint64_t*)w160 = 0;
}

/* not sure usefulness, gcc's code forces doubles in printf. Not sure of overhead
 * created by gcc and/or POSIX requirement of double argument.
static void
ftow(float flt, uint32_t w160[5]) {

  *(uint64_t*)w160 = 0;
  *(uint64_t*)&w160[2] = 0;
  w160[4] = (*(uint32_t*)&flt) >> 23;
  w160[3] = (*(uint32_t*)&flt) << 9;
  if ((w160[4] & 0xFF) == 0xFF)
    w160[4] = (w160[4] << 8) | 0x00FF;
  else
    w160[4] = ((w160[4] & 0xFF) + 16383 - 127) | ((w160[4] & 0x100) << 7);
}
*/


/* locate dependent utf8 loading of decimal point */
/* for newlib demo return ascii */
struct __params {
  unsigned char **s;
  unsigned char *slimit;
  //locale_t ld;
};

static unsigned
__decimalLoad(unsigned char *s, struct __params *params) {
  (void)params;
  *s = '.';
  return (1);
}

#define BUFSTRSIZE     64
unsigned char *
__format(const unsigned char *format, struct __params *params, va_list arg) {

  unsigned char   buffer[BUFSTRSIZE]; /* size of 64bit binary ouput  no prec,
                                       * field nor alt goes here */
  unsigned char  **s = params->s;
  unsigned char  *slimit = params->slimit;
  unsigned char  *strBuf = buffer, *strPtr, *strEnd, *sPtr = *s;
  uint64_t        val_ull;            /* for any sizeof() > 4 bytes, if
                                       * unsigned is 64bits it will bypass
                                       * val_ul to go here */
  unsigned        val_ul;             /* need unsigned instead of unsigned long
                                       * incase unsigned 64bits */
  uint32_t        arg_flags;          /* arg_flags needs to be 32bits??? */
  int             arg_prec, arg_field, n;
  unsigned char   ch, sign, arg_size; /* arg_size max value is sizeof(long
                                       * double) if int goes 128bits still
                                       * within char size */
  uintptr_t      *vPtr = NULL;

  --format;
  goto L500;
  do {

    strEnd = strPtr = &strBuf[BUFSTRSIZE];
    arg_flags = 0;
    arg_size  = sizeof(int);
    arg_prec  = arg_field = -1;
    sign = 0;

  next_ch:
    ch = *++format;
  EO_EXTENSIONS:
    if (!ch) break;
    switch (ch) {

        /* Removed Flags for demo */
        /* Removed some Length modifiers for demo */
      case 'l':
        if ((ch = *++format) != 'l') {   /* and ll */
                  arg_size = sizeof(long);         goto EO_EXTENSIONS;  }
                  arg_size = sizeof(long long);    goto next_ch;
      case 'q':   arg_size = sizeof(octle);        goto next_ch;	/* 128bit */
      case 'L':   arg_size = sizeof(long double);  goto next_ch;	/* long double */

        /* Removed some Field-Width / Precision for demo */
      case '.':
        ch = *++format;
        arg_flags |= PREC_PREFIX;

      case '1': case '2': case '3':
      case '4': case '5': case '6':
      case '7': case '8': case '9':
        n = ch - 48;
        while ((unsigned)((ch = *++format) - '0') <= 9)
          n = (n * 10) + (ch - '0');
        if (ch != '$') {
          if (arg_flags & PREC_PREFIX)  arg_prec = n;
          else  arg_field = n;
          arg_flags &= ~PREC_PREFIX;
          goto EO_EXTENSIONS;
        }
        goto next_ch;

        /* Removed all Formats... but real */
      case 'a':  case 'e':  case 'f':  case 'g':
      case 'A':  case 'E':  case 'F':  case 'G':
          /* gcc doesn't permit floats, it converts to doubles */
      {
        unsigned char *(*toStr)(uint32_t [5], unsigned char **,
                                                      int, int, int, unsigned);
        uint32_t w160[5];
        uint32_t dp;  /* 4 byte decimal point param */
        int32_t sch;
        __decimalLoad((unsigned char*)&dp, params);
        if ((ch | 0x20) == 'a')  ch -= 'A', toStr = realtohex;
        else                     ch -= 'E', toStr = realtostr;
        sch = format[-1];
          /* force 128bit long double into quad parsing */
        if (sch == 'q') {
#if (__LDBL_MANT_DIG__ != 64)
    quad_parse:
#endif
          sch = (('q' << 8) | ch);
          qtow(((vPtr == NULL) ? va_arg(arg, octle) : *(octle *)vPtr), w160);
        } else if (sch == 'L') {
#if (__LDBL_MANT_DIG__ != 64)
          goto quad_parse;
#endif
          sch = (('L' << 8) | ch);
          ldtow(((vPtr == NULL) ? va_arg(arg, long double) : *(long double *)vPtr), w160);
        } else {
          sch = (('l' << 8) | ch);
          dtow(((vPtr == NULL) ? va_arg(arg, double) : *(double *)vPtr), w160);
        }
        arg_flags |= (strBuf != buffer) << 7;
        strPtr = toStr(w160, &strBuf, sch, arg_prec, (int)arg_flags, dp);
      }
        if (strPtr == NULL)  goto error;

        val_ul = (n = (int)(strPtr - strBuf));
        strPtr = strBuf;
        arg_prec = 0;
        arg_flags &= ~ALT_PREFIX;
          /* grouping code removed for demo */
    } /* end switch */
                        /* main output section */
      /* adjust the different output factors */
    if ((arg_prec -= n) < 0) arg_prec = 0;
    if ((arg_field -= (arg_prec + val_ul)) < 0) arg_field = 0;
      /* is there output space? removed for demo */
      /* different formatting removed for demo */
    if (sign != 0) *sPtr++ = sign;
    if (arg_prec)  do *sPtr++ = '0'; while ((--arg_prec));
    if (n & 1)
               *sPtr++ = *strPtr++;
    if (n & 2) {
      *(int16_t *)sPtr = *(int16_t *)strPtr;
                  sPtr += sizeof(int16_t); strPtr += sizeof(int16_t);  }
    if ((n >>= 2))
      do {
      *(int32_t *)sPtr = *(int32_t *)strPtr;
                  sPtr += sizeof(int32_t); strPtr += sizeof(int32_t);
      } while ((--n));
    if (arg_field)  do  *sPtr++ = ' ';  while ((--arg_field));

      /* parsing of format string */
    do {
L500: ch = *++format;
      if ((sPtr >= slimit)) {
        --format;
          /* removed buffer extreme */
        puts("this is demo, don't be cute!");
        goto end;
      }
      *sPtr = ch;
      if (!ch) goto end;
      if (ch == '%')  break;
      sPtr++;
    } while (1);
  } while (1);
  *sPtr = 0;
end:
  if (strBuf != buffer)  free(strBuf);
  return sPtr;
error:
  if (strBuf != buffer)  free(strBuf);
  *--sPtr = 0;
  return (*s - 12);
}

#define FORMAT_LINE_MAX  4096

static int
phx_vfprintf(FILE *stream, const char *format, va_list arg) {
//_vfprintf(FILE *stream, const char *format, va_list arg, locale_t ld) {

  unsigned char  *sPtr, *wrPtr, *rdPtr;
  ssize_t         to_write;
  size_t          u8Sz  = FORMAT_LINE_MAX;
  struct __params params;

    /* make sure writable stream, Remove writable for demo */
  if ( (stream == NULL) //|| (!_io_writable(stream))
      || (format == NULL) )  return -1;

    /* create buffer to 'write' into */
  if ((wrPtr = (unsigned char*)malloc(u8Sz)) == NULL)  return -1;

    /* label rdPtr as <format>. If conversion, rdPtr will be allocated
     * and filled with UTF-8 version of <format> */
  rdPtr = (unsigned char*)format;

                        /* UTF-8 <format> input */
  params.s           = &wrPtr;
  params.slimit      = (wrPtr + u8Sz);
  //params.ld          = ld;
  sPtr = __format((const unsigned char *)rdPtr, &params, arg);

                        /* UTF-8 text ouput */
  if ((to_write = (ssize_t)(sPtr - wrPtr)) > 0)
    to_write = fwrite((const void *)wrPtr, (size_t)1, (size_t)to_write, stream);
  free(wrPtr);
  return (int)to_write;
}

int
phx_printf(const char *format, ...) {

  int     written;
  va_list arg;

  va_start(arg, format);
    written = phx_vfprintf((FILE *)stdout, format, arg);
    //written = _vfprintf((FILE *)stdout, format, arg, NULL);
  va_end(arg);
  return written;
}

/*
 *  realtostr - convert real numbers to formated latn numerical string
 *  realtohex - convert real numbers to formated hexadecimal string
 *  qtold - convert quadruple-precision number to a system defined long double
 *  ldtod - convert a system defined long double to a double-precision number
 */
        /* C langauge - string literals */
#ifdef __BIG_ENDIAN__
 #define cconst16_t(a,b)     (uint16_t)(((uint8_t)(a) << 8) | (uint8_t)(b))
 #define cconst32_t(a,b,c,d) (uint32_t)(((uint8_t)(a) << 24) | ((uint8_t)(b) << 16) | ((uint8_t)(c) << 8) | (uint8_t)(d))
#else
 #define cconst16_t(a,b)     (uint16_t)(((uint8_t)(b) << 8) | (uint8_t)(a))
 #define cconst32_t(a,b,c,d) (uint32_t)(((uint8_t)(d) << 24) | ((uint8_t)(c) << 16) | ((uint8_t)(b) << 8) | (uint8_t)(a))
#endif
        /* compiler convience macro */
#ifndef __GNUC_PREREQ
 #if defined __GNUC__ && defined __GNUC_MINOR__
  #define __GNUC_PREREQ(major,minor) \
    (((__GNUC__ << 16) + __GNUC_MINOR__) >= (((major) << 16) + (minor)))
 #else
  #define __GNUC_PREREQ(major,minor) 0
 #endif
#endif  /* __GNUC_PREREQ */

#ifndef __LDBL_MANT_DIG__
#define __LDBL_MANT_DIG__  LDBL_MANT_DIG
#endif

/* endian break down of uint64_t */
#ifdef __BIG_ENDIAN__
  typedef struct {  uint16_t w3; uint16_t w2; uint16_t w1; uint16_t w0;  } _ull_ws;
  typedef struct {  uint32_t hi; uint32_t lo;  } _ull_ls;
#elif defined (__LITTLE_ENDIAN__)
  typedef struct {  uint16_t w0; uint16_t w1; uint16_t w2; uint16_t w3;  } _ull_ws;
  typedef struct {  uint32_t lo; uint32_t hi;  } _ull_ls;
#else
  #error: undefined endianness
#endif
typedef union {  uint64_t ull;  _ull_ls  uls;  _ull_ws  uss;  } _ull_t;

#ifdef __ppc__
  #define CNTLZW(x,y) \
    do __asm__("cntlzw %0,%1" : "=r" (x) : "r" (y)); while(0)
#elif defined (__i386__)
  #define CNTLZW(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 31 - x;  } while(0)
#elif defined (__x86_64__)
  #define CNTLZW(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 31 - x;  } while(0)
  #define CNTLZD(x,y) \
    do {  __asm__("bsr %1,%0" : "=r" (x) : "r" (y));  x = 63 - x;  } while(0)
#else
    /* about 50% slower, 3 instructions vs 31 instructions */
  #define CNTLZW(x,y) \
    do {  unsigned t = y;   \
      t |= t >> 1;          \
      t |= t >> 2;          \
      t |= t >> 4;          \
      t |= t >> 8;          \
      t |= t >> 16;         \
      t = t - ((t >> 1) & 0x55555555);                            \
      t = (t & 0x33333333) + ((t >> 2) & 0x33333333);             \
      x  = 32 - (((t + (t >> 4) & 0xF0F0F0F) * 0x1010101) >> 24); \
    } while (0);
#endif

  /* used for __fractional128 stack zeroing */
extern void *memset(void *, int, size_t);

long double qtold(quadruple);
double ldtod(long double);

#define _CARRY  1000000000000000000ULL          /* 0x 0DE0B6B3 A7640000 */

  /* for rounding */
static const uint64_t _fcarry[] = {
  5, 50, 500, 5000, 50000, 500000, 5000000, 50000000, 500000000, 5000000000ULL,
  50000000000ULL, 500000000000ULL, 5000000000000ULL, 50000000000000ULL, 500000000000000ULL,
  5000000000000000ULL, 50000000000000000ULL, 500000000000000000ULL,
  /*18*/
  1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000,
  10000000000ULL, 100000000000ULL, 1000000000000ULL, 10000000000000ULL, 100000000000000ULL,
  1000000000000000ULL, 10000000000000000ULL, 100000000000000000ULL, _CARRY
};
  /* for cases: nan, inf, hex */
static const char *_const_outStr =
  "0123456789abcdefnanpinfe0x0.00000123456789ABCDEFNANPINFE0X0.0000";

      /*** internal routines ***/
  /* generic, div() moved to 64_t to address endianess */
static uint64_t divlu2r(uint64_t) __attribute__ ((noinline));
static void  _ulltobuffer_18lr(uint64_t, unsigned char *) __attribute__ ((noinline));
static void  _ulltobuffer_nlr(uint64_t, unsigned char *, int) __attribute__ ((noinline));
static int   _first_digit(uint64_t);
  /* ldtostr specific */
static unsigned char *__n_char_sequence(uint32_t [5], unsigned char *, int);
static uint32_t __fractional(_ull_t *, _ull_t);
static uint32_t __fractional_128(_ull_t *, _ull_t);
static uint32_t __positive_exponent_fp(_ull_t *, int);
static uint32_t __negative_exponent_fp(_ull_t *, int);

/* from fprintf... needs to be the same */
#define ZERO_PREFIX  0001
#define ALT_PREFIX   0002
#define LEFT_PREFIX  0004
#define SIGN_PREFIX  0010
#define SPC_PREFIX   0020
#define STAR_PREFIX  0040
#define GRP_PREFIX   0100
#define USGN_PREFIX  0200
#define PREC_PREFIX  0400
#define ALLOC_BUF    USGN_PREFIX
#define BUFSTRSIZE     64

/* XXX using upper 16 to pass 'unit' max from
 calcation prior to entry of x_exponent */

#define UPPER_TYPE  0x20
#define IS_UPPER_TYPE(a)  (!((a) & UPPER_TYPE))
#define IS_QUAD(a)    ((uint8_t)((a) >> 8) == (uint8_t)'q')
#define IS_LDBL(a)    ((uint8_t)((a) >> 8) == (uint8_t)'L')
#define IS_DBL(a)     ((uint8_t)((a) >> 8) == (uint8_t)'l')
#define IS_FLT(a)     ((uint8_t)((a) >> 8) == 0)
#define TYPE_ZERO   0x40
#define IS_TYPE_ZERO(a)   ((a) & TYPE_ZERO)
#define TYPE_E  0
#define TYPE_F  1
#define TYPE_G  2
#define IS_TYPE_E(a)  (((a) & 3) == 0)
#define IS_TYPE_F(a)  ((a) & TYPE_F)
#define IS_TYPE_G(a)  ((a) & TYPE_G)
#define SURPRESS_NSEQ(a) (((a) & 4) != 0)

#pragma mark *** Support Mathematics ***

static uint64_t
divlu2r(uint64_t c) {

  _ull_t q;
  uint32_t mConst = 2305843009U;  // 2^61 / 10^9
  uint32_t tConst = 1000000000U;
  q.uls.hi = (uint32_t)(c >> 32);
  q.uls.lo = ((uint32_t)c);
  if (q.uls.hi != 0) {
    uint32_t shift;
    CNTLZW(shift, q.uls.hi);
    q.uls.hi = (uint32_t)((((q.uls.hi << shift) | (q.uls.lo >> (32 - shift)))
                                                    * (uint64_t)mConst) >> 32);
    if (shift > 3)  q.uls.hi >>= (shift - 3);
  } else {
    q.uls.hi = (uint32_t)(((q.uls.lo | 1) * (uint64_t)mConst) >> (32 + 29));
  }
  q.uls.lo = (uint32_t)(c - ((uint64_t)q.uls.hi * (uint32_t)tConst));
  if (q.uls.lo >= tConst)
    q.uls.hi += 1, q.uls.lo -= tConst;
  return q.ull;
}

static void
_ulltobuffer_18lr(uint64_t c, unsigned char *buf) {

  _ull_t a, b;
  int n;
  --buf;
    /* in 4.60, 2.30 fixed point integers */
  b.ull = (uint64_t)(((uint32_t)c) << 2) * 2882303762U;
  a.ull = (uint64_t)(((uint32_t)(c >> 32)) << 2) * 2882303762U;
  n = 9;
  goto L1;
  do {
    b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
    a.uls.hi = (((a.uls.hi << 4) >> 4) * 10) + (a.uls.lo >> 28);
    b.uls.lo <<= 4;
    a.uls.lo <<= 4;
L1: *++buf = (a.uls.hi >> 28) + '0';
    buf[9] = (b.uls.hi >> 28) + '0';
  } while ((--n));
}

/* went for speed */
static void
_ulltobuffer_nlr(uint64_t c, unsigned char *buf, int m) {

  _ull_t b;
  int n;
  --buf;
    /* in 4.60, 2.30 fixed point integers */
  c <<= 2;
  b.ull = (uint64_t)((uint32_t)(c >> 32)) * 2882303762U;
  n = (m <= 9) ? m : 9;
  m -= 9;
  goto L1;
  do {
    b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
    b.uls.lo <<= 4;
L1: *++buf = (b.uls.hi >> 28) + '0';
  } while ((--n));
  if (m > 0) {
    n = m;
    b.ull = (uint64_t)((uint32_t)c) * 2882303762U;
    goto L2;
    do {
      b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
      b.uls.lo <<= 4;
  L2: *++buf = (b.uls.hi >> 28) + '0';
    } while ((--n));
  }
}

/*#pragma GCC diagnostic ignored "-Wmisleading-indentation"*/
#if __GNUC_PREREQ(4,5)
 #pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
/* uses 2 64s as swap loads, this prevents stalls */
/* returns 1 less than actual digits, the power of 10, or log10 value */
static int
_first_digit(uint64_t a) {

  _ull_t reg;
  const uint64_t *fc;
  uint64_t b;
  reg.ull = a;
  fc = _fcarry;
  if (reg.uls.hi) {
    a = fc[35];
    b = fc[34];
    if (reg.ull >= a)  return 17;  a = fc[33];
    if (reg.ull >= b)  return 16;  b = fc[32];
    if (reg.ull >= a)  return 15;  a = fc[31];
    if (reg.ull >= b)  return 14;  b = fc[30];
    if (reg.ull >= a)  return 13;  a = fc[29];
    if (reg.ull >= b)  return 12;  b = fc[28];
    if (reg.ull >= a)  return 11;
    if (reg.ull >= b)  return 10;
    return 9;
  } else {
    uint32_t c, d;
    c = (uint32_t)fc[27];
    d = (uint32_t)fc[26];
    if (reg.uls.lo >= c)  return 9;  c = (uint32_t)fc[25];
    if (reg.uls.lo >= d)  return 8;  d = (uint32_t)fc[24];
    if (reg.uls.lo >= c)  return 7;  c = (uint32_t)fc[23];
    if (reg.uls.lo >= d)  return 6;
    if (reg.uls.lo >= c)  return 5;
    if (reg.uls.lo >= 10000U)  return 4;
    if (reg.uls.lo >= 1000U)  return 3;
    if (reg.uls.lo >= 100U)  return 2;
    if (reg.uls.lo >= 10U)  return 1;
    return 0;
  }
}
/*#pragma GCC diagnostic pop*/
#if __GNUC_PREREQ(4,5)
 #pragma GCC diagnostic pop
#endif

#pragma mark *** Support Subroutines ***

static unsigned char *
__n_char_sequence(uint32_t w160[5], unsigned char *sPtr, int type) {

  const char   *_outvalues;
  unsigned is_zero;

    /* e80 long double type, 63rd bit required set reguardless if nan or inf */
    /* if is_zero is <zero>, then is infinity, else nan */
    /* any info in nan is considered n-char-sequence */
    /* bit 62 of nan, when 0, and others [61-0] set, is considered signalling nan.
     * bit 62 of nan, when 1, if others 0, special quiet nan, others [61-0] set,
     * considered quiet signaling nan. */
  is_zero = ((w160[3] & 0x7FFFFFFF) | w160[2] | w160[1] | w160[0]);
  _outvalues = _const_outStr + 16;
  if (IS_UPPER_TYPE(type)) _outvalues += 32;
  if (!is_zero){
    *((uint32_t *)sPtr) = ((int32_t *)_outvalues)[1];
    return (sPtr + 3);
  }

  *((uint32_t *)sPtr) = ((int32_t *)_outvalues)[0];
  sPtr += 3;
  if (SURPRESS_NSEQ(type)) return sPtr;

    /* for doubles 53bits exposed, shift off hidden */
  if (!IS_LDBL(type)) *(uint64_t*)&w160[2] <<= 1;
  if (IS_QUAD(type)) {
    w160[2] |= (w160[1] & 0x40000000) >> 30;
    *(uint64_t*)&w160[0] <<= 2;
  }

    /* now output the n-char-sequence */
    /* (0x  hexdigits )
     * e80,16nibbles, doubles,13nibbles, floats,5.5nibbles, */
  *((uint16_t *)sPtr) = cconst16_t('(','0');
  sPtr += 2;
  *sPtr = _outvalues[9]; /* 'x' or 'X' */
  sPtr++;
  _outvalues -= 16;
  sPtr[ 0] = _outvalues[(w160[3]  >> 28)];
  sPtr[ 1] = _outvalues[((w160[3] <<  4) >> 28)];
  sPtr[ 2] = _outvalues[((w160[3] <<  8) >> 28)];
  sPtr[ 3] = _outvalues[((w160[3] << 12) >> 28)];
  sPtr[ 4] = _outvalues[((w160[3] << 16) >> 28)];
  sPtr[ 5] = _outvalues[((w160[3] << 20) >> 28)];
  if (IS_FLT(type)) {  sPtr +=  6; goto end_brace;  }
  sPtr[ 6] = _outvalues[((w160[3] << 24) >> 28)];
  sPtr[ 7] = _outvalues[((w160[3] << 28) >> 28)];
  sPtr[ 8] = _outvalues[(w160[2]  >> 28)];
  sPtr[ 9] = _outvalues[((w160[2] <<  4) >> 28)];
  sPtr[10] = _outvalues[((w160[2] <<  8) >> 28)];
  sPtr[11] = _outvalues[((w160[2] << 12) >> 28)];
  sPtr[12] = _outvalues[((w160[2] << 16) >> 28)];
  if (IS_DBL(type)) {  sPtr += 13; goto end_brace;  }
  sPtr[13] = _outvalues[((w160[2] << 20) >> 28)];
  sPtr[14] = _outvalues[((w160[2] << 24) >> 28)];
  sPtr[15] = _outvalues[((w160[2] << 28) >> 28)];
  sPtr += 16;
  if (IS_LDBL(type))  goto end_brace;
    /* undo w160 formatting of '[0]' for number */
  sPtr[ 0] = _outvalues[(w160[1]  >> 28)];
  sPtr[ 1] = _outvalues[((w160[1] <<  4) >> 28)];
  sPtr[ 2] = _outvalues[((w160[1] <<  8) >> 28)];
  sPtr[ 3] = _outvalues[((w160[1] << 12) >> 28)];
  sPtr[ 4] = _outvalues[((w160[1] << 16) >> 28)];
  sPtr[ 5] = _outvalues[((w160[1] << 20) >> 28)];
  sPtr[ 6] = _outvalues[((w160[1] << 24) >> 28)];
  sPtr[ 7] = _outvalues[((w160[1] << 28) >> 28)];
  sPtr[ 8] = _outvalues[(w160[0]  >> 28)];
  sPtr[ 9] = _outvalues[((w160[0] <<  4) >> 28)];
  sPtr[10] = _outvalues[((w160[0] <<  8) >> 28)];
  sPtr[11] = _outvalues[((w160[0] << 12) >> 28)];
  sPtr += 12;
end_brace:
  *sPtr = ')';
  return (sPtr + 1);
}

/* converts fractional portion of IEEE754 value to
 * base 10 representation, in 10^18 ull format.
 */
static uint32_t
__fractional(_ull_t *uPtr, _ull_t reg) {

  uint32_t last = 1;
  uint64_t mlConst = 3814697265625ULL;  /* 5^18 (42bits) 0x378 2DACE9D9 */
    /*
  0x7FFFFFFF  FFFFFFFF  start
  0x7FFFE 1FFF FFFFFFFF split
     */
  uPtr[0].ull = (reg.uls.hi >> 13) * mlConst; /* 18 + 45 == 63 */
    /* test added, rather than just '&='. This is guess that small 'standard'
     * fractions might be used */
  if (((reg.uls.hi &= 0x00001FFF) | reg.uls.lo) != 0) {
      /* the left over 13bits from hi with 5bits from lo (45-18=>27).
  0x00001FFF FFFFFFFF  start
  0x00001FFFF8 7FFFFFF split
       */
    uint64_t u64;
    last += 1;
    uPtr[0].ull += (u64 = ((uint32_t)(reg.ull >> 27)) * mlConst) >> 18;
    uPtr[1].ull  = ((uint32_t)u64 & 0x3FFFF) * mlConst;
    if ((reg.uls.lo &= 0x7FFFFFF) != 0) {
        /*
  0x00000000 07FFFFFF   start
  0x          7FFFE 1FF split
         */
      _ull_t scratch;
      last += 1;
      uPtr[0].ull += (u64 = (reg.uls.lo >> 9) * mlConst) >> 36;
      scratch.ull  = (uint32_t)((uint32_t)u64 & 0x3FFFF) * mlConst;
      uPtr[2].ull  = (scratch.uls.lo & 0x3FFFF) * mlConst;
      u64 = uPtr[1].ull + (scratch.ull >> 18)
          + (((uint32_t)(u64 >> 18)) & 0x3FFFF) * mlConst;
      if (u64 >= _CARRY)  u64 -= _CARRY, uPtr[0].ull += 1ULL;
      uPtr[1].ull = u64;
      if ((reg.uls.lo &= 0x1FF) != 0) {
          /*
  0x00000000 000001FF end of 63bit resolution
           */
        uint32_t u32;
        last += 1;
          // 54*10^18
        u32 = (uint32_t)((scratch.ull = reg.uls.lo * mlConst) >> 27);
        uPtr[0].ull += (uint64_t)(scratch.uls.hi >> 13);
          // 54*10^18*10^18
          /* first  18 of 45 remainder */
        u64 = ((u32 & 0x3FFFF) * mlConst) + uPtr[1].ull;
          /* second 18 of 45 remainder, scratch.uls.lo &= 0x07FFFFFF; */
        u32 = scratch.uls.lo & 0x07FFFFFF;
        scratch.ull = (u32 >> 9) * mlConst;
        u64 += (scratch.ull >> 18);
        uPtr[2].ull += (scratch.uls.lo & 0x3FFFF) * mlConst;
        if (uPtr[2].ull >= _CARRY)  uPtr[2].ull -= _CARRY, u64 += 1ULL;
        uPtr[1].ull = u64;
          /* last    9 of 45 remainder */
        scratch.ull = ((u32 & 0x000001FF) << 9) * mlConst;
        uPtr[1].ull += (uint64_t)(scratch.uls.hi >> 4);
        if (uPtr[1].ull >= _CARRY)  uPtr[1].ull -= _CARRY, uPtr[0].ull += 1ULL;
          /* of scratch2 36 remainder */
        scratch.uls.hi &= 0x0000000F;
        uPtr[2].ull += ((uint32_t)(scratch.ull >> 18)) * mlConst;
        uPtr[2].ull += ((scratch.uls.lo & 0x3FFFF) * mlConst) >> 18;
        if (uPtr[2].ull >= _CARRY)  uPtr[2].ull -= _CARRY, uPtr[1].ull += 1ULL;
          /* of 18 remainder */
        scratch.uls.lo = (uint32_t)((scratch.uls.lo & 0x3FFFF) * mlConst) & 0x3FFFF;
        uPtr[3].ull = scratch.uls.lo * mlConst;
  } } }
  return last;
}

/* this is for second register of frational */
/* expected entry use: __fractional(uPtr, reg2);
 * expected range of reg2: 0x7fffffffffffc000 thru 0x0000000000004000
 * reg2 expectation: shifted right 1, matching reg 1, where high bit is bit 62
 * to avoid initial allocation problems, create stack use instead of uPtr.
 * also avoids re-zeroing of uPtr regs
 * uPtr parameter for adding to reg1's converted fractional, the final result */
static uint32_t
__fractional_128(_ull_t *uPtr, _ull_t reg) {

  uint32_t last;
  _ull_t sreg[15]; /* StackREGisters */
  memset(sreg, 0, sizeof(sreg));
    /* create regs of multiplly by 5^63 (3 max) */
  __fractional(sreg, reg);

    /* creates sr3-sr6, affects uPtr[4] */
  last = __fractional(&sreg[ 3], sreg[0]);
  if ((sreg[1].ull)) {
    last = 5;  // creates uPtr[5] value
      /* creates sr7-sr10 */
    __fractional(&sreg[ 7], sreg[1]);
    sreg[4].ull += sreg[7].ull;
    sreg[5].ull += sreg[8].ull;
    sreg[6].ull += sreg[9].ull;
    sreg[7].ull  = sreg[10].ull;
    if ((sreg[2].ull)) {
        /* creates sr11-sr14*/
      __fractional(&sreg[11], sreg[2]);
      sreg[5].ull += sreg[11].ull;
      sreg[6].ull += sreg[12].ull;
      sreg[7].ull += sreg[13].ull;
      uPtr[6].ull  = sreg[14].ull;
    }
    if (sreg[7].ull >= _CARRY)  sreg[6].ull += 1ULL, sreg[7].ull -= _CARRY;
    uPtr[5].ull  = sreg[7].ull;
    while (sreg[6].ull >= _CARRY)  sreg[5].ull += 1ULL, sreg[6].ull -= _CARRY;
  }
    /* sreg3-sreg8 set for addin transfer to uPtr */
  uPtr[4].ull  = sreg[6].ull;
  uPtr[3].ull += sreg[5].ull;
  while (uPtr[3].ull >= _CARRY)  uPtr[2].ull += 1ULL, uPtr[3].ull -= _CARRY;
  uPtr[2].ull += sreg[4].ull;
  if (uPtr[2].ull >= _CARRY)  uPtr[1].ull += 1ULL, uPtr[2].ull -= _CARRY;
  uPtr[1].ull += sreg[3].ull;
  if (uPtr[1].ull >= _CARRY)  uPtr[0].ull += 1ULL, uPtr[1].ull -= _CARRY;
  return last;
}

static uint32_t
__positive_lowexp128(_ull_t *uPtr, unsigned pw2, _ull_t reg0, _ull_t reg1) {

  uint32_t last, pw10 = 0;
  uint32_t  unit_18 = 0;
  if (pw2 < 64) {
    /* create shifted reg0, reg1 */
    /* special case: reg1 has 49bits, a shift of >= 49 bits (*2^49+)
     * means only 1 reg used for __fractional() */
    uint32_t shiftb = 63 - pw2;

    if (pw2 <= 48) {
        /* lowest drop of 15 bits, max of 63 */
      uPtr[0].ull = reg0.ull >> shiftb;
        /* pw2 == 0 clears explicit '1' */
      reg0.ull  = reg0.ull << pw2, reg0.uls.hi &= 0x7fffffff;
        /* pw2 == 0 ORs high bit explicit '0' */
      reg0.ull |= reg1.ull >> shiftb;
      reg1.ull  = reg1.ull << pw2, reg1.uls.hi &= 0x7fffffff;
      last = __fractional(&uPtr[1], reg0);
      if (reg1.ull != 0)
        last = __fractional_128(&uPtr[1], reg1);
    } else {

      uint32_t hi;
        /* @49 shiftb = 14, @63 shiftb = 0 */
      if (shiftb != 0) {
        reg1.ull = (reg1.ull >> shiftb);
        reg1.uls.hi |= (reg0.uls.lo << (32 - shiftb)) >> 1;
        reg0.ull >>= shiftb;
      } else {
          /* values 8-15 possible, reducing >= _CARRY loop
           *        8   9  10  11  12  13  14  15
           * actual 9, 10, 11, 12, 13, 14, 16, 17 e18 starting values */
        hi = (uint32_t)((reg0.uss.w3 >> 12) + 1);
        goto hi_adjusted;
      }
        /* reg0 holds shifted whole value */
        /* default action */
      uPtr[0].ull = reg0.ull;
      if ((shiftb < 4) || ((shiftb == 4) && (reg0.ull >= _CARRY))) {
          /* values 0-7 possible, reducing >= _CARRY loop
           *        0  1  2  3  4  5  6  7
           * actual 1, 1, 2, 3, 4, 5, 6, 8,  e18 starting values */
        hi = (uint32_t)(reg0.uss.w3 >> 12);
        if ((hi))
hi_adjusted:
          reg0.ull -= hi * _CARRY;
        while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
        uPtr[0].ull = (uint64_t)hi, ++uPtr;
        uPtr[0].ull = reg0.ull;
        pw10 = 18;
      }
      last = __fractional(&uPtr[1], reg1);
      unit_18 = 1;
    }
  } else {
    uint32_t n28s;
    _ull_t    reg;              /* most used 64 bit gpr */

      /* 64 <= pw2 < 112, 1 fractional reg, multiple whole regs */
    uint32_t hi = (uint32_t)((reg0.uss.w3 >> 12) + 1);
    reg0.ull -= hi * _CARRY;
    while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
    uPtr[0].ull = (uint64_t)hi;
    uPtr[1].ull = reg0.ull;
      /* max 50 ((111-63) = 49) [0-49] */
    pw2 -= 63;
    uint32_t shiftb = 63 - pw2;
    if ((n28s = (pw2 >= 28)))
      pw2 -= 28;
    if ((pw2)) {
        /* shift range [27-1] */
      _ull_t  scratch;
        /* shift [18-9] left [27-1] */
      uPtr[0].uls.lo <<= pw2;
        /* shift remainder (uPtr[1], size < 10^19) left [27-1] */
      reg.ull = divlu2r(uPtr[1].ull);
      scratch.ull = divlu2r((uint64_t)reg.uls.hi << pw2);
      uPtr[0].ull += scratch.uls.hi;
      reg.ull = ((uint64_t)scratch.uls.lo * 1000000000U)
              + ((uint64_t)reg.uls.lo << pw2);
        /* do carry, set uPtr[1] */
      if (reg.ull >= _CARRY)  uPtr[0].ull += 1ULL, reg.ull -= _CARRY;
      uPtr[1].ull = reg.ull;
    }
    unit_18 = 1;
    pw10 = 18;
    if (n28s) {
        /* shift of 28 multiples */
        /* set initial limiter to current regs valid (<= unit_max) */
      _ull_t *cap = &uPtr[unit_18];
      _ull_t *input, *output, next;
      reg.ull = (output = (input = uPtr))->ull;
      output->ull = 0;
      next.ull = input[1].ull;

      unit_18++;
      if (reg.ull >= 3725290298ULL)  goto L1;
      unit_18--;
      output->ull = reg.ull << 28;
      output--;

      do {
        _ull_t scratch0;
        uint32_t u32;
        reg.ull = next.ull;
        next.ull = (++input)[1].ull;
        output++;
  L1:   scratch0.ull = divlu2r(reg.ull);
        u32 = (uint32_t)(((uint64_t)scratch0.uls.hi * 2305843009) >> 32) >> 1;
        scratch0.ull = ((((uint64_t)scratch0.uls.hi << 28)
                         - ((uint64_t)u32 * 1000000000U)) * 1000000000U)
                     + ((uint64_t)scratch0.uls.lo << 28);
        if (scratch0.ull >= _CARRY)
          u32 += 1, scratch0.ull -= _CARRY;
        output[1].ull = scratch0.ull;
        if ((output->ull += u32) >= _CARRY)
          output->ull -= _CARRY, output[-1].ull += 1;
      } while (input < cap);
    }
    pw10 += ((unit_18 - 1) * 18);

      /* reg0 has addin to shifted whole value */
    pw2 = 63 - shiftb;
    reg0.ull = (reg1.ull >> shiftb);
    reg0.ull += uPtr[1].ull;
    if (reg0.ull >= _CARRY)  uPtr[0].ull += 1, reg0.ull -= _CARRY;
    uPtr[1].ull = reg0.ull;
      /* put fractional bits into place */
    reg1.ull = reg1.ull << pw2, reg1.uls.hi &= 0x7fffffff;
    last = __fractional(&uPtr[2], reg1);
    last++;
  }
  return ((pw10 << 16) | ((last + unit_18) << 4) | last);
}

static uint32_t
__positive_lowexp(_ull_t *uPtr, unsigned pw2, _ull_t reg0) {

    /* do less than 63 code */
  uint16_t last, whole_incr, pw10;
  uint64_t up0;
  pw10 = (whole_incr = 0);
  if (pw2 < 31) {
    up0 = (uint64_t)(reg0.uls.hi >> (31 - pw2));
    reg0.uls.hi = (reg0.uls.hi << (pw2 + 1)) >> (pw2 + 1);
  } else if (pw2 == 31) {
    up0 = (uint64_t)reg0.uls.hi;
    reg0.uls.hi = 0;
  } else {
    uint32_t shift = pw2 - 32;
    up0 = reg0.ull >> (31 - shift);
    reg0.uls.lo = (reg0.uls.lo << (shift + 1)) >> (shift + 1);
    if ((shift > 27) || ((shift == 27) && (up0 >= _CARRY))) {
      do uPtr[0].ull += 1ULL; while ((up0 -= _CARRY) >= _CARRY);
      uPtr++, whole_incr++, pw10 = 18;
    }
  }
  uPtr[0].ull = up0;
  last = 1;
  if ((reg0.ull <<= pw2) != 0)
    last += __fractional(&uPtr[1], reg0);
  return ((pw10 << 16) | ((last + whole_incr) << 4) | last);
}

/* now grows low address to high address, or left to right numerically */
/* expects implicit/explicit 1 as 63rd bit */
/* multiplies by 2^28, 2^28 used to keep large values within 10^18 ulls */
/* addition of such values create at max 'overflow' of 1 */
/* valid division range on /28 [0-16383] */
static uint32_t
__positive_exponent_fp(_ull_t *uPtr, int type) {

  unsigned  pw2, pw10, n28s;  /* power of 2, power of 10, shifts of 28 */
  _ull_t    reg0;              /* most used 64 bit gpr */
    /* track reg expansion, initial is to read 2 registers (&uPtr[unit_18]) */
  uint32_t  unit_18 = 1;

    /* load 'passed' data */
  pw2 = uPtr[0].uss.w0 - 16383;
    /* note: retain explicit '1' */
  reg0.ull = uPtr[1].ull;
  uPtr[1].ull = (uPtr[0].ull = 0);

  if (uPtr[2].ull != 0) {
    _ull_t   reg1;
    uint32_t hi;
    reg1.ull = uPtr[2].ull;
    uPtr[2].ull = 0;
    if (pw2 < 112)
      return __positive_lowexp128(uPtr, pw2, reg0, reg1);
    /* set up with all whole */
    hi = (uint32_t)((reg0.uss.w3 >> 12) + 1);
    reg0.ull -= hi * _CARRY;
    while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
    uPtr[0].ull = (uint64_t)hi << 49;
    uPtr[1].ull = reg1.ull >> 14;
      /* reg1 free to use as gpr */
    reg0.ull = divlu2r(reg0.ull);
    reg1.ull = divlu2r((uint64_t)reg0.uls.hi << 21);
    uPtr[0].ull += (uint64_t)reg1.uls.hi << 28;
    reg0.ull = ((uint64_t)reg1.uls.lo * 1000000000U)
             + ((uint64_t)reg0.uls.lo << 21);
    reg0.ull = divlu2r(reg0.ull);
    reg1.ull = divlu2r((uint64_t)reg0.uls.hi << 28);
    uPtr[0].ull += (uint64_t)reg1.uls.hi;
    uPtr[1].ull += ((uint64_t)reg1.uls.lo * 1000000000U)
                 + ((uint64_t)reg0.uls.lo << 28);
    if (uPtr[1].ull >= _CARRY)  uPtr[0].ull += 1, uPtr[1].ull -= _CARRY;
    pw10 = 18;
    pw2 -= 112;
      /* uPtr[0] range [10384593717069655-5192296858534827] */
    if ((pw2 -= ((n28s = (pw2 * 18725U) >> 19) * 28)) != 0) {
      _ull_t next, *output;
      uint32_t u32;
      next.ull = uPtr[1].ull;
      reg0.ull = divlu2r(uPtr[0].ull);
        /* 2^61 / 10^9  based on << 28 forces all into higher register */
      u32 = (uint32_t)(((uint64_t)reg0.uls.hi * 2305843009U) >> 32) >> (29 - pw2);
      reg0.ull = ((((uint64_t)reg0.uls.hi << pw2)
                  - ((uint64_t)u32 * 1000000000U)) * 1000000000U)
                   + ((uint64_t)reg0.uls.lo << pw2);
      if (reg0.ull >= _CARRY)
        u32 += 1, reg0.ull -= _CARRY;
      if ((u32)) {
        uPtr[1].ull = reg0.ull;
        uPtr[0].ull = (uint64_t)u32;
          /* creates new register, increases uPtr[unit_18] limit (3) */
          /* adds pw10 += 18 by return math addition of unit_18 */
        unit_18++;
        output = &uPtr[1];
      } else {
          /* no new register, still uPtr[unit_18] limit (2) */
        uPtr[0].ull = reg0.ull;
        output = &uPtr[0];
      }
      reg0.ull = divlu2r(next.ull);
      u32 = (uint32_t)(((uint64_t)reg0.uls.hi * 2305843009U) >> 32) >> (29 - pw2);
      reg0.ull = ((((uint64_t)reg0.uls.hi << pw2)
                  - ((uint64_t)u32 * 1000000000U)) * 1000000000U)
                   + ((uint64_t)reg0.uls.lo << pw2);
      if (reg0.ull >= _CARRY)
        u32 += 1, reg0.ull -= _CARRY;
      output[1].ull  = reg0.ull;
        /* case:
         * w160[4] = 0x4092, w160[3] = 0x8001a8fb, w160[2] = 0xec8ed23f;
         */
      if ((output[0].ull += u32) >= _CARRY)
        output[-1].ull += 1, output[0].ull -= _CARRY;
      }
  } else {
    uint32_t hi;
    if (pw2 < 63)
      return __positive_lowexp(uPtr, pw2, reg0);
      /* below, all use min of 2 regs (uPtr) */
      /* creates new register, still uPtr[unit_18] limit (2) */
      /* values 8-15 possible, reducing >= _CARRY loop
       *        8   9  10  11  12  13  14  15
       * actual 9, 10, 11, 12, 13, 14, 16, 17 e18 starting values */
    hi = (uint32_t)((reg0.uss.w3 >> 12) + 1);
    reg0.ull -= hi * _CARRY;
    while (reg0.ull >= _CARRY)  hi += 1, reg0.ull -= _CARRY;
    uPtr[0].ull = (uint64_t)hi;
    uPtr[1].ull = reg0.ull;
    pw10 = 18;
    pw2 -= 63;
    if ((pw2 -= ((n28s = (pw2 * 18725U) >> 19) * 28)) != 0) {
        /* shift range [27-1] */
      _ull_t  scratch;
        /* shift [18-9] left [27-1] */
      uPtr[0].uls.lo <<= pw2;
        /* shift remainder (uPtr[1], size < 10^19) left [27-1] */
      reg0.ull = divlu2r(uPtr[1].ull);
      scratch.ull = divlu2r((uint64_t)reg0.uls.hi << pw2);
      uPtr[0].ull += scratch.uls.hi;
      reg0.ull = ((uint64_t)scratch.uls.lo * 1000000000U)
              + ((uint64_t)reg0.uls.lo << pw2);
        /* do carry, set uPtr[1] */
      if (reg0.ull >= _CARRY)  uPtr[0].ull += 1ULL, reg0.ull -= _CARRY;
      uPtr[1].ull = reg0.ull;
    }
  }

  if (n28s != 0) {
      /* shift of 28 multiples */
      /* set max regs use, cap used as pointer compare limiter */
    uint32_t unit_max = type >> 16;
      /* set initial limiter to current regs valid (<= unit_max) */
    _ull_t *cap = &uPtr[unit_18];

    do {
        /* start at base */
        /* create loop counter, each 28 loop expects increase of new
         * reg created. A test used to verify if one is created. loop counter
         * then used to increase both pw10, and possibly the ending of
         * last math reg used (cap) */
      _ull_t *input, *output, next;
      reg0.ull = (output = (input = uPtr))->ull;
      output->ull = 0;
      next.ull = input[1].ull;

        /* critical point where addin to reg0.ull can cause overflow
         * and propagation into a prior ull */
      unit_18++;
      if (reg0.ull >= 3725290298ULL)  goto L1;
      unit_18--;
        /* shift of reg0.ull with addin remains below 10^19 */
      output->ull = reg0.ull << 28;
        /* undo loop increment of first addin location,
         * and lcnt (new reg increase) initial assumption */
      output--;

      do {
        _ull_t scratch0;
        uint32_t u32;
        reg0.ull = next.ull;
        next.ull = (++input)[1].ull;
        output++;
  L1:   scratch0.ull = divlu2r(reg0.ull);
        u32 = (uint32_t)(((uint64_t)scratch0.uls.hi * 2305843009) >> 32) >> 1;
        scratch0.ull = ((((uint64_t)scratch0.uls.hi << 28)
                         - ((uint64_t)u32 * 1000000000U)) * 1000000000U)
                     + ((uint64_t)scratch0.uls.lo << 28);
        if (scratch0.ull >= _CARRY)
          u32 += 1, scratch0.ull -= _CARRY;
        output[1].ull = scratch0.ull;
        if ((output->ull += u32) >= _CARRY)
          output->ull -= _CARRY, output[-1].ull += 1;
      } while (input < cap);
        /* tail pointer update based on new reg add vs needs for precision */
        /* limit allowed to increase above cap for pw10 counter */
      cap = &uPtr[((unit_18 > unit_max) ? unit_max : unit_18)];
    } while ((--n28s));
      /* convert regs used into pw10. add to pre-28 value */
  }
  pw10 += ((unit_18 - 1) * 18);
  return ((pw10 << 16) | ((unit_18 + 1) << 4));
}

static uint32_t
__negative_exponent_fp(_ull_t *uPtr, int type) {

  int      pw2, pw10;         /* power of 2, power of 10 */
  uint32_t index;
  _ull_t   reg0;

  pw10 = -18;
  pw2  = 16383 - uPtr[0].uls.lo;
  pw2 -= (index = ((uint32_t)pw2 == ((IS_DBL(type)) ? 1023U : 16383U)));

    /* remove explicit '1'. add back later as 1e18 if not denormal */
  reg0.ull = uPtr[1].ull;
  reg0.uls.hi &= 0x7FFFFFFF;
  uPtr[1].ull = 0;

  if (uPtr[2].ull != 0) {
    _ull_t   reg1;
    reg1.ull = uPtr[2].ull;
    uPtr[2].ull = 0;
    __fractional(uPtr, reg0);
    __fractional_128(uPtr, reg1);
    if (index == 0)  uPtr->ull += _CARRY;
    index = 8;
  } else if (reg0.ull != 0) {
      /* convert fractional to base 10 */
    __fractional(uPtr, reg0);
      /* 2 cases: denormal, normal */
    if (index == 0)  uPtr->ull += _CARRY;
    index = 5;
  } else {
    if (pw2 < 18) {
        /* straight power of 2 */
        /* first 17, [-1, -17]... shift of 10^18,
         * 10^18/18446744073709551616 =
         * 0.0542101086242752217003726400434970855712
         * 0.0542101086242752217003726400434970855712 * 2^64 =
         * 10^18 = 0x232830643 * 2^32 + 2808348672 */
        /* no whole, all fraction */
      uPtr->uls.lo = ((uint32_t)232830643U << (32 - pw2))
                     | ((uint32_t)2808348672U >> pw2);
      uPtr->uls.hi =  (uint32_t)232830643U >> pw2;
      return (uint32_t)((-(18 << 16)) | (1 << 4) | 8);
    }
      /* there are 18 trailing zeroes to _CARRY */
    uPtr[0].ull = _CARRY >> 18;
    pw2 -= 18;
    index = 5;
  }
    /* 5^18 * 2^2 * 2^16 = 10^18 */
    /* each shift into new ull is multiplied by 10^18.
     * or 2^-16 * 10^18 = shift * 5^18 * 2^2 */
  if (pw2 & 0xf) {
      /* partial 2^-16, use of shorter, quicker multiply */
      /* cant use shift on ulls, when partial gcc adds extra coding, test of 32 */
      /* use this code, smaller, faster than using unsigned long gcc code */
    uint32_t m = pw2 & 0xf;
    uint32_t shift = (16 - m);
    uint32_t shiftb = shift + 16;
    uint64_t fConst = 15258789062500ULL;
    if (index == 8) {
      uPtr[7].ull = ((uint16_t)(uPtr[6].uls.lo << shift)) * fConst;
      uPtr[6].uls.lo = (uPtr[6].uls.hi << shiftb) | (uPtr[6].uls.lo >> m);
      uPtr[6].uls.hi >>= m;
      uPtr[6].ull += ((uint16_t)(uPtr[5].uls.lo << shift)) * fConst;
      uPtr[5].uls.lo = (uPtr[5].uls.hi << shiftb) | (uPtr[5].uls.lo >> m);
      uPtr[5].uls.hi >>= m;
      uPtr[5].ull += ((uint16_t)(uPtr[4].uls.lo << shift)) * fConst;
      uPtr[4].uls.lo = (uPtr[4].uls.hi << shiftb) | (uPtr[4].uls.lo >> m);
      uPtr[4].uls.hi >>= m;
      uPtr[4].ull += ((uint16_t)(uPtr[3].uls.lo << shift)) * fConst;
    } else {
      uPtr[4].ull = ((uint16_t)(uPtr[3].uls.lo << shift)) * fConst;
    }
    uPtr[3].uls.lo = (uPtr[3].uls.hi << shiftb) | (uPtr[3].uls.lo >> m);
    uPtr[3].uls.hi >>= m;
    uPtr[3].ull += ((uint16_t)(uPtr[2].uls.lo << shift)) * fConst;
    uPtr[2].uls.lo = (uPtr[2].uls.hi << shiftb) | (uPtr[2].uls.lo >> m);
    uPtr[2].uls.hi >>= m;
    uPtr[2].ull += ((uint16_t)(uPtr[1].uls.lo << shift)) * fConst;
    uPtr[1].uls.lo = (uPtr[1].uls.hi << shiftb) | (uPtr[1].uls.lo >> m);
    uPtr[1].uls.hi >>= m;
    uPtr[1].ull += ((uint16_t)(uPtr[0].uls.lo << shift)) * fConst;
    uPtr[0].uls.lo = (uPtr[0].uls.hi << shiftb) | (uPtr[0].uls.lo >> m);
    uPtr[0].uls.hi >>= m;
      /* address case of denormals, below 'flag' will write pre-regs otherwise */
    if ((uPtr[0].uls.hi | uPtr[0].uls.lo) == 0) {
      pw10 -= 18;
      uPtr[0].ull = uPtr[1].ull, uPtr[1].ull = uPtr[2].ull;
      uPtr[2].ull = uPtr[3].ull, uPtr[3].ull = uPtr[4].ull;
      if (index == 8) {
        uPtr[4].ull = uPtr[5].ull, uPtr[5].ull = uPtr[6].ull;
        uPtr[6].ull = uPtr[7].ull, uPtr[7].ull = 0;
      } else {
        uPtr[4].ull = 0;
      }
    }
  }
    /* 2^-16, multiples */
  if ((pw2 >> 4) != 0) {

    _ull_t *regPtr, *limit;
    pw2 >>= 4;
    limit = uPtr + (unsigned)(type >> 16);
    int16_t fcnt = 0;
      /* code approaches best written assembler, addressed most stalls
       * inotherwords, use caution rewording, huge penalties. */
    do {
      uint64_t work;
      uint32_t w0;
      int32_t flag;
      regPtr = uPtr;
      w0 = (uint32_t)regPtr->uss.w0;
      fcnt += (flag = -((regPtr->ull = regPtr->ull >> 16) == 0));
      work = (++regPtr)->ull;
      do {
        uint64_t m0 = w0 * 15258789062500ULL;
        w0 = (uint16_t)work;
        regPtr[flag].ull = (work = m0 + (work >> 16));
        ++regPtr;
      } while ((regPtr <= limit) && ((work = regPtr->ull) != 0));
      regPtr[flag].ull = w0 * 15258789062500ULL;
    } while ((--pw2) != 0);
    index = (int)(regPtr - uPtr) + 1;
    pw10 += fcnt * 18;
  }
    /* only pw10 used in function as exponent of power 10 */
  return ((pw10 << 16) | (index << 4) | 8);
}

/* enters with 'sSize = strPtr - sPtr;'.
 * s, sPtr could have been malloc'd on entry, but expected to have been
 * cleared, output used, prior to entry. Possibly previous real allocation,
 * output to 'buffer', FILE or supplied, and not free'd.
 * printf() supplies BUFSTRSIZE on entry, realtostr supplies 45 ulls.
 * BUFSTRSIZE considered large for normal requests,
 * 45 ulls covers all needs for double reals.
 * all real types generally need 8, [±][0][.](precision)[e±000[0]]
 * IS_TYPE_F adds exponent to count, [±][0](exponent)[.](precision)[e±000[0]]
 * IS_TYPE_F has possiblity of grouping:
 * GRP_PREFIX assumes thousands, xxx,xxx,xxx,xxx etc. = 6 per ull
 * can't reuse precision 'reg's as double duty... 18 bytes / 8 byte ull
 * bufS(string) must come first for free'ing of allocation
 */

#pragma mark *** External Interfaces ***

#if (!__GNUC_PREREQ(4,5))
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif

unsigned char *
realtostr(uint32_t w160[5], unsigned char **s, int type,
          int arg_prec, int arg_flags, unsigned decimal) {

/*smash_stack();*/

  int exponent, rpos;   /* Rounding POSition */
  uint32_t (*math_func)(_ull_t *, int);
  unsigned char  *sPtr, dblt_sign;
    /* set up for percision of 792 digits, 18/ull * 44ull */
    /* malloc if exponent and percision dictate, 44ull = 352 bytes */
    /* reg: static reserved, regs: actual reg use address (stack or allocated) */
  _ull_t reg[45], *regs;
  /* supress warning due to zero jumps */
#if __GNUC_PREREQ(4,5)
#pragma GCC diagnostic ignored "-Wuninitialized"
#endif
    /* utPtr: top of stack, ubPtr: bottom of stack, ufPtr: fractional start */
  _ull_t *utPtr, *ubPtr, *ufPtr;
  int d0;   /* Digit0 (first digit) */
#if __GNUC_PREREQ(4,5)
#pragma GCC diagnostic pop
#endif
  uint64_t a;
  unsigned strsz, pw2, packed_return;

                        /*** input defaults ***/
  char *decimalPtr = (char*)&decimal;

  if (arg_prec < 0)  arg_prec = 6;
    /* CASE 'g':  initial zero(or first digit) is significant digit */
  if ((arg_prec) && (IS_TYPE_G(type)))  --arg_prec;

                        /*** sign ***/
  if ((w160[4] & 0x8000) != 0)  dblt_sign = '-';
  else {
    dblt_sign = 0;
    if ((arg_flags & (SIGN_PREFIX | SPC_PREFIX)) != 0)
      dblt_sign = (arg_flags & SPC_PREFIX) ? ' ' : '+';
  }
                        /*** exponent ***/
    /* extra math used for limiting register updating in math units. Also
     * allows allocation size when precision exceeds static buffersize */
  if ((exponent = w160[4] & 0x7FFF) == 0x7FFF) {
      /* inf or nan, nan(n-char-sequence) */
    *(sPtr = *s) = dblt_sign, sPtr += (dblt_sign != 0);
    return __n_char_sequence(w160, sPtr, (type | ((arg_flags & ALT_PREFIX) << 1)));
  }
  if (exponent < 0x3FFF) {
    uint32_t reg_min = (IS_QUAD(type)) ? 7 : 4;
    math_func = __negative_exponent_fp;
    pw2 = reg_min;

      /* these 'need' pw10 pre-calculations for extra decisions */
      /* !E can result in '0', no caculation needed,
       * >> 4 == 0 results in use of default,
       * default pw2 == 4 covers precision < 64 */
/* XXX with !e80, does pw2 cover precision < (NEG_REG_BITS + 1)? */
    if ( (!IS_TYPE_E(type)) ||
        ( (((uint32_t)(0x3FFF - exponent) >> 4) != 0) && (arg_prec >= 63) ) ) {
      pw2 = 0x3FFF - exponent;
        /* number of prefixing zeroes */
      pw2 = (((uint32_t)(((uint64_t)((uint32_t)pw2)
                                * (uint32_t)2585827973U) >> 32)) >> 1);
        /* check for all zeroes on a 'f' request */
      if ( (!IS_TYPE_E(type)) && ((unsigned)arg_prec < pw2) ) {
        if (pw2 > reg_min) {
          pw2 = reg_min;
          exponent = 0;
          *(uint64_t*)&w160[2] = 0;
/* XXX do I need? */
          *(uint64_t*)&w160[0] = 0;
          if (IS_TYPE_G(type))  type |= TYPE_F;
          type |= TYPE_ZERO;
            /* must not see as denormal for doubles */
            /* remove 'double' label, keep or set 'long double' */
          type &= ~(UPPER_TYPE << 8);
        } else {
          pw2 = reg_min;
        }
      } else {
          /* subtract zeroes from significant digits */
        uint32_t bits = (IS_QUAD(type)) ? 112 : 63;
        pw2 = (0x3FFF - exponent) + bits - pw2;
        if (pw2 > (unsigned)arg_prec)  pw2 = arg_prec;
          /* divide by 18 for number ulls needed */
        if (pw2 <= 38)
          pw2 = reg_min;
        else
          pw2 = reg_min + (uint32_t)((((uint64_t)(uint32_t)(pw2 - 38))
                                                * (uint32_t)238609295U) >> 32);
      }
    }
  } else {
    math_func = __positive_exponent_fp;
    pw2 = 2;
      /* when 'f', need maximum value */
    if ( ((exponent >= 0x40ce) && (arg_prec >= 27)) || (IS_TYPE_F(type)) ) {
      pw2 = exponent - 0x3FFF;
      pw2 = (((uint32_t)(((uint64_t)((uint32_t)pw2)
                                                * 2585827973U) >> 32)) >> 1);
        /* 'e' doesn't need full precision */
      if ((IS_TYPE_E(type)) && ((unsigned)arg_prec < pw2))
        pw2 = arg_prec;
        /* divide by 18 for number ulls needed */
      pw2 = 2 + (uint32_t)(((uint64_t)((uint32_t)pw2) * 252645135U) >> 32);
    }
  }
                        /*** allocation ***/
    /* allocate reg here based on exponent/precision, see above notes */
    /* group allocation, fprintf() to free, returning we allocating 's' */
  strsz = arg_prec + 8;
  type |= pw2 << 16;
    /* type F has special requirements, if positive exponent */
  if ((type << 31) & ~(exponent - 0x3FFF)) {
      /* grouping assumption of thousands: 1,000,000 */
      /* add in exponent: [±][0](exponent)[.](precision)[e±000[0]] */
      /* assume all pw2 as 'whole' */
      /* more accuracy, for less allocations, is faster */
    uint32_t triplets = (exponent - 0x3FFF);
    strsz += 1
          + (((uint32_t)(((uint64_t)((uint32_t)triplets) * 2585827973U) >> 32)) >> 1);
    if ((arg_flags & GRP_PREFIX) != 0)
      strsz += ((uint16_t)21845 * (uint32_t)triplets) >> 16;
  }
  regs = reg;
  if ((pw2 > 44) || (strsz > BUFSTRSIZE)) {
      /* if either static exceeds limit, allocate both */
    strsz = (strsz + ((1 << 6) - 1)) & ~((1 << 6) - 1);
      /* test for non-static string buffer */
    if (arg_flags & ALLOC_BUF)  free(*s);
      /* assignment to sPtr, sPtr is one tested and free'd in fprintf() */
    if ((sPtr = malloc((size_t)(strsz + ((pw2 + 1) * sizeof(_ull_t))))) == NULL)
      return sPtr;
    *s = sPtr;
    regs = (_ull_t*)(sPtr + strsz);
  }
    /* zero regs, no longer part of math units */
  do  regs[pw2].uls.hi = 0, regs[pw2].uls.lo = 0;  while ((--pw2));

                        /*** fractional load ***/
    /* assign 'fraction', 'exponent' info for passage to math units */
    /* w160[0,1] used on requested quadruple, or long double 128bit (not e80) */
  regs[0].uls.lo = exponent;
  regs[1].ull = *(uint64_t*)&w160[2];
  regs[2].ull = *(uint64_t*)&w160[0];
  if ((regs[1].ull | regs[2].ull) == 0) {
    exponent = 0;
    if (IS_TYPE_G(type))  type |= TYPE_F;
    type |= TYPE_ZERO;
    goto skip_round;
  }

                        /*** math ***/
  packed_return = (*math_func)(regs, type);
    /* transfer info returned into addresses, exponent */
  ubPtr = (utPtr = regs) + ((packed_return >> 4) & 0x3FF);
  exponent  = (d0 = _first_digit(utPtr->ull));
  exponent += (int32_t)packed_return >> 16;
  packed_return &= 0x0F;
  ufPtr = ((packed_return == 0) ? ubPtr :
          ((packed_return == 0x08) ? regs : ubPtr - (packed_return - 1)));

                        /*** rounding ***/
    /* assign position 1 pass display precision */
  rpos = arg_prec + 1;
  if (IS_TYPE_F(type))  {
      /* this excludes positive exponents, they will not jump to skip */
    if (arg_prec < ~exponent)  goto skip_round;
      /* positive exponents must count pre-radix as precision */
    if (~exponent >> 31)
      arg_prec += exponent;
      /* when negative, remove pre-zero count, positive add in whole portion */
    rpos += exponent;
  }
    /* rounding position <= first digit position, has front end zeroes */
  if ((rpos -= d0) > 0) {
    _ull_t *aPtr = utPtr;
      /* test rounding for second ull or higher, assume not large arg_prec */
    while (((++aPtr) < ubPtr) && (rpos > 18))  rpos -= 18;
      /* verify position has digits to round */
    if (aPtr < ubPtr) {
        /* set last ull, if rpos == 1 then exclude current aPtr */
      ubPtr = aPtr + (rpos != 1);
      if (ufPtr > ubPtr)  ufPtr = ubPtr;
      const uint64_t *fc = _fcarry;
      if ((aPtr->ull += fc[18 - rpos]) >= _CARRY) {
        uint64_t t = aPtr[-1].ull + 1ULL;
          /* carry forced propagation, makes ull == 0 */
        (ubPtr = aPtr)->ull = 0;
        (--aPtr)->ull = t;
        if ((aPtr == utPtr) && (t >= fc[19 + d0])) {
          exponent++;
            /* type 'f' has increase in precision if positive exponent */
            /* condition of == 18 never occurs */
          arg_prec += (type & ((uint32_t)~exponent >> 31));
          ++d0;
        }
      }
    }
  } else {
      /* first ull rounding */
    const uint64_t *fc = _fcarry;
      /* limit reads of ulls to next register */
    ubPtr = utPtr + 1;
    if (ufPtr > ubPtr)  ufPtr = ubPtr;
      /* add '5' to rounding location */
    a = utPtr->ull + fc[-rpos];
    utPtr->ull = a;
      /* test for propagation to new first digit */
    if (a >= fc[19 + d0]) {
      exponent++;
      if ((++d0) == 18) {
          /* rare case. inserted here to avoid alter of ulltobuffer for odd case
           * and testing all cases for this condition. */
          /* occurs @ 1e18, 1e-18
           9.9999950000000026e+17 first precision 6
           9.9999999999999987e+17 last
           9.9999950000000012e-19 first precision 6
           9.9999999999999988e-19 last
           */
        *(sPtr = *s) = dblt_sign, sPtr += (dblt_sign != 0);
          /* type f and negative exponent */
        if (((uint32_t)exponent >> 31) & type) {
          unsigned j, k;
          *sPtr++ = '0';
          *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
          j = 1 + (decimalPtr[1] != 0);
          sPtr += j, decimalPtr += j;
          if ((j = -exponent - 1)) {
            if (j >= (unsigned)arg_prec) goto L24;
            arg_prec -= j;
            k = j & 3;
            if ((j = (int)((unsigned)j >> 2)))
              do  *(uint32_t*)sPtr = 0x30303030U, sPtr += 4;  while ((--j));
            *(uint32_t*)sPtr = 0x30303030U;
            sPtr += k;
          }
          *sPtr++ = '1';
          if ((arg_prec -= 1) > 0)  goto L245;
          return sPtr;
        }
        *sPtr++ = '1';
        if ( (IS_TYPE_G(type)) && (((unsigned)exponent) <= (unsigned)arg_prec) )
          type |= TYPE_F;
        if ( (arg_prec) && (!(IS_TYPE_G(type)) || (arg_flags & ALT_PREFIX)) ) {
          unsigned dflag = (decimalPtr[1] != 0);
          *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
          sPtr += dflag + 1, decimalPtr += dflag + 1;
          goto L245;
        }
        goto L25;
      }
    }
  }
                    /*** adjust for entered type... g ***/
    /* rounding can adjust exponent, so now can test if conditions met */
  if (IS_TYPE_G(type)) {
      /* this should exclude negatives */
    if (((unsigned)exponent) <= (unsigned)arg_prec)  goto L23;
    if (((unsigned)(exponent + 4)) <= 4) {
      arg_prec -= exponent;
L23:  type |= TYPE_F;
    }
  }
                        /*** output section ***/
skip_round:
  *(sPtr = *s) = dblt_sign, sPtr += (dblt_sign != 0);
    /* type f and negative exponent, <zero> */
  if ((((uint32_t)exponent >> 31) & type) | (IS_TYPE_ZERO(type))) {
    unsigned j, k;
    *sPtr++ = '0';
    *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
    j = 1 + (decimalPtr[1] != 0);
    if (!arg_prec) {
      if (IS_TYPE_E(type))  goto L11;
      if (arg_flags & ALT_PREFIX)  return (sPtr + j);
      return sPtr;
    }
    sPtr += j, decimalPtr += j;
    if ((j = -exponent - 1)) {
      if (j >= (unsigned)arg_prec) goto L24;
      arg_prec -= j;
      k = j & 3;
      if ((j = (int)((unsigned)j >> 2)))
        do  *(uint32_t*)sPtr = 0x30303030U, sPtr += 4;  while ((--j));
      *(uint32_t*)sPtr = 0x30303030U;
      sPtr += k;
    }
  }
  {
    /*      first ull
     if utPtr == ufPtr then all digits are part of precision. d0 does not
     count digits but returns log10... log10(1-9) = 0
     if 'f', data starts at sPtr. Otherwise starts sPtr + size of decimal.
     will write on last char of decimal, which then that char copies to sPtr
     then overwritten by decimal.
     (!IS_TYPE_F(type)) stringPtr = sPtr + decimalsz
     To determine if first digit is used as precision use
     (decimalPtr == (char*)&decimal)
     Two-byte decimal determined by (((char*)&decimal)[1] != 0)
     */

      /* deal with d0 */
      /* _fcarry multiplier based on d0, then can adjust d0 */
    const uint64_t *mConst = &_fcarry[35 - d0];
    if ((d0 += ((utPtr == ufPtr) & IS_TYPE_F(type))) > arg_prec)  d0 = arg_prec;

    unsigned char *stringPtr = sPtr;
    if (!IS_TYPE_F(type))
      stringPtr += 1 + (((char*)&decimal)[1] != 0);
      /* traps d0 = [0,9] or all ull multipliers */
    if (utPtr->ull < 1000000000) {
      _ull_t b;
        /* buf output use pre-incr (++) */
      unsigned char *buf = stringPtr - 1;
      uint32_t f = (uint32_t)((uint64_t)utPtr->uls.lo * (uint32_t)mConst[(-9)]);
      uint32_t n = d0 + (decimalPtr == (char*)&decimal);
      b.ull = (uint64_t)(f << 2) * 2882303762U;
      goto L2;
      do {
        b.uls.hi = (((b.uls.hi << 4) >> 4) * 10) + (b.uls.lo >> 28);
        b.uls.lo <<= 4;
    L2: *++buf = (b.uls.hi >> 28) + '0';
      } while ((--n));
    } else {
      _ulltobuffer_nlr((divlu2r(utPtr->ull * (uint32_t)*mConst)), stringPtr,
                                        (d0 + (decimalPtr == (char*)&decimal)));
    }
      /* type 'e' requires radix after first digit */
    if (!IS_TYPE_F(type)) {
        /* move first digit to first spot */
      *sPtr = *stringPtr;
        /* can not use short write here,
         * will overwrite any digits following if sz == 1 */
      *++sPtr = *decimalPtr;
      if (*++decimalPtr != 0)  *++sPtr = *decimalPtr, decimalPtr++;
        /* a must for if arg_prec */
      sPtr++;
      if (!arg_prec) {
        if (arg_flags & ALT_PREFIX)
              sPtr += (((char*)&decimal)[1] != 0);
        else  sPtr -= 1 + (((char*)&decimal)[1] != 0);
        goto L11;
      }
    }
    sPtr += d0 + (decimalPtr == (char*)&decimal);
    if ((arg_prec -= d0) == 0)  goto L25;
  }

  do {
      /* case where has whole/fraction */
    if ((++utPtr) >= ufPtr) {
      if (decimalPtr == (char*)&decimal) {
          /* this also used as delimiter for whole part, stop point for
           * trailing zero removal */
        unsigned dflag = (decimalPtr[1] != 0);
        *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
        sPtr += dflag + 1, decimalPtr += dflag + 1;
      }
        /* set to last ull, if not already at */
      if (ufPtr == ubPtr)  break;
      ufPtr = ubPtr;
    }
    if ((arg_prec - 18) < 0) {
      _ulltobuffer_nlr(divlu2r(utPtr->ull), sPtr, arg_prec);
      sPtr += arg_prec;
      goto L25;
    }
    arg_prec -= 18;
    _ulltobuffer_18lr(divlu2r(utPtr->ull), sPtr);
    sPtr += 18;
  } while (1);
L24:
  if ( (arg_prec) && (!(IS_TYPE_G(type)) || (arg_flags & ALT_PREFIX)) ) {
      /* safe because of added room for exponent in allocation */
    unsigned k;
  L245:
    k = arg_prec & 3;
    arg_prec = (int)((unsigned)arg_prec >> 2);
    *(uint32_t*)sPtr = 0x30303030U;
    if (arg_prec)
      do  sPtr += 4, *(uint32_t*)sPtr = 0x30303030U;  while ((--arg_prec));
    sPtr += k;
  }
L25:
    /* case 'g', remove any trailing <zero>s */
  if (IS_TYPE_E(type))  goto L11;
  if (!(arg_flags & ALT_PREFIX)) {
      /* test only 'last' byte of decimal point.
       * '== 0' only occurs if written. decimal is at max 2 bytes */
    if (*decimalPtr == 0) {
      unsigned char ch;
      ch = *--sPtr;
      if (IS_TYPE_G(type))
        while (ch == '0')  ch = *--sPtr;
        /* decimal point is at max 2 bytes */
      if (ch != (unsigned char)decimalPtr[-1])  ++sPtr;
      else if (&decimalPtr[-1] != (char*)&decimal) --sPtr;
    }
  } else if (*decimalPtr != 0) {
    unsigned dflag = (decimalPtr[1] != 0);
    *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
    sPtr += dflag + 1, decimalPtr += dflag + 1;
  }
                        /*** exponent output ***/
  if (IS_TYPE_F(type))  return sPtr;
L11:
    /* converts g,G to e,E and adds sign (portable character set) */
  *(uint16_t*)sPtr = cconst16_t(('e' ^ ((type & 0x20) ^ UPPER_TYPE)),
                                ('+' - ((exponent >> 31) << 1)));
    /* absolute value */
  exponent = (exponent ^ (exponent >> 31)) - (exponent >> 31);
  if (exponent < 10) {
      /* exponent output reqires 2 digit display */
    *(uint16_t*)(&sPtr[2]) = cconst16_t('0',('0' + exponent));
    return (sPtr + 4);
  }
    /* quadruple, long doubles range [10,4966] */
  uint32_t ecd = (uint32_t)(((uint64_t)((uint32_t)exponent << 19)
                                                      * 2199023989U) >> 32);
  ++sPtr;
  if (exponent >= 1000)  *++sPtr = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 100)   *++sPtr = (ecd >> 28) + '0';
  sPtr[1] = ((ecd = (((ecd << 4) >> 4) * 10)) >> 28) + '0';
  sPtr[2] = ((((ecd << 4) >> 4) * 10) >> 28) + '0';
  return (sPtr + 3);
}

/* "[±]0xh.hhhhp±d"
 * normalized: h. force to high nibble for higher precision
 * denornalize: .h force highest nibble as above
 * output precision based on 'g' style
 */
unsigned char *
realtohex(uint32_t w160[5], unsigned char **s, int type,
         int arg_prec, int arg_flags, unsigned decimal) {

  unsigned char *sPtr = *s;
  int exponent;
  uint64_t frac;

                      /*** input defaults ***/
  char *decimalPtr = (char*)&decimal;
  if (arg_prec < 0) {
    arg_prec = (IS_DBL(type)) ? 13 : 15;
    type |= TYPE_G;
  }

    /* determine upper of lower case output */
  const char    *_outvalues = _const_outStr;
  if (IS_UPPER_TYPE(type)) _outvalues += 32;

                      /*** sign && exponent ***/
  exponent = w160[4] & 0x7FFF;
  if ((w160[4] & 0x8000) != 0)  *sPtr++ = '-';
  else {
    if ((arg_flags & (SIGN_PREFIX | SPC_PREFIX)) != 0)
      *sPtr++ =  (arg_flags & SPC_PREFIX) ? ' ' : '+';
  }
  if (exponent == 0x7FFF)
    return __n_char_sequence(w160, sPtr,
                                  (type | ((arg_flags & ALT_PREFIX) << 1)));
  exponent -= 16383;

                      /*** prefix  0x or 0X ***/
  *((uint16_t *)sPtr) = ((uint16_t *)_outvalues)[12];
  sPtr += 2;

                      /*** fractional load ***/
  if ((frac = *(uint64_t*)&w160[2]) == 0) {
    exponent = 0;
    *sPtr++ = '0';
    if (arg_flags & ALT_PREFIX) {
      unsigned dflag = (decimalPtr[1] != 0);
      *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
      sPtr += dflag + 1, decimalPtr += dflag + 1;
    }
    goto out_exp;
  }
    /* alter exponent by 'shift' amount used to normalize */
  if ((frac & 0x8000000000000000) != 0)
    exponent -= 3;
                        /*** rounding ***/
  if (arg_prec < 15) {
    uint64_t rmask = 0x0FFFFFFFFFFFFFFFULL >> (arg_prec << 2);
    uint64_t radd = (rmask + 1) >> 1;
      /* not sure need to mask result */
    if ((frac & rmask) > radd)  frac = (frac + radd) & ~rmask;
  }

    /* rotate high nibble to low nibble */
  uint32_t snfct = 0;
  if ((frac & 0x8000000000000000ULL) == 0) {
    frac <<= 1;
    snfct = 1;
    *sPtr++ = '0';
  } else {
    frac = (frac >> 60) | (frac << 4);
    *sPtr++ = _outvalues[((uint32_t)frac & 0x0000000F)];
  }
    /* decimal point */
  unsigned dflag = (decimalPtr[1] != 0);
  *(uint16_t*)sPtr = *(uint16_t*)decimalPtr;
  sPtr += dflag + 1, decimalPtr += dflag + 1;
    /* precision based on type, adjust if denormal */
  dflag = (arg_prec < ((IS_DBL(type)) ? 13 : 15))
         ? arg_prec : ((IS_DBL(type)) ? 13 : 15);
  arg_prec -= dflag;
  if (snfct == 1)
    dflag++;
  do {
      /* rotate into position */
    frac = (frac >> 60) | (frac << 4);
    *sPtr++ = _outvalues[((uint32_t)frac & 0x0000000F)];
  } while ((--dflag));

  if (IS_TYPE_G(type)) {
    unsigned char ch = *--sPtr;
    while (ch == '0')  ch = *--sPtr;
    if (ch != (unsigned char)decimalPtr[-1])  ++sPtr;
    else if (&decimalPtr[-1] != (char*)&decimal) --sPtr;
  } else if (arg_prec) {
    dflag = arg_prec & 3;
    arg_prec = (int)((unsigned)arg_prec >> 2);
    *(uint32_t*)sPtr = 0x30303030U;
    if (arg_prec)  do  sPtr += 4, *(uint32_t*)sPtr = 0x30303030U;  while ((--arg_prec));
    sPtr += dflag;
  }

  /*** exponent output (p±d p[-16494,+16383]) ***/
out_exp:
  *sPtr++ = _outvalues[19];
    /* based on portable character set */
  *sPtr++ = '+' - ((exponent >> 31) << 1);
    /* absolute value */
  exponent = (exponent ^ (exponent >> 31)) - (exponent >> 31);

  if (exponent < 10) {
    *sPtr = exponent + '0';
    return (sPtr + 1);
  }
    /* divide exponent by 10000   3518437209 */
  uint32_t ecd = (uint32_t)(((uint64_t)(uint32_t)exponent * 3518437209U) >> 17);
  ecd += 1 << 14;
  if (exponent >= 10000)  *sPtr++ = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 1000)   *sPtr++ = (ecd >> 28) + '0';
  ecd = ((ecd << 4) >> 4) * 10;
  if (exponent >= 100)    *sPtr++ = (ecd >> 28) + '0';
  sPtr[0] = ((ecd = (((ecd << 4) >> 4) * 10)) >> 28) + '0';
  sPtr[1] = ((((ecd << 4) >> 4) * 10) >> 28) + '0';

  return (sPtr + 2);
}

#pragma mark *** Utility External Interfaces ***

/* need different long double definitions 106,64,53 */
long double
qtold(quadruple qdbl) {

  octle iqdbl;
  unsigned char *cdbl = (unsigned char*)&iqdbl;
  iqdbl = qdbl;

#if (__LDBL_MANT_DIG__ == 64)
  iqdbl.ulls.ull0 = (iqdbl.ulls.ull1 << 15) | (iqdbl.uss.us3 >> 1);
  iqdbl.ulls.ull1 >>= 48;
  iqdbl.uls.ul1 = (((iqdbl.uss.us4 & 0x7fff) != 0) << 31)
                 | (iqdbl.uls.ul1 & 0x7fffffff);
#else
  int16_t sign = (idbl.uss.us7 & 0x8000) >> 4;
  int16_t e16 = (idbl.uss.us7 & 0x7fff) - 16383 + 1023;
  iqdbl.uss.us7 = 0;
  iqdbl.ulls.ull1 <<= 4;
  iqdbl.uss.us4 |= iqdbl.uss.us3 >> 12;
    /* double-double and double */
 #if (__LDBL_MANT_DIG__ == 106)
    /* assumes low double exponent different by 2^-52, keeps sign
     * of course this doesnt make sense on denormals
     * thus no extra precision, low double has to be zero */
    /* mantissas into correct positions */
  iqdbl.ulls.ull0 >>= 8;
  iqdbl.uss.us3 |= ((e16 - 52) | sign) << 4;
  if (e16 <= 52) {
    iqdbl.ulls.ull0 = 0;
      /* unknown if benefits hardware when 0 */
    iqdbl.uss.us3 = sign << 4;
  }
 #endif
  if (e16 <= 0) {
    if (e16 <= -52)  iqdbl.ulls.ull1 = 0;
    else             iqdbl.ulls.ull1 >>= (12 - e16);
    e16 = 0;
  } else if (e16 > 2047) {
      /* set to inf */
    iqdbl.ulls.ull1 = 0;
    e16 = 2047;
  }
  iqdbl.uss.us7 |= (e16 | sign) << 4;
#endif
  return (*(long double*)cdbl);
}

double
ldtod(long double ldbl) {

  unsigned char *wc = (unsigned char *)&ldbl;
#if ((__LDBL_MANT_DIG__ == 53) || (__LDBL_MANT_DIG__ == 106))
    /* ppc double-double  XXX untested, just drop lower 64, big endian */
    /* wc will point to highest byte of 128 bit */
  return (*(double*)wc);
#else
    /* since padding occurs, non gpr registers tend towards 128bit use: */
  octle ildbl;
  int16_t e16;
  int16_t sign;
  ildbl = *(octle*)wc;
  wc = (unsigned char *)&ildbl;
    /* 3 known, use IEEE-754 16bit sign/exp, biased */
#if (__LDBL_MANT_DIG__ > 65)
    /* trap binary128 */
  e16 = ildbl.uss.us7;
  ildbl.ulls.ull0 = ildbl.ulls.ull1 >> 12;
#else
    /* trap binary64, e80 (64/63 mantissas) */
  e16 = ildbl.uss.us4;
 #if (__LDBL_MANT_DIG__ == 64)
  ildbl.uss.us3 &= 0x7fff;
  ildbl.ulls.ull0 >>= 11;
 #else
  ildbl.ulls.ull0 = (((ildbl.ulls.ull1 << 4)
                      | (uint64_t)(ildbl.uss.us3 >> 12))) >> 12;
 #endif
#endif
  sign = (e16 & 0x8000) >> 4;
  e16 = (e16 & 0x7FFF) - 16383 + 1023;
  if (e16 <= 0) {
    if (e16 <= -52)  ildbl.ulls.ull0 = 0;
    else             ildbl.ulls.ull0 >>= (12 - e16);
    e16 = 0;
  } else if (e16 > 2047) {
      /* set to inf */
    ildbl.ulls.ull0 = 0;
    e16 = 2047;
  }
  ildbl.uss.us3 |= (e16 | sign) << 4;
  return (*(double*)wc);
#endif
}



int
main(void) {
  phx_printf("%.43f\n", 0x1.52f8a8e32e982p-140);
  phx_printf("%g\n", 355.0/113.0);
  return 0;
}