public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* RFD: RISC-V vectorized libfuncs
@ 2023-07-19 12:39 Joern Rennecke
  0 siblings, 0 replies; only message in thread
From: Joern Rennecke @ 2023-07-19 12:39 UTC (permalink / raw)
  To: GCC

[-- Attachment #1: Type: text/plain, Size: 389 bytes --]

I think it would make sense to leave the exact vector layout, like
vlen and lmul, to the caller.
Attached is an attempt to implement sin and cos vectorized so it
allows lmul values of m1 and m2, while using no more than a quarter of
the vector registers.
The function could live in libgcc and be used via a special pattern in
the machine description that
shows the exact list of clobbers.

[-- Attachment #2: sin.S --]
[-- Type: application/octet-stream, Size: 3518 bytes --]

/* vectorization of newlib/libm/mathfp/s_sine.c, with a few changes:
  - overflow is a bit farther off becasue we use 64 bit integers.
  - all errors are EDOM, for too large inputs, the output will be NaN.
  - redundant parts have been elided.
  - using the table from newlib/libm/mathfp/k_sin.c
/* The caller is expected to use a suitable v*setvl* instruction;
   could use m1 or m2.
   For m1: inputs: v2  Outputs: v2  Clobbered: a0-a2, v0, v4, v6, f0-f5
   For m2: inputs: v2/v3 Outputs: v2/v3 Clobbered: a0-a2, v0/v1, v4-v7, f0-f5
  (We don't use v0 for argument passing because it's the mask register. )  */
      
#define EDOM 33

#define X v2
#define y v2
#define g v4
#define R v6
#define tmp v0

 /* The multiply-add operation in the Taylor series can't actually save us
    instruction count.  Either we have to use a strided load to slat the
    values into a vector register - then we need to adjust the address
    register, as there is no offset available for vector loads.  Or we
    need to load into a scalar register first and splat that into a vector
    register.  Or we use separate multiply and add instructions so that we
    can use a scalar register directly for the add.
    latency-wise, the strided loads is not guaranteed to avoid
    repeated loads even if using x0 for the stride, and the fused multiply-add
    might be faster (and more precise) than separate multiply and add, so we
    got fro loading into a scalar and moving that into a vector.  */

pies:
 .double 1.57079632679489661923 // PI/2
 .double 0.5
 .double 0.31830988618379067154 // 1/PI
 .double 3.14159265358979323846 // PI
trouble:
 .quad 0x43dfffffffffffff // mnimum unsafe input (b/cause we round up)
 .quad 0x7ff8000000000000 // canonical NaN (qNaN)
tab:
 .double -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
 .double  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
 .double -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
 .double  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
 .double -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
 .double  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
.global cos
cos:
 auipc a0,0
 fld f0,-12*8-12/*(pies-cos)*/(a0)
 vfadd.vf X,X,f0
.global sin
sin:
 auipc a0,0
 fld f0,-8*8-12/*(trouble-sin)+0*8*/(a0)
 fld f1,-7*8-12/*(trouble-sin)+1*8*/(a0)

 fld f2,-11*8-12/*(pies-sin)+1*8*/(a0)
 fld f3,-10*8-12/*(pies-sin)+2*8*/(a0)
 fld f4, -9*8-12/*(pies-sin)+3*8*/(a0)
 
 /* Check for Inf / NaN / impeding overflow */
 vfsgnjx.vv tmp,X,X
 vmsge.vx v0,tmp, a1 // int compare to avoid invalid operation exception
 vfmerge.vfm X,X,f1,v0 // Put a Nan in each affected lane.
 /* write errno if there's an error.  */
 vcpop.m a1,v0
 beq a1,zero,0f
 lui     a1,%hi(errno)
 li      a2, EDOM
 sw      a2,%lo(errno)(a1)
0:
 vfmv.v.f tmp,f2
 vfsgnj.vv tmp,tmp,X
 vfmacc.vf tmp,f3,X
 vfcvt.rtz.x.f.v tmp,tmp
 vfcvt.f.x.v R,tmp
 vfnmsac.vf X,f4,R

 fld f0,-6*8/*(tab-sim)+0*8 */(a0)
 fld f1,-5*8/*(tab-sin)+1*8 */(a0)
 fld f2,-4*8/*(tab-sin)+2*8 */(a0)
 fld f3,-3*8/*(tab-sin)+3*8 */(a0)
 fld f4,-2*8/*(tab-sin)+4*8 */(a0)
 fld f5,-1*8/*(tab-sin)+5*8 */(a0)

 li a1,63
 vsll.vx tmp,tmp,a1	// tmp & 1 -> sgn
 vfsgnjx.vv y,X,tmp

 /* Evaluate polynom */
 vfmul.vv g,y,y

 vfmv.v.f R,f4			// R = r[4]
 vfmacc.vf R,f5,g		// R = r[5] * g * R
 vfmv.v.f tmp,f3
 vfmadd.vv R,g,tmp
 vfmv.v.f tmp,f2
 vfmadd.vv R,g,tmp
 vfmv.v.f tmp,f1
 vfmadd.vv R,g,tmp
 vfmv.v.f tmp,f0
 vfmadd.vv R,g,tmp
 vfmadd.vv y,R,y // return value in y == v2
 ret

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-07-19 12:39 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-07-19 12:39 RFD: RISC-V vectorized libfuncs Joern Rennecke

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).