branches compared (lja

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

* branches compared (lja_speed, EV56)
@ 2004-04-20 22:08 Kurt Garloff
  2004-04-25  2:02 ` Kurt Garloff
  0 siblings, 1 reply; 2+ messages in thread
From: Kurt Garloff @ 2004-04-20 22:08 UTC (permalink / raw)
  To: gcc


[-- Attachment #1.1: Type: text/plain, Size: 1706 bytes --]

Hi,

lja_speed benchmarks on my EV56-600 (DEC21164A), 768MB, KDE kicker 
background load, times in seconds.

                        -fno-new-ra  -fnew-ra	-fnew-ra 
						-fbranch-prob
*** With FFM
gcc-3.2.2 (SL81)          14.06		 9.37+	  9.24+
gcc-3.2.3                 14.12  	 
gcc-334     20040409      13.84		 9.46+	  9.25+ 
gcc-333 ham 20040409      16.56-	13.81-	 13.72 
gcc-34      20040409      12.89		11.18 
34 new_ra   20040409      16.67-	14.14-   ERROR! 
gcc-35      20040409      13.03		11.25	 ERROR!
35 tree-ssa 20040416	  12.01+	11.81	 14.10-
35 ssa-lno  20040409	  13.08		12.64
*** With CPML
gcc-3.2.2 (SL81)          14.18 	 9.27+		
ccc-6.5.9                  8.60

Compile flags: 
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3 \
 -frerun-loop-opt -mcpu=ev56 -funroll-loops -fstrict-aliasing 
ccc: -w0 -msg_display_tag -O2 -accept restrict_keyword \
 -D__USE_STD_IOSTREAM -fast -tune ev56 -arch ev56 -O4 -inline speed 

Notes:
* With the old register allocator, you can see improvements
  from 3.2 -> 3.3 -> 3.4 -> ssa.
  Both 3.5 and ssa-lno are a bit behind, hammer branch and new_ra
  branches suck.
* With the new register allocator, 3.3.4 and 3.2.2SUSE perform best,
  3.4, 3.5 are acceptable, ssa somewhat worse, ssa-lno worse
  and hammer and new-ra branches suck.
* The hammer branch slowdown is probably an import from 3.4; it used
  to be absent 9 month ago, then a slowdown appeared on 3.4, which is
  meanwhile cured.
* FDO is broken on 3.4 and 3.5.

Regards,
-- 
Kurt Garloff  <garloff@suse.de>                            Cologne, DE 
SUSE LINUX AG, Nuernberg, DE                          SUSE Labs (Head)

[-- Attachment #1.2: lja_speed.c --]
[-- Type: text/plain, Size: 7782 bytes --]

/* Test program for new-ra 
 * From ljapunov calc program
 * (c) Kurt Garloff, 10/2003, GNU GPL
 */ 

#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#if 1 //def __alpha__
#include <setjmp.h>
#include <signal.h>
#include <fenv.h>
#endif


#define LONG_DOUBLE long double
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)>(b)?(b):(a))
#if defined(__GNUC__) && __GNUC__ >= 3
# define LIKELY(x)	__builtin_expect((x) != 0,1) 
# define UNLIKELY(x)	__builtin_expect((x) != 0,0)
#else
# define LIKELY(x)	(x)
# define UNLIKELY(x)	(x)
#endif

#define MAXPATLEN 64
#define MAXCOLENTS 12
// Coltrafo description	

typedef struct _ct_inf {	
	float val;
	float h, s, i;
} ct_inf;

enum calcmode { FIXX, FIXB, FIXA };

typedef struct _lja_param {
	char magic[4];	/* "LJA0" */
	char rev;	/* revision */
	char cmode;	/* enum calcmode actually */
	unsigned char pass;
	unsigned char patlen;
/*0x08:*/
	double x0;
/*0x10:*/
	double amin, amax, bmin, bmax;
/*0x30:*/
	unsigned int xres, yres, bline, depth;
/*0x40:*/
	char pattern[MAXPATLEN];
/*0x80:*/
	/* Info for lja2bmp */
	float ampln, amplp;
/*0x88:*/
	char res2[55];
	char ctents;
/*0x0c0:*/
	ct_inf coltbl [MAXCOLENTS];
/*0x180:*/
} lja_parm;
	
lja_parm *ljp;
float *results;
unsigned long int nl;
// Count of A's and B's in pattern
static int na, nb;
static char *fullpat;
char quiet;
volatile sig_atomic_t interrupt;


#define INF 20000
#define SML 1e-24
#define DIV 160.0

const char lja_hdr_latest_rev = 2;

void lja_defaults (lja_parm * const ljp)
{
	memset (ljp, 0, sizeof (lja_parm));
	/* Ljapunov settings */
	memcpy (ljp->magic, "LJA0", 4);
	ljp->rev = lja_hdr_latest_rev; ljp->cmode = FIXX;
	ljp->pass = 0; 	ljp->patlen = 2;
	ljp->x0 = 0.4;
	ljp->amin = -2.7; ljp->amax = 4.2;
	ljp->bmin = -2.7; ljp->bmax = 4.2;
	ljp->xres = 1280; ljp->yres = 1024;
	ljp->bline = 0; ljp->depth = 256;
	memset (ljp->pattern, 0, MAXPATLEN); ljp->pattern[1] = 1;
	ljp->ampln = 0.4; ljp->amplp = 1.6;
	ljp->ctents = 0;
}

static //inline 
double extrapol_lja (const double a, const double b, const double l, 
		     const unsigned r, const unsigned dpth)
{
	++nl;
	if (UNLIKELY(l <= 0.0))
		return -INF;
	else
		return ( log(l) + r * 
			(  log (fabs (a)) * na
			 + log (fabs (b)) * nb ) ) / dpth;
}

#if 1 //def __alpha__
/* Exception handling: Will probably not work with multithreading */
jmp_buf lja_fpe_jump;

void fpe_handler (int signum, int info)
{
	if (!info)
		info = -1;
	printf ("\x1b[D!"); fflush (stdout); 
	if (lja_fpe_jump)
		longjmp (lja_fpe_jump, info);
	else
		raise (signum);
}
#endif
	
static inline
void set_pat (double* pat, const int ln, const double a, const double b)
{
	int i;
	for (i = 0; i < ln; ++i)
	       	pat[i] = (fullpat[i%ln]? b: a);
}

double ljapunov (const double a, const double b, const double xi, 
		 const unsigned dpth)
{
	int t = 0; register int p;
	const int exppln = (ljp->patlen > 16?     ljp->patlen: 
			    ljp->patlen >  8? 2 * ljp->patlen: 
			    		      4 * ljp->patlen);
	register LONG_DOUBLE l, r, x = xi;
	const int rep = dpth / exppln;
	
	double pattern[32];
	set_pat (pattern, exppln, a, b);
	//double pattern[2];
	//pattern[0] = a; pattern[1] = b;
	
	l = 1.0;
#if 1 //def __alpha__
	if (UNLIKELY(setjmp (lja_fpe_jump)))
		return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
#endif
	//asm (".align 32\n");
	for (t = 0; t < rep; ++t) {
		for (p = 0; p < exppln; ++p) {
			//r = (fullpat[p]? b: a);
			r = pattern[p];
			//r = pattern[fullpat[p]];
			x *= r*(1.0-x); 
			l *= fabs(r*(1.0-x*2.0));
		}
		{
		register const double fabsx = fabs(x);
		if ( /*t >= 2 &&*/
		    UNLIKELY(fabsx > DIV || fabsx < SML) )
			return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);

		}
	}
	// Residuals ?
	t = dpth - rep*exppln; 
	for (p = 0; p < t; ++p) {
		//r = (fullpat[p]? b : a);
		r = pattern[p];
		//r = pattern[fullpat[p]];
		x *= r*(1.0-x); 
		l *= fabs (r*(1.0-x*2.0));
	}

	if (LIKELY(l > 0.0)) 
		return log (l) / dpth;
	else 
		return -INF;
/*
 out:
	return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
 */	
}

void prepare_ljapunov (const int dpth)
{
	// pattern count
	int s; na = 0; nb = 0;
	for (s = 0; s < ljp->patlen; ++s) {
		if (ljp->pattern[s])
			++nb;
		else
			++na;
	}
	for (s = 0; s < dpth; ++s)
		fullpat[s] = ljp->pattern[s%ljp->patlen];
}

void do_calc_lines (const unsigned xstep, const unsigned ystep, 
		    const unsigned depth,
		    const unsigned start, const unsigned end)
{
	unsigned ln, col;
	const double mx = (ljp->amax - ljp->amin)/ljp->xres;
	const double my = (ljp->bmax - ljp->bmin)/ljp->yres;
#ifdef __alpha__
	fenv_t fpu;
	//fegetenv(&fpu);
	//printf("FPU: 0x%016lx", fpu);
	//fesetenv(FE_DFL_ENV);
	fegetenv(&fpu);
	//printf(" 0x%016lx", fpu);
	fpu |= FE_MAP_DMZ | FE_MAP_UMZ;
	fesetenv(&fpu);
	fedisableexcept(FE_ALL_EXCEPT);
	//fegetenv(&fpu);
	//printf(" 0x%016lx\n", fpu);
#endif	
	signal (SIGFPE, (sig_t)fpe_handler);
	
	for (ln = start; ln < end; ln += ystep) {
		double y = (double)ljp->bmax - my * (ln + (double)ystep*0.5);
		float res;
		//printf ("%i (%ix%i)\n", ln, xstep, ystep);
		switch (ljp->cmode) {
		    case FIXX:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (x, y, ljp->x0, depth);
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ln2++)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); ++col2)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    case FIXA:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (ljp->x0, y, x, depth); 
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    case FIXB:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (x, ljp->x0, y, depth);
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    default: res = 0; abort ();
		}
	}
	signal (SIGFPE, SIG_DFL);
}

void calcn (const unsigned xstep, const unsigned ystep, 
	    const unsigned dpth)
{
	unsigned int y = 0; int thr;
	const int ychunk = 32;
	unsigned yrmax = ljp->yres;
	fullpat = (char*)malloc (dpth); prepare_ljapunov (dpth);
	for (y = ljp->bline; y < yrmax && !interrupt; y += ychunk) {
		if (!quiet) { 
			printf ("."); fflush (stdout); 
		}
		/* Do the last chunk ourself */
		do_calc_lines (xstep, ystep, dpth, 
				y, y+ychunk);
	}
	if (interrupt) { 
	   	if (!quiet) 
			printf ("%i", y-1); 
	   	ljp->bline = y; 
		free (fullpat);
		return; 
	}
	if (y != yrmax)
		do_calc_lines (xstep, ystep, dpth, y, yrmax);
	//ljp->bline = ljp->yres;
	ljp -> bline = 0;
	free (fullpat);
	if (!quiet) { 
		printf ("%i\n", ljp->yres-1); fflush (stdout); 
	};
}

int main (int argc, char* argv[])
{
	clock_t st, en;
	st = clock();
	ljp = (lja_parm*)malloc(sizeof(lja_parm));
	lja_defaults(ljp);
	results = (float*) malloc(ljp->xres * ljp->yres * sizeof (float));
	calcn (1, 1, 256);
	free(results);
	free(ljp);
	en = clock();
	printf("CPU time: %.3fs\n", (double)(en-st)/CLOCKS_PER_SEC);
	return 0;
}



[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: branches compared (lja_speed, EV56)
  2004-04-20 22:08 branches compared (lja_speed, EV56) Kurt Garloff
@ 2004-04-25  2:02 ` Kurt Garloff
  0 siblings, 0 replies; 2+ messages in thread
From: Kurt Garloff @ 2004-04-25  2:02 UTC (permalink / raw)
  To: gcc


[-- Attachment #1.1: Type: text/plain, Size: 5059 bytes --]

On Mon, Apr 19, 2004 at 01:50:57PM +0200, Kurt Garloff wrote:
> lja_speed benchmarks

Update:

lja_speed benchmarks on a quad Opteron (1400MHz), 2GB, times in seconds
(average from two best out of three).

			-fno-	-fnew-ra FDO	 FDO	
			new-ra			new-ra	
gcc-3.2.2 (SLES8)	 5.56	 4.32+	 5.52	 4.15+
gcc-334 CVS 20040424	 5.55	 7.85-	 5.49	 7.66-
gcc-333 hammer  0424	 5.51	 ICE!!	 5.22	 ICE!!
gcc-340 newra   0424	 6.26-	 4.27+	 6.95-	 4.46+
gcc-3.4.0 		 5.59	 7.69-	 5.82-	 9.21-
gcc-350 CVS 20040424	 4.59+	 7.05	 5.22	 8.74
gcc-350 treessa 0424	 3.94+	 5.87	 3.43+	 4.05+
gcc-350 ssa-lno 0424	 4.98	 7.61-	 4.23+	 4.96
For comparison: 32bit
gcc-3.2.2 (SLES8) -m32	 6.11	 4.51	 6.02	 4.49

Options:
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3 
	-frerun-loop-opt  -funroll-loops -fstrict-aliasing
FDO:    -fprofile-arcs, run, -fbranch-probabilities (gcc-3.2/3.3/newra)
	-fprofile-generate, run, -fprofile-use (gcc-3.4/3.5/ssa/lno)

Compile time (seconds, user) and text sizes (stripped):
			-fno-	-fnew-ra FDO	 FDO	
			new-ra			new-ra	
gcc-3.2.2 (SLES8)	0.28	0.29	0.27	0.31
			 6601	 6201	 6585	 6169
gcc-334 CVS 20040424	0.29	0.37	0.30	0.36
			 7141 	 8389	 7059	 8255
gcc-333 hammer  0424	0.32 	ICE!	0.29	ICE!	
			 7145		 5874
gcc-340 newra   0424	0.57	0.79	0.32	0.41
			 9810	 9746	 5544	 5729
gcc-3.4.0 		0.48	0.56	0.29	0.35
			 9381	10261	 5764	 6795
gcc-350 CVS 20040424	0.51	0.60	0.36	0.41
			 9471	 9855	 6287	 7183
gcc-350 treessa 0424	0.80	0.91	0.71	0.79
			 6861	 7325	 5501	 5901	
gcc-350 ssa-lno 0424	0.55	0.64	0.42	0.46
			 8797	 9261	 5837	 6186

Notes:
* -fnew-ra performs very well on 3.2.2-SuSE and on the new-ra 
  branch, on most others it hurts.
* hammer branch has completely broken -fnew-ra.
* tree-ssa branch yields the best results, even better with FDO.
* FDO does save considerable compile time on newer versions; but
  it seems to save too much at the cost of optimization on 3.4.0,
  3.4 newra, and 3.5 mainline; they all lose compared to non-FD
  optimization. tree-ssa and tree-ssa-lno do win with FDO.
* lno loses against tree-ssa always and against mainline unless
  FDO is used.
* 32bit is slower than 64.
* If neither FDO nor new-ra is used, 3.2.2, 334 CVS, 333-hammer, 
  and 3.4.0 are all about the same speed. 3.5.0 improves on that.

Same benchmark on EV56 (DEC21164A), 600MHz, 768MB, linked with libffm

			-fno-	-fnew-ra FDO	 FDO	
			new-ra			new-ra	
gcc-3.2.2 (SL81)	14.13	 9.47+	13.99	 9.35+
gcc-3.2.3		14.07	 -	 -	 -
gcc-334 CVS 20040409	13.90	 9.32+	13.77	 9.17+
gcc-333 hammer  0409	16.47-	13.74	16.48-	13.75-
gcc-34 newra    0409	17.19-	14.04-	ERR!!	ERR!!
gcc-3.4.0		12.79+	11.25	ERR!!	ERR!!
gcc-350 CVS 20040409	13.00	11.21	ERR!!	ERR!!
gcc-35 tree-ssa 0416	11.93+	11.82	14.76-	13.73-
gcc-35 ssa-lno  0409	13.07	15.03-	13.11	12.89

For comparison: Linked with libcpml
gcc-3.2.2 (SL8.1)	13.88	 9.28
gcc-334 CVS 20040409	13.85	 9.26	13.76	 9.16
ccc-6.5.9		 8.62

Options:
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3 \
	-frerun-loop-opt -mcpu=ev56 -funroll-loops -fstrict-aliasing
FDO: See above
ccc: -w0 -msg_display_tag -O2 -accept restrict_keyword -D__USE_STD_IOSTREAM \
	-fast -tune ev56 -arch ev56 -O4 -inline speed 

Compile time (seconds, user) and text sizes (stripped):
			-fno-	-fnew-ra FDO	 FDO	
			new-ra			new-ra	
			new-ra			new-ra	
gcc-3.2.2 (SL81)	1.96	2.19	2.02	2.21
			11456	11392	11392	11328
gcc-3.2.3		2.03
			11445
gcc-334 CVS 20040409	2.16	2.33	2.15	2.31
			11777	11393	11897	11337
gcc-333 hammer  0409	1.88	2.04	1.95	2.09
			 9960	 9800	10260	10004
gcc-34 new-ra   0409	2.01	2.48
			 9912	 9808
gcc-3.4.0		2.04	2.19
			11653	11565
gcc-350 CVS 20040409	3.01	3.32
			13415	13135
gcc-35 tree-ssa 0416	5.14	5.40	4.60	4.93
			11358	11334	 9822	9670
gcc-35 ssa-lno  0409	3.23	3.49	2.57	2.75
			12542	12398	 9806	9782
ccc-6.5.9		2.83
			11427

Notes:
* The effect of -fnew-ra is larger on AXP than on x86-64. Despite more 
  registers, the higher demand on a RISC arch seems to be causing this.
* Like x86-64, 3.2.2-SuSE does well with -fnew-ra. Unlike x86-64, it
  works well with 334-CVS but not with new-ra branch.
* Early 3.4 had been performing bad on AXP and we see this both on hammer 
  branch and new-ra branch.
* With the old register allocator, you can see improvements
  from 3.2 -> 3.3 -> 3.4 -> ssa.
  Both 3.5 and ssa-lno are a bit behind, hammer branch and new_ra
  branches suck.
* The results vary a lot on this platform which is bad news. The good
  news is that we get quite close to ccc, if the right options are
  specified and the right compiler is used.
* tree-ssa does well again. If only -fnew-ra and FDO would help it the
  same way as old 3.2.2-SuSE!

Regards,
-- 
Kurt Garloff                   <kurt@garloff.de>             [Koeln, DE]
Physics:Plasma modeling <garloff@plasimo.phys.tue.nl> [TU Eindhoven, NL]
Linux: SUSE Labs (Head)        <garloff@suse.de>    [SUSE Nuernberg, DE]

[-- Attachment #1.2: lja_speed.c --]
[-- Type: text/plain, Size: 7843 bytes --]

/* Test program for new-ra 
 * From ljapunov calc program
 * (c) Kurt Garloff, 10/2003, GNU GPL
 */ 

#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#if 1 //def __alpha__
#include <setjmp.h>
#include <signal.h>
#include <fenv.h>
#endif


#define LONG_DOUBLE long double
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)>(b)?(b):(a))
#if defined(__GNUC__) && __GNUC__ >= 3
# define LIKELY(x)	__builtin_expect((x) != 0,1) 
# define UNLIKELY(x)	__builtin_expect((x) != 0,0)
#else
# define LIKELY(x)	(x)
# define UNLIKELY(x)	(x)
#endif

#define MAXPATLEN 64
#define MAXCOLENTS 12
// Coltrafo description	

typedef struct _ct_inf {	
	float val;
	float h, s, i;
} ct_inf;

enum calcmode { FIXX, FIXB, FIXA };

typedef struct _lja_param {
	char magic[4];	/* "LJA0" */
	char rev;	/* revision */
	char cmode;	/* enum calcmode actually */
	unsigned char pass;
	unsigned char patlen;
/*0x08:*/
	double x0;
/*0x10:*/
	double amin, amax, bmin, bmax;
/*0x30:*/
	unsigned int xres, yres, bline, depth;
/*0x40:*/
	char pattern[MAXPATLEN];
/*0x80:*/
	/* Info for lja2bmp */
	float ampln, amplp;
/*0x88:*/
	char res2[55];
	char ctents;
/*0x0c0:*/
	ct_inf coltbl [MAXCOLENTS];
/*0x180:*/
} lja_parm;
	
lja_parm *ljp;
float *results;
unsigned long int nl;
// Count of A's and B's in pattern
static int na, nb;
static char *fullpat;
char quiet;
volatile sig_atomic_t interrupt;

unsigned long ctr;

#define INF 20000
#define SML 1e-24
#define DIV 160.0

const char lja_hdr_latest_rev = 2;

void lja_defaults (lja_parm * const ljp)
{
	memset (ljp, 0, sizeof (lja_parm));
	/* Ljapunov settings */
	memcpy (ljp->magic, "LJA0", 4);
	ljp->rev = lja_hdr_latest_rev; ljp->cmode = FIXX;
	ljp->pass = 0; 	ljp->patlen = 2;
	ljp->x0 = 0.4;
	ljp->amin = -2.7; ljp->amax = 4.2;
	ljp->bmin = -2.7; ljp->bmax = 4.2;
	ljp->xres = 1280; ljp->yres = 1024;
	ljp->bline = 0; ljp->depth = 256;
	memset (ljp->pattern, 0, MAXPATLEN); ljp->pattern[1] = 1;
	ljp->ampln = 0.4; ljp->amplp = 1.6;
	ljp->ctents = 0;
}

static //inline 
double extrapol_lja (const double a, const double b, const double l, 
		     const unsigned r, const unsigned dpth)
{
	++nl;
	if (UNLIKELY(l <= 0.0))
		return -INF;
	else
		return ( log(l) + r * 
			(  log (fabs (a)) * na
			 + log (fabs (b)) * nb ) ) / dpth;
}

#if 1 //def __alpha__
/* Exception handling: Will probably not work with multithreading */
jmp_buf lja_fpe_jump;

void fpe_handler (int signum, int info)
{
	if (!info)
		info = -1;
	printf ("\x1b[D!"); fflush (stdout); 
	if (lja_fpe_jump)
		longjmp (lja_fpe_jump, info);
	else
		raise (signum);
}
#endif
	
static inline
void set_pat (double* pat, const int ln, const double a, const double b)
{
	int i;
	for (i = 0; i < ln; ++i)
	       	pat[i] = (fullpat[i%ln]? b: a);
}

double ljapunov (const double a, const double b, const double xi, 
		 const unsigned dpth)
{
	int t = 0; register int p;
	const int exppln = (ljp->patlen > 16?     ljp->patlen: 
			    ljp->patlen >  8? 2 * ljp->patlen: 
			    		      4 * ljp->patlen);
	register LONG_DOUBLE l, r, x = xi;
	const int rep = dpth / exppln;
	
	double pattern[32];
	set_pat (pattern, exppln, a, b);
	//double pattern[2];
	//pattern[0] = a; pattern[1] = b;
	
	l = 1.0;
#if 1 //def __alpha__
	if (UNLIKELY(setjmp (lja_fpe_jump)))
		return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
#endif
	//asm (".align 32\n");
	for (t = 0; t < rep; ++t) {
		for (p = 0; p < exppln; ++p) {
			//r = (fullpat[p]? b: a);
			r = pattern[p];
			//r = pattern[fullpat[p]];
			x *= r*(1.0-x); 
			l *= fabs(r*(1.0-x*2.0));
		}
		++ctr;
		{
		register const double fabsx = fabs(x);
		if ( /*t >= 2 &&*/
		    UNLIKELY(fabsx > DIV || fabsx < SML) )
			return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);

		}
	}
	// Residuals ?
	t = dpth - rep*exppln; 
	for (p = 0; p < t; ++p) {
		//r = (fullpat[p]? b : a);
		r = pattern[p];
		//r = pattern[fullpat[p]];
		x *= r*(1.0-x); 
		l *= fabs (r*(1.0-x*2.0));
	}

	if (LIKELY(l > 0.0)) 
		return log (l) / dpth;
	else 
		return -INF;
/*
 out:
	return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
 */	
}

void prepare_ljapunov (const int dpth)
{
	// pattern count
	int s; na = 0; nb = 0;
	for (s = 0; s < ljp->patlen; ++s) {
		if (ljp->pattern[s])
			++nb;
		else
			++na;
	}
	for (s = 0; s < dpth; ++s)
		fullpat[s] = ljp->pattern[s%ljp->patlen];
}

void do_calc_lines (const unsigned xstep, const unsigned ystep, 
		    const unsigned depth,
		    const unsigned start, const unsigned end)
{
	unsigned ln, col;
	const double mx = (ljp->amax - ljp->amin)/ljp->xres;
	const double my = (ljp->bmax - ljp->bmin)/ljp->yres;
#ifdef __alpha__
	fenv_t fpu;
	//fegetenv(&fpu);
	//printf("FPU: 0x%016lx", fpu);
	//fesetenv(FE_DFL_ENV);
	fegetenv(&fpu);
	//printf(" 0x%016lx", fpu);
	fpu |= FE_MAP_DMZ | FE_MAP_UMZ;
	fesetenv(&fpu);
	fedisableexcept(FE_ALL_EXCEPT);
	//fegetenv(&fpu);
	//printf(" 0x%016lx\n", fpu);
#endif	
	signal (SIGFPE, (sig_t)fpe_handler);
	
	for (ln = start; ln < end; ln += ystep) {
		double y = (double)ljp->bmax - my * (ln + (double)ystep*0.5);
		float res;
		//printf ("%i (%ix%i)\n", ln, xstep, ystep);
		switch (ljp->cmode) {
		    case FIXX:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (x, y, ljp->x0, depth);
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ln2++)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); ++col2)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    case FIXA:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (ljp->x0, y, x, depth); 
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    case FIXB:
			for (col = 0; col < ljp->xres; col += xstep) {
				double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
				register unsigned ln2, col2;
				res = ljapunov (x, ljp->x0, y, depth);
				for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
					for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
						*(results+col2+ljp->xres*ln2) = res;
			}; break;
		    default: res = 0; abort ();
		}
	}
	signal (SIGFPE, SIG_DFL);
}

void calcn (const unsigned xstep, const unsigned ystep, 
	    const unsigned dpth)
{
	unsigned int y = 0; //int thr;
	const int ychunk = 32;
	unsigned yrmax = ljp->yres;
	fullpat = (char*)malloc (dpth); prepare_ljapunov (dpth);
	for (y = ljp->bline; y < yrmax && !interrupt; y += ychunk) {
		if (!quiet) { 
			printf ("."); fflush (stdout); 
		}
		/* Do the last chunk ourself */
		do_calc_lines (xstep, ystep, dpth, 
				y, y+ychunk);
	}
	if (interrupt) { 
	   	if (!quiet) 
			printf ("%i", y-1); 
	   	ljp->bline = y; 
		free (fullpat);
		return; 
	}
	if (y != yrmax)
		do_calc_lines (xstep, ystep, dpth, y, yrmax);
	//ljp->bline = ljp->yres;
	ljp -> bline = 0;
	free (fullpat);
	if (!quiet) { 
		printf ("%i\n", ljp->yres-1); fflush (stdout); 
	};
}

int main (int argc, char* argv[])
{
	clock_t st, en;
	ctr = 0;
	st = clock();
	ljp = (lja_parm*)malloc(sizeof(lja_parm));
	lja_defaults(ljp);
	results = (float*) malloc(ljp->xres * ljp->yres * sizeof (float));
	calcn (1, 1, 256);
	free(results);
	free(ljp);
	en = clock();
	printf("CPU time: %.3fs for %li iter\n", (double)(en-st)/CLOCKS_PER_SEC, ctr);
	return 0;
}



[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2004-04-24 23:23 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-04-20 22:08 branches compared (lja_speed, EV56) Kurt Garloff
2004-04-25  2:02 ` Kurt Garloff

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).