* branches compared (lja_speed, EV56)
@ 2004-04-20 22:08 Kurt Garloff
2004-04-25 2:02 ` Kurt Garloff
0 siblings, 1 reply; 2+ messages in thread
From: Kurt Garloff @ 2004-04-20 22:08 UTC (permalink / raw)
To: gcc
[-- Attachment #1.1: Type: text/plain, Size: 1706 bytes --]
Hi,
lja_speed benchmarks on my EV56-600 (DEC21164A), 768MB, KDE kicker
background load, times in seconds.
-fno-new-ra -fnew-ra -fnew-ra
-fbranch-prob
*** With FFM
gcc-3.2.2 (SL81) 14.06 9.37+ 9.24+
gcc-3.2.3 14.12
gcc-334 20040409 13.84 9.46+ 9.25+
gcc-333 ham 20040409 16.56- 13.81- 13.72
gcc-34 20040409 12.89 11.18
34 new_ra 20040409 16.67- 14.14- ERROR!
gcc-35 20040409 13.03 11.25 ERROR!
35 tree-ssa 20040416 12.01+ 11.81 14.10-
35 ssa-lno 20040409 13.08 12.64
*** With CPML
gcc-3.2.2 (SL81) 14.18 9.27+
ccc-6.5.9 8.60
Compile flags:
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3 \
-frerun-loop-opt -mcpu=ev56 -funroll-loops -fstrict-aliasing
ccc: -w0 -msg_display_tag -O2 -accept restrict_keyword \
-D__USE_STD_IOSTREAM -fast -tune ev56 -arch ev56 -O4 -inline speed
Notes:
* With the old register allocator, you can see improvements
from 3.2 -> 3.3 -> 3.4 -> ssa.
Both 3.5 and ssa-lno are a bit behind, hammer branch and new_ra
branches suck.
* With the new register allocator, 3.3.4 and 3.2.2SUSE perform best,
3.4, 3.5 are acceptable, ssa somewhat worse, ssa-lno worse
and hammer and new-ra branches suck.
* The hammer branch slowdown is probably an import from 3.4; it used
to be absent 9 month ago, then a slowdown appeared on 3.4, which is
meanwhile cured.
* FDO is broken on 3.4 and 3.5.
Regards,
--
Kurt Garloff <garloff@suse.de> Cologne, DE
SUSE LINUX AG, Nuernberg, DE SUSE Labs (Head)
[-- Attachment #1.2: lja_speed.c --]
[-- Type: text/plain, Size: 7782 bytes --]
/* Test program for new-ra
* From ljapunov calc program
* (c) Kurt Garloff, 10/2003, GNU GPL
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#if 1 //def __alpha__
#include <setjmp.h>
#include <signal.h>
#include <fenv.h>
#endif
#define LONG_DOUBLE long double
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)>(b)?(b):(a))
#if defined(__GNUC__) && __GNUC__ >= 3
# define LIKELY(x) __builtin_expect((x) != 0,1)
# define UNLIKELY(x) __builtin_expect((x) != 0,0)
#else
# define LIKELY(x) (x)
# define UNLIKELY(x) (x)
#endif
#define MAXPATLEN 64
#define MAXCOLENTS 12
// Coltrafo description
typedef struct _ct_inf {
float val;
float h, s, i;
} ct_inf;
enum calcmode { FIXX, FIXB, FIXA };
typedef struct _lja_param {
char magic[4]; /* "LJA0" */
char rev; /* revision */
char cmode; /* enum calcmode actually */
unsigned char pass;
unsigned char patlen;
/*0x08:*/
double x0;
/*0x10:*/
double amin, amax, bmin, bmax;
/*0x30:*/
unsigned int xres, yres, bline, depth;
/*0x40:*/
char pattern[MAXPATLEN];
/*0x80:*/
/* Info for lja2bmp */
float ampln, amplp;
/*0x88:*/
char res2[55];
char ctents;
/*0x0c0:*/
ct_inf coltbl [MAXCOLENTS];
/*0x180:*/
} lja_parm;
lja_parm *ljp;
float *results;
unsigned long int nl;
// Count of A's and B's in pattern
static int na, nb;
static char *fullpat;
char quiet;
volatile sig_atomic_t interrupt;
#define INF 20000
#define SML 1e-24
#define DIV 160.0
const char lja_hdr_latest_rev = 2;
void lja_defaults (lja_parm * const ljp)
{
memset (ljp, 0, sizeof (lja_parm));
/* Ljapunov settings */
memcpy (ljp->magic, "LJA0", 4);
ljp->rev = lja_hdr_latest_rev; ljp->cmode = FIXX;
ljp->pass = 0; ljp->patlen = 2;
ljp->x0 = 0.4;
ljp->amin = -2.7; ljp->amax = 4.2;
ljp->bmin = -2.7; ljp->bmax = 4.2;
ljp->xres = 1280; ljp->yres = 1024;
ljp->bline = 0; ljp->depth = 256;
memset (ljp->pattern, 0, MAXPATLEN); ljp->pattern[1] = 1;
ljp->ampln = 0.4; ljp->amplp = 1.6;
ljp->ctents = 0;
}
static //inline
double extrapol_lja (const double a, const double b, const double l,
const unsigned r, const unsigned dpth)
{
++nl;
if (UNLIKELY(l <= 0.0))
return -INF;
else
return ( log(l) + r *
( log (fabs (a)) * na
+ log (fabs (b)) * nb ) ) / dpth;
}
#if 1 //def __alpha__
/* Exception handling: Will probably not work with multithreading */
jmp_buf lja_fpe_jump;
void fpe_handler (int signum, int info)
{
if (!info)
info = -1;
printf ("\x1b[D!"); fflush (stdout);
if (lja_fpe_jump)
longjmp (lja_fpe_jump, info);
else
raise (signum);
}
#endif
static inline
void set_pat (double* pat, const int ln, const double a, const double b)
{
int i;
for (i = 0; i < ln; ++i)
pat[i] = (fullpat[i%ln]? b: a);
}
double ljapunov (const double a, const double b, const double xi,
const unsigned dpth)
{
int t = 0; register int p;
const int exppln = (ljp->patlen > 16? ljp->patlen:
ljp->patlen > 8? 2 * ljp->patlen:
4 * ljp->patlen);
register LONG_DOUBLE l, r, x = xi;
const int rep = dpth / exppln;
double pattern[32];
set_pat (pattern, exppln, a, b);
//double pattern[2];
//pattern[0] = a; pattern[1] = b;
l = 1.0;
#if 1 //def __alpha__
if (UNLIKELY(setjmp (lja_fpe_jump)))
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
#endif
//asm (".align 32\n");
for (t = 0; t < rep; ++t) {
for (p = 0; p < exppln; ++p) {
//r = (fullpat[p]? b: a);
r = pattern[p];
//r = pattern[fullpat[p]];
x *= r*(1.0-x);
l *= fabs(r*(1.0-x*2.0));
}
{
register const double fabsx = fabs(x);
if ( /*t >= 2 &&*/
UNLIKELY(fabsx > DIV || fabsx < SML) )
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
}
}
// Residuals ?
t = dpth - rep*exppln;
for (p = 0; p < t; ++p) {
//r = (fullpat[p]? b : a);
r = pattern[p];
//r = pattern[fullpat[p]];
x *= r*(1.0-x);
l *= fabs (r*(1.0-x*2.0));
}
if (LIKELY(l > 0.0))
return log (l) / dpth;
else
return -INF;
/*
out:
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
*/
}
void prepare_ljapunov (const int dpth)
{
// pattern count
int s; na = 0; nb = 0;
for (s = 0; s < ljp->patlen; ++s) {
if (ljp->pattern[s])
++nb;
else
++na;
}
for (s = 0; s < dpth; ++s)
fullpat[s] = ljp->pattern[s%ljp->patlen];
}
void do_calc_lines (const unsigned xstep, const unsigned ystep,
const unsigned depth,
const unsigned start, const unsigned end)
{
unsigned ln, col;
const double mx = (ljp->amax - ljp->amin)/ljp->xres;
const double my = (ljp->bmax - ljp->bmin)/ljp->yres;
#ifdef __alpha__
fenv_t fpu;
//fegetenv(&fpu);
//printf("FPU: 0x%016lx", fpu);
//fesetenv(FE_DFL_ENV);
fegetenv(&fpu);
//printf(" 0x%016lx", fpu);
fpu |= FE_MAP_DMZ | FE_MAP_UMZ;
fesetenv(&fpu);
fedisableexcept(FE_ALL_EXCEPT);
//fegetenv(&fpu);
//printf(" 0x%016lx\n", fpu);
#endif
signal (SIGFPE, (sig_t)fpe_handler);
for (ln = start; ln < end; ln += ystep) {
double y = (double)ljp->bmax - my * (ln + (double)ystep*0.5);
float res;
//printf ("%i (%ix%i)\n", ln, xstep, ystep);
switch (ljp->cmode) {
case FIXX:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (x, y, ljp->x0, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ln2++)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); ++col2)
*(results+col2+ljp->xres*ln2) = res;
}; break;
case FIXA:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (ljp->x0, y, x, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
*(results+col2+ljp->xres*ln2) = res;
}; break;
case FIXB:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (x, ljp->x0, y, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
*(results+col2+ljp->xres*ln2) = res;
}; break;
default: res = 0; abort ();
}
}
signal (SIGFPE, SIG_DFL);
}
void calcn (const unsigned xstep, const unsigned ystep,
const unsigned dpth)
{
unsigned int y = 0; int thr;
const int ychunk = 32;
unsigned yrmax = ljp->yres;
fullpat = (char*)malloc (dpth); prepare_ljapunov (dpth);
for (y = ljp->bline; y < yrmax && !interrupt; y += ychunk) {
if (!quiet) {
printf ("."); fflush (stdout);
}
/* Do the last chunk ourself */
do_calc_lines (xstep, ystep, dpth,
y, y+ychunk);
}
if (interrupt) {
if (!quiet)
printf ("%i", y-1);
ljp->bline = y;
free (fullpat);
return;
}
if (y != yrmax)
do_calc_lines (xstep, ystep, dpth, y, yrmax);
//ljp->bline = ljp->yres;
ljp -> bline = 0;
free (fullpat);
if (!quiet) {
printf ("%i\n", ljp->yres-1); fflush (stdout);
};
}
int main (int argc, char* argv[])
{
clock_t st, en;
st = clock();
ljp = (lja_parm*)malloc(sizeof(lja_parm));
lja_defaults(ljp);
results = (float*) malloc(ljp->xres * ljp->yres * sizeof (float));
calcn (1, 1, 256);
free(results);
free(ljp);
en = clock();
printf("CPU time: %.3fs\n", (double)(en-st)/CLOCKS_PER_SEC);
return 0;
}
[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: branches compared (lja_speed, EV56)
2004-04-20 22:08 branches compared (lja_speed, EV56) Kurt Garloff
@ 2004-04-25 2:02 ` Kurt Garloff
0 siblings, 0 replies; 2+ messages in thread
From: Kurt Garloff @ 2004-04-25 2:02 UTC (permalink / raw)
To: gcc
[-- Attachment #1.1: Type: text/plain, Size: 5059 bytes --]
On Mon, Apr 19, 2004 at 01:50:57PM +0200, Kurt Garloff wrote:
> lja_speed benchmarks
Update:
lja_speed benchmarks on a quad Opteron (1400MHz), 2GB, times in seconds
(average from two best out of three).
-fno- -fnew-ra FDO FDO
new-ra new-ra
gcc-3.2.2 (SLES8) 5.56 4.32+ 5.52 4.15+
gcc-334 CVS 20040424 5.55 7.85- 5.49 7.66-
gcc-333 hammer 0424 5.51 ICE!! 5.22 ICE!!
gcc-340 newra 0424 6.26- 4.27+ 6.95- 4.46+
gcc-3.4.0 5.59 7.69- 5.82- 9.21-
gcc-350 CVS 20040424 4.59+ 7.05 5.22 8.74
gcc-350 treessa 0424 3.94+ 5.87 3.43+ 4.05+
gcc-350 ssa-lno 0424 4.98 7.61- 4.23+ 4.96
For comparison: 32bit
gcc-3.2.2 (SLES8) -m32 6.11 4.51 6.02 4.49
Options:
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3
-frerun-loop-opt -funroll-loops -fstrict-aliasing
FDO: -fprofile-arcs, run, -fbranch-probabilities (gcc-3.2/3.3/newra)
-fprofile-generate, run, -fprofile-use (gcc-3.4/3.5/ssa/lno)
Compile time (seconds, user) and text sizes (stripped):
-fno- -fnew-ra FDO FDO
new-ra new-ra
gcc-3.2.2 (SLES8) 0.28 0.29 0.27 0.31
6601 6201 6585 6169
gcc-334 CVS 20040424 0.29 0.37 0.30 0.36
7141 8389 7059 8255
gcc-333 hammer 0424 0.32 ICE! 0.29 ICE!
7145 5874
gcc-340 newra 0424 0.57 0.79 0.32 0.41
9810 9746 5544 5729
gcc-3.4.0 0.48 0.56 0.29 0.35
9381 10261 5764 6795
gcc-350 CVS 20040424 0.51 0.60 0.36 0.41
9471 9855 6287 7183
gcc-350 treessa 0424 0.80 0.91 0.71 0.79
6861 7325 5501 5901
gcc-350 ssa-lno 0424 0.55 0.64 0.42 0.46
8797 9261 5837 6186
Notes:
* -fnew-ra performs very well on 3.2.2-SuSE and on the new-ra
branch, on most others it hurts.
* hammer branch has completely broken -fnew-ra.
* tree-ssa branch yields the best results, even better with FDO.
* FDO does save considerable compile time on newer versions; but
it seems to save too much at the cost of optimization on 3.4.0,
3.4 newra, and 3.5 mainline; they all lose compared to non-FD
optimization. tree-ssa and tree-ssa-lno do win with FDO.
* lno loses against tree-ssa always and against mainline unless
FDO is used.
* 32bit is slower than 64.
* If neither FDO nor new-ra is used, 3.2.2, 334 CVS, 333-hammer,
and 3.4.0 are all about the same speed. 3.5.0 improves on that.
Same benchmark on EV56 (DEC21164A), 600MHz, 768MB, linked with libffm
-fno- -fnew-ra FDO FDO
new-ra new-ra
gcc-3.2.2 (SL81) 14.13 9.47+ 13.99 9.35+
gcc-3.2.3 14.07 - - -
gcc-334 CVS 20040409 13.90 9.32+ 13.77 9.17+
gcc-333 hammer 0409 16.47- 13.74 16.48- 13.75-
gcc-34 newra 0409 17.19- 14.04- ERR!! ERR!!
gcc-3.4.0 12.79+ 11.25 ERR!! ERR!!
gcc-350 CVS 20040409 13.00 11.21 ERR!! ERR!!
gcc-35 tree-ssa 0416 11.93+ 11.82 14.76- 13.73-
gcc-35 ssa-lno 0409 13.07 15.03- 13.11 12.89
For comparison: Linked with libcpml
gcc-3.2.2 (SL8.1) 13.88 9.28
gcc-334 CVS 20040409 13.85 9.26 13.76 9.16
ccc-6.5.9 8.62
Options:
gcc: -Wall -O2 -ffast-math -fomit-frame-pointer -fschedule-insns2 -O3 \
-frerun-loop-opt -mcpu=ev56 -funroll-loops -fstrict-aliasing
FDO: See above
ccc: -w0 -msg_display_tag -O2 -accept restrict_keyword -D__USE_STD_IOSTREAM \
-fast -tune ev56 -arch ev56 -O4 -inline speed
Compile time (seconds, user) and text sizes (stripped):
-fno- -fnew-ra FDO FDO
new-ra new-ra
new-ra new-ra
gcc-3.2.2 (SL81) 1.96 2.19 2.02 2.21
11456 11392 11392 11328
gcc-3.2.3 2.03
11445
gcc-334 CVS 20040409 2.16 2.33 2.15 2.31
11777 11393 11897 11337
gcc-333 hammer 0409 1.88 2.04 1.95 2.09
9960 9800 10260 10004
gcc-34 new-ra 0409 2.01 2.48
9912 9808
gcc-3.4.0 2.04 2.19
11653 11565
gcc-350 CVS 20040409 3.01 3.32
13415 13135
gcc-35 tree-ssa 0416 5.14 5.40 4.60 4.93
11358 11334 9822 9670
gcc-35 ssa-lno 0409 3.23 3.49 2.57 2.75
12542 12398 9806 9782
ccc-6.5.9 2.83
11427
Notes:
* The effect of -fnew-ra is larger on AXP than on x86-64. Despite more
registers, the higher demand on a RISC arch seems to be causing this.
* Like x86-64, 3.2.2-SuSE does well with -fnew-ra. Unlike x86-64, it
works well with 334-CVS but not with new-ra branch.
* Early 3.4 had been performing bad on AXP and we see this both on hammer
branch and new-ra branch.
* With the old register allocator, you can see improvements
from 3.2 -> 3.3 -> 3.4 -> ssa.
Both 3.5 and ssa-lno are a bit behind, hammer branch and new_ra
branches suck.
* The results vary a lot on this platform which is bad news. The good
news is that we get quite close to ccc, if the right options are
specified and the right compiler is used.
* tree-ssa does well again. If only -fnew-ra and FDO would help it the
same way as old 3.2.2-SuSE!
Regards,
--
Kurt Garloff <kurt@garloff.de> [Koeln, DE]
Physics:Plasma modeling <garloff@plasimo.phys.tue.nl> [TU Eindhoven, NL]
Linux: SUSE Labs (Head) <garloff@suse.de> [SUSE Nuernberg, DE]
[-- Attachment #1.2: lja_speed.c --]
[-- Type: text/plain, Size: 7843 bytes --]
/* Test program for new-ra
* From ljapunov calc program
* (c) Kurt Garloff, 10/2003, GNU GPL
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#if 1 //def __alpha__
#include <setjmp.h>
#include <signal.h>
#include <fenv.h>
#endif
#define LONG_DOUBLE long double
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)>(b)?(b):(a))
#if defined(__GNUC__) && __GNUC__ >= 3
# define LIKELY(x) __builtin_expect((x) != 0,1)
# define UNLIKELY(x) __builtin_expect((x) != 0,0)
#else
# define LIKELY(x) (x)
# define UNLIKELY(x) (x)
#endif
#define MAXPATLEN 64
#define MAXCOLENTS 12
// Coltrafo description
typedef struct _ct_inf {
float val;
float h, s, i;
} ct_inf;
enum calcmode { FIXX, FIXB, FIXA };
typedef struct _lja_param {
char magic[4]; /* "LJA0" */
char rev; /* revision */
char cmode; /* enum calcmode actually */
unsigned char pass;
unsigned char patlen;
/*0x08:*/
double x0;
/*0x10:*/
double amin, amax, bmin, bmax;
/*0x30:*/
unsigned int xres, yres, bline, depth;
/*0x40:*/
char pattern[MAXPATLEN];
/*0x80:*/
/* Info for lja2bmp */
float ampln, amplp;
/*0x88:*/
char res2[55];
char ctents;
/*0x0c0:*/
ct_inf coltbl [MAXCOLENTS];
/*0x180:*/
} lja_parm;
lja_parm *ljp;
float *results;
unsigned long int nl;
// Count of A's and B's in pattern
static int na, nb;
static char *fullpat;
char quiet;
volatile sig_atomic_t interrupt;
unsigned long ctr;
#define INF 20000
#define SML 1e-24
#define DIV 160.0
const char lja_hdr_latest_rev = 2;
void lja_defaults (lja_parm * const ljp)
{
memset (ljp, 0, sizeof (lja_parm));
/* Ljapunov settings */
memcpy (ljp->magic, "LJA0", 4);
ljp->rev = lja_hdr_latest_rev; ljp->cmode = FIXX;
ljp->pass = 0; ljp->patlen = 2;
ljp->x0 = 0.4;
ljp->amin = -2.7; ljp->amax = 4.2;
ljp->bmin = -2.7; ljp->bmax = 4.2;
ljp->xres = 1280; ljp->yres = 1024;
ljp->bline = 0; ljp->depth = 256;
memset (ljp->pattern, 0, MAXPATLEN); ljp->pattern[1] = 1;
ljp->ampln = 0.4; ljp->amplp = 1.6;
ljp->ctents = 0;
}
static //inline
double extrapol_lja (const double a, const double b, const double l,
const unsigned r, const unsigned dpth)
{
++nl;
if (UNLIKELY(l <= 0.0))
return -INF;
else
return ( log(l) + r *
( log (fabs (a)) * na
+ log (fabs (b)) * nb ) ) / dpth;
}
#if 1 //def __alpha__
/* Exception handling: Will probably not work with multithreading */
jmp_buf lja_fpe_jump;
void fpe_handler (int signum, int info)
{
if (!info)
info = -1;
printf ("\x1b[D!"); fflush (stdout);
if (lja_fpe_jump)
longjmp (lja_fpe_jump, info);
else
raise (signum);
}
#endif
static inline
void set_pat (double* pat, const int ln, const double a, const double b)
{
int i;
for (i = 0; i < ln; ++i)
pat[i] = (fullpat[i%ln]? b: a);
}
double ljapunov (const double a, const double b, const double xi,
const unsigned dpth)
{
int t = 0; register int p;
const int exppln = (ljp->patlen > 16? ljp->patlen:
ljp->patlen > 8? 2 * ljp->patlen:
4 * ljp->patlen);
register LONG_DOUBLE l, r, x = xi;
const int rep = dpth / exppln;
double pattern[32];
set_pat (pattern, exppln, a, b);
//double pattern[2];
//pattern[0] = a; pattern[1] = b;
l = 1.0;
#if 1 //def __alpha__
if (UNLIKELY(setjmp (lja_fpe_jump)))
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
#endif
//asm (".align 32\n");
for (t = 0; t < rep; ++t) {
for (p = 0; p < exppln; ++p) {
//r = (fullpat[p]? b: a);
r = pattern[p];
//r = pattern[fullpat[p]];
x *= r*(1.0-x);
l *= fabs(r*(1.0-x*2.0));
}
++ctr;
{
register const double fabsx = fabs(x);
if ( /*t >= 2 &&*/
UNLIKELY(fabsx > DIV || fabsx < SML) )
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
}
}
// Residuals ?
t = dpth - rep*exppln;
for (p = 0; p < t; ++p) {
//r = (fullpat[p]? b : a);
r = pattern[p];
//r = pattern[fullpat[p]];
x *= r*(1.0-x);
l *= fabs (r*(1.0-x*2.0));
}
if (LIKELY(l > 0.0))
return log (l) / dpth;
else
return -INF;
/*
out:
return extrapol_lja (a, b, l, (dpth-t*exppln)/ljp->patlen, dpth);
*/
}
void prepare_ljapunov (const int dpth)
{
// pattern count
int s; na = 0; nb = 0;
for (s = 0; s < ljp->patlen; ++s) {
if (ljp->pattern[s])
++nb;
else
++na;
}
for (s = 0; s < dpth; ++s)
fullpat[s] = ljp->pattern[s%ljp->patlen];
}
void do_calc_lines (const unsigned xstep, const unsigned ystep,
const unsigned depth,
const unsigned start, const unsigned end)
{
unsigned ln, col;
const double mx = (ljp->amax - ljp->amin)/ljp->xres;
const double my = (ljp->bmax - ljp->bmin)/ljp->yres;
#ifdef __alpha__
fenv_t fpu;
//fegetenv(&fpu);
//printf("FPU: 0x%016lx", fpu);
//fesetenv(FE_DFL_ENV);
fegetenv(&fpu);
//printf(" 0x%016lx", fpu);
fpu |= FE_MAP_DMZ | FE_MAP_UMZ;
fesetenv(&fpu);
fedisableexcept(FE_ALL_EXCEPT);
//fegetenv(&fpu);
//printf(" 0x%016lx\n", fpu);
#endif
signal (SIGFPE, (sig_t)fpe_handler);
for (ln = start; ln < end; ln += ystep) {
double y = (double)ljp->bmax - my * (ln + (double)ystep*0.5);
float res;
//printf ("%i (%ix%i)\n", ln, xstep, ystep);
switch (ljp->cmode) {
case FIXX:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (x, y, ljp->x0, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ln2++)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); ++col2)
*(results+col2+ljp->xres*ln2) = res;
}; break;
case FIXA:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (ljp->x0, y, x, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
*(results+col2+ljp->xres*ln2) = res;
}; break;
case FIXB:
for (col = 0; col < ljp->xres; col += xstep) {
double x = (double)ljp->amin + mx * (col + (double)xstep*0.5);
register unsigned ln2, col2;
res = ljapunov (x, ljp->x0, y, depth);
for (ln2 = ln; ln2 < MIN(ln+ystep,end); ++ln2)
for (col2 = col; col2 < MIN(col+xstep,ljp->xres-1); col2++)
*(results+col2+ljp->xres*ln2) = res;
}; break;
default: res = 0; abort ();
}
}
signal (SIGFPE, SIG_DFL);
}
void calcn (const unsigned xstep, const unsigned ystep,
const unsigned dpth)
{
unsigned int y = 0; //int thr;
const int ychunk = 32;
unsigned yrmax = ljp->yres;
fullpat = (char*)malloc (dpth); prepare_ljapunov (dpth);
for (y = ljp->bline; y < yrmax && !interrupt; y += ychunk) {
if (!quiet) {
printf ("."); fflush (stdout);
}
/* Do the last chunk ourself */
do_calc_lines (xstep, ystep, dpth,
y, y+ychunk);
}
if (interrupt) {
if (!quiet)
printf ("%i", y-1);
ljp->bline = y;
free (fullpat);
return;
}
if (y != yrmax)
do_calc_lines (xstep, ystep, dpth, y, yrmax);
//ljp->bline = ljp->yres;
ljp -> bline = 0;
free (fullpat);
if (!quiet) {
printf ("%i\n", ljp->yres-1); fflush (stdout);
};
}
int main (int argc, char* argv[])
{
clock_t st, en;
ctr = 0;
st = clock();
ljp = (lja_parm*)malloc(sizeof(lja_parm));
lja_defaults(ljp);
results = (float*) malloc(ljp->xres * ljp->yres * sizeof (float));
calcn (1, 1, 256);
free(results);
free(ljp);
en = clock();
printf("CPU time: %.3fs for %li iter\n", (double)(en-st)/CLOCKS_PER_SEC, ctr);
return 0;
}
[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2004-04-24 23:23 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-04-20 22:08 branches compared (lja_speed, EV56) Kurt Garloff
2004-04-25 2:02 ` Kurt Garloff
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).