Understand the running time of a program compiled with GCC

public inbox for gcc-help@gcc.gnu.org
 help / color / mirror / Atom feed

* Understand the running time of a program compiled with GCC
@ 2010-03-10  3:08 Da Zheng
  2010-03-10 12:04 ` John (Eljay) Love-Jensen
  0 siblings, 1 reply; 3+ messages in thread
From: Da Zheng @ 2010-03-10  3:08 UTC (permalink / raw)
  To: gcc-help

[-- Attachment #1: Type: text/plain, Size: 1394 bytes --]

Hello,

I tried to optimize a small parallel program, which is parallelized with OpenMP.
The program itself does operations on arrays and the optimization I did was to
reduce cache miss.

At first, I compiled the program with GCC under Linux and -O3 was enabled. I ran
the program in a quad-core machine but the running time wasn't very stable. I
thought it was because my program was interrupted by other processes or threads
were scheduled to other cores. So I ran the program with each thread attached to
a CPU and with the highest real-time priority in Linux. But still the running
time varied from 60 to 90 milliseconds. I couldn't find any reason to explain
why the running time could be so different, but I still wasn't really surprised
by the result until I used Intel C compiler.

After I compiled the code with Intel's compiler, the running time is always
about 40ms. The performance improvement isn't surprising to me, but I don't know
why the running time doesn't change any more. At beginning, I thought it might
be caused by cache miss. After I profiled the program with AMD CodeAnalyst, I
didn't see many cache misses in either binary executable.

Since I'm doing optimization, I hope to find out the reason. Can anyone tell me
what is the possible reason that can cause time difference?

I also attach the program in case someone would like to take a look.

Thank you,
Zheng Da

[-- Attachment #2: MORGAN2.c --]
[-- Type: text/plain, Size: 23817 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "Aplc.h"
#include "Apl3lib.h"
#define HEAPSIZE 1048576
#define BVBFLE32 512
#define IVBFLE 16384
#define EVBFLE 16384
#define CVBFLE 8192
#define minn -7.237005577332262e75
#define QDCT 1e-13
#define QDRL 16807
#define QDPP 7
int c2[2];
int  lf1= 0x80000000;
int  bvbf[BVBFLE32];
char cvbf[CVBFLE];
int  r0, r1, r2, r8, t, n, i, j, new, time1, time2, th;
int  w0, w1, w2, w3, w4, w5, w6, u0, u1, u2, u3, u4, u5, u6;
int  q0, q1, q2, q3, q4, q5, q6, v0, v1, v2, v3, c6;
int  l0, l1, l2, l3, l4, l5, l6, m0, m1, m2, m3, m4, m5, m6;
int  qw, qi, sw, si, s0, s1, s2, s3, s4, s5, s6, freti;
double d, fretf, epsln;
char fretc;
int  r15= HEAPSIZE;
int  rseed= QDRL, initf;
unsigned int tl,ttl,wa,wb;
unsigned int * twp;
int * v4= bvbf;
char * v7= cvbf;
int * fretip, * v5;
double * fretfp, * v6;
char * fretcp, * lop, * rop, * cad0, * heap;
int lshp[8], rshp[8], cad[50], dl[7], tdl[7], vdl[7];
int * p0, * p1, * lo0, * lo1, * ro0,* ro1,* t0,* t1,* p10,* p11,*p12;
int * lo13, * lo10,* ro13, * ro10, * lo11, * ro11,* lo12,* ro12;
int * id0, * id1, * id2, * id3, * id4, * id5, * id6;
double * p2,* t2,* lo2,* ro2,* p20,* p21,* p22,* lo24,* ro24,* lo25;
double * lo20,* ro20,* lo21,* ro21,* lo22,* ro22,* lo23,* ro23,* ro25;
char *ro30, * p3, *t3, * lo3, * ro3, *pTarg, *pSrc, *pBuff0, *pBuff1;
STOFM2 v49;
STOFM3 v50;
STOFM2 v51;
STOFM3 v52;
STOFM2 v53;
STOFM3 v54;
STOFM3 v28;

main(argc,argv)
	int argc; char * argv[];
{
	char * lparm, * rparm;
	char  ltype, rtype;
	int  g19;
	int lleng= 1;
	FILE *fpl, *fpr, *fopen();
	extern int apl_sec();
	init();
	initf= 1;
	heap=amalloc(HEAPSIZE);
	rt_prio();
#pragma omp parallel
	{
		printf("There are %d threads\n", omp_get_num_threads ());
		if (omp_get_num_threads() > 1)
			thread_attach_cpu(omp_get_thread_num());
	}
	v5=amalloc(65536); v6=amalloc(13107200);
	{
		while (--argc > 0)
			if (argc>0) {lparm = *argv; *argv = *argv + 1;}
			else rparm = *argv;
	}
	if( argv[1] == NULL )
		fpl= a_fopen("MORGAN.LEF","r");
	else
	{
		sprintf(cvbf, "%s.LEF", argv[1]);
		fpl= a_fopen(cvbf, "r");
	}
	fscanf(fpl,"%c %d",&ltype,&lshp[0]);
	if( ltype!= 'I')  {
		printf("The type of the left arguement is mismatched.\n"); exit(99);}
	if( lshp[0]!= 0)  {
		printf("The rank of the left arguement is mismatched.\n"); exit(99);}
	for( i = 1; i < 1 + lshp[0]; ++i)  {
		fscanf(fpl," %d",&lshp[i]); lleng *= lshp[i];}
	while (fgetc(fpl) != '\n')  {}
	fscanf(fpl,"%d",&g19);
	lleng = 1;
	if( argv[1] == NULL )
		fpr= a_fopen("MORGAN.RIG","r");
	else
	{
		sprintf(cvbf, "%s.RIG", argv[1]);
		fpr= a_fopen(cvbf, "r");
	}
	fscanf(fpr,"%c %d",&rtype,&rshp[0]);
	if( rtype!= 'E')  {
		printf("The type of the right arguement is mismatched.\n"); exit(99);}
	if( rshp[0]!= 3)  {
		printf("The rank of the right arguement is mismatched.\n"); exit(99);}
	for( i = 1; i < 1 + rshp[0]; ++i)  {
		fscanf(fpr," %d",&rshp[i]); lleng *= rshp[i];}
	while (fgetc(fpr) != '\n') {}
	rshp[0]= r0 = lleng;inchp2;
	rop= &heap[r15]; p2 = (double *) rop;
//	for(i = 0; i < lleng; ++i)  {
//		fscanf(fpr,"%lf",p2);
//		++p2;
//	}
	time1 = apl_sec();
	MORGAN2  (g19,ro2);
	time2 = apl_sec();
	fprintf(stderr, "\n execution time in ms %d\n",time2-time1);
	/*   OUTPUT generates the following code */
//	p2 = fretfp;
//	w0 = cad[3];
//	w1= cad[1]/w0;
//	for (u0=0; u0<w1; u0++)  {
//		if(w0<11) for(v1=0;v1<w0;v1++){a_prtD0(*p2);++p2;}
//		else for (v1=0;v1<w0;v1++)
//		{a_prtD0(*p2); ++p2; if ((v1%10)==9) printf("\n");}
//		putchar('\n');
//	}
//	putchar('\n');
	free(heap); free(v5); free(v6); system("pause"); exit(0);
}
				/*              CODE SEGMENT FOR FUNCTION MORGAN2       */
MORGAN2  (v19,p20)
	int  v19;
	double* p20;
{
	int num = 0;
	int cache_start[num_threads];
	int max_cache_num;
	int start_time;
	int cache_idx;
	char v8, v11;
	int oldr15, v9, v12, v13, v17;
	double v10, v14, v15, v16;
	STOFM3 v18;
	STOFM3 v21;
	STOFM3 v22;
	STOFM3 v31;
	STOFM3 v32;
	STOFM3 v34;
	STOFM3 v35;
	STOFM3 v36;
	STOFM3 v37;
	STOFM3 v38;
	STOFM3 v39;
	STOFM3 v40;
	STOFM3 v41;
	STOFM3 v42;
	STOFM3 v43;
	STOFM3 v44;
	STOFM3 v45;
	STOFM3 v46;
	STOFM3 v47;
	STOFM3 v48;
	STOFM3 v20;
	v18.maxl = -1;
	v49.maxl= -1; v50.maxl= -1; v51.maxl= -1;
	v52.maxl= -1; v53.maxl= -1; v54.maxl= -1;
	v28.maxl = -1;
	v21.maxl = -1;
	v22.maxl = -1;
	v31.maxl = -1;
	v32.maxl = -1;
	v34.maxl = -1;
	v35.maxl = -1;
	v36.maxl = -1;
	v37.maxl = -1;
	v38.maxl = -1;
	v39.maxl = -1;
	v40.maxl = -1;
	v41.maxl = -1;
	v42.maxl = -1;
	v43.maxl = -1;
	v44.maxl = -1;
	v45.maxl = -1;
	v46.maxl = -1;
	v47.maxl = -1;
	v48.maxl = -1;
	memset(cache_start, 0, sizeof (cache_start));
	r0= rshp[0];
	for (v1=0; v1<3; v1++)
		v20.dims[v1+0]=rshp[v1+1];
	if (initf == 1)
	{
		v20.valp= rop;
		v20.reall= v20.maxl= r0;
		initf= 0;
	} else {
		v20.reall= v20.maxl= r0;
		inchp2; v20.valp= &heap[r15];
		ro2= (double *) v20.valp;
		for (v1=0; v1<r0; v1++)
			ro2[v1]= p20[v1];
	}
	new=0;
	/******************   LINE 1   ******************/
	/* INDEXV generates the following code */
	lo2 = (double *) v20.valp;
	dl[2]= t = 1;
	for (u0=0; u0<2; u0++)
		t= dl[1-u0]= t*v20.dims[2-u0];
	cad[2]= v20.dims[1];
	cad[3]= v20.dims[2];
	n=0;
	n += dl[0]*(1-1);
	tdl[0] =v20.dims[2];
	r0= tdl[0]*v20.dims[1];
	cad[1]=r0;
	INIT_STARTP(v21);
	if (new=r0>v21.maxl) {
		v21.maxl=r0;
		inchp2;cad0= &heap[r15];
	}
	else cad0= v21.valp; p2 = (double *) cad0;
	if (r0==0) goto l2;
	for (u0=0; u0<v20.dims[1]; u0++)
	{
		w0 = dl[1]*u0;
		q0 = tdl[0]*u0;
		for (u1=0; u1<v20.dims[2]; u1++)
			p2[q0+u1] = lo2[n+w0+dl[2]*u1];
	}
l2:
	for (v1=0; v1<2; v1++)
		v21.dims[v1+0]=  cad[v1+2];
	v21.valp = cad0;
	v21.reall= cad[1];
	v21.curr_rows = NULL;
	/******************   LINE 2   ******************/
	/* INDEXV generates the following code */
	lo2 = (double *) v20.valp;
	dl[2]= t = 1;
	for (u0=0; u0<2; u0++)
		t= dl[1-u0]= t*v20.dims[2-u0];
	cad[2]= v20.dims[1];
	cad[3]= v20.dims[2];
	n=0;
	n += dl[0]*(2-1);
	tdl[0] =v20.dims[2];
	r0= tdl[0]*v20.dims[1];
	cad[1]=r0;
	INIT_STARTP(v22);
	if (new=r0>v22.maxl) {
		v22.maxl=r0;
		inchp2;cad0= &heap[r15];
	}
	else cad0= v22.valp;
	p2 = (double *) cad0;
	if (r0==0) goto l3;
	for (u0=0; u0<v20.dims[1]; u0++)
	{
		w0 = dl[1]*u0;
		q0 = tdl[0]*u0;
		for (u1=0; u1<v20.dims[2]; u1++)
			p2[q0+u1] = lo2[n+w0+dl[2]*u1];
	}
l3:
	for (v1=0; v1<2; v1++)
		v22.dims[v1+0]=  cad[v1+2];
	v22.valp = cad0;
	v22.reall= cad[1];
	v22.curr_rows = NULL;
	max_cache_num = 200; // TODO
	int init = 0;
	int local_first = 1;
	start_time = apl_sec();
#pragma omp parallel for default(shared) private(cache_idx, v1, v2, d) firstprivate(local_first)
	for(cache_idx = 0; cache_idx < max_cache_num; cache_idx++)
	{
		int slice;
		STOFM3 v23;
		STOFM3 v24;
		STOFM3 v25;
		STOFM3 v26;
		STOFM3 v27;
		STOFM3 v33;
		v23.maxl = -1;
		v24.maxl = -1;
		v25.maxl = -1;
		v26.maxl = -1;
		v27.maxl = -1;
		v33.maxl = -1;

		if (local_first)
		{
//#pragma omp critical
//			{
//				cache_start[omp_get_thread_num()] = cache_idx;
//			}
//#ifdef DEBUG
//			fprintf (stderr, "thread %d starts at %d\n", omp_get_thread_num(), cache_idx);
//#endif
		}

		if (cache_idx && init == 0)
		{
			while (init == 0)
			{
#pragma omp flush(init)
			}
		}
		/******************   LINE 3   ******************/
		/******************   LINE 4   ******************/
		if (cache_idx == 0)
		{
			/*   GENSCAN generates the following code */
			p2 = (double *) v28.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v21.dims[v1+0];
			r0 =   v21.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			if (r0==0) goto l4;
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v21);
//			SET_ROW(v28, v1);
		}

l4:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v31);
			v31.source = &v28;
			if (v5[0] > 0)
				v31.startp[0] = v5[0];
			else
				v31.startp[0] = 0;
			if (v5[1] > 0)
				v31.startp[1] = v5[1];
			else
				v31.startp[1] = 0;
			v31.dims[0] = cad[2];
			v31.dims[1] = cad[3];
			memcpy (v31.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v31.valp = v28.valp;
			v31.reall= v31.dims[0] * v31.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v46);
			v46.source = &v28;
			if (v5[0] > 0)
				v46.startp[0] = v5[0];
			else
				v46.startp[0] = 0;
			if (v5[1] > 0)
				v46.startp[1] = v5[1];
			else
				v46.startp[1] = 0;
			v46.dims[0] = cad[2];
			v46.dims[1] = cad[3];
			memcpy (v46.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v46.valp = v28.valp;
			v46.reall= v46.dims[0] * v46.dims[1];
		}

		if (local_first)
		{
			r0=v46.reall;
			INIT_STARTP(v23);
			for (v1=0; v1<2; v1++)
				v23.dims[v1+0]=v46.dims[v1+0];
			memcpy(v23.orig_dims, v23.dims, sizeof(v23.dims));
			cad[1]=r0;
			if (new=r0>v23.maxl) {
				v23.maxl=r0;
				inchp2;v23.valp= &heap[r15];
			}
			v23.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}

		FORAXIS0_CACHE(v23) /* code for - */
		{
//			while (!ROW_AVAIL(&v46, v1));
//			while (!ROW_AVAIL(&v31, v1));

			FORAXIS1(v23)
			{
				if (v2 == 0)
					SLICE_DVAL(v23) = DROPPED_DVAL(v46);
				else
					SLICE_DVAL(v23) = DROPPED_DVAL(v46) - CD_DVAL(v31, -1);
			}
//			SET_ROW(v23, v1);
		}
		/******************   LINE 5   ******************/
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v22.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v22.dims[v1+0];
			r0 =   v22.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l5;
			r0=v22.dims[1];
			r1=v22.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v22);
		}
l5:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v32);
			if (v5[0] > 0)
				v32.startp[0] = v5[0];
			else
				v32.startp[0] = 0;
			if (v5[1] > 0)
				v32.startp[1] = v5[1];
			else
				v32.startp[1] = 0;
			v32.dims[0] = cad[2];
			v32.dims[1] = cad[3];
			memcpy (v32.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v32.valp = v28.valp;
			v32.reall= v32.dims[0] * v32.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v47);
			if (v5[0] > 0)
				v47.startp[0] = v5[0];
			else
				v47.startp[0] = 0;
			if (v5[1] > 0)
				v47.startp[1] = v5[1];
			else
				v47.startp[1] = 0;
			v47.dims[0] = cad[2];
			v47.dims[1] = cad[3];
			memcpy (v47.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v47.valp = v28.valp;
			v47.reall= v47.dims[0] * v47.dims[1];
		}

		if (local_first)
		{
			r0=v47.reall;
			INIT_STARTP(v25);
			for (v1=0; v1<2; v1++)
				v25.dims[v1+0]=v47.dims[v1+0];
			memcpy(v25.orig_dims, v25.dims, sizeof(v25.dims));
			cad[1]=r0;
			if (new=r0>v25.maxl) {
				v25.maxl=r0;
				inchp2;v25.valp= &heap[r15];
			}
			v25.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v25) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v25) = DROPPED_DVAL(v47);
			else
				SLICE_DVAL(v25) = DROPPED_DVAL(v47) - CD_DVAL(v32, -1);
		}
		/******************   LINE 6   ******************/
		if (local_first)
		{
			r0=v21.reall;
			for (v1=0; v1<2; v1++)
				v33.dims[v1+0]=  v21.dims[v1+0];
			cad[1]=r0;
			if (new=r0>v33.maxl) {
				v33.maxl=r0;
				inchp2;v33.valp= &heap[r15];
			}
			v33.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v33.valp;
			lo2 = (double *) v21.valp;
		}
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v21) * DVAL(v21);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l6;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l6:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v34);
			if (v5[0] > 0)
				v34.startp[0] = v5[0];
			else
				v34.startp[0] = 0;
			if (v5[1] > 0)
				v34.startp[1] = v5[1];
			else
				v34.startp[1] = 0;
			v34.dims[0] = cad[2];
			v34.dims[1] = cad[3];
			memcpy (v34.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v34.valp = v28.valp;
			v34.reall= v34.dims[0] * v34.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v48);
			if (v5[0] > 0)
				v48.startp[0] = v5[0];
			else
				v48.startp[0] = 0;
			if (v5[1] > 0)
				v48.startp[1] = v5[1];
			else
				v48.startp[1] = 0;
			v48.dims[0] = cad[2];
			v48.dims[1] = cad[3];
			memcpy (v48.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v48.valp = v28.valp;
			v48.reall= v48.dims[0] * v48.dims[1];
		}

		if (local_first)
		{
			r0=v48.reall;
			INIT_STARTP(v24);
			for (v1=0; v1<2; v1++)
				v24.dims[v1+0]=v48.dims[v1+0];
			memcpy(v24.orig_dims, v24.dims, sizeof(v24.dims));
			cad[1]=r0;
			if (new=r0>v24.maxl) {
				v24.maxl=r0;
				inchp2;v24.valp= &heap[r15];
			}
			v24.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v24) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v24) = DROPPED_DVAL(v48);
			else
				SLICE_DVAL(v24) = DROPPED_DVAL(v48) - CD_DVAL(v34, -1);
		}
		/******************   LINE 7   ******************/
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v22) * DVAL(v22);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l7;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l7:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3]          ;
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v36);
			if (v5[0] > 0)
				v36.startp[0] = v5[0];
			else
				v36.startp[0] = 0;
			if (v5[1] > 0)
				v36.startp[1] = v5[1];
			else
				v36.startp[1] = 0;
			v36.dims[0] = cad[2];
			v36.dims[1] = cad[3];
			memcpy (v36.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v36.valp = v28.valp;
			v36.reall= v36.dims[0] * v36.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v40);
			if (v5[0] > 0)
				v40.startp[0] = v5[0];
			else
				v40.startp[0] = 0;
			if (v5[1] > 0)
				v40.startp[1] = v5[1];
			else
				v40.startp[1] = 0;
			v40.dims[0] = cad[2];
			v40.dims[1] = cad[3];
			memcpy (v40.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v40.valp = v28.valp;
			v40.reall= v40.dims[0] * v40.dims[1];
		}

		if (local_first)
		{
			r0=v40.reall;
			INIT_STARTP(v26);
			for (v1=0; v1<2; v1++)
				v26.dims[v1+0]=  v40.dims[v1+0];
			memcpy(v26.orig_dims, v26.dims, sizeof(v26.dims));
			cad[1]=r0;
			if (new=r0>v26.maxl) {
				v26.maxl=r0;
				inchp2;v26.valp= &heap[r15];
			}
			v26.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v26) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v26) = DROPPED_DVAL(v40);
			else
				SLICE_DVAL(v26) = DROPPED_DVAL(v40) - CD_DVAL(v36, -1);
		}
		/******************   LINE 8   ******************/
		FORALL2_CACHE(v33)
			DVAL(v33) = DVAL(v21) * DVAL(v22);
		/*   GENSCAN generates the following code */
		if (cache_idx == 0)
		{
			p2 = (double *) v28.valp;
			ro2 = (double *) v33.valp;
			INIT_STARTP(v28);
			for (v1=0; v1<2; v1++)
				v28.orig_dims[v1+0] = v28.dims[v1+0]=  v33.dims[v1+0];
			r0 =   v33.reall;
			cad[1]=r0;
			if (new=r0>v28.maxl) {
				v28.maxl=r0;
				inchp2;v28.valp= &heap[r15];
			}
			v28.reall=r0; /* 3 instructions genarated by INCHEAPP */
			p2 = (double *) v28.valp;
			if (r0==0) goto l8;
			r0=v33.dims[1];
			r1=v33.dims[0];
		}
		FORAXIS0_CACHE(v28)
		{
			v2 = 0;
			d = 0;
			FORAXIS1(v28)
				DVAL(v28)=d=d+DVAL(v33);
		}
l8:
		if (cache_idx == 0)
			v9 =  -v19; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v38);
			if (v5[0] > 0)
				v38.startp[0] = v5[0];
			else
				v38.startp[0] = 0;
			if (v5[1] > 0)
				v38.startp[1] = v5[1];
			else
				v38.startp[1] = 0;
			v38.dims[0] = cad[2];
			v38.dims[1] = cad[3];
			memcpy (v38.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v38.valp = v28.valp;
			v38.reall= v38.dims[0] * v38.dims[1];
		}
		/***** PFCATENA *****/

		if (cache_idx == 0)
			v9 = v19-1; /* code for - */
		/***** PFCATENA *****/
		if (cache_idx == 0)
		{
			r0 = cad[1]= 1+1;
			cad[1] =r0 =2;
			v5[0] = 0;
			v5[1]= v9;
			/*   PFDROP generates the following code */
			ro2 = (double *) v28.valp;
			dl[1]= t = 1;
			for (u0=0; u0<1; u0++)
				t= dl[0-u0]= t*v28.dims[1-u0];
			r0=t=1;
			cad[3] = max(0, v28.dims[1]-abs(v5[1]));
			t=tdl[0] = t*cad[3];
			cad[2] = max(0,v28.dims[0]-abs(v5[0]));
			INIT_STARTP(v39);
			if (v5[0] > 0)
				v39.startp[0] = v5[0];
			else
				v39.startp[0] = 0;
			if (v5[1] > 0)
				v39.startp[1] = v5[1];
			else
				v39.startp[1] = 0;
			v39.dims[0] = cad[2];
			v39.dims[1] = cad[3];
			memcpy (v39.orig_dims, v28.orig_dims, sizeof (v28.orig_dims));
			v39.valp = v28.valp;
			v39.reall= v39.dims[0] * v39.dims[1];
		}

		if (local_first)
		{
			r0=v39.reall;
			INIT_STARTP(v27);
			for (v1=0; v1<2; v1++)
				v27.dims[v1+0]=v39.dims[v1+0];
			memcpy(v27.orig_dims, v27.dims, sizeof(v27.dims));
			cad[1]=r0;
			if (new=r0>v27.maxl) {
				v27.maxl=r0;
				inchp2;v27.valp= &heap[r15];
			}
			v27.reall=r0; /* 3 instructions genarated by INCHEAPP */
		}
		FORALL2_CACHE(v27) /* code for - */
		{
			if (v2 == 0)
				SLICE_DVAL(v27) = DROPPED_DVAL(v39);
			else
				SLICE_DVAL(v27) = DROPPED_DVAL(v39) - CD_DVAL(v38, -1);
		}
		/******************   LINE 9   ******************/
		if (cache_idx == 0)
		{
			r0=v25.reall;
			cad[31]=r0;
			for (v1=0; v1<2; v1++)
				cad[32+v1]=  v25.dims[v1+0];
			lo2 = (double *) v25.valp;
			INIT_STARTP(v18);
			for (v1=0; v1<2; v1++)
				v18.dims[v1+0]=v25.dims[v1+0];
			cad[1]=r0;
			if (new=r0>v18.maxl) {
				v18.maxl=r0;
				inchp2;v18.valp= &heap[r15];
			}
			v18.reall=r0; /* 3 instructions genarated by INCHEAPP */
			memcpy(v18.orig_dims, v18.dims, sizeof(v18.dims));
		}
		FORALL2_CACHE(v25)
		{
			double tmp;
			double tmp2;
			double tmp3;
			if (unlikely(SLICE_DVAL(v25)==0 && 0==(double)v19)) tmp =  1;
			else
				tmp = SLICE_DVAL(v25)/(double)v19;
			tmp = tmp*tmp;
			if (unlikely(SLICE_DVAL(v26)==0 && 0==(double)v19)) tmp2 =  1;
			else
				tmp2 = SLICE_DVAL(v26)/(double)v19;
			tmp = tmp2-tmp;
			tmp =  fabs(tmp);
			tmp = sqrt(tmp);
			if (unlikely(SLICE_DVAL(v23)==0 && 0==(double)v19)) tmp2 =  1;
			else
				tmp2 = SLICE_DVAL(v23)/(double)v19;
			tmp2 = tmp2 * tmp2;
			if (unlikely(SLICE_DVAL(v24)==0 && 0==(double)v19)) tmp3 =  1;
			else
				tmp3 = SLICE_DVAL(v24)/(double)v19;
			tmp2 = tmp2 + tmp3;
			tmp2 = sqrt(tmp2);
			tmp2 = fabs(tmp2);
			tmp = tmp * tmp2;
			v9 = v19*v19; /* code for * */
			tmp2 = SLICE_DVAL(v23) * SLICE_DVAL(v25);
			if (unlikely(tmp2==0 && 0==(double)v9))  tmp2 =  1;
			else
				tmp2 = tmp2/(double)v9;
			if (unlikely(SLICE_DVAL(v27)==0 && 0==(double)v19)) tmp3 =  1;
			else
				tmp3 = SLICE_DVAL(v27)/(double)v19;
			tmp2 = tmp3 - tmp2;
			if (unlikely(tmp2==0 && 0==tmp))  DVAL(v18) =  1;
			else
				DVAL(v18) = tmp2/tmp;
			num++;
		}
		init = 1;
		local_first = 0;
	}
	fprintf(stderr, "the big loop takes %dms, there are %d iterations\n",
			apl_sec() - start_time, num);
	for (v1=0; v1<2; v1++)
		cad[2+v1]=  v18.dims[v1+0];
	cad[1]= v18.reall;
	fprintf (stderr, "size: %d\n", v18.reall);
	fretfp= (double *) v18.valp;
}

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: Understand the running time of a program compiled with GCC
  2010-03-10  3:08 Understand the running time of a program compiled with GCC Da Zheng
@ 2010-03-10 12:04 ` John (Eljay) Love-Jensen
  2010-03-11  2:50   ` Da Zheng
  0 siblings, 1 reply; 3+ messages in thread
From: John (Eljay) Love-Jensen @ 2010-03-10 12:04 UTC (permalink / raw)
  To: Da Zheng, gcc-help

Hi Zheng Da,

> Can anyone tell me what is the possible reason that can cause time difference?

To understand the differences, look at the assembly dumps of the routine from the GCC compiler and the Intel compiler.

Make sure you are using the right GCC machine flags -mcpu= -mtune= -march= for your platform.  (Since you did not provide your command lines for GCC and Intel compilers, I can only offer that as a general suggestion.)

HTH,
--Eljay

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: Understand the running time of a program compiled with GCC
  2010-03-10 12:04 ` John (Eljay) Love-Jensen
@ 2010-03-11  2:50   ` Da Zheng
  0 siblings, 0 replies; 3+ messages in thread
From: Da Zheng @ 2010-03-11  2:50 UTC (permalink / raw)
  To: John (Eljay) Love-Jensen; +Cc: gcc-help

Hello Eljay,

On 10-3-10 ä¸‹åˆ8:04, John (Eljay) Love-Jensen wrote:
> Hi Zheng Da,
> 
>> Can anyone tell me what is the possible reason that can cause time difference?
> 
> To understand the differences, look at the assembly dumps of the routine from the GCC compiler and the Intel compiler.
It's not a big program, but not a small one either. I used objdump -S to see the
assembly code and two files are almost completely different. But I noticed one
thing: the binary file generated by the Intel compiler is much larger than the
one from GCC. Another thing I have noticed is that with -O3 enabled, the Intel
compiler generate vector instructions but GCC doesn't. But I don't see how this
can explain my problem.
> 
> Make sure you are using the right GCC machine flags -mcpu= -mtune= -march= for your platform.  (Since you did not provide your command lines for GCC and Intel compilers, I can only offer that as a general suggestion.)
I didn't use any of these machine flags. I only used -O3 for GCC and Intel
compiler. So the command line I used to compile my program is
gcc -fopenmp -DREALTIME -D__LINUX -lm -O3 MORGAN2.c -o MORGAN2-omp
icc -openmp -DREALTIME -D__LINUX -O3 MORGAN2.c -o MORGAN2-omp

I use Athlon ii x4 620 processor, so I tried -mtune=athlon or -mtune=athlon-4 or
-mtune=athlon64 or -mtune=native -march=native when I compiled the code with
GCC, but none of them seems to work:-(

Best regards,
Zheng Da

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2010-03-11  2:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-03-10  3:08 Understand the running time of a program compiled with GCC Da Zheng
2010-03-10 12:04 ` John (Eljay) Love-Jensen
2010-03-11  2:50   ` Da Zheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).