2015-11-19 Nathan Sidwell * libgomp.oacc-c-c++-common/reduction-dbl.c: New. * libgomp.oacc-c-c++-common/reduction-flt.c: New. * libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: Use typedef. * libgomp.oacc-c-c++-common/reduction-cplx-flt.c: Use typedef. * libgomp.oacc-c-c++-common/reduction-2.c: Uncomment broken tests and fix. * libgomp.oacc-c-c++-common/reduction-3.c: Likewise. * libgomp.oacc-c-c++-common/reduction-4.c: Likewise. Index: testsuite/libgomp.oacc-c-c++-common/reduction-2.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-2.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-2.c (working copy) @@ -50,39 +50,37 @@ main(void) if (fabs(result - vresult) > .0001) abort (); -// result = 0; -// vresult = 0; -// -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); + result = 0; + vresult = 0; + + /* 'max' reductions. */ +#pragma acc parallel vector_length (vl) copy(result) +#pragma acc loop reduction (max:result) + for (i = 0; i < n; i++) + result = result > array[i] ? result : array[i]; + + /* Verify the reduction. */ + for (i = 0; i < n; i++) + vresult = vresult > array[i] ? vresult : array[i]; + + if (result != vresult) + abort (); + + result = 0; + vresult = 0; + + /* 'min' reductions. */ +#pragma acc parallel vector_length (vl) copy(result) +#pragma acc loop reduction (min:result) + for (i = 0; i < n; i++) + result = result < array[i] ? result : array[i]; + + /* Verify the reduction. */ + for (i = 0; i < n; i++) + vresult = vresult < array[i] ? vresult : array[i]; + + if (result != vresult) + abort (); result = 5; vresult = 5; Index: testsuite/libgomp.oacc-c-c++-common/reduction-3.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-3.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-3.c (working copy) @@ -22,15 +22,15 @@ main(void) result = 0; vresult = 0; - /* '+' reductions. */ + /* 'max' reductions. */ #pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (+:result) +#pragma acc loop reduction (max:result) for (i = 0; i < n; i++) - result += array[i]; + result = result > array[i] ? result : array[i]; /* Verify the reduction. */ for (i = 0; i < n; i++) - vresult += array[i]; + vresult = vresult > array[i] ? vresult : array[i]; if (result != vresult) abort (); @@ -38,51 +38,18 @@ main(void) result = 0; vresult = 0; - /* '*' reductions. */ + /* 'min' reductions. */ #pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (*:result) +#pragma acc loop reduction (min:result) for (i = 0; i < n; i++) - result *= array[i]; + result = result < array[i] ? result : array[i]; /* Verify the reduction. */ for (i = 0; i < n; i++) - vresult *= array[i]; + vresult = vresult < array[i] ? vresult : array[i]; - if (fabs(result - vresult) > .0001) + if (result != vresult) abort (); -// result = 0; -// vresult = 0; -// -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); result = 5; vresult = 5; Index: testsuite/libgomp.oacc-c-c++-common/reduction-4.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-4.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-4.c (working copy) @@ -23,76 +23,6 @@ main(void) result = 0; vresult = 0; - /* '+' reductions. */ -#pragma acc parallel vector_length (vl) copy(result) -#pragma acc loop reduction (+:result) - for (i = 0; i < n; i++) - result += array[i]; - - /* Verify the reduction. */ - for (i = 0; i < n; i++) - vresult += array[i]; - - if (result != vresult) - abort (); - - result = 0; - vresult = 0; - - /* Needs support for complex multiplication. */ - -// /* '*' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (*:result) -// for (i = 0; i < n; i++) -// result *= array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult *= array[i]; -// -// if (fabs(result - vresult) > .0001) -// abort (); -// result = 0; -// vresult = 0; - -// /* 'max' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result > array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult > array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); -// -// result = 0; -// vresult = 0; -// -// /* 'min' reductions. */ -// #pragma acc parallel vector_length (vl) -// #pragma acc loop reduction (+:result) -// for (i = 0; i < n; i++) -// result = result < array[i] ? result : array[i]; -// -// /* Verify the reduction. */ -// for (i = 0; i < n; i++) -// vresult = vresult < array[i] ? vresult : array[i]; -// -// printf("%d != %d\n", result, vresult); -// if (result != vresult) -// abort (); - - result = 5; - vresult = 5; - - lresult = false; - lvresult = false; - /* '&&' reductions. */ #pragma acc parallel vector_length (vl) copy(lresult) #pragma acc loop reduction (&&:lresult) Index: testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (working copy) @@ -3,10 +3,11 @@ /* Double float has 53 bits of fraction. */ #define FRAC (1.0 / (1LL << 48)) +typedef double _Complex Type; -int close_enough (double _Complex a, double _Complex b) +int close_enough (Type a, Type b) { - double _Complex diff = a - b; + Type diff = a - b; double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); double mag2_diff = (__real__(diff) * __real__ (diff) + __imag__ (diff) * __imag__ (diff)); @@ -17,9 +18,9 @@ int close_enough (double _Complex a, dou #define N 100 static int __attribute__ ((noinline)) -vector (double _Complex ary[N], double _Complex sum, double _Complex prod) +vector (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -41,9 +42,9 @@ vector (double _Complex ary[N], double _ } static int __attribute__ ((noinline)) -worker (double _Complex ary[N], double _Complex sum, double _Complex prod) +worker (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -65,9 +66,9 @@ worker (double _Complex ary[N], double _ } static int __attribute__ ((noinline)) -gang (double _Complex ary[N], double _Complex sum, double _Complex prod) +gang (Type ary[N], Type sum, Type prod) { - double _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -90,7 +91,7 @@ gang (double _Complex ary[N], double _Co int main (void) { - double _Complex ary[N], sum = 0, prod = 1; + Type ary[N], sum = 0, prod = 1; for (int ix = 0; ix < N; ix++) { Index: testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (revision 230605) +++ testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (working copy) @@ -3,10 +3,11 @@ /* Single float has 23 bits of fraction. */ #define FRAC (1.0f / (1 << 20)) +typedef float _Complex Type; -int close_enough (float _Complex a, float _Complex b) +int close_enough (Type a, Type b) { - float _Complex diff = a - b; + Type diff = a - b; float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); float mag2_diff = (__real__(diff) * __real__ (diff) + __imag__ (diff) * __imag__ (diff)); @@ -17,9 +18,9 @@ int close_enough (float _Complex a, floa #define N 100 static int __attribute__ ((noinline)) -vector (float _Complex ary[N], float _Complex sum, float _Complex prod) +vector (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -41,9 +42,9 @@ vector (float _Complex ary[N], float _Co } static int __attribute__ ((noinline)) -worker (float _Complex ary[N], float _Complex sum, float _Complex prod) +worker (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -65,9 +66,9 @@ worker (float _Complex ary[N], float _Co } static int __attribute__ ((noinline)) -gang (float _Complex ary[N], float _Complex sum, float _Complex prod) +gang (Type ary[N], Type sum, Type prod) { - float _Complex tsum = 0, tprod = 1; + Type tsum = 0, tprod = 1; #pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) { @@ -90,7 +91,7 @@ gang (float _Complex ary[N], float _Comp int main (void) { - float _Complex ary[N], sum = 0, prod = 1; + Type ary[N], sum = 0, prod = 1; for (int ix = 0; ix < N; ix++) { Index: testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (revision 0) +++ testsuite/libgomp.oacc-c-c++-common/reduction-dbl.c (working copy) @@ -0,0 +1,112 @@ + +/* Double float has 53 bits of fraction. */ +#define FRAC (1.0 / (1LL << 48)) +typedef double Type; + +int close_enough (Type a, Type b) +{ + Type diff = a - b; + if (diff < 0) + diff = -diff; + + return diff / a < FRAC; +} + +#define N 100 + +static int __attribute__ ((noinline)) +vector (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +gang (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + Type ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +} Index: testsuite/libgomp.oacc-c-c++-common/reduction-flt.c =================================================================== --- testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (revision 0) +++ testsuite/libgomp.oacc-c-c++-common/reduction-flt.c (working copy) @@ -0,0 +1,112 @@ + +/* Single float has 23 bits of fraction. */ +#define FRAC (1.0f / (1 << 20)) +typedef float Type; + +int close_enough (Type a, Type b) +{ + Type diff = a - b; + if (diff < 0) + diff = -diff; + + return diff / a < FRAC; +} + +#define N 100 + +static int __attribute__ ((noinline)) +vector (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel vector_length(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +worker (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_workers(32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop worker reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +static int __attribute__ ((noinline)) +gang (Type ary[N], Type sum, Type prod) +{ + Type tsum = 0, tprod = 1; + +#pragma acc parallel num_gangs (32) copyin(ary[0:N]) copy (tsum, tprod) + { +#pragma acc loop gang reduction(+:tsum) reduction (*:tprod) + for (int ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} + +int main (void) +{ + Type ary[N], sum = 0, prod = 1; + + for (int ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac; + sum += ary[ix]; + prod *= ary[ix]; + } + + if (vector (ary, sum, prod)) + return 1; + + if (worker (ary, sum, prod)) + return 1; + + if (gang (ary, sum, prod)) + return 1; + + return 0; +}