commit 4a5e8ad6d5c5fa2e944d1318dbcba28f234abffe Author: Bernd Schmidt Date: Wed Nov 19 18:35:41 2014 +0100 Cesar's latest patch diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map index f6e70e9..0fa62ff 100644 --- a/libgomp/libgomp.map +++ b/libgomp/libgomp.map @@ -310,6 +310,8 @@ GOACC_2.0 { GOACC_parallel; GOACC_update; GOACC_wait; + GOACC_get_thread_num; + GOACC_get_num_threads; }; GOMP_PLUGIN_1.0 { diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h index 44f200c..3db5676 100644 --- a/libgomp/libgomp_g.h +++ b/libgomp/libgomp_g.h @@ -226,5 +226,7 @@ extern void GOACC_parallel (int, void (*) (void *), const void *, size_t, void **, size_t *, unsigned short *, int, int, int, int, int, ...); extern void GOACC_wait (int, int, ...); +extern int GOACC_get_num_threads (void); +extern int GOACC_get_thread_num (void); #endif /* LIBGOMP_G_H */ diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c index 0ff44bf..e142384 100644 --- a/libgomp/oacc-parallel.c +++ b/libgomp/oacc-parallel.c @@ -115,9 +115,6 @@ GOACC_parallel (int device, void (*fn) (void *), const void *openmp_target, splay_tree_key tgt_fn_key; void (*tgt_fn); - if (num_gangs != 1) - gomp_fatal ("num_gangs (%d) different from one is not yet supported", - num_gangs); if (num_workers != 1) gomp_fatal ("num_workers (%d) different from one is not yet supported", num_workers); @@ -386,3 +383,15 @@ GOACC_wait (int async, int num_waits, ...) va_end (ap); } + +int +GOACC_get_num_threads (void) +{ + return 1; +} + +int +GOACC_get_thread_num (void) +{ + return 0; +} diff --git a/libgomp/oacc-ptx.h b/libgomp/oacc-ptx.h new file mode 100644 index 0000000..1af81b2 --- /dev/null +++ b/libgomp/oacc-ptx.h @@ -0,0 +1,400 @@ +#define ABORT_PTX \ + ".version 3.1\n" \ + ".target sm_30\n" \ + ".address_size 64\n" \ + ".visible .func abort;\n" \ + ".visible .func abort\n" \ + "{\n" \ + "trap;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func _gfortran_abort;\n" \ + ".visible .func _gfortran_abort\n" \ + "{\n" \ + "trap;\n" \ + "ret;\n" \ + "}\n" \ + +/* Generated with: + + $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_h_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline +*/ +#define ACC_ON_DEVICE_PTX \ + " .version 3.1\n" \ + " .target sm_30\n" \ + " .address_size 64\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \ + "{\n" \ + " .reg.u32 %ar1;\n" \ + ".reg.u32 %retval;\n" \ + " .reg.u64 %hr10;\n" \ + " .reg.u32 %r24;\n" \ + " .reg.u32 %r25;\n" \ + " .reg.pred %r27;\n" \ + " .reg.u32 %r30;\n" \ + " ld.param.u32 %ar1, [%in_ar1];\n" \ + " mov.u32 %r24, %ar1;\n" \ + " setp.ne.u32 %r27,%r24,4;\n" \ + " set.u32.eq.u32 %r30,%r24,5;\n" \ + " neg.s32 %r25, %r30;\n" \ + " @%r27 bra $L3;\n" \ + " mov.u32 %r25, 1;\n" \ + "$L3:\n" \ + " mov.u32 %retval, %r25;\n" \ + " st.param.u32 [%out_retval], %retval;\n" \ + " ret;\n" \ + " }\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1);\n" \ + ".visible .func (.param.u32 %out_retval)acc_on_device_h_(.param.u64 %in_ar1)\n" \ + "{\n" \ + " .reg.u64 %ar1;\n" \ + ".reg.u32 %retval;\n" \ + " .reg.u64 %hr10;\n" \ + " .reg.u64 %r25;\n" \ + " .reg.u32 %r26;\n" \ + " .reg.u32 %r27;\n" \ + " ld.param.u64 %ar1, [%in_ar1];\n" \ + " mov.u64 %r25, %ar1;\n" \ + " ld.u32 %r26, [%r25];\n" \ + " {\n" \ + " .param.u32 %retval_in;\n" \ + " {\n" \ + " .param.u32 %out_arg0;\n" \ + " st.param.u32 [%out_arg0], %r26;\n" \ + " call (%retval_in), acc_on_device, (%out_arg0);\n" \ + " }\n" \ + " ld.param.u32 %r27, [%retval_in];\n" \ + "}\n" \ + " mov.u32 %retval, %r27;\n" \ + " st.param.u32 [%out_retval], %retval;\n" \ + " ret;\n" \ + " }" + + #define GOACC_INTERNAL_PTX \ + ".version 3.1\n" \ + ".target sm_30\n" \ + ".address_size 64\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" \ + ".extern .func abort;\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L4;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L5;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L8;\n" \ + "mov.u32 %r23,%tid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L7;\n" \ + "$L4:\n" \ + "mov.u32 %r24,%tid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L7;\n" \ + "$L5:\n" \ + "mov.u32 %r25,%tid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L7;\n" \ + "$L8:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L7:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L11;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L12;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L15;\n" \ + "mov.u32 %r23,%ntid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L14;\n" \ + "$L11:\n" \ + "mov.u32 %r24,%ntid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L14;\n" \ + "$L12:\n" \ + "mov.u32 %r25,%ntid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L14;\n" \ + "$L15:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L14:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L18;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L19;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L22;\n" \ + "mov.u32 %r23,%ctaid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L21;\n" \ + "$L18:\n" \ + "mov.u32 %r24,%ctaid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L21;\n" \ + "$L19:\n" \ + "mov.u32 %r25,%ctaid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L21;\n" \ + "$L22:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L21:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n" \ + "{\n" \ + ".reg .u32 %ar1;\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .pred %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .pred %r32;\n" \ + ".reg .u32 %r33;\n" \ + ".reg .pred %r34;\n" \ + ".local .align 8 .b8 %frame[4];\n" \ + "ld.param.u32 %ar1,[%in_ar1];\n" \ + "mov.u32 %r27,%ar1;\n" \ + "st.local.u32 [%frame],%r27;\n" \ + "ld.local.u32 %r28,[%frame];\n" \ + "mov.u32 %r29,1;\n" \ + "setp.eq.u32 %r30,%r28,%r29;\n" \ + "@%r30 bra $L25;\n" \ + "mov.u32 %r31,2;\n" \ + "setp.eq.u32 %r32,%r28,%r31;\n" \ + "@%r32 bra $L26;\n" \ + "mov.u32 %r33,0;\n" \ + "setp.eq.u32 %r34,%r28,%r33;\n" \ + "@!%r34 bra $L29;\n" \ + "mov.u32 %r23,%nctaid.x;\n" \ + "mov.u32 %r22,%r23;\n" \ + "bra $L28;\n" \ + "$L25:\n" \ + "mov.u32 %r24,%nctaid.y;\n" \ + "mov.u32 %r22,%r24;\n" \ + "bra $L28;\n" \ + "$L26:\n" \ + "mov.u32 %r25,%nctaid.z;\n" \ + "mov.u32 %r22,%r25;\n" \ + "bra $L28;\n" \ + "$L29:\n" \ + "{\n" \ + "{\n" \ + "call abort;\n" \ + "}\n" \ + "}\n" \ + "$L28:\n" \ + "mov.u32 %r26,%r22;\n" \ + "mov.u32 %retval,%r26;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n" \ + "{\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + "mov.u32 %r26,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r26;\n" \ + "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r27,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r22,%r27;\n" \ + "mov.u32 %r28,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r28;\n" \ + "call (%retval_in),GOACC_nctaid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r29,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r23,%r29;\n" \ + "mul.lo.u32 %r24,%r22,%r23;\n" \ + "mov.u32 %r25,%r24;\n" \ + "mov.u32 %retval,%r25;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" \ + ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n" \ + "{\n" \ + ".reg .u32 %retval;\n" \ + ".reg .u64 %hr10;\n" \ + ".reg .u32 %r22;\n" \ + ".reg .u32 %r23;\n" \ + ".reg .u32 %r24;\n" \ + ".reg .u32 %r25;\n" \ + ".reg .u32 %r26;\n" \ + ".reg .u32 %r27;\n" \ + ".reg .u32 %r28;\n" \ + ".reg .u32 %r29;\n" \ + ".reg .u32 %r30;\n" \ + ".reg .u32 %r31;\n" \ + ".reg .u32 %r32;\n" \ + ".reg .u32 %r33;\n" \ + "mov.u32 %r28,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r28;\n" \ + "call (%retval_in),GOACC_ntid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r29,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r22,%r29;\n" \ + "mov.u32 %r30,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r30;\n" \ + "call (%retval_in),GOACC_ctaid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r31,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r23,%r31;\n" \ + "mul.lo.u32 %r24,%r22,%r23;\n" \ + "mov.u32 %r32,0;\n" \ + "{\n" \ + ".param .u32 %retval_in;\n" \ + "{\n" \ + ".param .u32 %out_arg0;\n" \ + "st.param.u32 [%out_arg0],%r32;\n" \ + "call (%retval_in),GOACC_tid,(%out_arg0);\n" \ + "}\n" \ + "ld.param.u32 %r33,[%retval_in];\n" \ + "}\n" \ + "mov.u32 %r25,%r33;\n" \ + "add.u32 %r26,%r24,%r25;\n" \ + "mov.u32 %r27,%r26;\n" \ + "mov.u32 %retval,%r27;\n" \ + "st.param.u32 [%out_retval],%retval;\n" \ + "ret;\n" \ + "}\n" diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 3d1b81b..7fedd2d 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -35,6 +35,7 @@ #include "libgomp.h" #include "libgomp_target.h" #include "libgomp-plugin.h" +#include "oacc-ptx.h" #include "oacc-plugin.h" #include @@ -722,78 +723,6 @@ PTX_get_num_devices (void) return n; } -#define ABORT_PTX \ - ".version 3.1\n" \ - ".target sm_30\n" \ - ".address_size 64\n" \ - ".visible .func abort;\n" \ - ".visible .func abort\n" \ - "{\n" \ - "trap;\n" \ - "ret;\n" \ - "}\n" \ - ".visible .func _gfortran_abort;\n" \ - ".visible .func _gfortran_abort\n" \ - "{\n" \ - "trap;\n" \ - "ret;\n" \ - "}\n" \ - -/* Generated with: - - $ echo 'int acc_on_device(int d) { return __builtin_acc_on_device(d); } int acc_on_device_(int *d) { return acc_on_device(*d); }' | accel-gcc/xgcc -Baccel-gcc -x c - -o - -S -m64 -O3 -fno-builtin-acc_on_device -fno-inline -*/ -#define ACC_ON_DEVICE_PTX \ - " .version 3.1\n" \ - " .target sm_30\n" \ - " .address_size 64\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1);\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device(.param.u32 %in_ar1)\n" \ - "{\n" \ - " .reg.u32 %ar1;\n" \ - ".reg.u32 %retval;\n" \ - " .reg.u64 %hr10;\n" \ - " .reg.u32 %r24;\n" \ - " .reg.u32 %r25;\n" \ - " .reg.pred %r27;\n" \ - " .reg.u32 %r30;\n" \ - " ld.param.u32 %ar1, [%in_ar1];\n" \ - " mov.u32 %r24, %ar1;\n" \ - " setp.ne.u32 %r27,%r24,4;\n" \ - " set.u32.eq.u32 %r30,%r24,5;\n" \ - " neg.s32 %r25, %r30;\n" \ - " @%r27 bra $L3;\n" \ - " mov.u32 %r25, 1;\n" \ - "$L3:\n" \ - " mov.u32 %retval, %r25;\n" \ - " st.param.u32 [%out_retval], %retval;\n" \ - " ret;\n" \ - " }\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1);\n" \ - ".visible .func (.param.u32 %out_retval)acc_on_device_(.param.u64 %in_ar1)\n" \ - "{\n" \ - " .reg.u64 %ar1;\n" \ - ".reg.u32 %retval;\n" \ - " .reg.u64 %hr10;\n" \ - " .reg.u64 %r25;\n" \ - " .reg.u32 %r26;\n" \ - " .reg.u32 %r27;\n" \ - " ld.param.u64 %ar1, [%in_ar1];\n" \ - " mov.u64 %r25, %ar1;\n" \ - " ld.u32 %r26, [%r25];\n" \ - " {\n" \ - " .param.u32 %retval_in;\n" \ - " {\n" \ - " .param.u32 %out_arg0;\n" \ - " st.param.u32 [%out_arg0], %r26;\n" \ - " call (%retval_in), acc_on_device, (%out_arg0);\n" \ - " }\n" \ - " ld.param.u32 %r27, [%retval_in];\n" \ - "}\n" \ - " mov.u32 %retval, %r27;\n" \ - " st.param.u32 [%out_retval], %retval;\n" \ - " ret;\n" \ - " }" static void link_ptx (CUmodule *module, char *ptx_code) @@ -856,6 +785,16 @@ link_ptx (CUmodule *module, char *ptx_code) cuda_error (r)); } + char *goacc_internal_ptx = GOACC_INTERNAL_PTX; + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, goacc_internal_ptx, + strlen (goacc_internal_ptx) + 1, 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); + GOMP_PLUGIN_fatal ("cuLinkAddData (goacc_internal_ptx) error: %s", + cuda_error (r)); + } + r = cuLinkAddData (linkstate, CU_JIT_INPUT_PTX, ptx_code, strlen (ptx_code) + 1, 0, 0, 0, 0); if (r != CUDA_SUCCESS) @@ -1043,7 +982,7 @@ PTX_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, kargs[0] = &dp; r = cuLaunchKernel (function, - 1, 1, 1, + num_gangs, 1, 1, nthreads_in_block, 1, 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) diff --git a/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 new file mode 100644 index 0000000..6325431 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/reduction-6.f90 @@ -0,0 +1,30 @@ +! { dg-do run } + +program reduction + implicit none + + integer, parameter :: n = 100 + integer :: i, s1, s2, vs1, vs2 + + s1 = 0 + s2 = 0 + vs1 = 0 + vs2 = 0 + + !$acc parallel vector_length (1000) + !$acc loop reduction(+:s1, s2) + do i = 1, n + s1 = s1 + 1 + s2 = s2 + 2 + end do + !$acc end parallel + + ! Verify the results + do i = 1, n + vs1 = vs1 + 1 + vs2 = vs2 + 2 + end do + + if (s1.ne.vs1) call abort () + if (s2.ne.vs2) call abort () +end program reduction