openmp: fix max_vf setting for amdgcn offloading Ensure that the "max_vf" figure used for the "safelen" attribute is large enough for the largest configured offload device. This change gives ~10x speed improvement on the Bablestream "dot" benchmark for AMD GCN. gcc/ChangeLog: * gimple-loop-versioning.cc (loop_versioning::loop_versioning): Add comment. * omp-general.cc (omp_max_simd_vf): New function. * omp-general.h (omp_max_simd_vf): New prototype. * omp-low.cc (lower_rec_simd_input_clauses): Select largest from omp_max_vf, omp_max_simt_vf, and omp_max_simd_vf. gcc/testsuite/ChangeLog: * lib/target-supports.exp (check_effective_target_amdgcn_offloading_enabled): New. (check_effective_target_nvptx_offloading_enabled): New. * gcc.dg/gomp/target-vf.c: New test. diff --git a/gcc/gimple-loop-versioning.cc b/gcc/gimple-loop-versioning.cc index 6bcf6eba691..e908c27fc44 100644 --- a/gcc/gimple-loop-versioning.cc +++ b/gcc/gimple-loop-versioning.cc @@ -555,7 +555,10 @@ loop_versioning::loop_versioning (function *fn) unvectorizable code, since it is the largest size that can be handled efficiently by scalar code. omp_max_vf calculates the maximum number of bytes in a vector, when such a value is relevant - to loop optimization. */ + to loop optimization. + FIXME: this probably needs to use omp_max_simd_vf when in a target + region, but how to tell? (And MAX_FIXED_MODE_SIZE is large enough that + it doesn't actually matter.) */ m_maximum_scale = estimated_poly_value (omp_max_vf ()); m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE); } diff --git a/gcc/omp-general.cc b/gcc/omp-general.cc index a406c578f33..8c6fcebc4b3 100644 --- a/gcc/omp-general.cc +++ b/gcc/omp-general.cc @@ -994,6 +994,24 @@ omp_max_simt_vf (void) return 0; } +/* Return maximum SIMD width if offloading may target SIMD hardware. */ + +int +omp_max_simd_vf (void) +{ + if (!optimize) + return 0; + if (ENABLE_OFFLOADING) + for (const char *c = getenv ("OFFLOAD_TARGET_NAMES"); c;) + { + if (startswith (c, "amdgcn")) + return 64; + else if ((c = strchr (c, ':'))) + c++; + } + return 0; +} + /* Store the construct selectors as tree codes from last to first, return their number. */ diff --git a/gcc/omp-general.h b/gcc/omp-general.h index 74e90e1a71a..410343e45fa 100644 --- a/gcc/omp-general.h +++ b/gcc/omp-general.h @@ -104,6 +104,7 @@ extern gimple *omp_build_barrier (tree lhs); extern tree find_combined_omp_for (tree *, int *, void *); extern poly_uint64 omp_max_vf (void); extern int omp_max_simt_vf (void); +extern int omp_max_simd_vf (void); extern int omp_constructor_traits_to_codes (tree, enum tree_code *); extern tree omp_check_context_selector (location_t loc, tree ctx); extern void omp_mark_declare_variant (location_t loc, tree variant, diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc index d73c165f029..1a9a509adb9 100644 --- a/gcc/omp-low.cc +++ b/gcc/omp-low.cc @@ -4646,7 +4646,14 @@ lower_rec_simd_input_clauses (tree new_var, omp_context *ctx, { if (known_eq (sctx->max_vf, 0U)) { - sctx->max_vf = sctx->is_simt ? omp_max_simt_vf () : omp_max_vf (); + /* If we are compiling for multiple devices choose the largest VF. */ + sctx->max_vf = omp_max_vf (); + if (omp_maybe_offloaded_ctx (ctx)) + { + if (sctx->is_simt) + sctx->max_vf = ordered_max (sctx->max_vf, omp_max_simt_vf ()); + sctx->max_vf = ordered_max (sctx->max_vf, omp_max_simd_vf ()); + } if (maybe_gt (sctx->max_vf, 1U)) { tree c = omp_find_clause (gimple_omp_for_clauses (ctx->stmt), diff --git a/gcc/testsuite/gcc.dg/gomp/target-vf.c b/gcc/testsuite/gcc.dg/gomp/target-vf.c new file mode 100644 index 00000000000..14cea45e53c --- /dev/null +++ b/gcc/testsuite/gcc.dg/gomp/target-vf.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-fopenmp -O2 -fdump-tree-omplower" } */ + +/* Ensure that the omp_max_vf, omp_max_simt_vf, and omp_max_simd_vf are working + properly to set the OpenMP vectorization factor for the offload target, and + not just for the host. */ + +float +foo (float * __restrict x, float * __restrict y) +{ + float sum = 0.0; + +#pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum) + for (int i=0; i<1024; i++) + sum += x[i] * y[i]; + + return sum; +} + +/* { dg-final { scan-tree-dump "safelen\\(64\\)" "omplower" { target amdgcn_offloading_enabled } } } */ +/* { dg-final { scan-tree-dump "safelen\\(32\\)" "omplower" { target { { nvptx_offloading_enabled } && { ! amdgcn_offloading_enabled } } } } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 4ed7b25b9a4..363354be461 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1025,6 +1025,16 @@ proc check_effective_target_offloading_enabled {} { return [check_configured_with "--enable-offload-targets"] } +# Return 1 if compiled with --enable-offload-targets=amdgcn +proc check_effective_target_amdgcn_offloading_enabled {} { + return [check_configured_with {--enable-offload-targets=[^ ]*amdgcn}] +} + +# Return 1 if compiled with --enable-offload-targets=amdgcn +proc check_effective_target_nvptx_offloading_enabled {} { + return [check_configured_with {--enable-offload-targets=[^ ]*nvptx}] +} + # Return 1 if compilation with -fopenacc is error-free for trivial # code, 0 otherwise.