From f1ae4a3675ad1147aaa88405be9f000ceed703bc Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Tue, 25 Apr 2023 23:53:12 +0200 Subject: [PATCH] Support parallel testing in libgomp, part II [PR66005] ..., and enable if 'flock' is available for serializing execution testing. Regarding the default of 19 parallel slots, this turned out to be a local minimum for wall time when testing this on: $ uname -srvi Linux 4.2.0-42-generic #49~14.04.1-Ubuntu SMP Wed Jun 29 20:22:11 UTC 2016 x86_64 $ grep '^model name' < /proc/cpuinfo | uniq -c 32 model name : Intel(R) Xeon(R) CPU E5-2640 v3 @ 2.60GHz ... in two configurations: case (a) standard configuration, no offloading configured, case (b) offloading for GCN and nvptx configured but no devices available. For both cases, default plus '-m32' variant. $ \time make check-target-libgomp RUNTESTFLAGS="--target_board=unix\{,-m32\}" Case (a), baseline: 6432.23user 332.38system 47:32.28elapsed 237%CPU (0avgtext+0avgdata 505044maxresident)k 6382.43user 319.21system 47:06.04elapsed 237%CPU (0avgtext+0avgdata 505172maxresident)k This is what people have been complaining about, rightly so, in "libgomp make check time is excessive" and elsewhere. Case (a), parallelized: -j12 GCC_TEST_PARALLEL_SLOTS=10 3088.49user 267.74system 6:43.82elapsed 831%CPU (0avgtext+0avgdata 505188maxresident)k -j15 GCC_TEST_PARALLEL_SLOTS=15 3308.08user 294.79system 5:56.04elapsed 1011%CPU (0avgtext+0avgdata 505360maxresident)k -j17 GCC_TEST_PARALLEL_SLOTS=17 3539.93user 298.99system 5:27.86elapsed 1170%CPU (0avgtext+0avgdata 505112maxresident)k -j18 GCC_TEST_PARALLEL_SLOTS=18 3697.50user 317.18system 5:14.63elapsed 1275%CPU (0avgtext+0avgdata 505360maxresident)k -j19 GCC_TEST_PARALLEL_SLOTS=19 3765.94user 324.27system 5:13.22elapsed 1305%CPU (0avgtext+0avgdata 505128maxresident)k -j20 GCC_TEST_PARALLEL_SLOTS=20 3684.66user 312.32system 5:15.26elapsed 1267%CPU (0avgtext+0avgdata 505100maxresident)k -j23 GCC_TEST_PARALLEL_SLOTS=23 4040.59user 347.10system 5:29.12elapsed 1333%CPU (0avgtext+0avgdata 505200maxresident)k -j26 GCC_TEST_PARALLEL_SLOTS=26 3973.24user 377.96system 5:24.70elapsed 1340%CPU (0avgtext+0avgdata 505160maxresident)k -j32 GCC_TEST_PARALLEL_SLOTS=32 4004.42user 346.10system 5:16.11elapsed 1376%CPU (0avgtext+0avgdata 505160maxresident)k Yay! Case (b), baseline; 2+ h: 7227.58user 700.54system 2:14:33elapsed 98%CPU (0avgtext+0avgdata 994264maxresident)k Case (b), parallelized: -j12 GCC_TEST_PARALLEL_SLOTS=10 7377.46user 777.52system 16:06.63elapsed 843%CPU (0avgtext+0avgdata 994344maxresident)k -j15 GCC_TEST_PARALLEL_SLOTS=15 8019.18user 721.42system 12:13.56elapsed 1191%CPU (0avgtext+0avgdata 994228maxresident)k -j17 GCC_TEST_PARALLEL_SLOTS=17 8530.11user 716.95system 10:45.92elapsed 1431%CPU (0avgtext+0avgdata 994176maxresident)k -j18 GCC_TEST_PARALLEL_SLOTS=18 8776.79user 645.89system 10:27.20elapsed 1502%CPU (0avgtext+0avgdata 994248maxresident)k -j19 GCC_TEST_PARALLEL_SLOTS=19 9332.37user 641.76system 10:15.09elapsed 1621%CPU (0avgtext+0avgdata 994260maxresident)k -j20 GCC_TEST_PARALLEL_SLOTS=20 9609.54user 789.88system 10:26.94elapsed 1658%CPU (0avgtext+0avgdata 994284maxresident)k -j23 GCC_TEST_PARALLEL_SLOTS=23 10362.40user 911.14system 10:44.47elapsed 1749%CPU (0avgtext+0avgdata 994208maxresident)k -j26 GCC_TEST_PARALLEL_SLOTS=26 11159.44user 850.99system 11:09.25elapsed 1794%CPU (0avgtext+0avgdata 994256maxresident)k -j32 GCC_TEST_PARALLEL_SLOTS=32 11453.50user 939.52system 11:00.38elapsed 1876%CPU (0avgtext+0avgdata 994240maxresident)k On my Dell Precision 7530 laptop: $ uname -srvi Linux 5.15.0-71-generic #78-Ubuntu SMP Tue Apr 18 09:00:29 UTC 2023 x86_64 $ grep '^model name' < /proc/cpuinfo | uniq -c 12 model name : Intel(R) Core(TM) i7-8850H CPU @ 2.60GHz $ nvidia-smi -L GPU 0: Quadro P1000 (UUID: GPU-e043973b-b52a-d02b-c066-a8fdbf64e8ea) ... in two configurations: case (c) standard configuration, no offloading configured, case (d) offloading for nvptx configured and device available. For both cases, only default variant, no '-m32'. $ \time make check-target-libgomp Case (c), baseline; roughly half of case (a) (just one variant): 1180.98user 110.80system 19:36.40elapsed 109%CPU (0avgtext+0avgdata 505148maxresident)k 1133.22user 111.08system 19:35.75elapsed 105%CPU (0avgtext+0avgdata 505212maxresident)k Case (c), parallelized: -j12 GCC_TEST_PARALLEL_SLOTS=2 1143.83user 110.76system 10:20.46elapsed 202%CPU (0avgtext+0avgdata 505216maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=6 1737.08user 143.94system 4:59.48elapsed 628%CPU (0avgtext+0avgdata 505200maxresident)k 1730.31user 143.02system 4:58.75elapsed 627%CPU (0avgtext+0avgdata 505152maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=8 2192.63user 169.34system 4:52.96elapsed 806%CPU (0avgtext+0avgdata 505216maxresident)k 2219.04user 167.67system 4:53.19elapsed 814%CPU (0avgtext+0avgdata 505152maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=10 2463.93user 184.98system 4:48.39elapsed 918%CPU (0avgtext+0avgdata 505200maxresident)k 2455.62user 183.68system 4:47.40elapsed 918%CPU (0avgtext+0avgdata 505216maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=12 2591.04user 192.64system 4:44.98elapsed 976%CPU (0avgtext+0avgdata 505216maxresident)k 2581.23user 195.21system 4:47.51elapsed 965%CPU (0avgtext+0avgdata 505212maxresident)k -j20 GCC_TEST_PARALLEL_SLOTS=20 [oversubscribe] 2613.18user 199.51system 4:44.06elapsed 990%CPU (0avgtext+0avgdata 505216maxresident)k Case (d), baseline (compared to case (b): only nvptx offloading compilation, but also nvptx offloading execution); ~1 h: 2841.93user 653.68system 1:02:26elapsed 93%CPU (0avgtext+0avgdata 909792maxresident)k 2842.03user 654.39system 1:02:24elapsed 93%CPU (0avgtext+0avgdata 909880maxresident)k Case (d), parallelized: -j12 GCC_TEST_PARALLEL_SLOTS=2 2856.39user 606.87system 33:58.64elapsed 169%CPU (0avgtext+0avgdata 909948maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=6 3444.90user 666.86system 18:37.57elapsed 367%CPU (0avgtext+0avgdata 909856maxresident)k 3462.13user 667.13system 18:36.87elapsed 369%CPU (0avgtext+0avgdata 909872maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=8 3929.74user 716.22system 18:02.36elapsed 429%CPU (0avgtext+0avgdata 909832maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=10 4152.84user 736.16system 17:43.05elapsed 459%CPU (0avgtext+0avgdata 909872maxresident)k -j12 GCC_TEST_PARALLEL_SLOTS=12 4209.60user 749.00system 17:35.20elapsed 469%CPU (0avgtext+0avgdata 909840maxresident)k -j20 GCC_TEST_PARALLEL_SLOTS=20 [oversubscribe] 4255.54user 756.78system 17:29.06elapsed 477%CPU (0avgtext+0avgdata 909868maxresident)k Worth noting is that with nvptx offloading, there is one execution test case that times out ('libgomp.fortran/reverse-offload-5.f90'). This effectively stalls progress for almost 5 min: quickly other executions test cases queue up on the lock for all parallel slots. That's working as expected; just noting this as it accordingly does skew the wall time numbers. PR libgomp/66005 libgomp/ * configure.ac: Look for 'flock'. * testsuite/Makefile.am (gcc_test_parallel_slots): Enable parallel testing. * testsuite/config/default.exp: Don't 'load_lib "standard.exp"' here... * testsuite/lib/libgomp.exp: ... but here, instead. (libgomp_load): Override for parallel testing. * testsuite/libgomp-site-extra.exp.in (FLOCK): Set. * configure: Regenerate. * Makefile.in: Regenerate. * testsuite/Makefile.in: Regenerate. --- libgomp/Makefile.in | 1 + libgomp/configure | 48 ++++++++++++++++++++- libgomp/configure.ac | 2 + libgomp/testsuite/Makefile.am | 3 +- libgomp/testsuite/Makefile.in | 4 +- libgomp/testsuite/config/default.exp | 2 - libgomp/testsuite/lib/libgomp.exp | 29 +++++++++++++ libgomp/testsuite/libgomp-site-extra.exp.in | 1 + 8 files changed, 84 insertions(+), 6 deletions(-) diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index 2c81ccacc1d..f1afb5ef57f 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -382,6 +382,7 @@ EXEEXT = @EXEEXT@ FC = @FC@ FCFLAGS = @FCFLAGS@ FGREP = @FGREP@ +FLOCK = @FLOCK@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ diff --git a/libgomp/configure b/libgomp/configure index fd0e337b578..489c9a00421 100755 --- a/libgomp/configure +++ b/libgomp/configure @@ -656,6 +656,7 @@ tmake_file XLDFLAGS XCFLAGS config_path +FLOCK CPU_COUNT LIBGOMP_BUILD_VERSIONED_SHLIB_SUN_FALSE LIBGOMP_BUILD_VERSIONED_SHLIB_SUN_TRUE @@ -11418,7 +11419,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 11421 "configure" +#line 11422 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -11524,7 +11525,7 @@ else lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 lt_status=$lt_dlunknown cat > conftest.$ac_ext <<_LT_EOF -#line 11527 "configure" +#line 11528 "configure" #include "confdefs.h" #if HAVE_DLFCN_H @@ -16486,6 +16487,49 @@ $as_echo "unable to detect (assuming 1)" >&6; } fi +for ac_prog in flock +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_FLOCK+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$FLOCK"; then + ac_cv_prog_FLOCK="$FLOCK" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_FLOCK="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +FLOCK=$ac_cv_prog_FLOCK +if test -n "$FLOCK"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FLOCK" >&5 +$as_echo "$FLOCK" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$FLOCK" && break +done + + # Get target configury. . ${srcdir}/configure.tgt CFLAGS="$save_CFLAGS $XCFLAGS" diff --git a/libgomp/configure.ac b/libgomp/configure.ac index a9b1f3973f7..0fab7168cfa 100644 --- a/libgomp/configure.ac +++ b/libgomp/configure.ac @@ -339,6 +339,8 @@ fi AX_COUNT_CPUS AC_SUBST(CPU_COUNT) +AC_CHECK_PROGS(FLOCK, flock) + # Get target configury. . ${srcdir}/configure.tgt CFLAGS="$save_CFLAGS $XCFLAGS" diff --git a/libgomp/testsuite/Makefile.am b/libgomp/testsuite/Makefile.am index eef02f4a8c2..0cc91ccc4d1 100644 --- a/libgomp/testsuite/Makefile.am +++ b/libgomp/testsuite/Makefile.am @@ -73,7 +73,8 @@ check_p_numbers4:=$(foreach i,$(check_p_numbers0),$(addprefix $(i),$(check_p_num check_p_numbers5:=$(addprefix 0,$(check_p_numbers3)) $(check_p_numbers4) check_p_numbers6:=$(foreach i,$(check_p_numbers0),$(addprefix $(i),$(check_p_numbers5))) check_p_numbers:=$(check_p_numbers0) $(check_p_numbers2) $(check_p_numbers4) $(check_p_numbers6) -gcc_test_parallel_slots:=1 +# If unable to serialize execution testing, use just one parallel slot. +gcc_test_parallel_slots:=$(if $(FLOCK),$(if $(GCC_TEST_PARALLEL_SLOTS),$(GCC_TEST_PARALLEL_SLOTS),19),1) check_p_subdirs=$(wordlist 1,$(gcc_test_parallel_slots),$(check_p_numbers)) check_DEJAGNU_libgomp_targets = $(addprefix check-DEJAGNUlibgomp,$(check_p_subdirs)) $(check_DEJAGNU_libgomp_targets): check-DEJAGNUlibgomp%: libgomp%/site.exp diff --git a/libgomp/testsuite/Makefile.in b/libgomp/testsuite/Makefile.in index 3b5903daed9..70cb5e42d7b 100644 --- a/libgomp/testsuite/Makefile.in +++ b/libgomp/testsuite/Makefile.in @@ -161,6 +161,7 @@ EXEEXT = @EXEEXT@ FC = @FC@ FCFLAGS = @FCFLAGS@ FGREP = @FGREP@ +FLOCK = @FLOCK@ GREP = @GREP@ INSTALL = @INSTALL@ INSTALL_DATA = @INSTALL_DATA@ @@ -309,7 +310,8 @@ check_p_numbers4 := $(foreach i,$(check_p_numbers0),$(addprefix $(i),$(check_p_n check_p_numbers5 := $(addprefix 0,$(check_p_numbers3)) $(check_p_numbers4) check_p_numbers6 := $(foreach i,$(check_p_numbers0),$(addprefix $(i),$(check_p_numbers5))) check_p_numbers := $(check_p_numbers0) $(check_p_numbers2) $(check_p_numbers4) $(check_p_numbers6) -gcc_test_parallel_slots := 1 +# If unable to serialize execution testing, use just one parallel slot. +gcc_test_parallel_slots := $(if $(FLOCK),$(if $(GCC_TEST_PARALLEL_SLOTS),$(GCC_TEST_PARALLEL_SLOTS),19),1) check_p_subdirs = $(wordlist 1,$(gcc_test_parallel_slots),$(check_p_numbers)) check_DEJAGNU_libgomp_targets = $(addprefix check-DEJAGNUlibgomp,$(check_p_subdirs)) all: all-am diff --git a/libgomp/testsuite/config/default.exp b/libgomp/testsuite/config/default.exp index b7afc82ff03..01569e6ab62 100644 --- a/libgomp/testsuite/config/default.exp +++ b/libgomp/testsuite/config/default.exp @@ -13,5 +13,3 @@ # You should have received a copy of the GNU General Public License # along with this program; see the file COPYING3. If not see # . - -load_lib "standard.exp" diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index 4ea6f26c674..8512f2c1e64 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -9,6 +9,7 @@ proc load_gcc_lib { filename } { } load_lib dg.exp +load_lib standard.exp # Required to use gcc-dg.exp - however, the latter should NOT be # loaded until ${tool}_target_compile is defined since it uses that @@ -297,6 +298,34 @@ proc libgomp_option_proc { option } { } } +if ![info exists ::env(GCC_RUNTEST_PARALLELIZE_DIR)] { + # No parallel testing. +} elseif { $FLOCK == "" } { + # Using just one parallel slot. +} else { + # Using several parallel slots. Override DejaGnu + # 'standard.exp:${tool}_load'... + rename libgomp_load standard_libgomp_load + proc libgomp_load { program args } { + # ... in order to serialize execution testing via an exclusive lock. + set lock_file ../lock + set lock_kind --exclusive + set lock_fd [open $lock_file a+] + set lock_clock_begin [clock seconds] + global FLOCK + exec $FLOCK $lock_kind 0 <@ $lock_fd + set lock_clock_end [clock seconds] + verbose -log "Got ${FLOCK}('$lock_file', '$lock_kind') at [clock format $lock_clock_end] after [expr $lock_clock_end - $lock_clock_begin] s" 2 + + set result [standard_libgomp_load $program $args] + + # Unlock (implicit with 'close'). + close $lock_fd + + return $result + } +} + # Translate offload target to OpenACC device type. Return the empty string if # not supported, and 'host' for offload target 'disable'. proc offload_target_to_openacc_device_type { offload_target } { diff --git a/libgomp/testsuite/libgomp-site-extra.exp.in b/libgomp/testsuite/libgomp-site-extra.exp.in index c0d26660bad..0a3ba059c21 100644 --- a/libgomp/testsuite/libgomp-site-extra.exp.in +++ b/libgomp/testsuite/libgomp-site-extra.exp.in @@ -1 +1,2 @@ +set FLOCK {@FLOCK@} set GCC_UNDER_TEST {@CC@} -- 2.34.1