From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 442963858D37; Thu, 29 Jun 2023 01:38:45 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 442963858D37 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1688002725; bh=yDp9f901ZpPAt8FAAUAm/YxfpWuluKKKMNKnts5abic=; h=From:To:Subject:Date:From; b=L2aR57EZADLtVXuR6aHimK9HtQddcH4Ih55j/Hvu1Ns4PrgfRDjrqNrQYO5zEclup s3TyNmHc325uR1+P9/3WsRHbSZ7Dix1/r5xNxvTg1a0uLkdhKFcFbK4EJqG1X3Mcck sZJrtLaWtodK7I9oVPkRsnX3GNybTO7DIGT3tXe0= From: "ryanpholt at me dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/110472] New: 60% slowdown with fwrapv when using openmp Date: Thu, 29 Jun 2023 01:38:44 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 10.2.1 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: ryanpholt at me dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone attachments.created Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D110472 Bug ID: 110472 Summary: 60% slowdown with fwrapv when using openmp Product: gcc Version: 10.2.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: ryanpholt at me dot com Target Milestone: --- Created attachment 55423 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=3D55423&action=3Dedit Reproduction file Compiling the attached example with -fwrapv inhibits some optimizations and= =20 results in a massive slowdown. It appears to be related to the use of OpenM= P. I know that fwrapv can result in slowdowns; however, I do not believe that it needs to in this example. In the loop nest below, gcc appears to believe the computations with the lo= op induction variables (ex. 'i2 =3D i * 21504 + i1 * 96;') will overflow. Howe= ver, the code is looping over fixed size data and so I believe gcc should be abl= e to determine that overflow is not possible. Perhaps some range analysis stops working across the openmp runtime boundary? The issue is fixed if I remove = the openmp pragma. The issue is also fixed if I change the loop induction variables to be decl= ared as int64_t rather than int. I also tried clang16 and did not observe the issue with fwrapv. #pragma omp parallel for \ num_threads(omp_get_max_threads()) \ private(i1,u0,b_u0,i2,i3,i4,i5,i6,i7,i8,i10,i12,i14,i15) \ firstprivate(r2) for (i =3D 0; i < 7; i++) { for (i1 =3D 0; i1 < 7; i1++) { u0 =3D i * -32 + 222; if (u0 > 32) { u0 =3D 32; } b_u0 =3D i1 * -32 + 222; if (b_u0 > 32) { b_u0 =3D 32; } i2 =3D i * 21504 + i1 * 96; i3 =3D i * 227328 + (i1 << 10); for (i4 =3D 0; i4 < u0; i4++) { for (i5 =3D 0; i5 < b_u0; i5++) { for (i6 =3D 0; i6 < 3; i6++) { i7 =3D i5 + i6; for (i8 =3D 0; i8 < 3; i8++) { for (i10 =3D 0; i10 < 3; i10++) { i12 =3D i4 + i10; for (i14 =3D 0; i14 < 32; i14++) { i15 =3D ((i3 + 7104 * i4) + (i5 << 5)) + i14; (r2)[i15] +=3D ((float *)&in_0[0][0][0][0])[((i2 + 672 * = i12) + 3 * i7) + i8] * ((float *)&__constant_3x3x3x32xf32[0][0][0]= [0]) [((288 * i10 + 96 * i6) + (i8 << 5)) + i14]; } } } } } } } } Repro: gcc -O3 -fopenmp -lpthread -fwrapv predict.i ./a.out (Remove the -fwrapv to observe a major speedup) Using built-in specs. COLLECT_GCC=3Dgcc COLLECT_LTO_WRAPPER=3D/usr/lib/gcc/x86_64-linux-gnu/10/lto-wrapper OFFLOAD_TARGET_NAMES=3Dnvptx-none:amdgcn-amdhsa:hsa OFFLOAD_TARGET_DEFAULT=3D1 Target: x86_64-linux-gnu Configured with: ../src/configure -v --with-pkgversion=3D'Debian 10.2.1-6' --with-bugurl=3Dfile:///usr/share/doc/gcc-10/README.Bugs --enable-languages=3Dc,ada,c++,go,brig,d,fortran,objc,obj-c++,m2 --prefix= =3D/usr --with-gcc-major-version-only --program-suffix=3D-10 --program-prefix=3Dx86_64-linux-gnu- --enable-shared --enable-linker-build-= id --libexecdir=3D/usr/lib --without-included-gettext --enable-threads=3Dposix --libdir=3D/usr/lib --enable-nls --enable-bootstrap --enable-clocale=3Dgnu --enable-libstdcxx-debug --enable-libstdcxx-time=3Dyes --with-default-libstdcxx-abi=3Dnew --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-= zlib --enable-libphobos-checking=3Drelease --with-target-system-zlib=3Dauto --enable-objc-gc=3Dauto --enable-multiarch --disable-werror --with-arch-32= =3Di686 --with-abi=3Dm64 --with-multilib-list=3Dm32,m64,mx32 --enable-multilib --with-tune=3Dgeneric --enable-offload-targets=3Dnvptx-none=3D/build/gcc-10-Km9U7s/gcc-10-10.2.1/= debian/tmp-nvptx/usr,amdgcn-amdhsa=3D/build/gcc-10-Km9U7s/gcc-10-10.2.1/deb= ian/tmp-gcn/usr,hsa --without-cuda-driver --enable-checking=3Drelease --build=3Dx86_64-linux-gnu --host=3Dx86_64-linux-gnu --target=3Dx86_64-linux-gnu --with-build-config=3Dbootstrap-lto-lean --enable-link-mutex Thread model: posix Supported LTO compression algorithms: zlib zstd gcc version 10.2.1 20210110 (Debian 10.2.1-6)=20 COLLECT_GCC_OPTIONS=3D'-v' '-save-temps' '-O3' '-fopenmp' '-fwrapv' '-mtune=3Dgeneric' '-march=3Dx86-64' '-pthread' /usr/lib/gcc/x86_64-linux-gnu/10/cc1 -E -quiet -v -imultiarch x86_64-linux= -gnu -D_REENTRANT main.c -mtune=3Dgeneric -march=3Dx86-64 -fopenmp -fwrapv -O3 -fpch-preprocess -fasynchronous-unwind-tables -o main.i ignoring nonexistent directory "/usr/local/include/x86_64-linux-gnu" ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/10/include-fi= xed" ignoring nonexistent directory "/usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include" #include "..." search starts here: #include <...> search starts here: /usr/lib/gcc/x86_64-linux-gnu/10/include /usr/local/include /usr/include/x86_64-linux-gnu /usr/include End of search list. COLLECT_GCC_OPTIONS=3D'-v' '-save-temps' '-O3' '-fopenmp' '-fwrapv' '-mtune=3Dgeneric' '-march=3Dx86-64' '-pthread' /usr/lib/gcc/x86_64-linux-gnu/10/cc1 -fpreprocessed main.i -quiet -dumpbase main.c -mtune=3Dgeneric -march=3Dx86-64 -auxbase main -O3 -version -fopenmp= -fwrapv -fasynchronous-unwind-tables -o main.s GNU C17 (Debian 10.2.1-6) version 10.2.1 20210110 (x86_64-linux-gnu) compiled by GNU C version 10.2.1 20210110, GMP version 6.2.1, MPFR version 4.1.0, MPC version 1.2.0, isl version isl-0.23-GMP GGC heuristics: --param ggc-min-expand=3D100 --param ggc-min-heapsize=3D131= 072 GNU C17 (Debian 10.2.1-6) version 10.2.1 20210110 (x86_64-linux-gnu) compiled by GNU C version 10.2.1 20210110, GMP version 6.2.1, MPFR version 4.1.0, MPC version 1.2.0, isl version isl-0.23-GMP GGC heuristics: --param ggc-min-expand=3D100 --param ggc-min-heapsize=3D131= 072 Compiler executable checksum: 1f803793fa2e3418c492b25e7d3eac2f COLLECT_GCC_OPTIONS=3D'-v' '-save-temps' '-O3' '-fopenmp' '-fwrapv' '-mtune=3Dgeneric' '-march=3Dx86-64' '-pthread' as -v --64 -o main.o main.s GNU assembler version 2.35.2 (x86_64-linux-gnu) using BFD version (GNU Binu= tils for Debian) 2.35.2 COMPILER_PATH=3D/usr/lib/gcc/x86_64-linux-gnu/10/:/usr/lib/gcc/x86_64-linux= -gnu/10/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/10/:/= usr/lib/gcc/x86_64-linux-gnu/ LIBRARY_PATH=3D/usr/lib/gcc/x86_64-linux-gnu/10/:/usr/lib/gcc/x86_64-linux-= gnu/10/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/10/../../..= /../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/us= r/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/10/../../../:/lib/:/usr/lib/ Reading specs from /usr/lib/gcc/x86_64-linux-gnu/10/libgomp.spec COLLECT_GCC_OPTIONS=3D'-v' '-save-temps' '-O3' '-fopenmp' '-fwrapv' '-mtune=3Dgeneric' '-march=3Dx86-64' '-pthread' /usr/lib/gcc/x86_64-linux-gnu/10/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/10/liblto_plugin.so -plugin-opt=3D/usr/lib/gcc/x86_64-linux-gnu/10/lto-wrapper -plugin-opt=3D-fresolution=3Dmain.res -plugin-opt=3D-pass-through=3D-lgcc -plugin-opt=3D-pass-through=3D-lgcc_s -plugin-opt=3D-pass-through=3D-lpthre= ad -plugin-opt=3D-pass-through=3D-lc -plugin-opt=3D-pass-through=3D-lgcc -plugin-opt=3D-pass-through=3D-lgcc_s --build-id --eh-frame-hdr -m elf_x86_= 64 --hash-style=3Dgnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 = -pie /usr/lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/Scrt1.o /usr/lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/10/crtbeginS.o /usr/lib/gcc/x86_64-linux-gnu/10/crtoffloadbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/10 -L/usr/lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/10/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/10/../../.. -lpthread main.o -lgomp -lgcc --push-state --as-needed -lgcc_s --pop-state -lpthread -lc -lgcc --push-sta= te --as-needed -lgcc_s --pop-state /usr/lib/gcc/x86_64-linux-gnu/10/crtendS.o /usr/lib/gcc/x86_64-linux-gnu/10/../../../x86_64-linux-gnu/crtn.o /usr/lib/gcc/x86_64-linux-gnu/10/crtoffloadend.o COLLECT_GCC_OPTIONS=3D'-v' '-save-temps' '-O3' '-fopenmp' '-fwrapv' '-mtune=3Dgeneric' '-march=3Dx86-64' '-pthread'=