From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C12223858400; Mon, 18 Oct 2021 10:05:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C12223858400 From: "jakub at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/102789] [12 regression] libgomp.c++/simd-3.C fails after r12-4340 for 32 bits Date: Mon, 18 Oct 2021 10:05:49 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 12.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: jakub at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 12.0 X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: everconfirmed component bug_status cf_reconfirmed_on cc Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 18 Oct 2021 10:05:49 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D102789 Jakub Jelinek changed: What |Removed |Added ---------------------------------------------------------------------------- Ever confirmed|0 |1 Component|libgomp |target Status|UNCONFIRMED |NEW Last reconfirmed| |2021-10-18 CC| |dje at gcc dot gnu.org, | |segher at gcc dot gnu.org --- Comment #2 from Jakub Jelinek --- Ok, I can reproduce, but only with -mcpu=3Dpower7. The cost model patch ju= st uncovered a latent power7 vectorization bug (backend or vectorizer) I'd say. I've instrumented the testcase a little bit: // { dg-do run } // { dg-additional-options "-msse2" { target sse2_runtime } } // { dg-additional-options "-mavx" { target avx_runtime } } extern "C" void abort (); int a[1024] __attribute__((aligned (32))) =3D { 1 }; int b[1024] __attribute__((aligned (32))) =3D { 1 }; unsigned char c[1024] __attribute__((aligned (32))) =3D { 1 }; int k, m; __UINTPTR_TYPE__ u, u2, u3; __attribute__((noinline, noclone)) int foo (int *p) { int i, s =3D 0, s2 =3D 0, t, t2; #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \ lastprivate (t2) for (i =3D 0; i < 512; i++) { a[i] *=3D p[i]; t2 =3D k + p[i]; k +=3D m + 1; s +=3D p[i] + k; c[i]++; } #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i =3D 512; i < 1024; i++) { a[i] *=3D p[i]; k +=3D m + 1; t =3D k + p[i]; u =3D (__UINTPTR_TYPE__) &k; u2 =3D (__UINTPTR_TYPE__) &s2; u3 =3D (__UINTPTR_TYPE__) &t; s2 +=3D t; c[i]++; } __builtin_printf ("foo %d %d %d %d\n", s, s2, t, t2); return s + s2 + t + t2; } __attribute__((noinline, noclone)) long int bar (int *p, long int n, long int o) { long int i, s =3D 0, s2 =3D 0, t, t2; #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s) \ lastprivate (t2) for (i =3D 0; i < n; i++) { a[i] *=3D p[i]; t2 =3D k + p[i]; k +=3D m + 1; s +=3D p[i] + k; c[i]++; } #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i =3D n; i < o; i++) { a[i] *=3D p[i]; k +=3D m + 1; t =3D k + p[i]; u =3D (__UINTPTR_TYPE__) &k; u2 =3D (__UINTPTR_TYPE__) &s2; u3 =3D (__UINTPTR_TYPE__) &t; s2 +=3D t; c[i]++; } __builtin_printf ("bar %d %d %d %d\n", s, s2, t, t2); return s + s2 + t + t2; } int main () { #if __SIZEOF_INT__ >=3D 4 int i; k =3D 4; m =3D 2; for (i =3D 0; i < 1024; i++) { a[i] =3D i - 512; b[i] =3D (i - 51) % 39; c[i] =3D (unsigned char) i; } int s =3D foo (b); for (i =3D 0; i < 1024; i++) { if (b[i] !=3D (i - 51) % 39 || a[i] !=3D (i - 512) * b[i] || c[i] !=3D (unsigned char) (i + 1)) { __builtin_printf ("#1 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] =3D i - 512; } if (k !=3D 4 + 3 * 1024 || s !=3D 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023])) { __builtin_printf ("#2 %d %d\n", k, s); abort (); } k =3D 4; s =3D bar (b, 512, 1024); for (i =3D 0; i < 1024; i++) { if (b[i] !=3D (i - 51) % 39 || a[i] !=3D (i - 512) * b[i] || c[i] !=3D (unsigned char) (i + 2)) { __builtin_printf ("#3 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] =3D i - 512; } if (k !=3D 4 + 3 * 1024 || s !=3D 1596127 + (4 + 3 * 511 + b[511]) + (4 + 3 * 1024 + b[1023])) { __builtin_printf ("#4 %d %d\n", k, s); abort (); } k =3D 4; s =3D bar (b, 511, 1021); for (i =3D 0; i < 1021; i++) { if (b[i] !=3D (i - 51) % 39 || a[i] !=3D (i - 512) * b[i] || c[i] !=3D (unsigned char) (i + 3)) { __builtin_printf ("#5 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } a[i] =3D i - 512; } for (i =3D 1021; i < 1024; i++) if (b[i] !=3D (i - 51) % 39 || a[i] !=3D i - 512 || c[i] !=3D (unsigned char) (i + 2)) { __builtin_printf ("#6 %d %d %d %d\n", i, b[i], a[i], c[i]); abort (); } if (k !=3D 4 + 3 * 1021 || s !=3D 1586803 + (4 + 3 * 510 + b[510]) + (4 + 3 * 1021 + b[1020])) { __builtin_printf ("#7 %d %d %d %d\n", k, s, b[510], b[1020]); abort (); } #endif return 0; } When compiled with -O2 -m32 -fopenmp -mcpu=3Dpower6, this prints: foo 403860 1192267 3112 1568 bar 403860 1192267 3112 1568 bar 402289 1184514 3100 1564 while with -O2 -m32 -fopenmp -mcpu=3Dpower7 it prints: foo 403860 1192267 3112 1568 bar 403860 1192267 3112 1568 bar 402289 919217 3100 1564 #7 3067 1326170 30 33 Aborted which seems to suggest it is the: #pragma omp simd aligned(a, b, p : 32) linear(k: m + 1) reduction(+:s2) \ lastprivate (t, u, u2, u3) for (i =3D n; i < o; i++) { a[i] *=3D p[i]; k +=3D m + 1; t =3D k + p[i]; u =3D (__UINTPTR_TYPE__) &k; u2 =3D (__UINTPTR_TYPE__) &s2; u3 =3D (__UINTPTR_TYPE__) &t; s2 +=3D t; c[i]++; } loop that is miscompiled and miscomputes s2. Now, the loop is invoked twice, once with n =3D 512 and o =3D 1024 and in t= hat case it works fine, and then with n =3D 511 and o =3D 1021 and in that case it misbehaves, so I bet it must be related to the prologue or epilogue loops. Now, if I compile with -O2 -m32 -fopenmp -mcpu=3Dpower7 -fvect-cost-model=3Dunlimited -fsimd-cost-model=3Dunlimited, it is miscompi= led the same way already in r9-1520-g42c5d1212ff6544be1061d488aa7ebee9463c375 (have= n't bisected fully), but certainly r5-370-ged15c5984e10f6556dffdf397accff804bf6= 0a7c through r9-1052-gfa725532c41ae543fd0078263ea348aa5af3997d have been ICEing on it instead: simd-3.C: In function =E2=80=98long int bar(int*, long int, long int)=E2=80= =99: simd-3.C:44:1: internal compiler error: in vect_get_store_cost, at tree-vect-stmts.c:1123 bar (int *p, long int n, long int o) ^~~ 0x1510f87 vect_get_store_cost(_stmt_vec_info*, int, unsigned int*, vec*) ../../gcc/tree-vect-stmts.c:1123 0x1510da0 vect_model_store_cost ../../gcc/tree-vect-stmts.c:1057 0x152200b vectorizable_store ../../gcc/tree-vect-stmts.c:6396 0x152cbf2 vect_analyze_stmt(gimple*, bool*, _slp_tree*, _slp_instance*, vec*) ../../gcc/tree-vect-stmts.c:9550 0x153a01f vect_analyze_loop_operations ../../gcc/tree-vect-loop.c:1655 0x153ad28 vect_analyze_loop_2 ../../gcc/tree-vect-loop.c:2050 0x153bd78 vect_analyze_loop(loop*, _loop_vec_info*) ../../gcc/tree-vect-loop.c:2343 0x157062a vectorize_loops() ../../gcc/tree-vectorizer.c:758 0x14196b5 execute ../../gcc/tree-ssa-loop.c:414 Note, r5-370 would ICE with it even with just -O3 -fopenmp -m32 -mcpu=3Dpower7 or -O2 -fopenmp -m32 -mcpu=3Dpower7 -fvect-cost-model=3Dunlimited.=