From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 95EBE3851A8E; Wed, 14 Sep 2022 15:20:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 95EBE3851A8E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1663168813; bh=EXPXbMlh59FLAwINqsdo73tYhnPaDS5/Nqy41SNZINs=; h=From:To:Subject:Date:In-Reply-To:References:From; b=c6EqDf55KFUx2Er84lP/dUqCU/wJ+1ZHmXu3SOMjaciVxC1MaMVA+RS5ATcjuCUbC +9ndDjcu9o18OaDSzZdemyOio3HJDJEbu5fOr14HzGCyieFqZKYw6IIHgk0pKXPMNZ KmAxMq2inxWewzlOtjeHKpKKHBOomOtHpZZvzCw0= From: "amonakov at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/106902] [11/12/13 Regression] Program compiled with -O3 -mfma produces different result Date: Wed, 14 Sep 2022 15:20:13 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 12.2.0 X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: amonakov at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Resolution: X-Bugzilla-Priority: P2 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 11.4 X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: cc Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D106902 Alexander Monakov changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |amonakov at gcc dot gnu.org --- Comment #6 from Alexander Monakov --- This is a lovely showcase how optimizations cooperatively produce something unexpected. TL;DR: SLP introduces redundant computations and then fma formation contrac= ts some (but not all) of those, dramatically reducing numerical stability. In principle that's similar to incorrectly "optimizing" double f(double x) { double y =3D x * x; return y - y; } (which is guaranteed to return either NaN or 0) to double f(double x) { return fma(x, x, -(x * x)); } which returns the round-off tail of x * x (or NaN). I think there's already another bug with a similar root cause. In this bug, we begin with (note, all following examples are supposed to be compiled without fma contraction, i.e. -O0, plain -O2, or -O2 -ffp-contract= =3Doff if your target has fma): #include #include double one =3D 1; double b1 =3D 0x1.70e906b54fe4fp+1; double b2 =3D -0x1.62adb4752c14ep+1; double b3 =3D 0x1.c7001a6f3bd8p-1; double B =3D 0x1.29c9034e7cp-13; void f1(void) { double x1 =3D 0, x2 =3D 0, x3 =3D 0; for (int i =3D 0; i < 99; ) { double t =3D B * one + x1 * b1 + x2 * b2 + x3 * b3; printf("%d %g\t%a\n", i++, t, t); x3 =3D x2, x2 =3D x1, x1 =3D t; } } predcom unrolls by 3 to get rid of moves: void f2(void) { double x1 =3D 0, x2 =3D 0, x3 =3D 0; for (int i =3D 0; i < 99; ) { x3 =3D B * one + x1 * b1 + x2 * b2 + x3 * b3; printf("%d %g\t%a\n", i++, x3, x3); x2 =3D B * one + x3 * b1 + x1 * b2 + x2 * b3; printf("%d %g\t%a\n", i++, x2, x2); x1 =3D B * one + x2 * b1 + x3 * b2 + x1 * b3; printf("%d %g\t%a\n", i++, x1, x1); } } SLP introduces some redundant vector computations: typedef double f64v2 __attribute__((vector_size(16))); void f3(void) { double x1 =3D 0, x2 =3D 0, x3 =3D 0; f64v2 x32 =3D { 0 }, x21 =3D { 0 }; for (int i =3D 0; i < 99; ) { x3 =3D B * one + x21[1] * b1 + x2 * b2 + x3 * b3; f64v2 x13b1 =3D { x21[1] * b1, x3 * b1 }; x32 =3D B * one + x13b1 + x21 * b2 + x32 * b3; x2 =3D B * one + x3 * b1 + x1 * b2 + x2 * b3; f64v2 x13b2 =3D { b2 * x1, b2 * x32[0] }; x21 =3D B * one + x32 * b1 + x13b2 + x21 * b3; x1 =3D B * one + x2 * b1 + x32[0] * b2 + x1 * b3; printf("%d %g\t%a\n", i++, x32[0], x32[0]); printf("%d %g\t%a\n", i++, x32[1], x32[1]); printf("%d %g\t%a\n", i++, x21[1], x21[1]); } } Note that this is still bit-identical to the initial function. But then tree-ssa-math-opts "randomly" forms some FMAs: f64v2 vfma(f64v2 x, f64v2 y, f64v2 z) { return (f64v2){ fma(x[0], y[0], z[0]), fma(x[1], y[1], z[1]) }; } void f4(void) { f64v2 vone =3D { one, one }, vB =3D { B, B }; f64v2 vb1 =3D { b1, b1 }, vb2 =3D { b2, b2 }, vb3 =3D { b3, b3 }; double x1 =3D 0, x2 =3D 0, x3 =3D 0; f64v2 x32 =3D { 0 }, x21 =3D { 0 }; for (int i =3D 0; i < 99; ) { x3 =3D fma(b3, x3, fma(b2, x2, fma(B, one, x21[1] * b1))); f64v2 x13b1 =3D { x21[1] * b1, x3 * b1 }; x32 =3D vfma(vb3, x32, vfma(vb2, x21, vfma(vB, vone, x13b1)= )); x2 =3D fma(b3, x2, b2 * x1 + fma(B, one, x3 * b1)); f64v2 x13b2 =3D { b2 * x1, b2 * x32[0] }; x21 =3D vfma(vb3, x21, x13b2 + vfma(vB, vone, x32 * vb1)); x1 =3D fma(b3, x1, b2 * x32[0] + fma(B, one, b1 * x2)); printf("%d %g\t%a\n", i++, x32[0], x32[0]); printf("%d %g\t%a\n", i++, x32[1], x32[1]); printf("%d %g\t%a\n", i++, x21[1], x21[1]); } } and here some of the redundantly computed values are computed differently depending on where rounding after multiplication was omitted. Somehow this = is enough to make the computation explode numerically.=