From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id 95EBE3851A8E; Wed, 14 Sep 2022 15:20:13 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 95EBE3851A8E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1663168813;
	bh=EXPXbMlh59FLAwINqsdo73tYhnPaDS5/Nqy41SNZINs=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=c6EqDf55KFUx2Er84lP/dUqCU/wJ+1ZHmXu3SOMjaciVxC1MaMVA+RS5ATcjuCUbC
	 +9ndDjcu9o18OaDSzZdemyOio3HJDJEbu5fOr14HzGCyieFqZKYw6IIHgk0pKXPMNZ
	 KmAxMq2inxWewzlOtjeHKpKKHBOomOtHpZZvzCw0=
From: "amonakov at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/106902] [11/12/13 Regression] Program compiled with -O3
 -mfma produces different result
Date: Wed, 14 Sep 2022 15:20:13 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 12.2.0
X-Bugzilla-Keywords: wrong-code
X-Bugzilla-Severity: normal
X-Bugzilla-Who: amonakov at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P2
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: 11.4
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: cc
Message-ID: <bug-106902-4-h85nPs5LeF@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-106902-4@http.gcc.gnu.org/bugzilla/>
References: <bug-106902-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D106902

Alexander Monakov <amonakov at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |amonakov at gcc dot gnu.org
--- Comment #6 from Alexander Monakov <amonakov at gcc dot gnu.org> ---
This is a lovely showcase how optimizations cooperatively produce something
unexpected.

TL;DR: SLP introduces redundant computations and then fma formation contrac=
ts
some (but not all) of those, dramatically reducing numerical stability. In
principle that's similar to incorrectly "optimizing"

double f(double x)
{
  double y =3D x * x;
  return y - y;
}

(which is guaranteed to return either NaN or 0) to

double f(double x)
{
  return fma(x, x, -(x * x));
}

which returns the round-off tail of x * x (or NaN). I think there's already
another bug with a similar root cause.

In this bug, we begin with (note, all following examples are supposed to be
compiled without fma contraction, i.e. -O0, plain -O2, or -O2 -ffp-contract=
=3Doff
if your target has fma):

#include <math.h>
#include <stdio.h>

double one =3D 1;

double b1 =3D 0x1.70e906b54fe4fp+1;
double b2 =3D -0x1.62adb4752c14ep+1;
double b3 =3D 0x1.c7001a6f3bd8p-1;
double B =3D 0x1.29c9034e7cp-13;

void f1(void)
{
        double x1 =3D 0, x2 =3D 0, x3 =3D 0;

        for (int i =3D 0; i < 99; ) {
                double t =3D B * one + x1 * b1 + x2 * b2 + x3 * b3;
                printf("%d %g\t%a\n", i++, t, t);

                x3 =3D x2, x2 =3D x1, x1 =3D t;
        }
}

predcom unrolls by 3 to get rid of moves:

void f2(void)
{
        double x1 =3D 0, x2 =3D 0, x3 =3D 0;

        for (int i =3D 0; i < 99; ) {
                x3 =3D B * one + x1 * b1 + x2 * b2 + x3 * b3;
                printf("%d %g\t%a\n", i++, x3, x3);

                x2 =3D B * one + x3 * b1 + x1 * b2 + x2 * b3;
                printf("%d %g\t%a\n", i++, x2, x2);

                x1 =3D B * one + x2 * b1 + x3 * b2 + x1 * b3;
                printf("%d %g\t%a\n", i++, x1, x1);
        }
}

SLP introduces some redundant vector computations:

typedef double f64v2 __attribute__((vector_size(16)));

void f3(void)
{
        double x1 =3D 0, x2 =3D 0, x3 =3D 0;

        f64v2 x32 =3D { 0 }, x21 =3D { 0 };

        for (int i =3D 0; i < 99; ) {
                x3 =3D B * one + x21[1] * b1 + x2 * b2 + x3 * b3;

                f64v2 x13b1 =3D { x21[1] * b1, x3 * b1 };

                x32 =3D B * one + x13b1 + x21 * b2 + x32 * b3;

                x2 =3D B * one + x3 * b1 + x1 * b2 + x2 * b3;

                f64v2 x13b2 =3D { b2 * x1, b2 * x32[0] };

                x21 =3D B * one + x32 * b1 + x13b2 + x21 * b3;

                x1 =3D B * one + x2 * b1 + x32[0] * b2 + x1 * b3;

                printf("%d %g\t%a\n", i++, x32[0], x32[0]);
                printf("%d %g\t%a\n", i++, x32[1], x32[1]);
                printf("%d %g\t%a\n", i++, x21[1], x21[1]);
        }
}

Note that this is still bit-identical to the initial function. But then
tree-ssa-math-opts "randomly" forms some FMAs:

f64v2 vfma(f64v2 x, f64v2 y, f64v2 z)
{
        return (f64v2){ fma(x[0], y[0], z[0]), fma(x[1], y[1], z[1]) };
}

void f4(void)
{
        f64v2 vone =3D { one, one }, vB =3D { B, B };
        f64v2 vb1 =3D { b1, b1 }, vb2 =3D { b2, b2 }, vb3 =3D { b3, b3 };

        double x1 =3D 0, x2 =3D 0, x3 =3D 0;

        f64v2 x32 =3D { 0 }, x21 =3D { 0 };

        for (int i =3D 0; i < 99; ) {
                x3 =3D fma(b3, x3, fma(b2, x2, fma(B, one, x21[1] * b1)));

                f64v2 x13b1 =3D { x21[1] * b1, x3 * b1 };

                x32 =3D vfma(vb3, x32, vfma(vb2, x21, vfma(vB, vone, x13b1)=
));

                x2 =3D fma(b3, x2, b2 * x1 + fma(B, one, x3 * b1));

                f64v2 x13b2 =3D { b2 * x1, b2 * x32[0] };

                x21 =3D vfma(vb3, x21, x13b2 + vfma(vB, vone, x32 * vb1));

                x1 =3D fma(b3, x1, b2 * x32[0] + fma(B, one, b1 * x2));

                printf("%d %g\t%a\n", i++, x32[0], x32[0]);
                printf("%d %g\t%a\n", i++, x32[1], x32[1]);
                printf("%d %g\t%a\n", i++, x21[1], x21[1]);
        }
}

and here some of the redundantly computed values are computed differently
depending on where rounding after multiplication was omitted. Somehow this =
is
enough to make the computation explode numerically.=