From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id C8F643858014; Wed,  2 Feb 2022 14:10:08 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C8F643858014
From: "tschwinge at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/104345] New: nvptx: "regression" after "nvptx:
 Transition nvptx backend to STORE_FLAG_VALUE = 1"
Date: Wed, 02 Feb 2022 14:10:08 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 12.0
X-Bugzilla-Keywords: openacc
X-Bugzilla-Severity: minor
X-Bugzilla-Who: tschwinge at gcc dot gnu.org
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 keywords bug_severity priority component assigned_to reporter cc
 target_milestone cf_gcctarget
Message-ID: <bug-104345-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Wed, 02 Feb 2022 14:10:08 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D104345

            Bug ID: 104345
           Summary: nvptx: "regression" after "nvptx: Transition nvptx
                    backend to STORE_FLAG_VALUE =3D 1"
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Keywords: openacc
          Severity: minor
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tschwinge at gcc dot gnu.org
                CC: sayle at gcc dot gnu.org, vries at gcc dot gnu.org
  Target Milestone: ---
            Target: nvptx

First, I do acknowledge that commit beed3f8f60492289ca6211d86c54a2254a642035
"nvptx: Transition nvptx backend to STORE_FLAG_VALUE =3D 1" generally does
improve nvptx code generation -- thanks!

I've however run into one case where it causes a regression:

    PASS: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
-DACC_DEVICE_TYPE_nvidia=3D1 -DACC_MEM_SHARED=3D0 -foffload=3Dnvptx-none  -=
O0  (test
for excess errors)
    [-PASS:-]{+FAIL:+}
libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
-DACC_DEVICE_TYPE_nvidia=3D1 -DACC_MEM_SHARED=3D0 -foffload=3Dnvptx-none  -=
O0=20
execution test
    PASS: libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
-DACC_DEVICE_TYPE_nvidia=3D1 -DACC_MEM_SHARED=3D0 -foffload=3Dnvptx-none  -=
O2  (test
for excess errors)
    [-PASS:-]{+FAIL:+}
libgomp.oacc-c/../libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
-DACC_DEVICE_TYPE_nvidia=3D1 -DACC_MEM_SHARED=3D0 -foffload=3Dnvptx-none  -=
O2=20
execution test

    libgomp: The Nvidia accelerator has insufficient resources to launch
'worker$_omp_fn$0' with num_workers =3D 32 and vector_length =3D 32; recomp=
ile the
program with 'num_workers =3D x and vector_length =3D y' on that offloaded =
region
or '-fopenacc-dim=3D:x:y' where x * y <=3D 896.

Same for C++.

That's with a Nvidia Tesla K20c, Driver Version: 346.46 -- so, rather old.

By the way: the subsequent commit 659f8161f61d3f75c3a47cf646147e8f7b4dcb34
"nvptx: Add support for PTX's cnot instruction" is not helpful or even rele=
vant
here; there are no 'cnot's appearing in the PTX code loaded to the GPU (per
'GOMP_DEBUG=3D1' execution).

Per 'diff' of  'GOMP_DEBUG=3D1' execution we indeed see *more* registers us=
ed
after "nvptx: Transition nvptx backend to STORE_FLAG_VALUE =3D 1" than befo=
re.=20
For '-O0':

    [...]
     Link log info    : 4 bytes gmem
     info    : Function properties for 'gang$_omp_fn$0':
    -info    : used 51 registers, 112 stack, 0 bytes smem, 328 bytes cmem[0=
],
16 bytes cmem[2], 0 bytes lmem
    +info    : used 68 registers, 112 stack, 0 bytes smem, 328 bytes cmem[0=
], 8
bytes cmem[2], 0 bytes lmem
     info    : Function properties for 'worker$_omp_fn$0':
    -info    : used 51 registers, 112 stack, 136 bytes smem, 328 bytes cmem=
[0],
16 bytes cmem[2], 0 bytes lmem
    +info    : used 68 registers, 112 stack, 136 bytes smem, 328 bytes cmem=
[0],
8 bytes cmem[2], 0 bytes lmem
     info    : Function properties for 'vector$_omp_fn$0':
    -info    : used 51 registers, 112 stack, 0 bytes smem, 328 bytes cmem[0=
],
16 bytes cmem[2], 0 bytes lmem
    +info    : used 68 registers, 112 stack, 0 bytes smem, 328 bytes cmem[0=
], 8
bytes cmem[2], 0 bytes lmem
       GOMP_OFFLOAD_openacc_exec: prepare mappings
       nvptx_exec: kernel vector$_omp_fn$0: launch gangs=3D1, workers=3D1,
vectors=3D32
       nvptx_exec: kernel vector$_omp_fn$0: finished
    -GOACC_parallel_keyed: mapnum=3D3, hostaddrs=3D0x7ffc760394a0, size=3D0=
x60bb30,
kinds=3D0x60bb48
    +GOACC_parallel_keyed: mapnum=3D3, hostaddrs=3D0x7fff99653530, size=3D0=
x60bad0,
kinds=3D0x60bae8
       GOMP_OFFLOAD_openacc_exec: prepare mappings
    +
    +libgomp: The Nvidia accelerator has insufficient resources to launch
'worker$_omp_fn$0' with num_workers =3D 32 and vector_length =3D 32; recomp=
ile the
program with 'num_workers =3D x and vector_length =3D y' on that offloaded =
region
or '-fopenacc-dim=3D:x:y' where x * y <=3D 896.
    -  nvptx_exec: kernel worker$_omp_fn$0: launch gangs=3D1, workers=3D32,
vectors=3D32
    -  nvptx_exec: kernel worker$_omp_fn$0: finished
    -GOACC_parallel_keyed: mapnum=3D3, hostaddrs=3D0x7ffc760394a0, size=3D0=
x60bb50,
kinds=3D0x60bb68
    -  GOMP_OFFLOAD_openacc_exec: prepare mappings
    -  nvptx_exec: kernel gang$_omp_fn$0: launch gangs=3D32, workers=3D1,
vectors=3D32
    -  nvptx_exec: kernel gang$_omp_fn$0: finished

Similar for '-O2', just with less stack usage.

Cross-checking with a more recent Driver Version: 450.119.03, I'm only seei=
ng
slightly increased register usage; 52 registers after "nvptx: Transition nv=
ptx
backend to STORE_FLAG_VALUE =3D 1" compared to 51 registers before:

    [...]
    Link log info    : 4 bytes gmem
    info    : Function properties for 'vector$_omp_fn$0':
    info    : used [-51-]{+52+} registers, 112 stack, 0 bytes smem, 328 byt=
es
cmem[0], 16 bytes cmem[2], 0 bytes lmem
    info    : Function properties for 'worker$_omp_fn$0':
    info    : used [-51-]{+52+} registers, 112 stack, 136 bytes smem, 328 b=
ytes
cmem[0], 16 bytes cmem[2], 0 bytes lmem
    info    : Function properties for 'gang$_omp_fn$0':
    info    : used [-51-]{+52+} registers, 112 stack, 0 bytes smem, 328 byt=
es
cmem[0], 16 bytes cmem[2], 0 bytes lmem
    [...]

This suggests that compared to before, after "nvptx: Transition nvptx backe=
nd
to STORE_FLAG_VALUE =3D 1" GCC is generating certain PTX code sequences tha=
t the
Driver/JIT fails to understand/optimize?  While not ideal, the code still
executes fine (with newish Driver/JIT), and I'm thus OK if we classify that=
 as
not worth looking into -- but I at least wanted to report my findings: maybe
there's a way to tune the GCC/nvptx code generation to the PTX -> SASS
compiler's liking?

Possibly (but that's just guessing!), the reason might be around the follow=
ing
PTX code change:

    [...]
    -setp.leu.f64 %r82,%r25,0d7fefffffffffffff;
    -@ ! %r82 bra $L3;
    +@ %r78 bra $L20;
    +setp.leu.f64 %r138,%r57,0d7fefffffffffffff;
    +bra $L3;
    +$L20:
     .loc 2 1976 21
    -setp.leu.f64 %r83,%r57,0d7fefffffffffffff;
    -@ %r83 bra $L19;
    +setp.leu.f64 %r138,%r57,0d7fefffffffffffff;
    +@ %r138 bra $L19;
     $L3:
    [...]

>>From a quick look, I read this to mean that the originally ("before")
unconditional 'setp.leu.f64 %r82,%r25,0d7fefffffffffffff;' is now ("after")
done conditionally.

Maybe related, maybe not: when curiously 'diff'ing the before vs. after
nvptx-none target libraries, I noticed amongst all the "noise" (improved co=
de
generation):

'nvptx-none/libatomic/gcas.o':

    [...]
     atom.cas.b32 %r137,[%r34],%r136,%r139;
     setp.eq.u32 %r140,%r137,%r136;
     selp.u32 %r138,1,0,%r140;
    -setp.ne.u32 %r141,%r138,0;
    -@ %r141 bra $L21;
    +@ %r140 bra $L18;
     st.u32 [%r201],%r137;
    -bra $L19;
    +$L18:
    +setp.eq.u32 %r142,%r138,0;
    +@ %r142 bra $L19;
     $L21:
    [...]

... which again looks like a pattern where an originally ("before")
unconditional 'setp.ne.u32 %r141,%r138,0;' is now ("after") done conditiona=
lly.

Similar in other files -- but I certainly didn't look in detail, and I'm
certainly not claiming this to be/cause any actual problem.


And, I've spotted a few cases where we're generating "maybe worse" code:

'nvptx-none/libgomp/openacc.o' (complete 'diff'):

    @@ -25,6 +25,7 @@
     .reg .u64 %r28;
     .reg .u32 %r29;
     .reg .u32 %r30;
    +.reg .pred %r31;
     mov.u64 %r27,%ar0;
     st.u64 [%frame+16],%r27;
     ld.u64 %r28,[%frame+16];
    @@ -38,8 +39,8 @@
     ld.param.u32 %r30,[%value_in];
     }
     mov.u32 %r23,%r30;
    -set.u32.ne.u32 %r24,%r23,0;
    -neg.s32 %r24,%r24;
    +setp.ne.u32 %r31,%r23,0;
    +selp.u32 %r24,1,0,%r31;
     st.u32 [%frame],%r24;
     ld.u32 %r25,[%frame];
     mov.u32 %r26,%r25;

'nvptx-none/newlib/libc/reent/lib_a-renamer.o' (complete 'diff'):

    @@ -28,6 +28,8 @@
     .reg .u32 %r32;
     .reg .pred %r33;
     .reg .u32 %r36;
    +.reg .u32 %r38;
    +.reg .pred %r39;
     mov.u64 %r26,%ar0;
     mov.u64 %r27,%ar1;
     mov.u64 %r28,%ar2;
    @@ -58,7 +60,9 @@
     ld.param.u32 %r36,[%value_in];
     }
     .loc 2 57 6
    -set.u32.eq.u32 %r25,%r36,-1;
    +setp.eq.u32 %r39,%r36,-1;
    +selp.u32 %r38,1,0,%r39;
    +neg.s32 %r25,%r38;
     $L1:
     .loc 2 64 1
     mov.u32 %value,%r25;

'nvptx-none/newlib/libc/stdio/lib_a-remove.o' (complete 'diff'):

    @@ -24,6 +24,8 @@
     .reg .u64 %r26;
     .reg .u64 %r27;
     .reg .u32 %r30;
    +.reg .u32 %r33;
    +.reg .pred %r34;
     mov.u64 %r26,%ar0;
     mov.u64 %r27,%ar1;
     .loc 2 65 7
    @@ -37,7 +39,9 @@
     ld.param.u32 %r30,[%value_in];
     }
     .loc 2 65 6
    -set.u32.eq.u32 %value,%r30,-1;
    +setp.eq.u32 %r34,%r30,-1;
    +selp.u32 %r33,1,0,%r34;
    +neg.s32 %value,%r33;
     .loc 2 69 1
     st.param.u32 [%value_out],%value;
     ret;
    @@ -51,6 +55,8 @@
     .reg .u64 %r27;
     .reg .u64 %r30;
     .reg .u32 %r31;
    +.reg .u32 %r34;
    +.reg .pred %r35;
     mov.u64 %r27,%ar0;
     .loc 2 65 7
     ld.global.u64 %r30,[_impure_ptr];
    @@ -64,7 +70,9 @@
     ld.param.u32 %r31,[%value_in];
     }
     .loc 2 65 6
    -set.u32.eq.u32 %value,%r31,-1;
    +setp.eq.u32 %r35,%r31,-1;
    +selp.u32 %r34,1,0,%r35;
    +neg.s32 %value,%r34;
     .loc 2 77 1
     st.param.u32 [%value_out],%value;
     ret;

'nvptx-none/newlib/libm/common/lib_a-s_rint.o' (complete 'diff'):

    @@ -80,6 +80,7 @@
     .reg .u32 %r119;
     .reg .pred %r120;
     .reg .u32 %r122;
    +.reg .pred %r123;
     .reg .u32 %r124;
     .reg .u32 %r125;
     .reg .u32 %r126;
    @@ -197,7 +198,8 @@
     setp.eq.u32 %r120,%r41,0;
     @ %r120 bra $L5;
     .loc 2 114 9
    -set.u32.eq.u32 %r122,%r58,19;
    +setp.eq.u32 %r123,%r58,19;
    +selp.u32 %r122,1,0,%r123;
     shl.b32 %r41,%r122,31;
     .loc 2 115 17
     not.b32 %r124,%r64;

I haven't looked if that's "actually worse" in SASS, or just "maybe worse" =
in
the intermediate PTX representation.  (... and is most certainly not relate=
d to
the regression mentioned before.)


It seems unlikely, but I'll report in case anything here changes due to Tom=
's
several GCC/nvptx back end commits yesterday.


So, please close this PR as "won't fix" unless you see something here that
you'd like to look into.=