[Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13
@ 2021-08-29 12:30 jens.seifert at de dot ibm.com
  2021-08-29 12:49 ` [Bug target/102117] " jens.seifert at de dot ibm.com
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: jens.seifert at de dot ibm.com @ 2021-08-29 12:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102117

            Bug ID: 102117
           Summary: s390: Inefficient code for 64x64=128 signed multiply
                    for <= z13
           Product: gcc
           Version: 8.3.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jens.seifert at de dot ibm.com
  Target Milestone: ---

__int128 imul128(long long a, long long b)
{
   return (__int128)a * (__int128)b;
}

creates sequence with 3 multiplies:

_Z7imul128xx:
.LFB0:
        .cfi_startproc
        ldgr    %f2,%r12
        .cfi_register 12, 17
        ldgr    %f0,%r13
        .cfi_register 13, 16
        lgr     %r13,%r3
        mlgr    %r12,%r4
        srag    %r1,%r3,63
        msgr    %r1,%r4
        srag    %r4,%r4,63
        msgr    %r4,%r3
        agr     %r4,%r1
        agr     %r12,%r4
        stmg    %r12,%r13,0(%r2)
        lgdr    %r13,%f0
        .cfi_restore 13
        lgdr    %r12,%f2
        .cfi_restore 12
        br      %r14
        .cfi_endproc


The following sequence only requires 1 multiply:

__int128 imul128_opt(long long a, long long b)
{
   unsigned __int128 x = (unsigned __int128)(unsigned long long)a;
   unsigned __int128 y = (unsigned __int128)(unsigned long long)b;
   unsigned long long t1 = (a >> 63) & a;
   unsigned long long t2 = (b >> 63) & b;
   unsigned __int128 u128 = x * y;
   unsigned long long hi = (u128 >> 64) - (t1 + t2);
   unsigned long long lo = (unsigned long long)u128;
   unsigned __int128 res = hi;
   res <<= 64;
   res |= lo;
   return (__int128)res;
}

_Z11imul128_optxx:
.LFB1:
        .cfi_startproc
        ldgr    %f2,%r12
        .cfi_register 12, 17
        ldgr    %f0,%r13
        .cfi_register 13, 16
        lgr     %r13,%r3
        mlgr    %r12,%r4
        lgr     %r1,%r3
        srag    %r3,%r3,63
        ngr     %r3,%r1
        srag    %r1,%r4,63
        ngr     %r4,%r1
        agr     %r3,%r4
        sgrk    %r3,%r12,%r3
        stg     %r13,8(%r2)
        lgdr    %r12,%f2
        .cfi_restore 12
        lgdr    %r13,%f0
        .cfi_restore 13
        stg     %r3,0(%r2)
        br      %r14
        .cfi_endproc

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/102117] s390: Inefficient code for 64x64=128 signed multiply for <= z13
  2021-08-29 12:30 [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13 jens.seifert at de dot ibm.com
@ 2021-08-29 12:49 ` jens.seifert at de dot ibm.com
  2021-11-20 13:16 ` roger at nextmovesoftware dot com
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: jens.seifert at de dot ibm.com @ 2021-08-29 12:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102117

--- Comment #1 from Jens Seifert <jens.seifert at de dot ibm.com> ---
Sorry small bug in optimal sequence.

__int128 imul128_opt(long long a, long long b)
{
   unsigned __int128 x = (unsigned __int128)(unsigned long long)a;
   unsigned __int128 y = (unsigned __int128)(unsigned long long)b;
   unsigned long long t1 = (a >> 63) & b;
   unsigned long long t2 = (b >> 63) & a;
   unsigned __int128 u128 = x * y;
   unsigned long long hi = (u128 >> 64) - (t1 + t2);
   unsigned long long lo = (unsigned long long)u128;
   unsigned __int128 res = hi;
   res <<= 64;
   res |= lo;
   return (__int128)res;
}

_Z11imul128_optxx:
.LFB1:
        .cfi_startproc
        ldgr    %f2,%r12
        .cfi_register 12, 17
        ldgr    %f0,%r13
        .cfi_register 13, 16
        lgr     %r13,%r3
        mlgr    %r12,%r4
        srag    %r1,%r3,63
        ngr     %r1,%r4
        srag    %r4,%r4,63
        ngr     %r4,%r3
        agr     %r4,%r1
        sgrk    %r4,%r12,%r4
        stg     %r13,8(%r2)
        lgdr    %r12,%f2
        .cfi_restore 12
        lgdr    %r13,%f0
        .cfi_restore 13
        stg     %r4,0(%r2)
        br      %r14
        .cfi_endproc

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/102117] s390: Inefficient code for 64x64=128 signed multiply for <= z13
  2021-08-29 12:30 [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13 jens.seifert at de dot ibm.com
  2021-08-29 12:49 ` [Bug target/102117] " jens.seifert at de dot ibm.com
@ 2021-11-20 13:16 ` roger at nextmovesoftware dot com
  2021-11-21 11:41 ` cvs-commit at gcc dot gnu.org
  2021-11-25 19:17 ` roger at nextmovesoftware dot com
  3 siblings, 0 replies; 5+ messages in thread
From: roger at nextmovesoftware dot com @ 2021-11-20 13:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102117

Roger Sayle <roger at nextmovesoftware dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
           Assignee|unassigned at gcc dot gnu.org      |roger at nextmovesoftware dot com
                 CC|                            |roger at nextmovesoftware dot com
   Last reconfirmed|                            |2021-11-20
             Status|UNCONFIRMED                 |ASSIGNED

--- Comment #2 from Roger Sayle <roger at nextmovesoftware dot com> ---
Patch proposed:
https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585067.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/102117] s390: Inefficient code for 64x64=128 signed multiply for <= z13
  2021-08-29 12:30 [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13 jens.seifert at de dot ibm.com
  2021-08-29 12:49 ` [Bug target/102117] " jens.seifert at de dot ibm.com
  2021-11-20 13:16 ` roger at nextmovesoftware dot com
@ 2021-11-21 11:41 ` cvs-commit at gcc dot gnu.org
  2021-11-25 19:17 ` roger at nextmovesoftware dot com
  3 siblings, 0 replies; 5+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2021-11-21 11:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102117

--- Comment #3 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Roger Sayle <sayle@gcc.gnu.org>:

https://gcc.gnu.org/g:dc915b361bbc99da83fc53db7f7e0e28d0ce12c8

commit r12-5436-gdc915b361bbc99da83fc53db7f7e0e28d0ce12c8
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Sun Nov 21 11:40:08 2021 +0000

    Tweak tree-ssa-math-opts.c to solve PR target/102117.

    This patch resolves PR target/102117 on s390.  The problem is that
    some of the functionality of GCC's RTL expanders is no longer triggered
    following the transition to tree SSA form.  On s390, unsigned widening
    multiplications are converted into WIDEN_MULT_EXPR (aka w* in tree dumps),
    but signed widening multiplies are left in their original form, which
    alas doesn't benefit from the clever logic in expand_widening_mult.

    The fix is to teach convert_mult_to_widen, that RTL expansion can
    synthesize a signed widening multiplication if the target provides
    a suitable umul_widen_optab.

    On s390-linux-gnu with -O2 -m64, the code in the bugzilla PR currently
    generates:

    imul128:
            stmg    %r12,%r13,96(%r15)
            srag    %r0,%r4,63
            srag    %r1,%r3,63
            lgr     %r13,%r3
            mlgr    %r12,%r4
            msgr    %r1,%r4
            msgr    %r0,%r3
            lgr     %r4,%r12
            agr     %r1,%r0
            lgr     %r5,%r13
            agr     %r4,%r1
            stmg    %r4,%r5,0(%r2)
            lmg     %r12,%r13,96(%r15)
            br      %r14

    but with this patch should now generate the more efficient:

    imul128:
            lgr     %r1,%r3
            mlgr    %r0,%r4
            srag    %r5,%r3,63
            ngr     %r5,%r4
            srag    %r4,%r4,63
            sgr     %r0,%r5
            ngr     %r4,%r3
            sgr     %r0,%r4
            stmg    %r0,%r1,0(%r2)
            br      %r14

    2021-11-21  Roger Sayle  <roger@nextmovesoftware.com>
                Robin Dapp  <rdapp@linux.ibm.com>

    gcc/ChangeLog
            PR target/102117
            * tree-ssa-math-opts.c (convert_mult_to_widen): Recognize
            signed WIDEN_MULT_EXPR if the target supports umul_widen_optab.

    gcc/testsuite/ChangeLog
            PR target/102117
            * gcc.target/s390/mul-wide.c: New test case.
            * gcc.target/s390/umul-wide.c: New test case.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/102117] s390: Inefficient code for 64x64=128 signed multiply for <= z13
  2021-08-29 12:30 [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13 jens.seifert at de dot ibm.com
                   ` (2 preceding siblings ...)
  2021-11-21 11:41 ` cvs-commit at gcc dot gnu.org
@ 2021-11-25 19:17 ` roger at nextmovesoftware dot com
  3 siblings, 0 replies; 5+ messages in thread
From: roger at nextmovesoftware dot com @ 2021-11-25 19:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102117

Roger Sayle <roger at nextmovesoftware dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|ASSIGNED                    |RESOLVED
         Resolution|---                         |FIXED
   Target Milestone|---                         |12.0

--- Comment #4 from Roger Sayle <roger at nextmovesoftware dot com> ---
This should now be fixed on mainline.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-11-25 19:17 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-29 12:30 [Bug target/102117] New: s390: Inefficient code for 64x64=128 signed multiply for <= z13 jens.seifert at de dot ibm.com
2021-08-29 12:49 ` [Bug target/102117] " jens.seifert at de dot ibm.com
2021-11-20 13:16 ` roger at nextmovesoftware dot com
2021-11-21 11:41 ` cvs-commit at gcc dot gnu.org
2021-11-25 19:17 ` roger at nextmovesoftware dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).