From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-303132-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 14465 invoked by alias); 25 Dec 2009 09:12:30 -0000
Received: (qmail 14435 invoked by uid 48); 25 Dec 2009 09:12:15 -0000
Date: Fri, 25 Dec 2009 09:12:00 -0000
Subject: [Bug target/42498]  New: GCC can't use smull to compute int * int --> long long
X-Bugzilla-Reason: CC
Message-ID: <bug-42498-17659@http.gcc.gnu.org/bugzilla/>
Reply-To: gcc-bugzilla@gcc.gnu.org
To: gcc-bugs@gcc.gnu.org
From: "carrot at google dot com" <gcc-bugzilla@gcc.gnu.org>
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
X-SW-Source: 2009-12/txt/msg02377.txt.bz2

Compile following code with options -O2

extern "C" void foo(long long a, long long b, long long c);

extern "C" void bar(int a, int b, int c, int d)
{
      long long x = (long long)a*b;
      long long y = (long long)b*c;
      long long z = (long long)c*d;
      foo(x,y,z);
}


gcc generates:

bar:
        stmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
        .save {r4, r5, r6, r7, r8, r9, sl, fp, lr}
.LCFI0:
        mov     fp, r1
        mov     ip, fp, asr #31
        mov     lr, r2
        mov     r4, r0
        mov     r5, r4, asr #31
        mov     sl, ip
        mov     r6, r3
        mul     r9, r1, r5
        mov     r3, r2, asr #31
        mul     sl, lr, sl
        mul     r8, r6, r3
        mov     r7, ip
        mov     fp, r6
        mov     ip, fp, asr #31
        umull   r4, r5, r1, r0
        mla     r7, r0, r7, r9
        mla     r0, r1, r3, sl
        umull   r2, r3, lr, r1
        mov     r1, ip
        mla     r1, lr, r1, r8
        umull   fp, ip, r6, lr
        add     r5, r7, r5
        add     r1, r1, ip
        mov     ip, r1
        .pad #12
        sub     sp, sp, #12
.LCFI1:
        add     r3, r0, r3
        mov     r1, r5
        mov     r0, r4
        stmia   sp, {fp-ip}
        bl      foo
        add     sp, sp, #12
        ldmfd   sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
        bx      lr

An ideal result code should use the ARM instruction smull to do the operation 
(int * int --> long long), but gcc first sign extends the operands, then do the
complex double word multiplication.

Following are some analysis from Seongbae Park:


============== From Seongbae Park ==========================

Looks like rtl expander/combiner tried its best to screw this case, even though
there's a pattern for smull for exactly this case,
and then it's a complete downhill from there.


*.final_cleanup:

void bar(int, int, int, int) (a, b, c, d)
{
 long long int D.1732;
 long long int D.1731;

<bb 2>:
 D.1731 = (long long int) b;
 D.1732 = (long long int) c;
 foo (D.1731 * (long long int) a, D.1732 * D.1731, (long long int) d * D.1732)
[tail call];
 return;

}

This input, when reaches the RTL expander, causes it to do a full 64x64->64bit
multiply,
instead of 32x32->64 multiply, and once that full sequence is expanded,
we don't have pass(es) that can clean up unnecessary multiplies/pluses
so the RTL isn't in a shape the combiner can turn into smull insn.

One possible solution is to make expander look at more of the tree/gimple that
it currently does.
e.g. if it could see that D.1731 and D.1732 are originally int values,
it can recognize that those 64-bit multiplies are actually 32x32->64 multiplies
and use smull at the expansion time.

I wish there's an easy way to slap BURG into gcc, and we could fix this kind of
problem once and for all :(


In this case, it's not so much the registers, but how the IR looks
like when it reaches RTL expander.
In particular:

D.1731 = (long long int) b;
D.1732 = (long long int) c;
foo (D.1731 * (long long int) a, D.1732 * D.1731, (long long int) d *
D.1732) [tail call];

Those extra names, D.1731 and D.1732 hide the fact that they are
originally 32-bit from the expander,
thus the expander isn't aware that this is a 32x32->64 multiply.

One way to show is slightly modified example:

extern "C" void foo(long long a, long long b, long long c);

extern "C" void bar(int a, int b, int c, int d, int e, int f)
{
 long long x = a*b;
 long long y = c*d;
 long long z = e*f;
  foo(x, y, z);
}


At the end of midle-end, the above source becomes:

void bar(int, int, int, int, int, int) (a, b, c, d, e, f)
{
<bb 2>:
 foo ((long long int) (b * a), (long long int) (d * c), (long long
int) (f * e)) [tail call];
 return;

}

And this exposes everything expander needs to do its job properly,
and leads to the following assembly:

      stmfd   sp!, {r4, r5}
      .save {r4, r5}
.LCFI0:
      mul     r1, r0, r1
      ldr     r4, [sp, #8]
      ldr     r0, [sp, #12]
      mul     r3, r2, r3
      mul     r4, r4, r0
      mov     r2, r3
      mov     r3, r2, asr #31
      mov     r5, r4, asr #31
      mov     r0, r1
      mov     r1, r0, asr #31
      strd    r4, [sp, #8]
      ldmfd   sp!, {r4, r5}
      b       foo
.LFE2:

Nice and clean, even though there are more variables involved.


Sorry the example was missing the cast.

A proper example:

extern "C" void foo(long long a, long long b, long long c);

extern "C" void bar(int a, int b, int c, int d, int e, int f)
{
 long long x = (long long)a*b;
 long long y = (long long)c*d;
 long long z = (long long)e*f;
  foo(x, y, z);
}

and this leads to:

void bar(int, int, int, int, int, int) (a, b, c, d, e, f)
{
<bb 2>:
 foo ((long long int) b * (long long int) a, (long long int) d * (long long
int) c, (long long int) f * (long long int) e) [tail call];
 return;

}


and finally:

     stmfd   sp!, {r4, r5}
      .save {r4, r5}
.LCFI0:
      smull   r0, r1, r1, r0
      ldr     r4, [sp, #12]
      ldr     ip, [sp, #8]
      smull   r2, r3, r3, r2
      smull   r4, r5, r4, ip
      strd    r4, [sp, #8]
      ldmfd   sp!, {r4, r5}
      b       foo
.LFE2:

=============== END ===============================


-- 
           Summary: GCC can't use smull to compute int * int --> long long
           Product: gcc
           Version: 4.5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: carrot at google dot com
 GCC build triplet: i686-linux
  GCC host triplet: i686-linux
GCC target triplet: arm-eabi


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42498