public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99412] New: s352 benchmark of TSVC is vectorized by clang and not by gcc
@ 2021-03-05 14:54 hubicka at gcc dot gnu.org
  2021-03-08  8:32 ` [Bug tree-optimization/99412] " rguenth at gcc dot gnu.org
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-05 14:54 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99412

            Bug ID: 99412
           Summary: s352 benchmark of TSVC is vectorized by clang and not
                    by gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256

real_t a[LEN_1D],b[LEN_1D];
int main ()
{

//    loop rerolling
//    unrolled dot product

    real_t dot;
    for (int nl = 0; nl < 8*iterations; nl++) {
        dot = (real_t)0.;
        for (int i = 0; i < LEN_1D; i += 5) {
            dot = dot + a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2]
                * b[i + 2] + a[i + 3] * b[i + 3] + a[i + 4] * b[i + 4];
        }
    }

    return dot;
}


clang does:
main:                                   # @main
        .cfi_startproc
# %bb.0:
        xorl    %eax, %eax
        .p2align        4, 0x90
.LBB0_1:                                # =>This Loop Header: Depth=1
                                        #     Child Loop BB0_2 Depth 2
        vxorps  %xmm0, %xmm0, %xmm0
        movq    $-5, %rcx
        .p2align        4, 0x90
.LBB0_2:                                #   Parent Loop BB0_1 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
        vmovups b+20(,%rcx,4), %xmm1
        vmovss  b+36(,%rcx,4), %xmm2            # xmm2 = mem[0],zero,zero,zero
        vmulps  a+20(,%rcx,4), %xmm1, %xmm1
        vpermilpd       $1, %xmm1, %xmm3        # xmm3 = xmm1[1,0]
        vaddps  %xmm3, %xmm1, %xmm1
        vmovshdup       %xmm1, %xmm3            # xmm3 = xmm1[1,1,3,3]
        vaddss  %xmm3, %xmm1, %xmm1
        vfmadd231ss     a+36(,%rcx,4), %xmm2, %xmm1 # xmm1 = (xmm2 * mem) +
xmm1
        addq    $5, %rcx
        vaddss  %xmm0, %xmm1, %xmm0
        cmpq    $31995, %rcx                    # imm = 0x7CFB
        jb      .LBB0_2
# %bb.3:                                #   in Loop: Header=BB0_1 Depth=1
        incl    %eax
        cmpl    $800000, %eax                   # imm = 0xC3500
        jne     .LBB0_1
# %bb.4:
        vcvttss2si      %xmm0, %eax
        retq

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-01-12 13:42 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-05 14:54 [Bug middle-end/99412] New: s352 benchmark of TSVC is vectorized by clang and not by gcc hubicka at gcc dot gnu.org
2021-03-08  8:32 ` [Bug tree-optimization/99412] " rguenth at gcc dot gnu.org
2021-06-09 12:54 ` rguenth at gcc dot gnu.org
2023-01-11 19:14 ` hubicka at gcc dot gnu.org
2023-01-12  9:12 ` rguenth at gcc dot gnu.org
2023-01-12  9:30 ` rguenth at gcc dot gnu.org
2023-01-12 13:30 ` cvs-commit at gcc dot gnu.org
2023-01-12 13:42 ` rguenth at gcc dot gnu.org
2023-01-12 13:42 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).