public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "siavashserver at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
Date: Wed, 07 Aug 2013 06:31:00 -0000	[thread overview]
Message-ID: <bug-58095-4-BoIO1Ne0HN@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-58095-4@http.gcc.gnu.org/bugzilla/>

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095

--- Comment #4 from Siavash Eliasi <siavashserver at gmail dot com> ---
In the end, here is what I really like GCC to generate for me. Same output as
function (bar) for function (foo) when using GCC with -O3 -march=core2
switches:

#include <xmmintrin.h>

#define BATCHSIZE 8

void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
        }
    }
}

void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
    for (unsigned int i = 0; i < size; i++)
    {
        __m128 cx[BATCHSIZE];

        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
        }

        for (unsigned int j=0; j<BATCHSIZE; j++)
        {
            c[i][j] = cx[j]; 
        }
    }
}

Generated asm code:

foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L1
    xor    eax, eax
.L4:
    movaps    xmm0, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    sub    rdx, -128
    sub    rdi, -128
    sub    rcx, -128
    mulps    xmm0, XMMWORD PTR [rsi-128]
    addps    xmm0, XMMWORD PTR [rdi-128]
    movaps    XMMWORD PTR [rcx-128], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-112]
    mulps    xmm0, XMMWORD PTR [rsi-112]
    addps    xmm0, XMMWORD PTR [rdi-112]
    movaps    XMMWORD PTR [rcx-112], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-96]
    mulps    xmm0, XMMWORD PTR [rsi-96]
    addps    xmm0, XMMWORD PTR [rdi-96]
    movaps    XMMWORD PTR [rcx-96], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-80]
    mulps    xmm0, XMMWORD PTR [rsi-80]
    addps    xmm0, XMMWORD PTR [rdi-80]
    movaps    XMMWORD PTR [rcx-80], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-64]
    mulps    xmm0, XMMWORD PTR [rsi-64]
    addps    xmm0, XMMWORD PTR [rdi-64]
    movaps    XMMWORD PTR [rcx-64], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-48]
    mulps    xmm0, XMMWORD PTR [rsi-48]
    addps    xmm0, XMMWORD PTR [rdi-48]
    movaps    XMMWORD PTR [rcx-48], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-32]
    mulps    xmm0, XMMWORD PTR [rsi-32]
    addps    xmm0, XMMWORD PTR [rdi-32]
    movaps    XMMWORD PTR [rcx-32], xmm0
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L4
.L1:
    rep; ret
bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
    test    r8d, r8d
    je    .L6
    xor    eax, eax
.L9:
    movaps    xmm7, XMMWORD PTR [rdx]
    add    eax, 1
    sub    rsi, -128
    movaps    xmm6, XMMWORD PTR [rdx+16]
    sub    rdi, -128
    sub    rdx, -128
    movaps    xmm5, XMMWORD PTR [rdx-96]
    sub    rcx, -128
    movaps    xmm4, XMMWORD PTR [rdx-80]
    movaps    xmm3, XMMWORD PTR [rdx-64]
    movaps    xmm2, XMMWORD PTR [rdx-48]
    movaps    xmm1, XMMWORD PTR [rdx-32]
    movaps    xmm0, XMMWORD PTR [rdx-16]
    mulps    xmm7, XMMWORD PTR [rsi-128]
    mulps    xmm6, XMMWORD PTR [rsi-112]
    mulps    xmm5, XMMWORD PTR [rsi-96]
    mulps    xmm4, XMMWORD PTR [rsi-80]
    mulps    xmm3, XMMWORD PTR [rsi-64]
    mulps    xmm2, XMMWORD PTR [rsi-48]
    mulps    xmm1, XMMWORD PTR [rsi-32]
    mulps    xmm0, XMMWORD PTR [rsi-16]
    addps    xmm7, XMMWORD PTR [rdi-128]
    addps    xmm6, XMMWORD PTR [rdi-112]
    addps    xmm5, XMMWORD PTR [rdi-96]
    addps    xmm4, XMMWORD PTR [rdi-80]
    addps    xmm3, XMMWORD PTR [rdi-64]
    addps    xmm2, XMMWORD PTR [rdi-48]
    addps    xmm1, XMMWORD PTR [rdi-32]
    addps    xmm0, XMMWORD PTR [rdi-16]
    movaps    XMMWORD PTR [rcx-128], xmm7
    movaps    XMMWORD PTR [rcx-112], xmm6
    movaps    XMMWORD PTR [rcx-96], xmm5
    movaps    XMMWORD PTR [rcx-80], xmm4
    movaps    XMMWORD PTR [rcx-64], xmm3
    movaps    XMMWORD PTR [rcx-48], xmm2
    movaps    XMMWORD PTR [rcx-32], xmm1
    movaps    XMMWORD PTR [rcx-16], xmm0
    cmp    eax, r8d
    jne    .L9
.L6:
    rep; ret


  parent reply	other threads:[~2013-08-07  6:31 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-06 16:03 [Bug c++/58095] New: " siavashserver at gmail dot com
2013-08-06 16:15 ` [Bug tree-optimization/58095] " paolo.carlini at oracle dot com
2013-08-06 16:54 ` pinskia at gcc dot gnu.org
2013-08-06 17:46 ` siavashserver at gmail dot com
2013-08-07  5:13 ` siavashserver at gmail dot com
2013-08-07  6:31 ` siavashserver at gmail dot com [this message]
2021-08-28 18:48 ` pinskia at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-58095-4-BoIO1Ne0HN@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).