public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "siavashserver at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
Date: Wed, 07 Aug 2013 05:13:00 -0000	[thread overview]
Message-ID: <bug-58095-4-clzFyneyF4@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-58095-4@http.gcc.gnu.org/bugzilla/>

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095

--- Comment #3 from Siavash Eliasi <siavashserver at gmail dot com> ---
I did an experiment with using raw float data types instead of __m128 data
type. This time GCC, Clang and ICC were able to generate desired code, even
without using __restric__ keyword, but a little more dirty (Pointer
arithmetics).

Not most, but I'm sure that new video decoder/encoder, game engines and similar
applications are using __m128 data types directly instead of float data types,
because (1) it guarantees them to be 16byte aligned, (2) removes the need to
manually load/store data from memory to XMM/YMM registers, (3) makes the source
code smaller and easier to maintain and (4) much more clean and smaller
generated code.

In conclusion, I don't think issue me and other people are facing is related to
not using __restrict__ keyword. All compilers fail to generate optimal code
when facing __m128 data types. However as an exception, ICC is able to generate
optimal code when facing __m128 data types and __restrict__ keyword mixed.

Here is what I have tried:

#include <xmmintrin.h>

void fooFloat(float* a, float* b, float* d, float* c, unsigned int size)
{
    for (unsigned int i = 0; i < size; i+=32)
    {
        __m128 ax[8], bx[8], cx[8], dx[8];

        ax[0] = _mm_load_ps(&a[i*32+0]);
        ax[1] = _mm_load_ps(&a[i*32+4]);
        ax[2] = _mm_load_ps(&a[i*32+8]);
        ax[3] = _mm_load_ps(&a[i*32+12]);
        ax[4] = _mm_load_ps(&a[i*32+16]);
        ax[5] = _mm_load_ps(&a[i*32+20]);
        ax[6] = _mm_load_ps(&a[i*32+24]);
        ax[7] = _mm_load_ps(&a[i*32+28]);

        bx[0] = _mm_load_ps(&b[i*32+0]);
        bx[1] = _mm_load_ps(&b[i*32+4]);
        bx[2] = _mm_load_ps(&b[i*32+8]);
        bx[3] = _mm_load_ps(&b[i*32+12]);
        bx[4] = _mm_load_ps(&b[i*32+16]);
        bx[5] = _mm_load_ps(&b[i*32+20]);
        bx[6] = _mm_load_ps(&b[i*32+24]);
        bx[7] = _mm_load_ps(&b[i*32+28]);

        dx[0] = _mm_load_ps(&d[i*32+0]);
        dx[1] = _mm_load_ps(&d[i*32+4]);
        dx[2] = _mm_load_ps(&d[i*32+8]);
        dx[3] = _mm_load_ps(&d[i*32+12]);
        dx[4] = _mm_load_ps(&d[i*32+16]);
        dx[5] = _mm_load_ps(&d[i*32+20]);
        dx[6] = _mm_load_ps(&d[i*32+24]);
        dx[7] = _mm_load_ps(&d[i*32+28]);

        cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0]));
        cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1]));
        cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2]));
        cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3]));
        cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4]));
        cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5]));
        cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6]));
        cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7]));

        _mm_store_ps(&c[i*32+0], cx[0]);
        _mm_store_ps(&c[i*32+4], cx[1]);
        _mm_store_ps(&c[i*32+8], cx[2]);
        _mm_store_ps(&c[i*32+12], cx[3]);
        _mm_store_ps(&c[i*32+16], cx[4]);
        _mm_store_ps(&c[i*32+20], cx[5]);
        _mm_store_ps(&c[i*32+24], cx[6]);
        _mm_store_ps(&c[i*32+28], cx[7]);
    }
}

And its output using GCC 4.8.1 -O2 :

fooFloat(float*, float*, float*, float*, unsigned int):
    push    r15
    xor    r15d, r15d
    test    r8d, r8d
    mov    eax, 4
    push    r14
    push    r13
    push    r12
    push    rbp
    push    rbx
    je    .L15
.L19:
    lea    r12d, [rax+4]
    lea    ebp, [rax+8]
    lea    ebx, [rax+12]
    lea    r11d, [rax+16]
    lea    r10d, [rax+20]
    lea    r9d, [rax+24]
    mov    r14d, r15d
    mov    r13d, eax
    add    r15d, 32
    sal    r14d, 5
    movaps    xmm6, XMMWORD PTR [rdx+r13*4]
    add    eax, 1024
    cmp    r8d, r15d
    movaps    xmm7, XMMWORD PTR [rdx+r14*4]
    mulps    xmm6, XMMWORD PTR [rsi+r13*4]
    movaps    xmm5, XMMWORD PTR [rdx+r12*4]
    mulps    xmm7, XMMWORD PTR [rsi+r14*4]
    movaps    xmm4, XMMWORD PTR [rdx+rbp*4]
    mulps    xmm5, XMMWORD PTR [rsi+r12*4]
    movaps    xmm3, XMMWORD PTR [rdx+rbx*4]
    mulps    xmm4, XMMWORD PTR [rsi+rbp*4]
    movaps    xmm2, XMMWORD PTR [rdx+r11*4]
    mulps    xmm3, XMMWORD PTR [rsi+rbx*4]
    movaps    xmm1, XMMWORD PTR [rdx+r10*4]
    mulps    xmm2, XMMWORD PTR [rsi+r11*4]
    movaps    xmm0, XMMWORD PTR [rdx+r9*4]
    mulps    xmm1, XMMWORD PTR [rsi+r10*4]
    addps    xmm7, XMMWORD PTR [rdi+r14*4]
    mulps    xmm0, XMMWORD PTR [rsi+r9*4]
    addps    xmm6, XMMWORD PTR [rdi+r13*4]
    addps    xmm5, XMMWORD PTR [rdi+r12*4]
    addps    xmm4, XMMWORD PTR [rdi+rbp*4]
    addps    xmm3, XMMWORD PTR [rdi+rbx*4]
    addps    xmm2, XMMWORD PTR [rdi+r11*4]
    addps    xmm1, XMMWORD PTR [rdi+r10*4]
    addps    xmm0, XMMWORD PTR [rdi+r9*4]
    movaps    XMMWORD PTR [rcx+r14*4], xmm7
    movaps    XMMWORD PTR [rcx+r13*4], xmm6
    movaps    XMMWORD PTR [rcx+r12*4], xmm5
    movaps    XMMWORD PTR [rcx+rbp*4], xmm4
    movaps    XMMWORD PTR [rcx+rbx*4], xmm3
    movaps    XMMWORD PTR [rcx+r11*4], xmm2
    movaps    XMMWORD PTR [rcx+r10*4], xmm1
    movaps    XMMWORD PTR [rcx+r9*4], xmm0
    ja    .L19
.L15:
    pop    rbx
    pop    rbp
    pop    r12
    pop    r13
    pop    r14
    pop    r15
    ret


  parent reply	other threads:[~2013-08-07  5:13 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-08-06 16:03 [Bug c++/58095] New: " siavashserver at gmail dot com
2013-08-06 16:15 ` [Bug tree-optimization/58095] " paolo.carlini at oracle dot com
2013-08-06 16:54 ` pinskia at gcc dot gnu.org
2013-08-06 17:46 ` siavashserver at gmail dot com
2013-08-07  5:13 ` siavashserver at gmail dot com [this message]
2013-08-07  6:31 ` siavashserver at gmail dot com
2021-08-28 18:48 ` pinskia at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-58095-4-clzFyneyF4@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).