public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "siavashserver at gmail dot com" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization Date: Wed, 07 Aug 2013 06:31:00 -0000 [thread overview] Message-ID: <bug-58095-4-BoIO1Ne0HN@http.gcc.gnu.org/bugzilla/> (raw) In-Reply-To: <bug-58095-4@http.gcc.gnu.org/bugzilla/> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095 --- Comment #4 from Siavash Eliasi <siavashserver at gmail dot com> --- In the end, here is what I really like GCC to generate for me. Same output as function (bar) for function (foo) when using GCC with -O3 -march=core2 switches: #include <xmmintrin.h> #define BATCHSIZE 8 void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } } } void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE], __m128 c[][BATCHSIZE], unsigned int size) { for (unsigned int i = 0; i < size; i++) { __m128 cx[BATCHSIZE]; for (unsigned int j=0; j<BATCHSIZE; j++) { cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j])); } for (unsigned int j=0; j<BATCHSIZE; j++) { c[i][j] = cx[j]; } } } Generated asm code: foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L1 xor eax, eax .L4: movaps xmm0, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 sub rdx, -128 sub rdi, -128 sub rcx, -128 mulps xmm0, XMMWORD PTR [rsi-128] addps xmm0, XMMWORD PTR [rdi-128] movaps XMMWORD PTR [rcx-128], xmm0 movaps xmm0, XMMWORD PTR [rdx-112] mulps xmm0, XMMWORD PTR [rsi-112] addps xmm0, XMMWORD PTR [rdi-112] movaps XMMWORD PTR [rcx-112], xmm0 movaps xmm0, XMMWORD PTR [rdx-96] mulps xmm0, XMMWORD PTR [rsi-96] addps xmm0, XMMWORD PTR [rdi-96] movaps XMMWORD PTR [rcx-96], xmm0 movaps xmm0, XMMWORD PTR [rdx-80] mulps xmm0, XMMWORD PTR [rsi-80] addps xmm0, XMMWORD PTR [rdi-80] movaps XMMWORD PTR [rcx-80], xmm0 movaps xmm0, XMMWORD PTR [rdx-64] mulps xmm0, XMMWORD PTR [rsi-64] addps xmm0, XMMWORD PTR [rdi-64] movaps XMMWORD PTR [rcx-64], xmm0 movaps xmm0, XMMWORD PTR [rdx-48] mulps xmm0, XMMWORD PTR [rsi-48] addps xmm0, XMMWORD PTR [rdi-48] movaps XMMWORD PTR [rcx-48], xmm0 movaps xmm0, XMMWORD PTR [rdx-32] mulps xmm0, XMMWORD PTR [rsi-32] addps xmm0, XMMWORD PTR [rdi-32] movaps XMMWORD PTR [rcx-32], xmm0 movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L4 .L1: rep; ret bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], float __vector (*) [8], unsigned int): test r8d, r8d je .L6 xor eax, eax .L9: movaps xmm7, XMMWORD PTR [rdx] add eax, 1 sub rsi, -128 movaps xmm6, XMMWORD PTR [rdx+16] sub rdi, -128 sub rdx, -128 movaps xmm5, XMMWORD PTR [rdx-96] sub rcx, -128 movaps xmm4, XMMWORD PTR [rdx-80] movaps xmm3, XMMWORD PTR [rdx-64] movaps xmm2, XMMWORD PTR [rdx-48] movaps xmm1, XMMWORD PTR [rdx-32] movaps xmm0, XMMWORD PTR [rdx-16] mulps xmm7, XMMWORD PTR [rsi-128] mulps xmm6, XMMWORD PTR [rsi-112] mulps xmm5, XMMWORD PTR [rsi-96] mulps xmm4, XMMWORD PTR [rsi-80] mulps xmm3, XMMWORD PTR [rsi-64] mulps xmm2, XMMWORD PTR [rsi-48] mulps xmm1, XMMWORD PTR [rsi-32] mulps xmm0, XMMWORD PTR [rsi-16] addps xmm7, XMMWORD PTR [rdi-128] addps xmm6, XMMWORD PTR [rdi-112] addps xmm5, XMMWORD PTR [rdi-96] addps xmm4, XMMWORD PTR [rdi-80] addps xmm3, XMMWORD PTR [rdi-64] addps xmm2, XMMWORD PTR [rdi-48] addps xmm1, XMMWORD PTR [rdi-32] addps xmm0, XMMWORD PTR [rdi-16] movaps XMMWORD PTR [rcx-128], xmm7 movaps XMMWORD PTR [rcx-112], xmm6 movaps XMMWORD PTR [rcx-96], xmm5 movaps XMMWORD PTR [rcx-80], xmm4 movaps XMMWORD PTR [rcx-64], xmm3 movaps XMMWORD PTR [rcx-48], xmm2 movaps XMMWORD PTR [rcx-32], xmm1 movaps XMMWORD PTR [rcx-16], xmm0 cmp eax, r8d jne .L9 .L6: rep; ret
next prev parent reply other threads:[~2013-08-07 6:31 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2013-08-06 16:03 [Bug c++/58095] New: " siavashserver at gmail dot com 2013-08-06 16:15 ` [Bug tree-optimization/58095] " paolo.carlini at oracle dot com 2013-08-06 16:54 ` pinskia at gcc dot gnu.org 2013-08-06 17:46 ` siavashserver at gmail dot com 2013-08-07 5:13 ` siavashserver at gmail dot com 2013-08-07 6:31 ` siavashserver at gmail dot com [this message] 2021-08-28 18:48 ` pinskia at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-58095-4-BoIO1Ne0HN@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).