From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 2684 invoked by alias); 7 Aug 2013 05:13:36 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 2583 invoked by uid 48); 7 Aug 2013 05:13:31 -0000 From: "siavashserver at gmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization Date: Wed, 07 Aug 2013 05:13:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: unknown X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: siavashserver at gmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-SW-Source: 2013-08/txt/msg00417.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095 --- Comment #3 from Siavash Eliasi --- I did an experiment with using raw float data types instead of __m128 data type. This time GCC, Clang and ICC were able to generate desired code, even without using __restric__ keyword, but a little more dirty (Pointer arithmetics). Not most, but I'm sure that new video decoder/encoder, game engines and similar applications are using __m128 data types directly instead of float data types, because (1) it guarantees them to be 16byte aligned, (2) removes the need to manually load/store data from memory to XMM/YMM registers, (3) makes the source code smaller and easier to maintain and (4) much more clean and smaller generated code. In conclusion, I don't think issue me and other people are facing is related to not using __restrict__ keyword. All compilers fail to generate optimal code when facing __m128 data types. However as an exception, ICC is able to generate optimal code when facing __m128 data types and __restrict__ keyword mixed. Here is what I have tried: #include void fooFloat(float* a, float* b, float* d, float* c, unsigned int size) { for (unsigned int i = 0; i < size; i+=32) { __m128 ax[8], bx[8], cx[8], dx[8]; ax[0] = _mm_load_ps(&a[i*32+0]); ax[1] = _mm_load_ps(&a[i*32+4]); ax[2] = _mm_load_ps(&a[i*32+8]); ax[3] = _mm_load_ps(&a[i*32+12]); ax[4] = _mm_load_ps(&a[i*32+16]); ax[5] = _mm_load_ps(&a[i*32+20]); ax[6] = _mm_load_ps(&a[i*32+24]); ax[7] = _mm_load_ps(&a[i*32+28]); bx[0] = _mm_load_ps(&b[i*32+0]); bx[1] = _mm_load_ps(&b[i*32+4]); bx[2] = _mm_load_ps(&b[i*32+8]); bx[3] = _mm_load_ps(&b[i*32+12]); bx[4] = _mm_load_ps(&b[i*32+16]); bx[5] = _mm_load_ps(&b[i*32+20]); bx[6] = _mm_load_ps(&b[i*32+24]); bx[7] = _mm_load_ps(&b[i*32+28]); dx[0] = _mm_load_ps(&d[i*32+0]); dx[1] = _mm_load_ps(&d[i*32+4]); dx[2] = _mm_load_ps(&d[i*32+8]); dx[3] = _mm_load_ps(&d[i*32+12]); dx[4] = _mm_load_ps(&d[i*32+16]); dx[5] = _mm_load_ps(&d[i*32+20]); dx[6] = _mm_load_ps(&d[i*32+24]); dx[7] = _mm_load_ps(&d[i*32+28]); cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0])); cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1])); cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2])); cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3])); cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4])); cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5])); cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6])); cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7])); _mm_store_ps(&c[i*32+0], cx[0]); _mm_store_ps(&c[i*32+4], cx[1]); _mm_store_ps(&c[i*32+8], cx[2]); _mm_store_ps(&c[i*32+12], cx[3]); _mm_store_ps(&c[i*32+16], cx[4]); _mm_store_ps(&c[i*32+20], cx[5]); _mm_store_ps(&c[i*32+24], cx[6]); _mm_store_ps(&c[i*32+28], cx[7]); } } And its output using GCC 4.8.1 -O2 : fooFloat(float*, float*, float*, float*, unsigned int): push r15 xor r15d, r15d test r8d, r8d mov eax, 4 push r14 push r13 push r12 push rbp push rbx je .L15 .L19: lea r12d, [rax+4] lea ebp, [rax+8] lea ebx, [rax+12] lea r11d, [rax+16] lea r10d, [rax+20] lea r9d, [rax+24] mov r14d, r15d mov r13d, eax add r15d, 32 sal r14d, 5 movaps xmm6, XMMWORD PTR [rdx+r13*4] add eax, 1024 cmp r8d, r15d movaps xmm7, XMMWORD PTR [rdx+r14*4] mulps xmm6, XMMWORD PTR [rsi+r13*4] movaps xmm5, XMMWORD PTR [rdx+r12*4] mulps xmm7, XMMWORD PTR [rsi+r14*4] movaps xmm4, XMMWORD PTR [rdx+rbp*4] mulps xmm5, XMMWORD PTR [rsi+r12*4] movaps xmm3, XMMWORD PTR [rdx+rbx*4] mulps xmm4, XMMWORD PTR [rsi+rbp*4] movaps xmm2, XMMWORD PTR [rdx+r11*4] mulps xmm3, XMMWORD PTR [rsi+rbx*4] movaps xmm1, XMMWORD PTR [rdx+r10*4] mulps xmm2, XMMWORD PTR [rsi+r11*4] movaps xmm0, XMMWORD PTR [rdx+r9*4] mulps xmm1, XMMWORD PTR [rsi+r10*4] addps xmm7, XMMWORD PTR [rdi+r14*4] mulps xmm0, XMMWORD PTR [rsi+r9*4] addps xmm6, XMMWORD PTR [rdi+r13*4] addps xmm5, XMMWORD PTR [rdi+r12*4] addps xmm4, XMMWORD PTR [rdi+rbp*4] addps xmm3, XMMWORD PTR [rdi+rbx*4] addps xmm2, XMMWORD PTR [rdi+r11*4] addps xmm1, XMMWORD PTR [rdi+r10*4] addps xmm0, XMMWORD PTR [rdi+r9*4] movaps XMMWORD PTR [rcx+r14*4], xmm7 movaps XMMWORD PTR [rcx+r13*4], xmm6 movaps XMMWORD PTR [rcx+r12*4], xmm5 movaps XMMWORD PTR [rcx+rbp*4], xmm4 movaps XMMWORD PTR [rcx+rbx*4], xmm3 movaps XMMWORD PTR [rcx+r11*4], xmm2 movaps XMMWORD PTR [rcx+r10*4], xmm1 movaps XMMWORD PTR [rcx+r9*4], xmm0 ja .L19 .L15: pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret