* [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
2013-08-06 16:03 [Bug c++/58095] New: SIMD code requiring auxiliary array for best optimization siavashserver at gmail dot com
` (2 preceding siblings ...)
2013-08-06 17:46 ` siavashserver at gmail dot com
@ 2013-08-07 5:13 ` siavashserver at gmail dot com
2013-08-07 6:31 ` siavashserver at gmail dot com
2021-08-28 18:48 ` pinskia at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: siavashserver at gmail dot com @ 2013-08-07 5:13 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095
--- Comment #3 from Siavash Eliasi <siavashserver at gmail dot com> ---
I did an experiment with using raw float data types instead of __m128 data
type. This time GCC, Clang and ICC were able to generate desired code, even
without using __restric__ keyword, but a little more dirty (Pointer
arithmetics).
Not most, but I'm sure that new video decoder/encoder, game engines and similar
applications are using __m128 data types directly instead of float data types,
because (1) it guarantees them to be 16byte aligned, (2) removes the need to
manually load/store data from memory to XMM/YMM registers, (3) makes the source
code smaller and easier to maintain and (4) much more clean and smaller
generated code.
In conclusion, I don't think issue me and other people are facing is related to
not using __restrict__ keyword. All compilers fail to generate optimal code
when facing __m128 data types. However as an exception, ICC is able to generate
optimal code when facing __m128 data types and __restrict__ keyword mixed.
Here is what I have tried:
#include <xmmintrin.h>
void fooFloat(float* a, float* b, float* d, float* c, unsigned int size)
{
for (unsigned int i = 0; i < size; i+=32)
{
__m128 ax[8], bx[8], cx[8], dx[8];
ax[0] = _mm_load_ps(&a[i*32+0]);
ax[1] = _mm_load_ps(&a[i*32+4]);
ax[2] = _mm_load_ps(&a[i*32+8]);
ax[3] = _mm_load_ps(&a[i*32+12]);
ax[4] = _mm_load_ps(&a[i*32+16]);
ax[5] = _mm_load_ps(&a[i*32+20]);
ax[6] = _mm_load_ps(&a[i*32+24]);
ax[7] = _mm_load_ps(&a[i*32+28]);
bx[0] = _mm_load_ps(&b[i*32+0]);
bx[1] = _mm_load_ps(&b[i*32+4]);
bx[2] = _mm_load_ps(&b[i*32+8]);
bx[3] = _mm_load_ps(&b[i*32+12]);
bx[4] = _mm_load_ps(&b[i*32+16]);
bx[5] = _mm_load_ps(&b[i*32+20]);
bx[6] = _mm_load_ps(&b[i*32+24]);
bx[7] = _mm_load_ps(&b[i*32+28]);
dx[0] = _mm_load_ps(&d[i*32+0]);
dx[1] = _mm_load_ps(&d[i*32+4]);
dx[2] = _mm_load_ps(&d[i*32+8]);
dx[3] = _mm_load_ps(&d[i*32+12]);
dx[4] = _mm_load_ps(&d[i*32+16]);
dx[5] = _mm_load_ps(&d[i*32+20]);
dx[6] = _mm_load_ps(&d[i*32+24]);
dx[7] = _mm_load_ps(&d[i*32+28]);
cx[0] = _mm_add_ps(ax[0], _mm_mul_ps(dx[0], bx[0]));
cx[1] = _mm_add_ps(ax[1], _mm_mul_ps(dx[1], bx[1]));
cx[2] = _mm_add_ps(ax[2], _mm_mul_ps(dx[2], bx[2]));
cx[3] = _mm_add_ps(ax[3], _mm_mul_ps(dx[3], bx[3]));
cx[4] = _mm_add_ps(ax[4], _mm_mul_ps(dx[4], bx[4]));
cx[5] = _mm_add_ps(ax[5], _mm_mul_ps(dx[5], bx[5]));
cx[6] = _mm_add_ps(ax[6], _mm_mul_ps(dx[6], bx[6]));
cx[7] = _mm_add_ps(ax[7], _mm_mul_ps(dx[7], bx[7]));
_mm_store_ps(&c[i*32+0], cx[0]);
_mm_store_ps(&c[i*32+4], cx[1]);
_mm_store_ps(&c[i*32+8], cx[2]);
_mm_store_ps(&c[i*32+12], cx[3]);
_mm_store_ps(&c[i*32+16], cx[4]);
_mm_store_ps(&c[i*32+20], cx[5]);
_mm_store_ps(&c[i*32+24], cx[6]);
_mm_store_ps(&c[i*32+28], cx[7]);
}
}
And its output using GCC 4.8.1 -O2 :
fooFloat(float*, float*, float*, float*, unsigned int):
push r15
xor r15d, r15d
test r8d, r8d
mov eax, 4
push r14
push r13
push r12
push rbp
push rbx
je .L15
.L19:
lea r12d, [rax+4]
lea ebp, [rax+8]
lea ebx, [rax+12]
lea r11d, [rax+16]
lea r10d, [rax+20]
lea r9d, [rax+24]
mov r14d, r15d
mov r13d, eax
add r15d, 32
sal r14d, 5
movaps xmm6, XMMWORD PTR [rdx+r13*4]
add eax, 1024
cmp r8d, r15d
movaps xmm7, XMMWORD PTR [rdx+r14*4]
mulps xmm6, XMMWORD PTR [rsi+r13*4]
movaps xmm5, XMMWORD PTR [rdx+r12*4]
mulps xmm7, XMMWORD PTR [rsi+r14*4]
movaps xmm4, XMMWORD PTR [rdx+rbp*4]
mulps xmm5, XMMWORD PTR [rsi+r12*4]
movaps xmm3, XMMWORD PTR [rdx+rbx*4]
mulps xmm4, XMMWORD PTR [rsi+rbp*4]
movaps xmm2, XMMWORD PTR [rdx+r11*4]
mulps xmm3, XMMWORD PTR [rsi+rbx*4]
movaps xmm1, XMMWORD PTR [rdx+r10*4]
mulps xmm2, XMMWORD PTR [rsi+r11*4]
movaps xmm0, XMMWORD PTR [rdx+r9*4]
mulps xmm1, XMMWORD PTR [rsi+r10*4]
addps xmm7, XMMWORD PTR [rdi+r14*4]
mulps xmm0, XMMWORD PTR [rsi+r9*4]
addps xmm6, XMMWORD PTR [rdi+r13*4]
addps xmm5, XMMWORD PTR [rdi+r12*4]
addps xmm4, XMMWORD PTR [rdi+rbp*4]
addps xmm3, XMMWORD PTR [rdi+rbx*4]
addps xmm2, XMMWORD PTR [rdi+r11*4]
addps xmm1, XMMWORD PTR [rdi+r10*4]
addps xmm0, XMMWORD PTR [rdi+r9*4]
movaps XMMWORD PTR [rcx+r14*4], xmm7
movaps XMMWORD PTR [rcx+r13*4], xmm6
movaps XMMWORD PTR [rcx+r12*4], xmm5
movaps XMMWORD PTR [rcx+rbp*4], xmm4
movaps XMMWORD PTR [rcx+rbx*4], xmm3
movaps XMMWORD PTR [rcx+r11*4], xmm2
movaps XMMWORD PTR [rcx+r10*4], xmm1
movaps XMMWORD PTR [rcx+r9*4], xmm0
ja .L19
.L15:
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/58095] SIMD code requiring auxiliary array for best optimization
2013-08-06 16:03 [Bug c++/58095] New: SIMD code requiring auxiliary array for best optimization siavashserver at gmail dot com
` (3 preceding siblings ...)
2013-08-07 5:13 ` siavashserver at gmail dot com
@ 2013-08-07 6:31 ` siavashserver at gmail dot com
2021-08-28 18:48 ` pinskia at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: siavashserver at gmail dot com @ 2013-08-07 6:31 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58095
--- Comment #4 from Siavash Eliasi <siavashserver at gmail dot com> ---
In the end, here is what I really like GCC to generate for me. Same output as
function (bar) for function (foo) when using GCC with -O3 -march=core2
switches:
#include <xmmintrin.h>
#define BATCHSIZE 8
void foo(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
for (unsigned int i = 0; i < size; i++)
{
for (unsigned int j=0; j<BATCHSIZE; j++)
{
c[i][j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
}
}
}
void bar(__m128 a[][BATCHSIZE], __m128 b[][BATCHSIZE], __m128 d[][BATCHSIZE],
__m128 c[][BATCHSIZE], unsigned int size)
{
for (unsigned int i = 0; i < size; i++)
{
__m128 cx[BATCHSIZE];
for (unsigned int j=0; j<BATCHSIZE; j++)
{
cx[j] = _mm_add_ps(a[i][j], _mm_mul_ps(d[i][j], b[i][j]));
}
for (unsigned int j=0; j<BATCHSIZE; j++)
{
c[i][j] = cx[j];
}
}
}
Generated asm code:
foo(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
test r8d, r8d
je .L1
xor eax, eax
.L4:
movaps xmm0, XMMWORD PTR [rdx]
add eax, 1
sub rsi, -128
sub rdx, -128
sub rdi, -128
sub rcx, -128
mulps xmm0, XMMWORD PTR [rsi-128]
addps xmm0, XMMWORD PTR [rdi-128]
movaps XMMWORD PTR [rcx-128], xmm0
movaps xmm0, XMMWORD PTR [rdx-112]
mulps xmm0, XMMWORD PTR [rsi-112]
addps xmm0, XMMWORD PTR [rdi-112]
movaps XMMWORD PTR [rcx-112], xmm0
movaps xmm0, XMMWORD PTR [rdx-96]
mulps xmm0, XMMWORD PTR [rsi-96]
addps xmm0, XMMWORD PTR [rdi-96]
movaps XMMWORD PTR [rcx-96], xmm0
movaps xmm0, XMMWORD PTR [rdx-80]
mulps xmm0, XMMWORD PTR [rsi-80]
addps xmm0, XMMWORD PTR [rdi-80]
movaps XMMWORD PTR [rcx-80], xmm0
movaps xmm0, XMMWORD PTR [rdx-64]
mulps xmm0, XMMWORD PTR [rsi-64]
addps xmm0, XMMWORD PTR [rdi-64]
movaps XMMWORD PTR [rcx-64], xmm0
movaps xmm0, XMMWORD PTR [rdx-48]
mulps xmm0, XMMWORD PTR [rsi-48]
addps xmm0, XMMWORD PTR [rdi-48]
movaps XMMWORD PTR [rcx-48], xmm0
movaps xmm0, XMMWORD PTR [rdx-32]
mulps xmm0, XMMWORD PTR [rsi-32]
addps xmm0, XMMWORD PTR [rdi-32]
movaps XMMWORD PTR [rcx-32], xmm0
movaps xmm0, XMMWORD PTR [rdx-16]
mulps xmm0, XMMWORD PTR [rsi-16]
addps xmm0, XMMWORD PTR [rdi-16]
movaps XMMWORD PTR [rcx-16], xmm0
cmp eax, r8d
jne .L4
.L1:
rep; ret
bar(float __vector (*) [8], float __vector (*) [8], float __vector (*) [8],
float __vector (*) [8], unsigned int):
test r8d, r8d
je .L6
xor eax, eax
.L9:
movaps xmm7, XMMWORD PTR [rdx]
add eax, 1
sub rsi, -128
movaps xmm6, XMMWORD PTR [rdx+16]
sub rdi, -128
sub rdx, -128
movaps xmm5, XMMWORD PTR [rdx-96]
sub rcx, -128
movaps xmm4, XMMWORD PTR [rdx-80]
movaps xmm3, XMMWORD PTR [rdx-64]
movaps xmm2, XMMWORD PTR [rdx-48]
movaps xmm1, XMMWORD PTR [rdx-32]
movaps xmm0, XMMWORD PTR [rdx-16]
mulps xmm7, XMMWORD PTR [rsi-128]
mulps xmm6, XMMWORD PTR [rsi-112]
mulps xmm5, XMMWORD PTR [rsi-96]
mulps xmm4, XMMWORD PTR [rsi-80]
mulps xmm3, XMMWORD PTR [rsi-64]
mulps xmm2, XMMWORD PTR [rsi-48]
mulps xmm1, XMMWORD PTR [rsi-32]
mulps xmm0, XMMWORD PTR [rsi-16]
addps xmm7, XMMWORD PTR [rdi-128]
addps xmm6, XMMWORD PTR [rdi-112]
addps xmm5, XMMWORD PTR [rdi-96]
addps xmm4, XMMWORD PTR [rdi-80]
addps xmm3, XMMWORD PTR [rdi-64]
addps xmm2, XMMWORD PTR [rdi-48]
addps xmm1, XMMWORD PTR [rdi-32]
addps xmm0, XMMWORD PTR [rdi-16]
movaps XMMWORD PTR [rcx-128], xmm7
movaps XMMWORD PTR [rcx-112], xmm6
movaps XMMWORD PTR [rcx-96], xmm5
movaps XMMWORD PTR [rcx-80], xmm4
movaps XMMWORD PTR [rcx-64], xmm3
movaps XMMWORD PTR [rcx-48], xmm2
movaps XMMWORD PTR [rcx-32], xmm1
movaps XMMWORD PTR [rcx-16], xmm0
cmp eax, r8d
jne .L9
.L6:
rep; ret
^ permalink raw reply [flat|nested] 7+ messages in thread