public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6
@ 2021-04-25 22:08 witold.baryluk+gcc at gmail dot com
2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
` (6 more replies)
0 siblings, 7 replies; 8+ messages in thread
From: witold.baryluk+gcc at gmail dot com @ 2021-04-25 22:08 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
Bug ID: 100257
Summary: poor codegen with vcvtph2ps / stride of 6
Product: gcc
Version: 12.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
Assignee: unassigned at gcc dot gnu.org
Reporter: witold.baryluk+gcc at gmail dot com
Target Milestone: ---
gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental)
https://godbolt.org/z/n6ooMdnz8
This C code:
```
#include <stdint.h>
#include <string.h>
#include <immintrin.h>
struct float3 {
float f1;
float f2;
float f3;
};
struct util_format_r16g16b16_float {
uint16_t r;
uint16_t g;
uint16_t b;
};
static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t
val_1, uint16_t val_2) {
#if defined(__F16C__)
//const __m128i in = {val_0, val_1, val_2};
//__m128 out;
//__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));
const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
const __m128 out = _mm_cvtph_ps(in);
const struct float3 r = {out[0], out[1], out[2]};
return r;
#endif
}
void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const
uint8_t *restrict src, unsigned width)
{
float *dst = dst_row;
for (unsigned x = 0; x < width; x += 1) {
const struct util_format_r16g16b16_float pixel;
memcpy(&pixel, src, sizeof pixel);
struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
dst[0] = r.f1; /* r */
dst[1] = r.f2; /* g */
dst[2] = r.f3; /* b */
dst[3] = 1; /* a */
src += 6;
dst += 4;
}
}
```
Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c
enabled) when using -FPIE.
Example:
gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE
util_format_r16g16b16_float_unpack_rgba_float:
push ebp
push edi
push esi
push ebx
sub esp, 28
mov ecx, DWORD PTR 56[esp]
mov edx, DWORD PTR 48[esp]
call __x86.get_pc_thunk.ax
add eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_
mov ebx, DWORD PTR 52[esp]
test ecx, ecx
je .L8
vmovss xmm3, DWORD PTR .LC0@GOTOFF[eax]
xor esi, esi
xor ebp, ebp
vpxor xmm2, xmm2, xmm2
.L3:
mov eax, DWORD PTR [ebx]
vmovss DWORD PTR 12[edx], xmm3
add ebx, 6
add edx, 16
inc esi
mov ecx, eax
vmovd xmm0, eax
shr ecx, 16
mov edi, ecx
movzx ecx, WORD PTR -2[ebx]
vpinsrw xmm0, xmm0, edi, 1
vmovd xmm1, ecx
vpinsrw xmm1, xmm1, ebp, 1
vpunpckldq xmm0, xmm0, xmm1
vpunpcklqdq xmm0, xmm0, xmm2
vcvtph2ps xmm0, xmm0
vmovss DWORD PTR -16[edx], xmm0
vextractps DWORD PTR -12[edx], xmm0, 1
vextractps DWORD PTR -8[edx], xmm0, 2
cmp DWORD PTR 56[esp], esi
jne .L3
.L8:
add esp, 28
pop ebx
pop esi
pop edi
pop ebp
ret
.LC0:
.long 1065353216
__x86.get_pc_thunk.ax:
mov eax, DWORD PTR [esp]
ret
clang:
util_format_r16g16b16_float_unpack_rgba_float: #
@util_format_r16g16b16_float_unpack_rgba_float
mov eax, dword ptr [esp + 12]
test eax, eax
je .LBB0_3
mov ecx, dword ptr [esp + 8]
mov edx, dword ptr [esp + 4]
.LBB0_2: # =>This Inner Loop Header: Depth=1
vmovd xmm0, dword ptr [ecx] # xmm0 = mem[0],zero,zero,zero
vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2
add ecx, 6
vcvtph2ps xmm0, xmm0
vmovss dword ptr [edx], xmm0
vextractps dword ptr [edx + 4], xmm0, 1
vextractps dword ptr [edx + 8], xmm0, 2
mov dword ptr [edx + 12], 1065353216
add edx, 16
dec eax
jne .LBB0_2
.LBB0_3:
ret
clang code is essentially optimal.
The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics.
The issue might be the src stride, of 6, instead 8, that is confusing gcc.
Additionally, constant 1065353216 (which is weird, I would expect it to be 0),
is stored in data section, instead inline as immediate, this makes code
actually larger, and in PIE mode, requires extra pointer trickery, and on -m32,
even calling extra function.
Even without -fPIE the main loop has poor codegen even on x86-64 / amd64
compared to clang or what I would considered good code.
gcc -m64 -O3 -march=native
util_format_r16g16b16_float_unpack_rgba_float:
test edx, edx
je .L8
mov edx, edx
sal rdx, 4
vmovss xmm3, DWORD PTR .LC0[rip]
lea rcx, [rdi+rdx]
xor r9d, r9d
vpxor xmm2, xmm2, xmm2
.L3:
mov eax, DWORD PTR [rsi]
vmovss DWORD PTR 12[rdi], xmm3
mov edx, eax
shr edx, 16
mov r8d, edx
movzx edx, WORD PTR 4[rsi]
vmovd xmm0, eax
vmovd xmm1, edx
vpinsrw xmm0, xmm0, r8d, 1
vpinsrw xmm1, xmm1, r9d, 1
vpunpckldq xmm0, xmm0, xmm1
vpunpcklqdq xmm0, xmm0, xmm2
vcvtph2ps xmm0, xmm0
add rdi, 16
vmovlps QWORD PTR -16[rdi], xmm0
vextractps DWORD PTR -8[rdi], xmm0, 2
add rsi, 6
cmp rdi, rcx
jne .L3
.L8:
ret
.LC0:
.long 1065353216
If you know what is going on, please rename more accurately and reassign to
proper component.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
@ 2021-04-25 23:42 ` pinskia at gcc dot gnu.org
2021-04-26 1:17 ` pinskia at gcc dot gnu.org
` (5 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-25 23:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Looks like a few missed optimizations at the tree level (and a target issue of
the store):
memcpy (&pixel, src_33, 6);
_1 = pixel.b;
_2 = pixel.g;
_3 = pixel.r;
val_2.0_21 = (short int) _1;
val_1.1_22 = (short int) _2;
val_0.2_23 = (short int) _3;
_24 = {val_0.2_23, val_1.1_22, val_2.0_21, 0, 0, 0, 0, 0};
_25 = __builtin_ia32_vcvtph2ps (_24);
_14 = BIT_FIELD_REF <_25, 64, 0>;
_28 = BIT_FIELD_REF <_25, 32, 64>;
MEM <vector(2) float> [(float *)dst_34] = _14;
MEM[(float *)dst_34 + 8B] = _28;
MEM[(float *)dst_34 + 12B] = 1.0e+0;
The store issue is now PR 100258.
This is more about the missed optimization of the first part, the conversion.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
@ 2021-04-26 1:17 ` pinskia at gcc dot gnu.org
2021-04-26 7:36 ` crazylht at gmail dot com
` (4 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-26 1:17 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Severity|normal |enhancement
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
2021-04-26 1:17 ` pinskia at gcc dot gnu.org
@ 2021-04-26 7:36 ` crazylht at gmail dot com
2021-04-26 8:07 ` rguenth at gcc dot gnu.org
` (3 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26 7:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
Hongtao.liu <crazylht at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |crazylht at gmail dot com
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
for vec_init, if higher part is zero, we can use vmovd/vmovq instead of vector
concat.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
` (2 preceding siblings ...)
2021-04-26 7:36 ` crazylht at gmail dot com
@ 2021-04-26 8:07 ` rguenth at gcc dot gnu.org
2021-04-26 9:01 ` pinskia at gcc dot gnu.org
` (2 subsequent siblings)
6 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-04-26 8:07 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Confirmed. We fail to elide the 'pixel' temporary, that is, express
memcpy (&pixel, src_33, 6);
_1 = pixel.b;
_2 = pixel.g;
_3 = pixel.r;
in terms of loads from src. Then the backend intrinsic expanding to
a target builtin of course does not help things.
For the above we'd need SRA-like analysis, while VN could remat the loads
from src it lacks the global costing that would tell it that all uses of
pixel and thus the memcpy goes away.
We can fold the memcpy to
__MEM <unsigned char[6], 16> ((char * {ref-all})&pixel) = __MEM <unsigned
char[6]> ((char * {ref-all})src_13);
with the following:
diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index 281839d4a73..750a5d884a7 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -1215,9 +1215,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
if (TREE_CODE (dest) == ADDR_EXPR
&& var_decl_component_p (TREE_OPERAND (dest, 0))
&& tree_int_cst_equal (TYPE_SIZE_UNIT (desttype), len)
- && dest_align >= TYPE_ALIGN (desttype)
- && (is_gimple_reg_type (desttype)
- || src_align >= TYPE_ALIGN (desttype)))
+ && dest_align >= TYPE_ALIGN (desttype))
destvar = fold_build2 (MEM_REF, desttype, dest, off0);
else if (TREE_CODE (src) == ADDR_EXPR
&& var_decl_component_p (TREE_OPERAND (src, 0))
and then end up with
pixel$r_3 = MEM <const uint16_t> [(char * {ref-all})src_34];
pixel$g_2 = MEM <const uint16_t> [(char * {ref-all})src_34 + 2B];
pixel$b_1 = MEM <const uint16_t> [(char * {ref-all})src_34 + 4B];
val_2.0_19 = (short int) pixel$b_1;
val_1.1_20 = (short int) pixel$g_2;
val_0.2_21 = (short int) pixel$r_3;
_22 = {val_0.2_21, val_1.1_20, val_2.0_19, 0, 0, 0, 0, 0};
_23 = __builtin_ia32_vcvtph2ps (_22);
but that doesn't help in the end. It does help vectorizing the loop
when you avoid the intrinsic by doing
for (unsigned x = 0; x < width; x += 1) {
const struct util_format_r16g16b16_float pixel;
memcpy(&pixel, src, sizeof pixel);
struct float3 r;// = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
r.f1 = pixel.r;
r.f2 = pixel.g;
r.f3 = pixel.b;
dst[0] = r.f1; /* r */
dst[1] = r.f2; /* g */
dst[2] = r.f3; /* b */
dst[3] = 1; /* a */
src += 6;
dst += 4;
}
then we vectorize it as
vect_pixel_r_25.30_283 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278];
vect_pixel_r_25.31_285 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278 + 16B];
vect_pixel_r_25.32_287 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278 + 32B];
vect__8.33_288 = [vec_unpack_float_lo_expr] vect_pixel_r_25.30_283;
vect__8.33_289 = [vec_unpack_float_hi_expr] vect_pixel_r_25.30_283;
vect__8.33_290 = [vec_unpack_float_lo_expr] vect_pixel_r_25.31_285;
vect__8.33_291 = [vec_unpack_float_hi_expr] vect_pixel_r_25.31_285;
vect__8.33_292 = [vec_unpack_float_lo_expr] vect_pixel_r_25.32_287;
vect__8.33_293 = [vec_unpack_float_hi_expr] vect_pixel_r_25.32_287;
but then we somehow mess up analysis of the stores going for hybrid SLP
(we split the store group). We could just leave the dst[3] stores
unvectorized ... but then we somehow decide to emit
vpmovzxwd (%rdx), %ymm11
...
vcvtdq2ps %ymm11, %ymm11
and thus use a different conversion path (not sure if it is worse in the
end, but ...).
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
` (3 preceding siblings ...)
2021-04-26 8:07 ` rguenth at gcc dot gnu.org
@ 2021-04-26 9:01 ` pinskia at gcc dot gnu.org
2021-04-26 9:09 ` crazylht at gmail dot com
2021-04-26 9:58 ` crazylht at gmail dot com
6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-26 9:01 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Hongtao.liu from comment #2)
> for vec_init, if higher part is zero, we can use vmovd/vmovq instead of
> vector concat.
That is related to PR 94680 if not the same.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
` (4 preceding siblings ...)
2021-04-26 9:01 ` pinskia at gcc dot gnu.org
@ 2021-04-26 9:09 ` crazylht at gmail dot com
2021-04-26 9:58 ` crazylht at gmail dot com
6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26 9:09 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Andrew Pinski from comment #4)
> (In reply to Hongtao.liu from comment #2)
> > for vec_init, if higher part is zero, we can use vmovd/vmovq instead of
> > vector concat.
>
> That is related to PR 94680 if not the same.
Yes, but in different phase.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
` (5 preceding siblings ...)
2021-04-26 9:09 ` crazylht at gmail dot com
@ 2021-04-26 9:58 ` crazylht at gmail dot com
6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26 9:58 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257
--- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
> const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
in ix86_expand_vector_init, we can generate asm like
vmovd val_0, %xmm0
pinsrw $1, val_1, %xmm0
pinsrw $2, val_2, %xmm0
and let rtl's optimization "merge" val_0 and val_1 since they come from
contiguous memory)
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2021-04-26 9:58 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
2021-04-26 1:17 ` pinskia at gcc dot gnu.org
2021-04-26 7:36 ` crazylht at gmail dot com
2021-04-26 8:07 ` rguenth at gcc dot gnu.org
2021-04-26 9:01 ` pinskia at gcc dot gnu.org
2021-04-26 9:09 ` crazylht at gmail dot com
2021-04-26 9:58 ` crazylht at gmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).