public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6
@ 2021-04-25 22:08 witold.baryluk+gcc at gmail dot com
  2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: witold.baryluk+gcc at gmail dot com @ 2021-04-25 22:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

            Bug ID: 100257
           Summary: poor codegen with vcvtph2ps / stride of 6
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: witold.baryluk+gcc at gmail dot com
  Target Milestone: ---

gcc (Compiler-Explorer-Build) 12.0.0 20210424 (experimental)


https://godbolt.org/z/n6ooMdnz8


This C code:

```
#include <stdint.h>
#include <string.h>
#include <immintrin.h>

struct float3 {
    float f1;
    float f2;
    float f3;
};

struct util_format_r16g16b16_float {
   uint16_t r;
   uint16_t g;
   uint16_t b;
};

static inline struct float3 _mesa_half3_to_float3(uint16_t val_0, uint16_t
val_1, uint16_t val_2) {
#if defined(__F16C__)
      //const __m128i in = {val_0, val_1, val_2};
      //__m128 out;
      //__asm volatile("vcvtph2ps %1, %0" : "=v"(out) : "v"(in));

      const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);
      const __m128 out = _mm_cvtph_ps(in);

      const struct float3 r = {out[0], out[1], out[2]};
      return r;
#endif
}


void
util_format_r16g16b16_float_unpack_rgba_float(void *restrict dst_row, const
uint8_t *restrict src, unsigned width)
{
   float *dst = dst_row;
   for (unsigned x = 0; x < width; x += 1) {
        const struct util_format_r16g16b16_float pixel;
        memcpy(&pixel, src, sizeof pixel);

        struct float3 r = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
        dst[0] = r.f1; /* r */
        dst[1] = r.f2; /* g */
        dst[2] = r.f3; /* b */
        dst[3] = 1; /* a */

        src += 6;
        dst += 4;
   }
}

```

Is compiled "poorly" by gcc, even worse when compiled on i386 (with -mf16c
enabled) when using -FPIE.

Example:


gcc -O3 -m32 -march=znver2 -mfpmath=sse -fPIE

util_format_r16g16b16_float_unpack_rgba_float:
        push    ebp
        push    edi
        push    esi
        push    ebx
        sub     esp, 28
        mov     ecx, DWORD PTR 56[esp]
        mov     edx, DWORD PTR 48[esp]
        call    __x86.get_pc_thunk.ax
        add     eax, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_
        mov     ebx, DWORD PTR 52[esp]
        test    ecx, ecx
        je      .L8
        vmovss  xmm3, DWORD PTR .LC0@GOTOFF[eax]
        xor     esi, esi
        xor     ebp, ebp
        vpxor   xmm2, xmm2, xmm2
.L3:
        mov     eax, DWORD PTR [ebx]
        vmovss  DWORD PTR 12[edx], xmm3
        add     ebx, 6
        add     edx, 16
        inc     esi
        mov     ecx, eax
        vmovd   xmm0, eax
        shr     ecx, 16
        mov     edi, ecx
        movzx   ecx, WORD PTR -2[ebx]
        vpinsrw xmm0, xmm0, edi, 1
        vmovd   xmm1, ecx
        vpinsrw xmm1, xmm1, ebp, 1
        vpunpckldq      xmm0, xmm0, xmm1
        vpunpcklqdq     xmm0, xmm0, xmm2
        vcvtph2ps       xmm0, xmm0
        vmovss  DWORD PTR -16[edx], xmm0
        vextractps      DWORD PTR -12[edx], xmm0, 1
        vextractps      DWORD PTR -8[edx], xmm0, 2
        cmp     DWORD PTR 56[esp], esi
        jne     .L3
.L8:
        add     esp, 28
        pop     ebx
        pop     esi
        pop     edi
        pop     ebp
        ret
.LC0:
        .long   1065353216
__x86.get_pc_thunk.ax:
        mov     eax, DWORD PTR [esp]
        ret



clang:

util_format_r16g16b16_float_unpack_rgba_float: #
@util_format_r16g16b16_float_unpack_rgba_float
        mov     eax, dword ptr [esp + 12]
        test    eax, eax
        je      .LBB0_3
        mov     ecx, dword ptr [esp + 8]
        mov     edx, dword ptr [esp + 4]
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        vmovd   xmm0, dword ptr [ecx]           # xmm0 = mem[0],zero,zero,zero
        vpinsrw xmm0, xmm0, word ptr [ecx + 4], 2
        add     ecx, 6
        vcvtph2ps       xmm0, xmm0
        vmovss  dword ptr [edx], xmm0
        vextractps      dword ptr [edx + 4], xmm0, 1
        vextractps      dword ptr [edx + 8], xmm0, 2
        mov     dword ptr [edx + 12], 1065353216
        add     edx, 16
        dec     eax
        jne     .LBB0_2
.LBB0_3:
        ret


clang code is essentially optimal.


The issue persist if I use `vcvtph2ps` directly via asm, or via intrinsics.

The issue might be the src stride, of 6, instead 8, that is confusing gcc.

Additionally, constant 1065353216  (which is weird, I would expect it to be 0),
is stored in data section, instead inline as immediate, this makes code
actually larger, and in PIE mode, requires extra pointer trickery, and on -m32,
even calling extra function.

Even without -fPIE the main loop has poor codegen even on x86-64 / amd64
compared to clang or what I would considered good code.

gcc -m64 -O3 -march=native

util_format_r16g16b16_float_unpack_rgba_float:
        test    edx, edx
        je      .L8
        mov     edx, edx
        sal     rdx, 4
        vmovss  xmm3, DWORD PTR .LC0[rip]
        lea     rcx, [rdi+rdx]
        xor     r9d, r9d
        vpxor   xmm2, xmm2, xmm2
.L3:
        mov     eax, DWORD PTR [rsi]
        vmovss  DWORD PTR 12[rdi], xmm3
        mov     edx, eax
        shr     edx, 16
        mov     r8d, edx
        movzx   edx, WORD PTR 4[rsi]
        vmovd   xmm0, eax
        vmovd   xmm1, edx
        vpinsrw xmm0, xmm0, r8d, 1
        vpinsrw xmm1, xmm1, r9d, 1
        vpunpckldq      xmm0, xmm0, xmm1
        vpunpcklqdq     xmm0, xmm0, xmm2
        vcvtph2ps       xmm0, xmm0
        add     rdi, 16
        vmovlps QWORD PTR -16[rdi], xmm0
        vextractps      DWORD PTR -8[rdi], xmm0, 2
        add     rsi, 6
        cmp     rdi, rcx
        jne     .L3
.L8:
        ret
.LC0:
        .long   1065353216


If you know what is going on, please rename more accurately and reassign to
proper component.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
@ 2021-04-25 23:42 ` pinskia at gcc dot gnu.org
  2021-04-26  1:17 ` pinskia at gcc dot gnu.org
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-25 23:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Looks like a few missed optimizations at the tree level (and a target issue of
the store):
  memcpy (&pixel, src_33, 6);
  _1 = pixel.b;
  _2 = pixel.g;
  _3 = pixel.r;
  val_2.0_21 = (short int) _1;
  val_1.1_22 = (short int) _2;
  val_0.2_23 = (short int) _3;
  _24 = {val_0.2_23, val_1.1_22, val_2.0_21, 0, 0, 0, 0, 0};
  _25 = __builtin_ia32_vcvtph2ps (_24);
  _14 = BIT_FIELD_REF <_25, 64, 0>;
  _28 = BIT_FIELD_REF <_25, 32, 64>;
  MEM <vector(2) float> [(float *)dst_34] = _14;
  MEM[(float *)dst_34 + 8B] = _28;
  MEM[(float *)dst_34 + 12B] = 1.0e+0;


The store issue is now PR 100258.
This is more about the missed optimization of the first part, the conversion.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
  2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
@ 2021-04-26  1:17 ` pinskia at gcc dot gnu.org
  2021-04-26  7:36 ` crazylht at gmail dot com
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-26  1:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Severity|normal                      |enhancement

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
  2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
  2021-04-26  1:17 ` pinskia at gcc dot gnu.org
@ 2021-04-26  7:36 ` crazylht at gmail dot com
  2021-04-26  8:07 ` rguenth at gcc dot gnu.org
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26  7:36 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

Hongtao.liu <crazylht at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |crazylht at gmail dot com

--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
for vec_init, if higher part is zero, we can use vmovd/vmovq instead of vector
concat.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
                   ` (2 preceding siblings ...)
  2021-04-26  7:36 ` crazylht at gmail dot com
@ 2021-04-26  8:07 ` rguenth at gcc dot gnu.org
  2021-04-26  9:01 ` pinskia at gcc dot gnu.org
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-04-26  8:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Confirmed.  We fail to elide the 'pixel' temporary, that is, express

  memcpy (&pixel, src_33, 6);
  _1 = pixel.b;
  _2 = pixel.g;
  _3 = pixel.r;

in terms of loads from src.  Then the backend intrinsic expanding to
a target builtin of course does not help things.

For the above we'd need SRA-like analysis, while VN could remat the loads
from src it lacks the global costing that would tell it that all uses of
pixel and thus the memcpy goes away.

We can fold the memcpy to

  __MEM <unsigned char[6], 16> ((char * {ref-all})&pixel) = __MEM <unsigned
char[6]> ((char * {ref-all})src_13);

with the following:

diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index 281839d4a73..750a5d884a7 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -1215,9 +1215,7 @@ gimple_fold_builtin_memory_op (gimple_stmt_iterator *gsi,
       if (TREE_CODE (dest) == ADDR_EXPR
          && var_decl_component_p (TREE_OPERAND (dest, 0))
          && tree_int_cst_equal (TYPE_SIZE_UNIT (desttype), len)
-         && dest_align >= TYPE_ALIGN (desttype)
-         && (is_gimple_reg_type (desttype)
-             || src_align >= TYPE_ALIGN (desttype)))
+         && dest_align >= TYPE_ALIGN (desttype))
        destvar = fold_build2 (MEM_REF, desttype, dest, off0);
       else if (TREE_CODE (src) == ADDR_EXPR
               && var_decl_component_p (TREE_OPERAND (src, 0))

and then end up with

  pixel$r_3 = MEM <const uint16_t> [(char * {ref-all})src_34];
  pixel$g_2 = MEM <const uint16_t> [(char * {ref-all})src_34 + 2B];
  pixel$b_1 = MEM <const uint16_t> [(char * {ref-all})src_34 + 4B];
  val_2.0_19 = (short int) pixel$b_1;
  val_1.1_20 = (short int) pixel$g_2;
  val_0.2_21 = (short int) pixel$r_3;
  _22 = {val_0.2_21, val_1.1_20, val_2.0_19, 0, 0, 0, 0, 0};
  _23 = __builtin_ia32_vcvtph2ps (_22);

but that doesn't help in the end.  It does help vectorizing the loop
when you avoid the intrinsic by doing

   for (unsigned x = 0; x < width; x += 1) {
        const struct util_format_r16g16b16_float pixel;
        memcpy(&pixel, src, sizeof pixel);

        struct float3 r;// = _mesa_half3_to_float3(pixel.r, pixel.g, pixel.b);
        r.f1 = pixel.r;
        r.f2 = pixel.g;
        r.f3 = pixel.b;
        dst[0] = r.f1; /* r */
        dst[1] = r.f2; /* g */
        dst[2] = r.f3; /* b */
        dst[3] = 1; /* a */

        src += 6;
        dst += 4;
   }

then we vectorize it as

  vect_pixel_r_25.30_283 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278];
  vect_pixel_r_25.31_285 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278 + 16B];
  vect_pixel_r_25.32_287 = MEM <const vector(8) short unsigned int> [(char *
{ref-all})vectp_src.29_278 + 32B];
  vect__8.33_288 = [vec_unpack_float_lo_expr] vect_pixel_r_25.30_283;
  vect__8.33_289 = [vec_unpack_float_hi_expr] vect_pixel_r_25.30_283;
  vect__8.33_290 = [vec_unpack_float_lo_expr] vect_pixel_r_25.31_285;
  vect__8.33_291 = [vec_unpack_float_hi_expr] vect_pixel_r_25.31_285;
  vect__8.33_292 = [vec_unpack_float_lo_expr] vect_pixel_r_25.32_287;
  vect__8.33_293 = [vec_unpack_float_hi_expr] vect_pixel_r_25.32_287;

but then we somehow mess up analysis of the stores going for hybrid SLP
(we split the store group).  We could just leave the dst[3] stores
unvectorized ... but then we somehow decide to emit

        vpmovzxwd       (%rdx), %ymm11
...
        vcvtdq2ps       %ymm11, %ymm11

and thus use a different conversion path (not sure if it is worse in the
end, but ...).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
                   ` (3 preceding siblings ...)
  2021-04-26  8:07 ` rguenth at gcc dot gnu.org
@ 2021-04-26  9:01 ` pinskia at gcc dot gnu.org
  2021-04-26  9:09 ` crazylht at gmail dot com
  2021-04-26  9:58 ` crazylht at gmail dot com
  6 siblings, 0 replies; 8+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-26  9:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
(In reply to Hongtao.liu from comment #2)
> for vec_init, if higher part is zero, we can use vmovd/vmovq instead of
> vector concat.

That is related to PR 94680 if not the same.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
                   ` (4 preceding siblings ...)
  2021-04-26  9:01 ` pinskia at gcc dot gnu.org
@ 2021-04-26  9:09 ` crazylht at gmail dot com
  2021-04-26  9:58 ` crazylht at gmail dot com
  6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26  9:09 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Andrew Pinski from comment #4)
> (In reply to Hongtao.liu from comment #2)
> > for vec_init, if higher part is zero, we can use vmovd/vmovq instead of
> > vector concat.
> 
> That is related to PR 94680 if not the same.

Yes, but in different phase.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug target/100257] poor codegen with vcvtph2ps / stride of 6
  2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
                   ` (5 preceding siblings ...)
  2021-04-26  9:09 ` crazylht at gmail dot com
@ 2021-04-26  9:58 ` crazylht at gmail dot com
  6 siblings, 0 replies; 8+ messages in thread
From: crazylht at gmail dot com @ 2021-04-26  9:58 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100257

--- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
> const __m128i in = _mm_setr_epi16(val_0, val_1, val_2, 0, 0, 0, 0, 0);

in ix86_expand_vector_init, we can generate asm like 


  vmovd val_0, %xmm0
  pinsrw $1, val_1, %xmm0
  pinsrw $2, val_2, %xmm0

and let rtl's optimization "merge" val_0 and val_1 since they come from
contiguous memory)

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-04-26  9:58 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-25 22:08 [Bug c/100257] New: poor codegen with vcvtph2ps / stride of 6 witold.baryluk+gcc at gmail dot com
2021-04-25 23:42 ` [Bug target/100257] " pinskia at gcc dot gnu.org
2021-04-26  1:17 ` pinskia at gcc dot gnu.org
2021-04-26  7:36 ` crazylht at gmail dot com
2021-04-26  8:07 ` rguenth at gcc dot gnu.org
2021-04-26  9:01 ` pinskia at gcc dot gnu.org
2021-04-26  9:09 ` crazylht at gmail dot com
2021-04-26  9:58 ` crazylht at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).