public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
@ 2021-04-26 12:48 konstantin.ananyev at intel dot com
  2021-04-27  7:08 ` [Bug middle-end/100267] " rguenth at gcc dot gnu.org
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: konstantin.ananyev at intel dot com @ 2021-04-26 12:48 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

            Bug ID: 100267
           Summary: gcc -O2 for avx512 instrincts generates extra warnings
                    and less optimizations
           Product: gcc
           Version: 10.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: konstantin.ananyev at intel dot com
  Target Milestone: ---

The code snippet below compiles ok with '-O2' for gcc-9.
But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings.
Another thing (which is probably worse) 'gcc-10 -O2' generates code with
unnecessary loads for ymm registers from the initiliazed portion of the stack.
As I understand, thats where from these -Wuninitialized warnings come from:
by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables
on the stack.
Note that only '-O2' affected, '-O3' looks good for all versions I tried
(gcc-9, gcc-10, gcc-11)..

=====================
$ cat tavx512u5.c

#include <stddef.h>
#include <stdint.h>
#include <x86intrin.h>


struct flow_avx512 {
        uint32_t num_packets;
        uint32_t total_packets;
        const uint8_t **idata;
};

static inline void
start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num,
                    uint32_t msk, __m256i pdata[2])
{
        uint32_t n, m[2], nm[2];
        __m256i nd[2];

        m[0] = msk & 0xF;
        m[1] = msk >> 4;

        n = __builtin_popcount(m[0]);
        nm[0] = (1 << n) - 1;
        nm[1] = (1 << (num - n)) - 1;

        nd[0] = _mm256_maskz_loadu_epi64(nm[0],
                                flow->idata + flow->num_packets);
        nd[1] = _mm256_maskz_loadu_epi64(nm[1],
                        flow->idata + flow->num_packets + n);

        pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
        pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
}

__m256i
dummyf1_avx512x8(const struct flow_avx512 *flow)
{
        __m256i pdata[2];

        start_flow_avx512x8(flow, 8, 0xFF, pdata);
        return _mm256_add_epi64(pdata[0], pdata[1]);
}

====================
Good version (gcc-9) first:
gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
tavx512u5.gcc9-O2.o -c tavx512u5.c

$ objdump -d tavx512u5.gcc9-O2.o

tavx512u5.gcc9-O2.o:     file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <dummyf1_avx512x8>:
   0:   f3 0f 1e fa             endbr64
   4:   8b 17                   mov    (%rdi),%edx
   6:   48 8b 47 08             mov    0x8(%rdi),%rax
   a:   b9 0f 00 00 00          mov    $0xf,%ecx
   f:   c5 f8 92 c9             kmovw  %ecx,%k1
  13:   62 f2 fd a9 89 0c d0    vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z}
  1a:   62 f2 fd a9 89 44 d0    vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z}
  21:   04
  22:   c5 f5 d4 c0             vpaddq %ymm0,%ymm1,%ymm0
  26:   c3                      retq

=======================
Now gcc-10:
$ gcc-10 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
tavx512u5.gcc9-O2.o  -c tavx512u5.c
tavx512u5.c: In function ‘dummyf1_avx512x8’:
tavx512u5.c:32:13: warning: ‘pdata’ is used uninitialized in this function
[-Wuninitialized]
   32 |  pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
tavx512u5.c:33:13: warning: ‘*((void *)&pdata+32)’ is used uninitialized in
this function [-Wuninitialized]
   33 |  pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

$ objdump -d tavx512u5.gcc10-O2.o 

tavx512u5.gcc10-O2.o:     file format elf64-x86-64

0000000000000000 <dummyf1_avx512x8>:
   0:   f3 0f 1e fa             endbr64
   4:   55                      push   %rbp
   5:   b9 0f 00 00 00          mov    $0xf,%ecx
   a:   c5 f8 92 c9             kmovw  %ecx,%k1
   e:   48 89 e5                mov    %rsp,%rbp
  11:   48 83 e4 e0             and    $0xffffffffffffffe0,%rsp
  15:   48 83 ec 60             sub    $0x60,%rsp
  19:   8b 17                   mov    (%rdi),%edx
  1b:   64 48 8b 04 25 28 00    mov    %fs:0x28,%rax
  22:   00 00
  24:   48 89 44 24 58          mov    %rax,0x58(%rsp)
  29:   31 c0                   xor    %eax,%eax
  2b:   48 8b 47 08             mov    0x8(%rdi),%rax
  2f:   c5 fd 6f 04 24          vmovdqa (%rsp),%ymm0  <=== load uninit data
  34:   c5 fd 6f 4c 24 20       vmovdqa 0x20(%rsp),%ymm1 <=== from stack
  3a:   62 f2 fd 29 89 04 d0    vpexpandq (%rax,%rdx,8),%ymm0{%k1}
  41:   62 f2 fd 29 89 4c d0    vpexpandq 0x20(%rax,%rdx,8),%ymm1{%k1}
  48:   04
  49:   c5 fd d4 c1             vpaddq %ymm1,%ymm0,%ymm0
  4d:   48 8b 44 24 58          mov    0x58(%rsp),%rax
  52:   64 48 2b 04 25 28 00    sub    %fs:0x28,%rax
  59:   00 00
  5b:   75 02                   jne    5f <dummyf1_avx512x8+0x5f>
  5d:   c9                      leaveq
  5e:   c3                      retq
  5f:   c5 f8 77                vzeroupper
  62:   e8 00 00 00 00          callq  67 <dummyf1_avx512x8+0x67>


================
Running gcc-10 with -fdump-tree-optimized shows similar picture 
(as I can uderstand it wants to put pdata[2] on the stack):
$ cat tavx512u5.gcc10-O2.optimized

;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5593,
decl_uid=32966, cgraph_uid=5594, symbol_order=5593)

dummyf1_avx512x8 (const struct flow_avx512 * flow)
{
  __m256i pdata[2];
  vector(4) long long unsigned int _6;
  vector(4) long long unsigned int _8;
  vector(4) long long unsigned int _9;
  vector(4) long long int _10;
  const uint8_t * * _22;
  unsigned int _23;
  long unsigned int _24;
  long unsigned int _25;
  const uint8_t * * _26;
  vector(4) long long int _29;
  const uint8_t * * _30;
  unsigned int _31;
  sizetype _32;
  sizetype _34;
  sizetype _35;
  const uint8_t * * _36;
  vector(4) long long int _39;
  vector(4) long long int _41;
  vector(4) long long int _42;
  vector(4) long long int _45;
  vector(4) long long int _46;

  <bb 2> [local count: 1073741824]:
  _22 = flow_4(D)->idata;
  _23 = flow_4(D)->num_packets;
  _24 = (long unsigned int) _23;
  _25 = _24 * 8;
  _26 = _22 + _25;
  _29 = __builtin_ia32_loaddqudi256_mask (_26, { 0, 0, 0, 0 }, 15);
  _30 = flow_4(D)->idata;
  _31 = flow_4(D)->num_packets;
  _32 = (sizetype) _31;
  _34 = _32 + 4;
  _35 = _34 * 8;
  _36 = _30 + _35;
  _39 = __builtin_ia32_loaddqudi256_mask (_36, { 0, 0, 0, 0 }, 15);
  _41 = MEM[(__m256i * {ref-all})&pdata];
  _42 = __builtin_ia32_expanddi256_mask (_29, _41, 15);
  _45 = MEM[(__m256i * {ref-all})&pdata + 32B];
  _46 = __builtin_ia32_expanddi256_mask (_39, _45, 15);
  _6 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_42);
  _8 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_46);
  _9 = _6 + _8;
  _10 = VIEW_CONVERT_EXPR<__m256i>(_9);
  pdata ={v} {CLOBBER};
  return _10;

}

=========================
While gcc-9:

$ cat tavx512u5.gcc9-O2.optimized                                               
;; Function dummyf1_avx512x8 (dummyf1_avx512x8, funcdef_no=5525,
decl_uid=32562, cgraph_uid=5526, symbol_order=5525)

dummyf1_avx512x8 (const struct flow_avx512 * flow)
{
  vector(4) long long int pdata$32;
  vector(4) long long int pdata;
  vector(4) long long unsigned int _3;
  vector(4) long long unsigned int _5;
  vector(4) long long unsigned int _6;
  vector(4) long long int _7;
  const uint8_t * * _9;
  unsigned int _10;
  long unsigned int _11;
  long unsigned int _12;
  const uint8_t * * _13;
  vector(4) long long int _14;
  const uint8_t * * _15;
  unsigned int _16;
  sizetype _17;
  sizetype _18;
  sizetype _19;
  const uint8_t * * _20;
  vector(4) long long int _21;
  vector(4) long long int _22;
  vector(4) long long int _23;

  <bb 2> [local count: 1073741824]:
  _9 = MEM[(const uint8_t * * const *)flow_2(D) + 8B];
  _10 = MEM[(const uint32_t *)flow_2(D)];
  _11 = (long unsigned int) _10;
  _12 = _11 * 8;
  _13 = _9 + _12;
  _14 = __builtin_ia32_loaddqudi256_mask (_13, { 0, 0, 0, 0 }, 15);
  _15 = MEM[(const uint8_t * * const *)flow_2(D) + 8B];
  _16 = MEM[(const uint32_t *)flow_2(D)];
  _17 = (sizetype) _16;
  _18 = _17 + 4;
  _19 = _18 * 8;
  _20 = _15 + _19;
  _21 = __builtin_ia32_loaddqudi256_mask (_20, { 0, 0, 0, 0 }, 15);
  _22 = __builtin_ia32_expanddi256_mask (_14, pdata_4(D), 15);
  _23 = __builtin_ia32_expanddi256_mask (_21, pdata$32_8(D), 15);
  _3 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_22);
  _5 = VIEW_CONVERT_EXPR<vector(4) long long unsigned int>(_23);
  _6 = _3 + _5;
  _7 = VIEW_CONVERT_EXPR<__m256i>(_6);
  return _7;

}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
@ 2021-04-27  7:08 ` rguenth at gcc dot gnu.org
  2021-04-27 12:07 ` crazylht at gmail dot com
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: rguenth at gcc dot gnu.org @ 2021-04-27  7:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
           Keywords|                            |diagnostic,
                   |                            |missed-optimization
             Target|                            |x86_64-*-* i?86-*-*
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2021-04-27

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
With GCC 9 SRA decomposes pdata and substitutes uninitialized SSA names marked
with no-warning while later we no longer perform this optimization.  It's not
exactly clear what _mm256_mask_expand_epi64 does and why you need to feed it
an uninitialized first argument.

The middle-end unfortunately only sees

  _41 = MEM[(__m256i * {ref-all})&pdata];
  _42 = __builtin_ia32_expanddi256_mask (_29, _41, 15);

where it doesn't know that __builtin_ia32_expanddi256_mask doesn't actually
use the uninitialized value _41 (does it?)

You could use

  __m256i pdatau = pdatau;

  pdata[0] = _mm256_mask_expand_epi64(pdatau, m[0], nd[0]);
  pdata[1] = _mm256_mask_expand_epi64(pdatau, m[1], nd[1]);

to get an uninitialized __m256i and not warn about the missed initialization.
(but I suspect the data might be initialized in other uses)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
  2021-04-27  7:08 ` [Bug middle-end/100267] " rguenth at gcc dot gnu.org
@ 2021-04-27 12:07 ` crazylht at gmail dot com
  2021-04-28  8:52 ` crazylht at gmail dot com
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-04-27 12:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Konstantin Ananyev from comment #0)
> The code snippet below compiles ok with '-O2' for gcc-9.
> But with gcc-10 (and gcc-11) it generates -Wuninitialized warnings.
> Another thing (which is probably worse) 'gcc-10 -O2' generates code with
> unnecessary loads for ymm registers from the initiliazed portion of the
> stack.
> As I understand, thats where from these -Wuninitialized warnings come from:
> by some reason gcc-10 wants to put local '__m256i pdatap[2]' variables
> on the stack.
> Note that only '-O2' affected, '-O3' looks good for all versions I tried
> (gcc-9, gcc-10, gcc-11)..
> 
> =====================
> $ cat tavx512u5.c
> 
> #include <stddef.h>
> #include <stdint.h>
> #include <x86intrin.h>
> 
> 
> struct flow_avx512 {
>         uint32_t num_packets;
>         uint32_t total_packets;
>         const uint8_t **idata;
> };
> 
> static inline void
> start_flow_avx512x8(const struct flow_avx512 *flow, uint32_t num,
>                     uint32_t msk, __m256i pdata[2])
> {
>         uint32_t n, m[2], nm[2];
>         __m256i nd[2];
> 
>         m[0] = msk & 0xF;
>         m[1] = msk >> 4;
> 
>         n = __builtin_popcount(m[0]);
>         nm[0] = (1 << n) - 1;
>         nm[1] = (1 << (num - n)) - 1;
> 
>         nd[0] = _mm256_maskz_loadu_epi64(nm[0],
>                                 flow->idata + flow->num_packets);
>         nd[1] = _mm256_maskz_loadu_epi64(nm[1],
>                         flow->idata + flow->num_packets + n);
> 
>         pdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);
>         pdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);
> }
> 
> __m256i
> dummyf1_avx512x8(const struct flow_avx512 *flow)
> {
>         __m256i pdata[2];
> 
>         start_flow_avx512x8(flow, 8, 0xFF, pdata);
>         return _mm256_add_epi64(pdata[0], pdata[1]);
> }
> 
> ====================
> Good version (gcc-9) first:
> gcc-9 -m64 -mavx512f -mavx512vl -mavx512cd -mavx512bw -Wall -O2 -o
> tavx512u5.gcc9-O2.o -c tavx512u5.c
> 
> $ objdump -d tavx512u5.gcc9-O2.o
> 
> tavx512u5.gcc9-O2.o:     file format elf64-x86-64
> 
> Disassembly of section .text:
> 
> 0000000000000000 <dummyf1_avx512x8>:
>    0:   f3 0f 1e fa             endbr64
>    4:   8b 17                   mov    (%rdi),%edx
>    6:   48 8b 47 08             mov    0x8(%rdi),%rax
>    a:   b9 0f 00 00 00          mov    $0xf,%ecx
>    f:   c5 f8 92 c9             kmovw  %ecx,%k1
>   13:   62 f2 fd a9 89 0c d0    vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z}
>   1a:   62 f2 fd a9 89 44 d0    vpexpandq 0x20(%rax,%rdx,8),%ymm0{%k1}{z}
>   21:   04
>   22:   c5 f5 d4 c0             vpaddq %ymm0,%ymm1,%ymm0
>   26:   c3                      retq
> 

k1 is 0xf, so pdata is not used in _mm256_mask_expand_epi64, but gcc failed to
simplify vpexpandq (%rax,%rdx,8),%ymm1{%k1}{z} to vpexpandq (%rax,%rdx,8),
%ymm1 since we didn't support vpexpandq w/o mask? clang's codegen seems to be
optimal https://godbolt.org/z/d79v11Gz3

cut from sse.md, it seems we only support vpexpandq w/ mask since all
corresponding intrinsics are w/ mask.
----
(define_expand "<avx512>_expand<mode>_maskz"
  [(set (match_operand:VI48F 0 "register_operand")
        (unspec:VI48F
          [(match_operand:VI48F 1 "nonimmediate_operand")
           (match_operand:VI48F 2 "nonimm_or_0_operand")
           (match_operand:<avx512fmaskmode> 3 "register_operand")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512F"
  "operands[2] = CONST0_RTX (<MODE>mode);")

(define_insn "<avx512>_expand<mode>_mask"
  [(set (match_operand:VI48F 0 "register_operand" "=v,v")
        (unspec:VI48F
          [(match_operand:VI48F 1 "nonimmediate_operand" "v,m")
           (match_operand:VI48F 2 "nonimm_or_0_operand" "0C,0C")
           (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512F"
  "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
  [(set_attr "type" "ssemov")
   (set_attr "prefix" "evex")
   (set_attr "memory" "none,load")
   (set_attr "mode" "<sseinsnmode>")])

(define_insn "expand<mode>_mask"
  [(set (match_operand:VI12_AVX512VLBW 0 "register_operand" "=v,v")
        (unspec:VI12_AVX512VLBW
          [(match_operand:VI12_AVX512VLBW 1 "nonimmediate_operand" "v,m")
           (match_operand:VI12_AVX512VLBW 2 "nonimm_or_0_operand" "0C,0C")
           (match_operand:<avx512fmaskmode> 3 "register_operand" "Yk,Yk")]
          UNSPEC_EXPAND))]
  "TARGET_AVX512VBMI2"
  "v<sseintprefix>expand<ssemodesuffix>\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
  [(set_attr "type" "ssemov")
   (set_attr "prefix" "evex")
   (set_attr "memory" "none,load")
   (set_attr "mode" "<sseinsnmode>")])
----

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
  2021-04-27  7:08 ` [Bug middle-end/100267] " rguenth at gcc dot gnu.org
  2021-04-27 12:07 ` crazylht at gmail dot com
@ 2021-04-28  8:52 ` crazylht at gmail dot com
  2021-04-28  9:56 ` crazylht at gmail dot com
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-04-28  8:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
After support v{,p}expand* thats w/o mask operands, codegen seems to be optimal

dummyf1_avx512x8:
.LFB5668:
        .cfi_startproc
        movl    (%rdi), %edx
        movq    8(%rdi), %rax
        vmovdqu (%rax,%rdx,8), %ymm0
        vmovdqu 32(%rax,%rdx,8), %ymm1
        vpexpandq       %ymm0, %ymm0
        vpexpandq       %ymm1, %ymm1
        vpaddq  %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
                   ` (2 preceding siblings ...)
  2021-04-28  8:52 ` crazylht at gmail dot com
@ 2021-04-28  9:56 ` crazylht at gmail dot com
  2021-04-30  1:13 ` crazylht at gmail dot com
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-04-28  9:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #3)
> After support v{,p}expand* thats w/o mask operands, codegen seems to be
> optimal
> 

I was wrong, without mask, it's just simple move.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
                   ` (3 preceding siblings ...)
  2021-04-28  9:56 ` crazylht at gmail dot com
@ 2021-04-30  1:13 ` crazylht at gmail dot com
  2021-05-07  2:22 ` crazylht at gmail dot com
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-04-30  1:13 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #4)
> (In reply to Hongtao.liu from comment #3)
> > After support v{,p}expand* thats w/o mask operands, codegen seems to be
> > optimal
> > 
> 
> I was wrong, without mask, it's just simple move.

finally optimized to

 _Z16dummyf1_avx512x8PK11flow_avx512:
.LFB5665:
        .cfi_startproc
        movl    (%rdi), %edx
        movq    8(%rdi), %rax
        vmovdqu (%rax,%rdx,8), %ymm0
        vmovdqu 32(%rax,%rdx,8), %ymm1
        vpaddq  %ymm1, %ymm0, %ymm0
        ret

I'm testing the patch.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
                   ` (4 preceding siblings ...)
  2021-04-30  1:13 ` crazylht at gmail dot com
@ 2021-05-07  2:22 ` crazylht at gmail dot com
  2021-06-22  1:35 ` cvs-commit at gcc dot gnu.org
  2021-06-22  1:47 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-05-07  2:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #6 from Hongtao.liu <crazylht at gmail dot com> ---
a patch is posted at
https://gcc.gnu.org/pipermail/gcc-patches/2021-April/569248.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
                   ` (5 preceding siblings ...)
  2021-05-07  2:22 ` crazylht at gmail dot com
@ 2021-06-22  1:35 ` cvs-commit at gcc dot gnu.org
  2021-06-22  1:47 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2021-06-22  1:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #7 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>:

https://gcc.gnu.org/g:f51618f301664d02cc41205f1386c0c9b9a29a54

commit r12-1706-gf51618f301664d02cc41205f1386c0c9b9a29a54
Author: liuhongt <hongtao.liu@intel.com>
Date:   Thu Apr 29 18:27:09 2021 +0800

    Optimize vpexpand* to mask mov when mask have all ones in it's lower part
(including 0 and -1).

    gcc/ChangeLog:

            PR target/100267
            * config/i386/i386-builtin.def (BDESC): Adjust builtin name.
            * config/i386/sse.md (<avx512>_expand<mode>_mask): Rename to ..
            (expand<mode>_mask): this ..
            (*expand<mode>_mask): New pre_reload splitter to transform
            v{,p}expand* to vmov* when mask is zero, all ones, or has all
            ones in it's lower part, otherwise still generate
            v{,p}expand*.

    gcc/testsuite/ChangeLog:

            PR target/100267
            * gcc.target/i386/avx512bw-pr100267-1.c: New test.
            * gcc.target/i386/avx512bw-pr100267-b-2.c: New test.
            * gcc.target/i386/avx512bw-pr100267-d-2.c: New test.
            * gcc.target/i386/avx512bw-pr100267-q-2.c: New test.
            * gcc.target/i386/avx512bw-pr100267-w-2.c: New test.
            * gcc.target/i386/avx512f-pr100267-1.c: New test.
            * gcc.target/i386/avx512f-pr100267-pd-2.c: New test.
            * gcc.target/i386/avx512f-pr100267-ps-2.c: New test.
            * gcc.target/i386/avx512vl-pr100267-1.c: New test.
            * gcc.target/i386/avx512vl-pr100267-pd-2.c: New test.
            * gcc.target/i386/avx512vl-pr100267-ps-2.c: New test.
            * gcc.target/i386/avx512vlbw-pr100267-1.c: New test.
            * gcc.target/i386/avx512vlbw-pr100267-b-2.c: New test.
            * gcc.target/i386/avx512vlbw-pr100267-d-2.c: New test.
            * gcc.target/i386/avx512vlbw-pr100267-q-2.c: New test.
            * gcc.target/i386/avx512vlbw-pr100267-w-2.c: New test.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Bug middle-end/100267] gcc -O2 for avx512 instrincts generates extra warnings and less optimizations
  2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
                   ` (6 preceding siblings ...)
  2021-06-22  1:35 ` cvs-commit at gcc dot gnu.org
@ 2021-06-22  1:47 ` crazylht at gmail dot com
  7 siblings, 0 replies; 9+ messages in thread
From: crazylht at gmail dot com @ 2021-06-22  1:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100267

--- Comment #8 from Hongtao.liu <crazylht at gmail dot com> ---
"less optimizations" part should be fixed in GCC12. 


        .file   "test.c"
        .text
        .p2align 4
        .globl  dummyf1_avx512x8
        .type   dummyf1_avx512x8, @function
dummyf1_avx512x8:
.LFB5668:
        .cfi_startproc
        movl    (%rdi), %edx
        movq    8(%rdi), %rax
        vmovdqu (%rax,%rdx,8), %ymm0
        vmovdqu 32(%rax,%rdx,8), %ymm1
        vpaddq  %ymm1, %ymm0, %ymm0
        ret
        .cfi_endproc
.LFE5668:
        .size   dummyf1_avx512x8, .-dummyf1_avx512x8
        .ident  "GCC: (GNU) 12.0.0 20210621 (experimental)"
        .section        .note.GNU-stack,"",@progbits

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-06-22  1:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-26 12:48 [Bug middle-end/100267] New: gcc -O2 for avx512 instrincts generates extra warnings and less optimizations konstantin.ananyev at intel dot com
2021-04-27  7:08 ` [Bug middle-end/100267] " rguenth at gcc dot gnu.org
2021-04-27 12:07 ` crazylht at gmail dot com
2021-04-28  8:52 ` crazylht at gmail dot com
2021-04-28  9:56 ` crazylht at gmail dot com
2021-04-30  1:13 ` crazylht at gmail dot com
2021-05-07  2:22 ` crazylht at gmail dot com
2021-06-22  1:35 ` cvs-commit at gcc dot gnu.org
2021-06-22  1:47 ` crazylht at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).