[Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
@ 2020-06-04  6:24 crazylht at gmail dot com
  2020-06-11  8:21 ` [Bug target/95524] " crazylht at gmail dot com
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-06-04  6:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

            Bug ID: 95524
           Summary: Subtimal codegen for shift by constant for v16qi/v32qi
                    under -march=skylake
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---
            Target: x86_64-*-* i?86-*-*

cat test.c
---
typedef char v16qi __attribute__ ((vector_size (16)));
typedef char v32qi __attribute__ ((vector_size (32)));
typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
typedef unsigned char v32uqi __attribute__ ((vector_size (32)));

v16qi
ashift (v16qi a)
{
    return  a<<5;
}

v32qi
ashift2 (v32qi a, v32qi b)
{
    return  a<<5;
}

v16qi
ashiftrt (v16qi a)
{
    return  a>>5;
}

v32qi
arshiftrt2 (v32qi a)
{
    return  a>>5;
}

v16uqi
lshiftrt (v16uqi a)
{
    return  a>>5;
}

v32uqi
lshiftrt2 (v32uqi a)
{
    return  a>>5;
}
---

gcc11 -O2 -march=skylake

---
ashift(char __vector(16)):
        vpaddb  xmm0, xmm0, xmm0
        vpaddb  xmm0, xmm0, xmm0
        vpaddb  xmm0, xmm0, xmm0
        vpaddb  xmm0, xmm0, xmm0
        vpaddb  xmm0, xmm0, xmm0
        ret
ashift2(char __vector(32), char __vector(32)):
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm0, ymm0, ymm0
        vpaddb  ymm0, ymm0, ymm0
        ret
ashiftrt(char __vector(16)):
        vpmovsxbw       xmm2, xmm0
        vpsrldq xmm1, xmm0, 8
        vpmovsxbw       xmm1, xmm1
        vpsraw  xmm0, xmm2, 5
        vmovdqa xmm2, XMMWORD PTR .LC0[rip]
        vpsraw  xmm1, xmm1, 5
        vpand   xmm0, xmm2, xmm0
        vpand   xmm2, xmm2, xmm1
        vpackuswb       xmm0, xmm0, xmm2
        ret
arshiftrt2(char __vector(32)):
        vmovdqa ymm1, ymm0
        vextracti128    xmm1, ymm1, 0x1
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpmovsxbw       ymm0, xmm0
        vpmovsxbw       ymm1, xmm1
        vpsraw  ymm1, ymm1, 5
        vpsraw  ymm0, ymm0, 5
        vpand   ymm0, ymm2, ymm0
        vpand   ymm2, ymm2, ymm1
        vpackuswb       ymm0, ymm0, ymm2
        vpermq  ymm0, ymm0, 216
        ret
lshiftrt(unsigned char __vector(16)):
        vpmovzxbw       xmm2, xmm0
        vpsrldq xmm1, xmm0, 8
        vpmovzxbw       xmm1, xmm1
        vpsrlw  xmm0, xmm2, 5
        vmovdqa xmm2, XMMWORD PTR .LC0[rip]
        vpsrlw  xmm1, xmm1, 5
        vpand   xmm0, xmm2, xmm0
        vpand   xmm2, xmm2, xmm1
        vpackuswb       xmm0, xmm0, xmm2
        ret
lshiftrt2(unsigned char __vector(32)):
        vmovdqa ymm1, ymm0
        vextracti128    xmm1, ymm1, 0x1
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpmovzxbw       ymm0, xmm0
        vpmovzxbw       ymm1, xmm1
        vpsrlw  ymm1, ymm1, 5
        vpsrlw  ymm0, ymm0, 5
        vpand   ymm0, ymm2, ymm0
        vpand   ymm2, ymm2, ymm1
        vpackuswb       ymm0, ymm0, ymm2
        vpermq  ymm0, ymm0, 216
        ret
.LC0:
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
.LC1:
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
        .value  255
---

icc has
---
ashift(char __vector(16)):
        vpsllw    xmm1, xmm0, 5                                 #9.16
        vpand     xmm0, xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #9.16
        ret                                                     #9.16
ashift2(char __vector(32), char __vector(32)):
        vpsllw    ymm2, ymm0, 5                                 #15.16
        vpand     ymm0, ymm2, YMMWORD PTR .L_2il0floatpacket.1[rip] #15.16
        ret                                                     #15.16
ashiftrt(char __vector(16)):
        vpsrlw    xmm1, xmm0, 5                                 #21.16
        vpand     xmm0, xmm1, XMMWORD PTR .L_2il0floatpacket.2[rip] #21.16
        ret                                                     #21.16
arshiftrt2(char __vector(32)):
        vpsrlw    ymm1, ymm0, 5                                 #27.16
        vpand     ymm0, ymm1, YMMWORD PTR .L_2il0floatpacket.3[rip] #27.16
        ret                                                     #27.16
lshiftrt(unsigned char __vector(16)):
        vpsrlw    xmm1, xmm0, 5                                 #33.16
        vpand     xmm0, xmm1, XMMWORD PTR .L_2il0floatpacket.2[rip] #33.16
        ret                                                     #33.16
lshiftrt2(unsigned char __vector(32)):
        vpsrlw    ymm1, ymm0, 5                                 #39.16
        vpand     ymm0, ymm1, YMMWORD PTR .L_2il0floatpacket.3[rip] #39.16
        ret                                                     #39.16
.L_2il0floatpacket.1:
        .long  
0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0
.L_2il0floatpacket.3:
        .long  
0x07070707,0x07070707,0x07070707,0x07070707,0x07070707,0x07070707,0x07070707,0x07070707
.L_2il0floatpacket.0:
        .long   0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0,0xe0e0e0e0
.L_2il0floatpacket.2:
        .long   0x07070707,0x07070707,0x07070707,0x07070707
---

icc take much less instructions than gcc.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
@ 2020-06-11  8:21 ` crazylht at gmail dot com
  2020-06-15 13:08 ` crazylht at gmail dot com
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-06-11  8:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
Microbenchmark show

interleave_ashiftrt : 69023847
magic_ashiftrt :      62488066

Seems 10% improvement.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
  2020-06-11  8:21 ` [Bug target/95524] " crazylht at gmail dot com
@ 2020-06-15 13:08 ` crazylht at gmail dot com
  2020-06-15 13:16 ` crazylht at gmail dot com
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-06-15 13:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
Microbenchmark show on Skylake client
---
benchmark       Skylake client          
ashift          improvement
v16qi           13%
v32qi           5%
v64qi           7%

ashiftrt        
v16qi           5%
v32qi           7%
v64qi           6%

lshiftrt        
v16qi           16%
v32qi           13%
v64qi           6%
---

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
  2020-06-11  8:21 ` [Bug target/95524] " crazylht at gmail dot com
  2020-06-15 13:08 ` crazylht at gmail dot com
@ 2020-06-15 13:16 ` crazylht at gmail dot com
  2020-06-17  8:05 ` cvs-commit at gcc dot gnu.org
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-06-15 13:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Hongtao.liu from comment #0)

> icc has
> ---
> ashift(char __vector(16)):
>         vpsllw    xmm1, xmm0, 5                                 #9.16
>         vpand     xmm0, xmm1, XMMWORD PTR .L_2il0floatpacket.0[rip] #9.16
>         ret                                                     #9.16
> ashift2(char __vector(32), char __vector(32)):
>         vpsllw    ymm2, ymm0, 5                                 #15.16
>         vpand     ymm0, ymm2, YMMWORD PTR .L_2il0floatpacket.1[rip] #15.16
>         ret                                                     #15.16
> ashiftrt(char __vector(16)):
>         vpsrlw    xmm1, xmm0, 5                                 #21.16
>         vpand     xmm0, xmm1, XMMWORD PTR .L_2il0floatpacket.2[rip] #21.16
>         ret                                                     #21.16
> arshiftrt2(char __vector(32)):
>         vpsrlw    ymm1, ymm0, 5                                 #27.16
>         vpand     ymm0, ymm1, YMMWORD PTR .L_2il0floatpacket.3[rip] #27.16
>         ret                                                     #27.16
>         .long  
> 

ICC seems to generate inaccurate instructions for ashiftrt, but clang is right,
still better than gcc, refer to https://godbolt.org/z/ttV5xY

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
                   ` (2 preceding siblings ...)
  2020-06-15 13:16 ` crazylht at gmail dot com
@ 2020-06-17  8:05 ` cvs-commit at gcc dot gnu.org
  2020-07-09  6:56 ` crazylht at gmail dot com
  2021-08-21 18:27 ` pinskia at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2020-06-17  8:05 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

--- Comment #4 from CVS Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by hongtao Liu <liuhongt@gcc.gnu.org>:

https://gcc.gnu.org/g:c7199fb6e694d1a0964351200648c24c3ee97973

commit r11-1411-gc7199fb6e694d1a0964351200648c24c3ee97973
Author: liuhongt <hongtao.liu@intel.com>
Date:   Mon Jun 15 13:48:45 2020 +0800

    Optimize V16QI/V32QI/V64QI shift by constant.

    gcc/ChangeLog:
            PR target/95524
            * config/i386/i386-expand.c
            (ix86_expand_vec_shift_qihi_constant): New function.
            * config/i386/i386-protos.h
            (ix86_expand_vec_shift_qihi_constant): Declare.
            * config/i386/sse.md (<shift_insn><mode>3): Optimize shift
            V*QImode by constant.

    gcc/testsuite/ChangeLog:
            * gcc.target/i386/avx2-shiftqihi-constant-1.c: New test.
            * gcc.target/i386/avx2-shiftqihi-constant-2.c: Ditto.
            * gcc.target/i386/avx512bw-shiftqihi-constant-1.c: Ditto.
            * gcc.target/i386/avx512bw-shiftqihi-constant-2.c: Ditto.
            * gcc.target/i386/sse2-shiftqihi-constant-1.c: Ditto.
            * gcc.target/i386/sse2-shiftqihi-constant-2.c: Ditto.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
                   ` (3 preceding siblings ...)
  2020-06-17  8:05 ` cvs-commit at gcc dot gnu.org
@ 2020-07-09  6:56 ` crazylht at gmail dot com
  2021-08-21 18:27 ` pinskia at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: crazylht at gmail dot com @ 2020-07-09  6:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

Hongtao.liu <crazylht at gmail dot com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |RESOLVED
         Resolution|---                         |FIXED

--- Comment #5 from Hongtao.liu <crazylht at gmail dot com> ---
Fixed in GCC11

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug target/95524] Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake
  2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
                   ` (4 preceding siblings ...)
  2020-07-09  6:56 ` crazylht at gmail dot com
@ 2021-08-21 18:27 ` pinskia at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-08-21 18:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95524

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|---                         |11.0

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-08-21 18:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-04  6:24 [Bug target/95524] New: Subtimal codegen for shift by constant for v16qi/v32qi under -march=skylake crazylht at gmail dot com
2020-06-11  8:21 ` [Bug target/95524] " crazylht at gmail dot com
2020-06-15 13:08 ` crazylht at gmail dot com
2020-06-15 13:16 ` crazylht at gmail dot com
2020-06-17  8:05 ` cvs-commit at gcc dot gnu.org
2020-07-09  6:56 ` crazylht at gmail dot com
2021-08-21 18:27 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).