* [PATCH] Optimize vec_setv8{hi,hf}_0 + pmovzxbq to pmovzxbq.
@ 2022-05-09 6:43 liuhongt
2022-05-09 6:44 ` Hongtao Liu
0 siblings, 1 reply; 3+ messages in thread
From: liuhongt @ 2022-05-09 6:43 UTC (permalink / raw)
To: gcc-patches; +Cc: =ubizjak
Clean up of 16-bit uppers is not needed for pmovzxbq/pmovsxbq.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/105072
* config/i386/sse.md (*sse4_1_<code>v2qiv2di2<mask_name>_1):
New define_insn.
(*sse4_1_zero_extendv2qiv2di2_2): New pre_reload
define_insn_and_split.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr105072.c: New test.
---
gcc/config/i386/sse.md | 45 +++++++++++++++++++++---
gcc/testsuite/gcc.target/i386/pr105072.c | 24 +++++++++++++
2 files changed, 65 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr105072.c
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7b791def542..47f8b18b82e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -22297,15 +22297,52 @@ (define_insn "sse4_1_<code>v2qiv2di2<mask_name>"
(set_attr "prefix" "orig,orig,maybe_evex")
(set_attr "mode" "TI")])
+(define_insn "*sse4_1_<code>v2qiv2di2<mask_name>_1"
+ [(set (match_operand:V2DI 0 "register_operand" "=v")
+ (any_extend:V2DI
+ (match_operand:V2QI 1 "memory_operand" "m")))]
+ "TARGET_SSE4_1 && <mask_avx512vl_condition>"
+ "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
+ [(set_attr "type" "ssemov")
+ (set_attr "prefix_extra" "1")
+ (set_attr "prefix" "maybe_evex")
+ (set_attr "mode" "TI")])
+
(define_expand "<insn>v2qiv2di2"
[(set (match_operand:V2DI 0 "register_operand")
(any_extend:V2DI
- (match_operand:V2QI 1 "register_operand")))]
+ (match_operand:V2QI 1 "nonimmediate_operand")))]
"TARGET_SSE4_1"
{
- rtx op1 = force_reg (V2QImode, operands[1]);
- op1 = lowpart_subreg (V16QImode, op1, V2QImode);
- emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
+ if (!MEM_P (operands[1]))
+ {
+ rtx op1 = force_reg (V2QImode, operands[1]);
+ op1 = lowpart_subreg (V16QImode, op1, V2QImode);
+ emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
+ DONE;
+ }
+})
+
+(define_insn_and_split "*sse4_1_zero_extendv2qiv2di2_2"
+ [(set (match_operand:V2DI 0 "register_operand")
+ (zero_extend:V2DI
+ (vec_select:V2QI
+ (subreg:V16QI
+ (vec_merge:V8_128
+ (vec_duplicate:V8_128
+ (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))
+ (match_operand:V8_128 2 "const0_operand")
+ (const_int 1)) 0)
+ (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ if (!MEM_P (operands[1]))
+ operands[1] = force_reg (<ssescalarmode>mode, operands[1]);
+ operands[1] = lowpart_subreg (V2QImode, operands[1], <ssescalarmode>mode);
+ emit_insn (gen_zero_extendv2qiv2di2 (operands[0], operands[1]));
DONE;
})
diff --git a/gcc/testsuite/gcc.target/i386/pr105072.c b/gcc/testsuite/gcc.target/i386/pr105072.c
new file mode 100644
index 00000000000..54e229731b8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr105072.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)pmovzxbq[ \t]+} "4" } } */
+/* { dg-final { scan-assembler-not {(?n)pinsrw[ \t]+} } } */
+
+#include<immintrin.h>
+
+__m128i foo (void *p){
+ return _mm_cvtepu8_epi64(_mm_loadu_si16(p));
+}
+
+__m128i foo2 (short a){
+ return _mm_cvtepu8_epi64(_mm_set_epi16(0, 0, 0, 0, 0, 0, 0, a));
+}
+
+__m128i
+foo3 (void *p){
+ return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {*(_Float16 const*)p, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+}
+
+__m128i
+foo4 (_Float16 a){
+ return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {a, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+}
--
2.18.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Optimize vec_setv8{hi,hf}_0 + pmovzxbq to pmovzxbq.
2022-05-09 6:43 [PATCH] Optimize vec_setv8{hi,hf}_0 + pmovzxbq to pmovzxbq liuhongt
@ 2022-05-09 6:44 ` Hongtao Liu
2022-05-09 7:56 ` Uros Bizjak
0 siblings, 1 reply; 3+ messages in thread
From: Hongtao Liu @ 2022-05-09 6:44 UTC (permalink / raw)
To: Uros Bizjak; +Cc: GCC Patches
On Mon, May 9, 2022 at 2:43 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Clean up of 16-bit uppers is not needed for pmovzxbq/pmovsxbq.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
> PR target/105072
> * config/i386/sse.md (*sse4_1_<code>v2qiv2di2<mask_name>_1):
> New define_insn.
> (*sse4_1_zero_extendv2qiv2di2_2): New pre_reload
> define_insn_and_split.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr105072.c: New test.
> ---
> gcc/config/i386/sse.md | 45 +++++++++++++++++++++---
> gcc/testsuite/gcc.target/i386/pr105072.c | 24 +++++++++++++
> 2 files changed, 65 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr105072.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 7b791def542..47f8b18b82e 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -22297,15 +22297,52 @@ (define_insn "sse4_1_<code>v2qiv2di2<mask_name>"
> (set_attr "prefix" "orig,orig,maybe_evex")
> (set_attr "mode" "TI")])
>
> +(define_insn "*sse4_1_<code>v2qiv2di2<mask_name>_1"
> + [(set (match_operand:V2DI 0 "register_operand" "=v")
> + (any_extend:V2DI
> + (match_operand:V2QI 1 "memory_operand" "m")))]
> + "TARGET_SSE4_1 && <mask_avx512vl_condition>"
> + "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
> + [(set_attr "type" "ssemov")
> + (set_attr "prefix_extra" "1")
> + (set_attr "prefix" "maybe_evex")
> + (set_attr "mode" "TI")])
> +
> (define_expand "<insn>v2qiv2di2"
> [(set (match_operand:V2DI 0 "register_operand")
> (any_extend:V2DI
> - (match_operand:V2QI 1 "register_operand")))]
> + (match_operand:V2QI 1 "nonimmediate_operand")))]
> "TARGET_SSE4_1"
> {
> - rtx op1 = force_reg (V2QImode, operands[1]);
> - op1 = lowpart_subreg (V16QImode, op1, V2QImode);
> - emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
> + if (!MEM_P (operands[1]))
> + {
> + rtx op1 = force_reg (V2QImode, operands[1]);
> + op1 = lowpart_subreg (V16QImode, op1, V2QImode);
> + emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
> + DONE;
> + }
> +})
> +
> +(define_insn_and_split "*sse4_1_zero_extendv2qiv2di2_2"
> + [(set (match_operand:V2DI 0 "register_operand")
> + (zero_extend:V2DI
> + (vec_select:V2QI
> + (subreg:V16QI
> + (vec_merge:V8_128
> + (vec_duplicate:V8_128
> + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))
> + (match_operand:V8_128 2 "const0_operand")
> + (const_int 1)) 0)
> + (parallel [(const_int 0) (const_int 1)]))))]
> + "TARGET_SSE4_1 && ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(const_int 0)]
> +{
> + if (!MEM_P (operands[1]))
> + operands[1] = force_reg (<ssescalarmode>mode, operands[1]);
> + operands[1] = lowpart_subreg (V2QImode, operands[1], <ssescalarmode>mode);
> + emit_insn (gen_zero_extendv2qiv2di2 (operands[0], operands[1]));
> DONE;
> })
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr105072.c b/gcc/testsuite/gcc.target/i386/pr105072.c
> new file mode 100644
> index 00000000000..54e229731b8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105072.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse4.1 -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)pmovzxbq[ \t]+} "4" } } */
> +/* { dg-final { scan-assembler-not {(?n)pinsrw[ \t]+} } } */
> +
> +#include<immintrin.h>
> +
> +__m128i foo (void *p){
> + return _mm_cvtepu8_epi64(_mm_loadu_si16(p));
> +}
> +
> +__m128i foo2 (short a){
> + return _mm_cvtepu8_epi64(_mm_set_epi16(0, 0, 0, 0, 0, 0, 0, a));
> +}
> +
> +__m128i
> +foo3 (void *p){
> + return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {*(_Float16 const*)p, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
> +}
> +
> +__m128i
> +foo4 (_Float16 a){
> + return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {a, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
> +}
> --
> 2.18.1
>
--
BR,
Hongtao
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] Optimize vec_setv8{hi,hf}_0 + pmovzxbq to pmovzxbq.
2022-05-09 6:44 ` Hongtao Liu
@ 2022-05-09 7:56 ` Uros Bizjak
0 siblings, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2022-05-09 7:56 UTC (permalink / raw)
To: Hongtao Liu; +Cc: GCC Patches
On Mon, May 9, 2022 at 8:44 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 2:43 PM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Clean up of 16-bit uppers is not needed for pmovzxbq/pmovsxbq.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> > PR target/105072
> > * config/i386/sse.md (*sse4_1_<code>v2qiv2di2<mask_name>_1):
> > New define_insn.
> > (*sse4_1_zero_extendv2qiv2di2_2): New pre_reload
> > define_insn_and_split.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr105072.c: New test.
OK.
Thanks,
Uros.
> > ---
> > gcc/config/i386/sse.md | 45 +++++++++++++++++++++---
> > gcc/testsuite/gcc.target/i386/pr105072.c | 24 +++++++++++++
> > 2 files changed, 65 insertions(+), 4 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr105072.c
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index 7b791def542..47f8b18b82e 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -22297,15 +22297,52 @@ (define_insn "sse4_1_<code>v2qiv2di2<mask_name>"
> > (set_attr "prefix" "orig,orig,maybe_evex")
> > (set_attr "mode" "TI")])
> >
> > +(define_insn "*sse4_1_<code>v2qiv2di2<mask_name>_1"
> > + [(set (match_operand:V2DI 0 "register_operand" "=v")
> > + (any_extend:V2DI
> > + (match_operand:V2QI 1 "memory_operand" "m")))]
> > + "TARGET_SSE4_1 && <mask_avx512vl_condition>"
> > + "%vpmov<extsuffix>bq\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}"
> > + [(set_attr "type" "ssemov")
> > + (set_attr "prefix_extra" "1")
> > + (set_attr "prefix" "maybe_evex")
> > + (set_attr "mode" "TI")])
> > +
> > (define_expand "<insn>v2qiv2di2"
> > [(set (match_operand:V2DI 0 "register_operand")
> > (any_extend:V2DI
> > - (match_operand:V2QI 1 "register_operand")))]
> > + (match_operand:V2QI 1 "nonimmediate_operand")))]
> > "TARGET_SSE4_1"
> > {
> > - rtx op1 = force_reg (V2QImode, operands[1]);
> > - op1 = lowpart_subreg (V16QImode, op1, V2QImode);
> > - emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
> > + if (!MEM_P (operands[1]))
> > + {
> > + rtx op1 = force_reg (V2QImode, operands[1]);
> > + op1 = lowpart_subreg (V16QImode, op1, V2QImode);
> > + emit_insn (gen_sse4_1_<code>v2qiv2di2 (operands[0], op1));
> > + DONE;
> > + }
> > +})
> > +
> > +(define_insn_and_split "*sse4_1_zero_extendv2qiv2di2_2"
> > + [(set (match_operand:V2DI 0 "register_operand")
> > + (zero_extend:V2DI
> > + (vec_select:V2QI
> > + (subreg:V16QI
> > + (vec_merge:V8_128
> > + (vec_duplicate:V8_128
> > + (match_operand:<ssescalarmode> 1 "nonimmediate_operand"))
> > + (match_operand:V8_128 2 "const0_operand")
> > + (const_int 1)) 0)
> > + (parallel [(const_int 0) (const_int 1)]))))]
> > + "TARGET_SSE4_1 && ix86_pre_reload_split ()"
> > + "#"
> > + "&& 1"
> > + [(const_int 0)]
> > +{
> > + if (!MEM_P (operands[1]))
> > + operands[1] = force_reg (<ssescalarmode>mode, operands[1]);
> > + operands[1] = lowpart_subreg (V2QImode, operands[1], <ssescalarmode>mode);
> > + emit_insn (gen_zero_extendv2qiv2di2 (operands[0], operands[1]));
> > DONE;
> > })
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr105072.c b/gcc/testsuite/gcc.target/i386/pr105072.c
> > new file mode 100644
> > index 00000000000..54e229731b8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr105072.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse4.1 -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)pmovzxbq[ \t]+} "4" } } */
> > +/* { dg-final { scan-assembler-not {(?n)pinsrw[ \t]+} } } */
> > +
> > +#include<immintrin.h>
> > +
> > +__m128i foo (void *p){
> > + return _mm_cvtepu8_epi64(_mm_loadu_si16(p));
> > +}
> > +
> > +__m128i foo2 (short a){
> > + return _mm_cvtepu8_epi64(_mm_set_epi16(0, 0, 0, 0, 0, 0, 0, a));
> > +}
> > +
> > +__m128i
> > +foo3 (void *p){
> > + return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {*(_Float16 const*)p, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
> > +}
> > +
> > +__m128i
> > +foo4 (_Float16 a){
> > + return _mm_cvtepu8_epi64((__m128i)__extension__(__m128h) {a, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
> > +}
> > --
> > 2.18.1
> >
>
>
> --
> BR,
> Hongtao
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-05-09 7:56 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-09 6:43 [PATCH] Optimize vec_setv8{hi,hf}_0 + pmovzxbq to pmovzxbq liuhongt
2022-05-09 6:44 ` Hongtao Liu
2022-05-09 7:56 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).