* [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns. @ 2022-07-11 1:15 liuhongt 2022-07-11 8:02 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: liuhongt @ 2022-07-11 1:15 UTC (permalink / raw) To: gcc-patches And split it to GPR-version instruction after reload. This will enable below optimization for 16/32/64-bit vector bit_op - movd (%rdi), %xmm0 - movd (%rsi), %xmm1 - pand %xmm1, %xmm0 - movd %xmm0, (%rdi) + movl (%rsi), %eax + andl %eax, (%rdi) Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/106038 * config/i386/mmx.md (<code><mode>3): Expand with (clobber (reg:CC flags_reg)) under TARGET_64BIT (mmx_code><mode>3): Ditto. (*mmx_<code><mode>3_1): New define_insn, add post_reload splitter after it. (*<code><mode>3): New define_insn, also add post_reload splitter after it. (mmxinsnmode): New mode attribute. (VI_16_32_64): New mode iterator. (*mov<mode>_imm): Refactor with mmxinsnmode. * config/i386/predicates.md (nonimmediate_or_x86_64_vector_cst): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/pr106038-1.c: New test. * gcc.target/i386/pr106038-2.c: New test. * gcc.target/i386/pr106038-3.c: New test. --- gcc/config/i386/mmx.md | 131 +++++++++++++++------ gcc/config/i386/predicates.md | 4 + gcc/testsuite/gcc.target/i386/pr106038-1.c | 61 ++++++++++ gcc/testsuite/gcc.target/i386/pr106038-2.c | 35 ++++++ gcc/testsuite/gcc.target/i386/pr106038-3.c | 17 +++ 5 files changed, 213 insertions(+), 35 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..85b06abea27 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64 (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT") (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")]) +(define_mode_iterator VI_16_32_64 + [V2QI V4QI V2HI + (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") + (V2SI "TARGET_64BIT")]) + ;; V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm" HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3" (match_operand:MMXMODEI 1 "register_mmxmem_operand") (match_operand:MMXMODEI 2 "register_mmxmem_operand")))] "TARGET_MMX || TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") +{ + ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); + if (TARGET_64BIT) + { + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); + DONE; + } +}) (define_expand "<code><mode>3" [(set (match_operand:MMXMODEI 0 "register_operand") (any_logic:MMXMODEI (match_operand:MMXMODEI 1 "register_operand") (match_operand:MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE") + "TARGET_MMX_WITH_SSE" +{ + if (TARGET_64BIT) + { + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); + DONE; + } +}) (define_insn "*mmx_<code><mode>3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3" (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") - (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) +(define_insn "*mmx_<code><mode>3_1" + [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0") + (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && (TARGET_MMX || TARGET_SSE2) + && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" + "#" + [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64") + (set_attr "mmx_isa" "native,*,*,*,*,*") + (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu") + (set_attr "mode" "DI,TI,TI,TI,DI,DI")]) + +(define_split + [(set (match_operand:MMXMODEI 0 "register_operand") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "register_mmxmem_operand") + (match_operand:MMXMODEI 2 "register_mmxmem_operand"))) (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT + && (TARGET_MMX || TARGET_SSE2) + && reload_completed + && !general_reg_operand (operands[0], <MODE>mode)" + [(set (match_dup 0) + (any_logic:<MODE> (match_dup 1) (match_dup 2)))]) + +(define_expand "<code><mode>3" + [(set (match_operand:VI_16_32 0 "register_operand") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "register_operand") + (match_operand:VI_16_32 2 "register_operand")))] "" +{ + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); + DONE; +}) + +(define_insn "*<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") - (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + [(set (match_operand:VI_16_32_64 0 "") + (any_logic:VI_16_32_64 + (match_operand:VI_16_32_64 1 "") + (match_operand:VI_16_32_64 2 ""))) (clobber (reg:CC FLAGS_REG))] - "reload_completed" + "reload_completed + && !sse_reg_operand (operands[1], <MODE>mode) + && !sse_reg_operand (operands[2], <MODE>mode) + && !sse_reg_operand (operands[0], <MODE>mode)" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + if (CONSTANT_P (operands[2])) + { + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], + <MODE>mode); + operands[2] = GEN_INT (val); + } + else + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c71c453cceb..62280f58478 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" return trunc_int_for_mode (val, SImode) == val; }) +(define_predicate "nonimmediate_or_x86_64_vector_cst" + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "x86_64_const_vector_operand"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c new file mode 100644 index 00000000000..ac5d1990682 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; +} + +void +foo1 (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo2 (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; + a[4] &= b[4]; + a[5] &= b[5]; + a[6] &= b[6]; + a[7] &= b[7]; +} + +void +foo3 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} + +void +foo5 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 2; + a[3] &= 3; + a[4] &= 4; + a[5] &= 5; + a[6] &= 6; + a[7] &= 7; +} diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c new file mode 100644 index 00000000000..dce8a536a95 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo (short* a, short* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; +} + +void +foo1 (short* a, short* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo3 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c new file mode 100644 index 00000000000..3c7bd978f36 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo1 (int* a, int* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo4 (int* a, int* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} -- 2.18.1 ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns. 2022-07-11 1:15 [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns liuhongt @ 2022-07-11 8:02 ` Uros Bizjak 2022-07-12 6:37 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-11 8:02 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches, H. J. Lu On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote: > > And split it to GPR-version instruction after reload. > > This will enable below optimization for 16/32/64-bit vector bit_op > > - movd (%rdi), %xmm0 > - movd (%rsi), %xmm1 > - pand %xmm1, %xmm0 > - movd %xmm0, (%rdi) > + movl (%rsi), %eax > + andl %eax, (%rdi) > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? The patch will create many interunit moves (xmm <-> gpr) for anything but the most simple logic sequences, because operations with memory/immediate will be forced into GPR registers, while reg/reg operations will remain in XMM registers. I tried to introduce GPR registers to MMX logic insns in the past and observed the above behavior, but perhaps RA evolved in the mean time to handle different register sets better (especially under register pressure). However, I would advise to be careful with this functionality. Perhaps this problem should be attacked in stages. First, please introduce GPR registers to MMX logic instructions (similar to how VI_16_32 mode instructions are handled). After RA effects will be analysed, only then memory/immediate handling should be added. Also, please don't forget to handle ANDNOT insn - TARGET_BMI slightly complicates this part, but this is also solved with VI_16_32 mode instructions. Uros. > > gcc/ChangeLog: > > PR target/106038 > * config/i386/mmx.md (<code><mode>3): Expand > with (clobber (reg:CC flags_reg)) under TARGET_64BIT > (mmx_code><mode>3): Ditto. > (*mmx_<code><mode>3_1): New define_insn, add post_reload > splitter after it. > (*<code><mode>3): New define_insn, also add post_reload > splitter after it. > (mmxinsnmode): New mode attribute. > (VI_16_32_64): New mode iterator. > (*mov<mode>_imm): Refactor with mmxinsnmode. > * config/i386/predicates.md > (nonimmediate_or_x86_64_vector_cst): New predicate. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr106038-1.c: New test. > * gcc.target/i386/pr106038-2.c: New test. > * gcc.target/i386/pr106038-3.c: New test. > --- > gcc/config/i386/mmx.md | 131 +++++++++++++++------ > gcc/config/i386/predicates.md | 4 + > gcc/testsuite/gcc.target/i386/pr106038-1.c | 61 ++++++++++ > gcc/testsuite/gcc.target/i386/pr106038-2.c | 35 ++++++ > gcc/testsuite/gcc.target/i386/pr106038-3.c | 17 +++ > 5 files changed, 213 insertions(+), 35 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 3294c1e6274..85b06abea27 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64 > (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT") > (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")]) > > +(define_mode_iterator VI_16_32_64 > + [V2QI V4QI V2HI > + (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") > + (V2SI "TARGET_64BIT")]) > + > ;; V2S* modes > (define_mode_iterator V2FI [V2SF V2SI]) > > @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > +;; Mapping to same size integral mode. > +(define_mode_attr mmxinsnmode > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > + (V4HI "DI") (V2HI "SI") > + (V2SI "DI") > + (V4HF "DI") (V2HF "SI") > + (V2SF "DI")]) > + > (define_mode_attr mmxdoublemode > [(V8QI "V8HI") (V4HI "V4SI")]) > > @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm" > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > <MODE>mode); > operands[1] = GEN_INT (val); > - machine_mode mode; > - switch (GET_MODE_SIZE (<MODE>mode)) > - { > - case 2: > - mode = HImode; > - break; > - case 4: > - mode = SImode; > - break; > - case 8: > - mode = DImode; > - break; > - default: > - gcc_unreachable (); > - } > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > ;; For TARGET_64BIT we always round up to 8 bytes. > @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3" > (match_operand:MMXMODEI 1 "register_mmxmem_operand") > (match_operand:MMXMODEI 2 "register_mmxmem_operand")))] > "TARGET_MMX || TARGET_MMX_WITH_SSE" > - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") > +{ > + ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); > + if (TARGET_64BIT) > + { > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > + DONE; > + } > +}) > > (define_expand "<code><mode>3" > [(set (match_operand:MMXMODEI 0 "register_operand") > (any_logic:MMXMODEI > (match_operand:MMXMODEI 1 "register_operand") > (match_operand:MMXMODEI 2 "register_operand")))] > - "TARGET_MMX_WITH_SSE") > + "TARGET_MMX_WITH_SSE" > +{ > + if (TARGET_64BIT) > + { > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > + DONE; > + } > +}) > > (define_insn "*mmx_<code><mode>3" > [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") > @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3" > (set_attr "type" "mmxadd,sselog,sselog,sselog") > (set_attr "mode" "DI,TI,TI,TI")]) > > -(define_insn "<code><mode>3" > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > - (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > +(define_insn "*mmx_<code><mode>3_1" > + [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r") > + (any_logic:MMXMODEI > + (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0") > + (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_64BIT > + && (TARGET_MMX || TARGET_SSE2) > + && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > + "#" > + [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64") > + (set_attr "mmx_isa" "native,*,*,*,*,*") > + (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu") > + (set_attr "mode" "DI,TI,TI,TI,DI,DI")]) > + > +(define_split > + [(set (match_operand:MMXMODEI 0 "register_operand") > + (any_logic:MMXMODEI > + (match_operand:MMXMODEI 1 "register_mmxmem_operand") > + (match_operand:MMXMODEI 2 "register_mmxmem_operand"))) > (clobber (reg:CC FLAGS_REG))] > + "TARGET_64BIT > + && (TARGET_MMX || TARGET_SSE2) > + && reload_completed > + && !general_reg_operand (operands[0], <MODE>mode)" > + [(set (match_dup 0) > + (any_logic:<MODE> (match_dup 1) (match_dup 2)))]) > + > +(define_expand "<code><mode>3" > + [(set (match_operand:VI_16_32 0 "register_operand") > + (any_logic:VI_16_32 > + (match_operand:VI_16_32 1 "register_operand") > + (match_operand:VI_16_32 2 "register_operand")))] > "" > +{ > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > + DONE; > +}) > + > +(define_insn "*<code><mode>3" > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v") > + (any_logic:VI_16_32 > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v"))) > + (clobber (reg:CC FLAGS_REG))] > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > "#" > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > - (set_attr "type" "alu,sselog,sselog,sselog") > - (set_attr "mode" "SI,TI,TI,TI")]) > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > (define_split > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > - (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "general_reg_operand") > - (match_operand:VI_16_32 2 "general_reg_operand"))) > + [(set (match_operand:VI_16_32_64 0 "") > + (any_logic:VI_16_32_64 > + (match_operand:VI_16_32_64 1 "") > + (match_operand:VI_16_32_64 2 ""))) > (clobber (reg:CC FLAGS_REG))] > - "reload_completed" > + "reload_completed > + && !sse_reg_operand (operands[1], <MODE>mode) > + && !sse_reg_operand (operands[2], <MODE>mode) > + && !sse_reg_operand (operands[0], <MODE>mode)" > [(parallel > [(set (match_dup 0) > - (any_logic:SI (match_dup 1) (match_dup 2))) > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > (clobber (reg:CC FLAGS_REG))])] > { > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + if (CONSTANT_P (operands[2])) > + { > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > + <MODE>mode); > + operands[2] = GEN_INT (val); > + } > + else > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > (define_split > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index c71c453cceb..62280f58478 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > return trunc_int_for_mode (val, SImode) == val; > }) > > +(define_predicate "nonimmediate_or_x86_64_vector_cst" > + (ior (match_operand 0 "nonimmediate_operand") > + (match_operand 0 "x86_64_const_vector_operand"))) > + > ;; Return true when OP is nonimmediate or standard SSE constant. > (define_predicate "nonimmediate_or_sse_const_operand" > (ior (match_operand 0 "nonimmediate_operand") > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > new file mode 100644 > index 00000000000..ac5d1990682 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > @@ -0,0 +1,61 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo (char* a, char* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > + a[2] &= b[2]; > + a[3] &= b[3]; > +} > + > +void > +foo1 (char* a, char* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > +} > + > +void > +foo2 (char* a, char* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > + a[2] &= b[2]; > + a[3] &= b[3]; > + a[4] &= b[4]; > + a[5] &= b[5]; > + a[6] &= b[6]; > + a[7] &= b[7]; > +} > + > +void > +foo3 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 3; > + a[3] &= 3; > +} > + > +void > +foo4 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > + > +void > +foo5 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 2; > + a[3] &= 3; > + a[4] &= 4; > + a[5] &= 5; > + a[6] &= 6; > + a[7] &= 7; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c > new file mode 100644 > index 00000000000..dce8a536a95 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c > @@ -0,0 +1,35 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo (short* a, short* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > + a[2] &= b[2]; > + a[3] &= b[3]; > +} > + > +void > +foo1 (short* a, short* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > +} > + > +void > +foo3 (short* a, short* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 3; > + a[3] &= 3; > +} > + > +void > +foo4 (short* a, short* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c > new file mode 100644 > index 00000000000..3c7bd978f36 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo1 (int* a, int* __restrict b) > +{ > + a[0] &= b[0]; > + a[1] &= b[1]; > +} > + > +void > +foo4 (int* a, int* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > -- > 2.18.1 > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns. 2022-07-11 8:02 ` Uros Bizjak @ 2022-07-12 6:37 ` Hongtao Liu 2022-07-12 7:15 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-12 6:37 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > And split it to GPR-version instruction after reload. > > > > This will enable below optimization for 16/32/64-bit vector bit_op > > > > - movd (%rdi), %xmm0 > > - movd (%rsi), %xmm1 > > - pand %xmm1, %xmm0 > > - movd %xmm0, (%rdi) > > + movl (%rsi), %eax > > + andl %eax, (%rdi) > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > Ok for trunk? > > The patch will create many interunit moves (xmm <-> gpr) for anything > but the most simple logic sequences, because operations with > memory/immediate will be forced into GPR registers, while reg/reg > operations will remain in XMM registers. Agree not to deal with mem/immediate at first. > > I tried to introduce GPR registers to MMX logic insns in the past and > observed the above behavior, but perhaps RA evolved in the mean time > to handle different register sets better (especially under register > pressure). However, I would advise to be careful with this > functionality. > > Perhaps this problem should be attacked in stages. First, please > introduce GPR registers to MMX logic instructions (similar to how > VI_16_32 mode instructions are handled). After RA effects will be There's "?r" in VI_16_32 logic instructions which prevent RA allocate gpr for testcase in the patch. Is it ok to remove "?" for them(Also add alternative "r" instead of "?r" in mmx logic insns)? If there's other instructions that prefer "v to "r", then RA will allocate "v", but for logic instructions, "r" and “v" should be treated equally, just as in the 16/32/64-bit vector mov<mode>_internal. > analysed, only then memory/immediate handling should be added. Also, > please don't forget to handle ANDNOT insn - TARGET_BMI slightly > complicates this part, but this is also solved with VI_16_32 mode > instructions. > > Uros. > > > > > gcc/ChangeLog: > > > > PR target/106038 > > * config/i386/mmx.md (<code><mode>3): Expand > > with (clobber (reg:CC flags_reg)) under TARGET_64BIT > > (mmx_code><mode>3): Ditto. > > (*mmx_<code><mode>3_1): New define_insn, add post_reload > > splitter after it. > > (*<code><mode>3): New define_insn, also add post_reload > > splitter after it. > > (mmxinsnmode): New mode attribute. > > (VI_16_32_64): New mode iterator. > > (*mov<mode>_imm): Refactor with mmxinsnmode. > > * config/i386/predicates.md > > (nonimmediate_or_x86_64_vector_cst): New predicate. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr106038-1.c: New test. > > * gcc.target/i386/pr106038-2.c: New test. > > * gcc.target/i386/pr106038-3.c: New test. > > --- > > gcc/config/i386/mmx.md | 131 +++++++++++++++------ > > gcc/config/i386/predicates.md | 4 + > > gcc/testsuite/gcc.target/i386/pr106038-1.c | 61 ++++++++++ > > gcc/testsuite/gcc.target/i386/pr106038-2.c | 35 ++++++ > > gcc/testsuite/gcc.target/i386/pr106038-3.c | 17 +++ > > 5 files changed, 213 insertions(+), 35 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c > > > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > > index 3294c1e6274..85b06abea27 100644 > > --- a/gcc/config/i386/mmx.md > > +++ b/gcc/config/i386/mmx.md > > @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64 > > (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT") > > (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")]) > > > > +(define_mode_iterator VI_16_32_64 > > + [V2QI V4QI V2HI > > + (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") > > + (V2SI "TARGET_64BIT")]) > > + > > ;; V2S* modes > > (define_mode_iterator V2FI [V2SF V2SI]) > > > > @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize > > [(V8QI "b") (V4QI "b") (V2QI "b") > > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > > > +;; Mapping to same size integral mode. > > +(define_mode_attr mmxinsnmode > > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > > + (V4HI "DI") (V2HI "SI") > > + (V2SI "DI") > > + (V4HF "DI") (V2HF "SI") > > + (V2SF "DI")]) > > + > > (define_mode_attr mmxdoublemode > > [(V8QI "V8HI") (V4HI "V4SI")]) > > > > @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm" > > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > > <MODE>mode); > > operands[1] = GEN_INT (val); > > - machine_mode mode; > > - switch (GET_MODE_SIZE (<MODE>mode)) > > - { > > - case 2: > > - mode = HImode; > > - break; > > - case 4: > > - mode = SImode; > > - break; > > - case 8: > > - mode = DImode; > > - break; > > - default: > > - gcc_unreachable (); > > - } > > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > > }) > > > > ;; For TARGET_64BIT we always round up to 8 bytes. > > @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3" > > (match_operand:MMXMODEI 1 "register_mmxmem_operand") > > (match_operand:MMXMODEI 2 "register_mmxmem_operand")))] > > "TARGET_MMX || TARGET_MMX_WITH_SSE" > > - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") > > +{ > > + ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); > > + if (TARGET_64BIT) > > + { > > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > > + DONE; > > + } > > +}) > > > > (define_expand "<code><mode>3" > > [(set (match_operand:MMXMODEI 0 "register_operand") > > (any_logic:MMXMODEI > > (match_operand:MMXMODEI 1 "register_operand") > > (match_operand:MMXMODEI 2 "register_operand")))] > > - "TARGET_MMX_WITH_SSE") > > + "TARGET_MMX_WITH_SSE" > > +{ > > + if (TARGET_64BIT) > > + { > > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > > + DONE; > > + } > > +}) > > > > (define_insn "*mmx_<code><mode>3" > > [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") > > @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3" > > (set_attr "type" "mmxadd,sselog,sselog,sselog") > > (set_attr "mode" "DI,TI,TI,TI")]) > > > > -(define_insn "<code><mode>3" > > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > > - (any_logic:VI_16_32 > > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > > +(define_insn "*mmx_<code><mode>3_1" > > + [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r") > > + (any_logic:MMXMODEI > > + (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0") > > + (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m"))) > > + (clobber (reg:CC FLAGS_REG))] > > + "TARGET_64BIT > > + && (TARGET_MMX || TARGET_SSE2) > > + && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > > + "#" > > + [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64") > > + (set_attr "mmx_isa" "native,*,*,*,*,*") > > + (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu") > > + (set_attr "mode" "DI,TI,TI,TI,DI,DI")]) > > + > > +(define_split > > + [(set (match_operand:MMXMODEI 0 "register_operand") > > + (any_logic:MMXMODEI > > + (match_operand:MMXMODEI 1 "register_mmxmem_operand") > > + (match_operand:MMXMODEI 2 "register_mmxmem_operand"))) > > (clobber (reg:CC FLAGS_REG))] > > + "TARGET_64BIT > > + && (TARGET_MMX || TARGET_SSE2) > > + && reload_completed > > + && !general_reg_operand (operands[0], <MODE>mode)" > > + [(set (match_dup 0) > > + (any_logic:<MODE> (match_dup 1) (match_dup 2)))]) > > + > > +(define_expand "<code><mode>3" > > + [(set (match_operand:VI_16_32 0 "register_operand") > > + (any_logic:VI_16_32 > > + (match_operand:VI_16_32 1 "register_operand") > > + (match_operand:VI_16_32 2 "register_operand")))] > > "" > > +{ > > + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); > > + DONE; > > +}) > > + > > +(define_insn "*<code><mode>3" > > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v") > > + (any_logic:VI_16_32 > > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > > + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v"))) > > + (clobber (reg:CC FLAGS_REG))] > > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > > "#" > > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > > - (set_attr "type" "alu,sselog,sselog,sselog") > > - (set_attr "mode" "SI,TI,TI,TI")]) > > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > > > (define_split > > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > > - (any_logic:VI_16_32 > > - (match_operand:VI_16_32 1 "general_reg_operand") > > - (match_operand:VI_16_32 2 "general_reg_operand"))) > > + [(set (match_operand:VI_16_32_64 0 "") > > + (any_logic:VI_16_32_64 > > + (match_operand:VI_16_32_64 1 "") > > + (match_operand:VI_16_32_64 2 ""))) > > (clobber (reg:CC FLAGS_REG))] > > - "reload_completed" > > + "reload_completed > > + && !sse_reg_operand (operands[1], <MODE>mode) > > + && !sse_reg_operand (operands[2], <MODE>mode) > > + && !sse_reg_operand (operands[0], <MODE>mode)" > > [(parallel > > [(set (match_dup 0) > > - (any_logic:SI (match_dup 1) (match_dup 2))) > > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > > (clobber (reg:CC FLAGS_REG))])] > > { > > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > > + if (CONSTANT_P (operands[2])) > > + { > > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > > + <MODE>mode); > > + operands[2] = GEN_INT (val); > > + } > > + else > > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > > }) > > > > (define_split > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > > index c71c453cceb..62280f58478 100644 > > --- a/gcc/config/i386/predicates.md > > +++ b/gcc/config/i386/predicates.md > > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > > return trunc_int_for_mode (val, SImode) == val; > > }) > > > > +(define_predicate "nonimmediate_or_x86_64_vector_cst" > > + (ior (match_operand 0 "nonimmediate_operand") > > + (match_operand 0 "x86_64_const_vector_operand"))) > > + > > ;; Return true when OP is nonimmediate or standard SSE constant. > > (define_predicate "nonimmediate_or_sse_const_operand" > > (ior (match_operand 0 "nonimmediate_operand") > > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > > new file mode 100644 > > index 00000000000..ac5d1990682 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > > @@ -0,0 +1,61 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse2 -O2" } */ > > +/* { dg-final { scan-assembler-not "xmm" } } */ > > + > > +void > > +foo (char* a, char* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > + a[2] &= b[2]; > > + a[3] &= b[3]; > > +} > > + > > +void > > +foo1 (char* a, char* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > +} > > + > > +void > > +foo2 (char* a, char* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > + a[2] &= b[2]; > > + a[3] &= b[3]; > > + a[4] &= b[4]; > > + a[5] &= b[5]; > > + a[6] &= b[6]; > > + a[7] &= b[7]; > > +} > > + > > +void > > +foo3 (char* a, char* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > + a[2] &= 3; > > + a[3] &= 3; > > +} > > + > > +void > > +foo4 (char* a, char* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > +} > > + > > +void > > +foo5 (char* a, char* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > + a[2] &= 2; > > + a[3] &= 3; > > + a[4] &= 4; > > + a[5] &= 5; > > + a[6] &= 6; > > + a[7] &= 7; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c > > new file mode 100644 > > index 00000000000..dce8a536a95 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c > > @@ -0,0 +1,35 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse2 -O2" } */ > > +/* { dg-final { scan-assembler-not "xmm" } } */ > > + > > +void > > +foo (short* a, short* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > + a[2] &= b[2]; > > + a[3] &= b[3]; > > +} > > + > > +void > > +foo1 (short* a, short* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > +} > > + > > +void > > +foo3 (short* a, short* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > + a[2] &= 3; > > + a[3] &= 3; > > +} > > + > > +void > > +foo4 (short* a, short* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c > > new file mode 100644 > > index 00000000000..3c7bd978f36 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c > > @@ -0,0 +1,17 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse2 -O2" } */ > > +/* { dg-final { scan-assembler-not "xmm" } } */ > > + > > +void > > +foo1 (int* a, int* __restrict b) > > +{ > > + a[0] &= b[0]; > > + a[1] &= b[1]; > > +} > > + > > +void > > +foo4 (int* a, int* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > +} > > -- > > 2.18.1 > > -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns. 2022-07-12 6:37 ` Hongtao Liu @ 2022-07-12 7:15 ` Uros Bizjak 2022-07-14 5:33 ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-12 7:15 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches On Tue, Jul 12, 2022 at 8:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > And split it to GPR-version instruction after reload. > > > > > > This will enable below optimization for 16/32/64-bit vector bit_op > > > > > > - movd (%rdi), %xmm0 > > > - movd (%rsi), %xmm1 > > > - pand %xmm1, %xmm0 > > > - movd %xmm0, (%rdi) > > > + movl (%rsi), %eax > > > + andl %eax, (%rdi) > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > Ok for trunk? > > > > The patch will create many interunit moves (xmm <-> gpr) for anything > > but the most simple logic sequences, because operations with > > memory/immediate will be forced into GPR registers, while reg/reg > > operations will remain in XMM registers. > Agree not to deal with mem/immediate at first. > > > > I tried to introduce GPR registers to MMX logic insns in the past and > > observed the above behavior, but perhaps RA evolved in the mean time > > to handle different register sets better (especially under register > > pressure). However, I would advise to be careful with this > > functionality. > > > > Perhaps this problem should be attacked in stages. First, please > > introduce GPR registers to MMX logic instructions (similar to how > > VI_16_32 mode instructions are handled). After RA effects will be > There's "?r" in VI_16_32 logic instructions which prevent RA allocate > gpr for testcase in the patch. > Is it ok to remove "?" for them(Also add alternative "r" instead of > "?r" in mmx logic insns)? > If there's other instructions that prefer "v to "r", then RA will > allocate "v", but for logic instructions, "r" and “v" should be > treated equally, just as in the 16/32/64-bit vector > mov<mode>_internal. ?r was introduced under the assumption that we want vector values mostly in vector registers. Currently there are no instructions with memory or immediate operand, so that made sense at the time. Let's keep ?r until logic instructions with mem/imm operands are introduced. So, for the patch that adds 64-bit vector logic in GPR, I would advise to first introduce only register operands. mem/imm operands should be added in a follow-up patch when the "?r" constraint will also be relaxed. Uros. ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative 2022-07-12 7:15 ` Uros Bizjak @ 2022-07-14 5:33 ` liuhongt 2022-07-14 7:22 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: liuhongt @ 2022-07-14 5:33 UTC (permalink / raw) To: gcc-patches And split it to GPR-version instruction after reload. > ?r was introduced under the assumption that we want vector values > mostly in vector registers. Currently there are no instructions with > memory or immediate operand, so that made sense at the time. Let's > keep ?r until logic instructions with mem/imm operands are introduced. > So, for the patch that adds 64-bit vector logic in GPR, I would advise > to first introduce only register operands. mem/imm operands should be Update patch to add ?r to 64-bit bit_op patterns. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. No big imact on SPEC2017(Most same binary). Ok for trunk? gcc/ChangeLog: PR target/106038 * config/i386/mmx.md (<code><mode>3): Expand with (clobber (reg:CC flags_reg)) under TARGET_64BIT (mmx_code><mode>3): Ditto. (*mmx_<code><mode>3_gpr): New define_insn, add post_reload splitter after it. (mmx_andnot<mode>3_gpr): Ditto. (<code><mode>3): Extend follow define_split from VI_16_32 to VI_16_32_64. (*andnot<mode>3): Ditto. (mmxinsnmode): New mode attribute. (VI_16_32_64): New mode iterator. (*mov<mode>_imm): Refactor with mmxinsnmode. * config/i386/predicates.md gcc/testsuite/ChangeLog: * gcc.target/i386/pr106038-1.c: New test. * gcc.target/i386/pr106038-2.c: New test. * gcc.target/i386/pr106038-3.c: New test. --- gcc/config/i386/mmx.md | 131 +++++++++++++++------ gcc/testsuite/gcc.target/i386/pr106038-1.c | 61 ++++++++++ gcc/testsuite/gcc.target/i386/pr106038-2.c | 35 ++++++ gcc/testsuite/gcc.target/i386/pr106038-3.c | 17 +++ 4 files changed, 210 insertions(+), 34 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..5f7e40bd7a1 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64 (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT") (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")]) +(define_mode_iterator VI_16_32_64 + [V2QI V4QI V2HI + (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") + (V2SI "TARGET_64BIT")]) + ;; V2S* modes (define_mode_iterator V2FI [V2SF V2SI]) @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm" HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2878,6 +2876,31 @@ (define_insn "mmx_andnot<mode>3" (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_insn "mmx_andnot<mode>3_gpr" + [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v") + (and:MMXMODEI + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "r,0,0,x,v")) + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)" + "#" + [(set_attr "isa" "bmi,*,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "*,native,*,*,*") + (set_attr "type" "alu,mmxadd,sselog,sselog,sselog") + (set_attr "mode" "DI,DI,TI,TI,TI")]) + +(define_split + [(set (match_operand:MMXMODEI 0 "register_operand") + (and:MMXMODEI + (not:MMXMODEI (match_operand:MMXMODEI 1 "register_mmxmem_operand")) + (match_operand:MMXMODEI 2 "register_mmxmem_operand"))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed + && (TARGET_MMX || TARGET_MMX_WITH_SSE) + && !GENERAL_REGNO_P (REGNO (operands[0]))" + [(set (match_dup 0) + (and:<MODE> (not:<MODE> (match_dup 1)) (match_dup 2)))]) + (define_insn "*andnot<mode>3" [(set (match_operand:VI_16_32 0 "register_operand" "=?&r,?r,x,x,v") (and:VI_16_32 @@ -2892,20 +2915,20 @@ (define_insn "*andnot<mode>3" (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") - (and:VI_16_32 - (not:VI_16_32 (match_operand:VI_16_32 1 "general_reg_operand")) - (match_operand:VI_16_32 2 "general_reg_operand"))) + [(set (match_operand:VI_16_32_64 0 "general_reg_operand") + (and:VI_16_32_64 + (not:VI_16_32_64 (match_operand:VI_16_32_64 1 "general_reg_operand")) + (match_operand:VI_16_32_64 2 "general_reg_operand"))) (clobber (reg:CC FLAGS_REG))] "TARGET_BMI && reload_completed" [(parallel [(set (match_dup 0) - (and:SI (not:SI (match_dup 1)) (match_dup 2))) + (and:<mmxinsnmode> (not:<mmxinsnmode> (match_dup 1)) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split @@ -2948,14 +2971,28 @@ (define_expand "mmx_<code><mode>3" (match_operand:MMXMODEI 1 "register_mmxmem_operand") (match_operand:MMXMODEI 2 "register_mmxmem_operand")))] "TARGET_MMX || TARGET_MMX_WITH_SSE" - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") +{ + ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands); + if (TARGET_64BIT) + { + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); + DONE; + } +}) (define_expand "<code><mode>3" [(set (match_operand:MMXMODEI 0 "register_operand") (any_logic:MMXMODEI (match_operand:MMXMODEI 1 "register_operand") (match_operand:MMXMODEI 2 "register_operand")))] - "TARGET_MMX_WITH_SSE") + "TARGET_MMX_WITH_SSE" +{ + if (TARGET_64BIT) + { + ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); + DONE; + } +}) (define_insn "*mmx_<code><mode>3" [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v") @@ -2974,6 +3011,32 @@ (define_insn "*mmx_<code><mode>3" (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) +(define_insn "*mmx_<code><mode>3_gpr" + [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,0,x,v") + (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2) + && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" + "#" + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "mmx_isa" "*,native,*,*,*") + (set_attr "type" "alu,mmxadd,sselog,sselog,sselog") + (set_attr "mode" "DI,DI,TI,TI,TI")]) + +(define_split + [(set (match_operand:MMXMODEI 0 "register_operand") + (any_logic:MMXMODEI + (match_operand:MMXMODEI 1 "register_mmxmem_operand") + (match_operand:MMXMODEI 2 "register_mmxmem_operand"))) + (clobber (reg:CC FLAGS_REG))] + "reload_completed && (TARGET_MMX || TARGET_MMX_WITH_SSE) + && !GENERAL_REGNO_P (REGNO (operands[0]))" + [(set (match_dup 0) + (any_logic:<MODE> (match_dup 1) + (match_dup 2)))]) + (define_insn "<code><mode>3" [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") (any_logic:VI_16_32 @@ -2987,20 +3050,20 @@ (define_insn "<code><mode>3" (set_attr "mode" "SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") - (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + [(set (match_operand:VI_16_32_64 0 "general_reg_operand") + (any_logic:VI_16_32_64 + (match_operand:VI_16_32_64 1 "general_reg_operand") + (match_operand:VI_16_32_64 2 "general_reg_operand"))) (clobber (reg:CC FLAGS_REG))] "reload_completed" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c new file mode 100644 index 00000000000..c026329c843 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */ + +void +foo (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; +} + +void +foo1 (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo2 (char* a, char* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; + a[4] &= b[4]; + a[5] &= b[5]; + a[6] &= b[6]; + a[7] &= b[7]; +} + +void +foo3 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} + +void +foo5 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 2; + a[3] &= 3; + a[4] &= 4; + a[5] &= 5; + a[6] &= 6; + a[7] &= 7; +} diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c new file mode 100644 index 00000000000..87d2070784f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */ + +void +foo (short* a, short* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; + a[2] &= b[2]; + a[3] &= b[3]; +} + +void +foo1 (short* a, short* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo3 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c new file mode 100644 index 00000000000..91f7112395f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2 -mtune=generic" } */ +/* { dg-final { scan-assembler-not "xmm" { xfail { ! ia32 } } } } */ + +void +foo1 (int* a, int* __restrict b) +{ + a[0] &= b[0]; + a[1] &= b[1]; +} + +void +foo4 (int* a, int* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} -- 2.18.1 ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative 2022-07-14 5:33 ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt @ 2022-07-14 7:22 ` Uros Bizjak 2022-07-14 9:32 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-14 7:22 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote: > > And split it to GPR-version instruction after reload. > > > ?r was introduced under the assumption that we want vector values > > mostly in vector registers. Currently there are no instructions with > > memory or immediate operand, so that made sense at the time. Let's > > keep ?r until logic instructions with mem/imm operands are introduced. > > So, for the patch that adds 64-bit vector logic in GPR, I would advise > > to first introduce only register operands. mem/imm operands should be > Update patch to add ?r to 64-bit bit_op patterns. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > No big imact on SPEC2017(Most same binary). The problem with your approach is with the combine pass, where combine first tries to recognize the combined instruction without clobber, before re-recognizing instruction with added clobber. So, if a forward propagation happens, the combine will *always* choose the insn variant without GPR. So, the solution with VI_16_32 is to always expand with a clobbered version that is split to either SImode or V16QImode. With 64-bit instructions, we have two additional complications. First, we have a native MMX instruction, and we have to split to it after reload, and second, we have a builtin that expects vector insn. To solve the first issue, we should change the mode of "*mmx<code><mode>" to V1DImode and split your new _gpr version with clobber to it for !GENERAL_REG_P operands. The second issue could be solved by emitting V1DImode instructions directly from the expander. Please note there are several expanders that expect non-clobbered logic insn in certain mode to be available, so the situation can become quite annoying... Uros. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative 2022-07-14 7:22 ` Uros Bizjak @ 2022-07-14 9:32 ` Hongtao Liu 2022-07-14 9:46 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-14 9:32 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Thu, Jul 14, 2022 at 3:22 PM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > And split it to GPR-version instruction after reload. > > > > > ?r was introduced under the assumption that we want vector values > > > mostly in vector registers. Currently there are no instructions with > > > memory or immediate operand, so that made sense at the time. Let's > > > keep ?r until logic instructions with mem/imm operands are introduced. > > > So, for the patch that adds 64-bit vector logic in GPR, I would advise > > > to first introduce only register operands. mem/imm operands should be > > Update patch to add ?r to 64-bit bit_op patterns. > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > No big imact on SPEC2017(Most same binary). > > The problem with your approach is with the combine pass, where combine > first tries to recognize the combined instruction without clobber, > before re-recognizing instruction with added clobber. So, if a forward > propagation happens, the combine will *always* choose the insn variant > without GPR. Thank you for the explanation, I really did not know this point. > > So, the solution with VI_16_32 is to always expand with a clobbered > version that is split to either SImode or V16QImode. With 64-bit > instructions, we have two additional complications. First, we have a > native MMX instruction, and we have to split to it after reload, and > second, we have a builtin that expects vector insn. > > To solve the first issue, we should change the mode of > "*mmx<code><mode>" to V1DImode and split your new _gpr version with > clobber to it for !GENERAL_REG_P operands. > > The second issue could be solved by emitting V1DImode instructions > directly from the expander. Please note there are several expanders > that expect non-clobbered logic insn in certain mode to be available, > so the situation can become quite annoying... Yes. It looks like it would add a lot of code complexity, I'll hold the patch for now. > > Uros. -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative 2022-07-14 9:32 ` Hongtao Liu @ 2022-07-14 9:46 ` Uros Bizjak 2022-07-18 1:59 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-14 9:46 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches On Thu, Jul 14, 2022 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Thu, Jul 14, 2022 at 3:22 PM Uros Bizjak via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > And split it to GPR-version instruction after reload. > > > > > > > ?r was introduced under the assumption that we want vector values > > > > mostly in vector registers. Currently there are no instructions with > > > > memory or immediate operand, so that made sense at the time. Let's > > > > keep ?r until logic instructions with mem/imm operands are introduced. > > > > So, for the patch that adds 64-bit vector logic in GPR, I would advise > > > > to first introduce only register operands. mem/imm operands should be > > > Update patch to add ?r to 64-bit bit_op patterns. > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > No big imact on SPEC2017(Most same binary). > > > > The problem with your approach is with the combine pass, where combine > > first tries to recognize the combined instruction without clobber, > > before re-recognizing instruction with added clobber. So, if a forward > > propagation happens, the combine will *always* choose the insn variant > > without GPR. > Thank you for the explanation, I really did not know this point. > > > > So, the solution with VI_16_32 is to always expand with a clobbered > > version that is split to either SImode or V16QImode. With 64-bit > > instructions, we have two additional complications. First, we have a > > native MMX instruction, and we have to split to it after reload, and > > second, we have a builtin that expects vector insn. > > > > To solve the first issue, we should change the mode of > > "*mmx<code><mode>" to V1DImode and split your new _gpr version with > > clobber to it for !GENERAL_REG_P operands. > > > > The second issue could be solved by emitting V1DImode instructions > > directly from the expander. Please note there are several expanders > > that expect non-clobbered logic insn in certain mode to be available, > > so the situation can become quite annoying... > Yes. It looks like it would add a lot of code complexity, I'll hold > the patch for now. I did some experimenting in the past with the idea of adding GPR instructions to 64-bit vectors. While there were some opportunities with 32- and 16-bit operations, mostly due to the fact that these arguments are passed via integer registers, 64-bit cases never triggered, because 64-bit vectors are passed via XMM registers. Also, when mem/imm alternatives were added, many inter-regunit moves were generated for everything but the most simple testcases involving logic operations, also considering the limited range of 64-bit immediates. IMO, the only case it is worth adding is a direct immediate store to memory, which HJ recently added. Uros. Uros. ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative. 2022-07-14 9:46 ` Uros Bizjak @ 2022-07-18 1:59 ` liuhongt 2022-07-18 6:28 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: liuhongt @ 2022-07-18 1:59 UTC (permalink / raw) To: gcc-patches And split it after reload. >IMO, the only case it is worth adding is a direct immediate store to >memory, which HJ recently added. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/106038 * config/i386/mmx.md (<code><mode>3): Extend to AND mem,imm, and adjust below define_split. (mmxinsnmode): New mode attribute. (*mov<mode>_imm): Refactor with mmxinsnmode. * config/i386/predicates.md (register_or_x86_64_const_vector_operand): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/pr106038-1.c: New test. --- gcc/config/i386/mmx.md | 58 +++++++++++----------- gcc/config/i386/predicates.md | 4 ++ gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++++ 3 files changed, 60 insertions(+), 29 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..fbcb34d4395 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2975,32 +2968,39 @@ (define_insn "*mmx_<code><mode>3" (set_attr "mode" "DI,TI,TI,TI")]) (define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) (clobber (reg:CC FLAGS_REG))] "" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand"))) (clobber (reg:CC FLAGS_REG))] "reload_completed" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + if (GET_CODE (operands[2]) == CONST_VECTOR) + { + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], + <MODE>mode); + operands[2] = GEN_INT (val); + } + else + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c71c453cceb..5f63a7d52f5 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" return trunc_int_for_mode (val, SImode) == val; }) +(define_predicate "register_or_x86_64_const_vector_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "x86_64_const_vector_operand"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c new file mode 100644 index 00000000000..bb52385c8a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo3 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} + + +void +foo1 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} -- 2.18.1 ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative. 2022-07-18 1:59 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt @ 2022-07-18 6:28 ` Uros Bizjak 2022-07-19 6:07 ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-18 6:28 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches On Mon, Jul 18, 2022 at 3:59 AM liuhongt <hongtao.liu@intel.com> wrote: > > And split it after reload. > > >IMO, the only case it is worth adding is a direct immediate store to > >memory, which HJ recently added. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > PR target/106038 > * config/i386/mmx.md (<code><mode>3): Extend to AND mem,imm, > and adjust below define_split. > (mmxinsnmode): New mode attribute. > (*mov<mode>_imm): Refactor with mmxinsnmode. > * config/i386/predicates.md > (register_or_x86_64_const_vector_operand): New predicate. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr106038-1.c: New test. > --- > gcc/config/i386/mmx.md | 58 +++++++++++----------- > gcc/config/i386/predicates.md | 4 ++ > gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++++ > 3 files changed, 60 insertions(+), 29 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 3294c1e6274..fbcb34d4395 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > +;; Mapping to same size integral mode. > +(define_mode_attr mmxinsnmode > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > + (V4HI "DI") (V2HI "SI") > + (V2SI "DI") > + (V4HF "DI") (V2HF "SI") > + (V2SF "DI")]) > + > (define_mode_attr mmxdoublemode > [(V8QI "V8HI") (V4HI "V4SI")]) > > @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > <MODE>mode); > operands[1] = GEN_INT (val); > - machine_mode mode; > - switch (GET_MODE_SIZE (<MODE>mode)) > - { > - case 2: > - mode = HImode; > - break; > - case 4: > - mode = SImode; > - break; > - case 8: > - mode = DImode; > - break; > - default: > - gcc_unreachable (); > - } > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > ;; For TARGET_64BIT we always round up to 8 bytes. > @@ -2975,32 +2968,39 @@ (define_insn "*mmx_<code><mode>3" > (set_attr "mode" "DI,TI,TI,TI")]) > > (define_insn "<code><mode>3" > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) > (clobber (reg:CC FLAGS_REG))] > "" You will need ix86_binary_operator_ok insn constraint here with corresponding expander using ix86_fixup_binary_operands_no_copy to prepare insn operands. > "#" > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > - (set_attr "type" "alu,sselog,sselog,sselog") > - (set_attr "mode" "SI,TI,TI,TI")]) > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > (define_split > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "general_reg_operand") > - (match_operand:VI_16_32 2 "general_reg_operand"))) > + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand"))) > (clobber (reg:CC FLAGS_REG))] > "reload_completed" > [(parallel > [(set (match_dup 0) > - (any_logic:SI (match_dup 1) (match_dup 2))) > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > (clobber (reg:CC FLAGS_REG))])] > { > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + if (GET_CODE (operands[2]) == CONST_VECTOR) Please use if (!register_operand (operands[2], <MODE>mode)) instead. Uros. > + { > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > + <MODE>mode); > + operands[2] = GEN_INT (val); > + } > + else > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > (define_split > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index c71c453cceb..5f63a7d52f5 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > return trunc_int_for_mode (val, SImode) == val; > }) > > +(define_predicate "register_or_x86_64_const_vector_operand" > + (ior (match_operand 0 "register_operand") > + (match_operand 0 "x86_64_const_vector_operand"))) > + > ;; Return true when OP is nonimmediate or standard SSE constant. > (define_predicate "nonimmediate_or_sse_const_operand" > (ior (match_operand 0 "nonimmediate_operand") > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > new file mode 100644 > index 00000000000..bb52385c8a5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo3 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 3; > + a[3] &= 3; > +} > + > +void > +foo4 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > + > + > +void > +foo1 (short* a, short* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > -- > 2.18.1 > ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-18 6:28 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak @ 2022-07-19 6:07 ` liuhongt 2022-07-19 6:34 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: liuhongt @ 2022-07-19 6:07 UTC (permalink / raw) To: gcc-patches And split it after reload. > You will need ix86_binary_operator_ok insn constraint here with > corresponding expander using ix86_fixup_binary_operands_no_copy to > prepare insn operands. Split define_expand with just register_operand, and allow memory/immediate in define_insn, assume combine/forwprop will do optimization. > Please use if (!register_operand (operands[2], <MODE>mode)) instead. Changed. Update patch. gcc/ChangeLog: PR target/106038 * config/i386/mmx.md (<code><mode>3): New define_expand, it's original "<code><mode>3". (*<code><mode>3): New define_insn, it's original "<code><mode>3" be extended to handle memory and immediate operand with ix86_binary_operator_ok. Also adjust define_split after it. (mmxinsnmode): New mode attribute. (*mov<mode>_imm): Refactor with mmxinsnmode. * config/i386/predicates.md (register_or_x86_64_const_vector_operand): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/pr106038-1.c: New test. --- gcc/config/i386/mmx.md | 71 ++++++++++++---------- gcc/config/i386/predicates.md | 4 ++ gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++ 3 files changed, 71 insertions(+), 31 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..316b83dd3ac 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3" (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") +(define_expand "<code><mode>3" + [(parallel + [(set (match_operand:VI_16_32 0 "register_operand") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "register_operand") + (match_operand:VI_16_32 2 "register_operand"))) + (clobber (reg:CC FLAGS_REG))])] + "") + +(define_insn "*<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) (clobber (reg:CC FLAGS_REG))] - "" + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand"))) (clobber (reg:CC FLAGS_REG))] "reload_completed" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + if (!register_operand (operands[2], <MODE>mode)) + { + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], + <MODE>mode); + operands[2] = GEN_INT (val); + } + else + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c71c453cceb..5f63a7d52f5 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" return trunc_int_for_mode (val, SImode) == val; }) +(define_predicate "register_or_x86_64_const_vector_operand" + (ior (match_operand 0 "register_operand") + (match_operand 0 "x86_64_const_vector_operand"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c new file mode 100644 index 00000000000..bb52385c8a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo3 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} + + +void +foo1 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} -- 2.18.1 ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-19 6:07 ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt @ 2022-07-19 6:34 ` Uros Bizjak 2022-07-19 6:56 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-19 6:34 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > And split it after reload. > > > You will need ix86_binary_operator_ok insn constraint here with > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > prepare insn operands. > Split define_expand with just register_operand, and allow > memory/immediate in define_insn, assume combine/forwprop will do optimization. But you will *ease* the job of the above passes if you use ix86_fixup_binary_operands_no_copy in the expander. You can already use a nonimmediate operand for operands 0 and 1 in the expander (and register_or_x86_64_const_vector_operand as operand 2) and the fixup will try to optimize and adjust the pattern. So I really don't see what you gain with the proposed approach. Uros. > > > Please use if (!register_operand (operands[2], <MODE>mode)) instead. > Changed. > > Update patch. > > gcc/ChangeLog: > > PR target/106038 > * config/i386/mmx.md (<code><mode>3): New define_expand, it's > original "<code><mode>3". > (*<code><mode>3): New define_insn, it's original > "<code><mode>3" be extended to handle memory and immediate > operand with ix86_binary_operator_ok. Also adjust define_split > after it. > (mmxinsnmode): New mode attribute. > (*mov<mode>_imm): Refactor with mmxinsnmode. > * config/i386/predicates.md > (register_or_x86_64_const_vector_operand): New predicate. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr106038-1.c: New test. > --- > gcc/config/i386/mmx.md | 71 ++++++++++++---------- > gcc/config/i386/predicates.md | 4 ++ > gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++ > 3 files changed, 71 insertions(+), 31 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 3294c1e6274..316b83dd3ac 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > +;; Mapping to same size integral mode. > +(define_mode_attr mmxinsnmode > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > + (V4HI "DI") (V2HI "SI") > + (V2SI "DI") > + (V4HF "DI") (V2HF "SI") > + (V2SF "DI")]) > + > (define_mode_attr mmxdoublemode > [(V8QI "V8HI") (V4HI "V4SI")]) > > @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > <MODE>mode); > operands[1] = GEN_INT (val); > - machine_mode mode; > - switch (GET_MODE_SIZE (<MODE>mode)) > - { > - case 2: > - mode = HImode; > - break; > - case 4: > - mode = SImode; > - break; > - case 8: > - mode = DImode; > - break; > - default: > - gcc_unreachable (); > - } > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > ;; For TARGET_64BIT we always round up to 8 bytes. > @@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3" > (set_attr "type" "mmxadd,sselog,sselog,sselog") > (set_attr "mode" "DI,TI,TI,TI")]) > > -(define_insn "<code><mode>3" > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > +(define_expand "<code><mode>3" > + [(parallel > + [(set (match_operand:VI_16_32 0 "register_operand") > + (any_logic:VI_16_32 > + (match_operand:VI_16_32 1 "register_operand") > + (match_operand:VI_16_32 2 "register_operand"))) > + (clobber (reg:CC FLAGS_REG))])] > + "") > + > +(define_insn "*<code><mode>3" > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) > (clobber (reg:CC FLAGS_REG))] > - "" > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > "#" > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > - (set_attr "type" "alu,sselog,sselog,sselog") > - (set_attr "mode" "SI,TI,TI,TI")]) > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > (define_split > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "general_reg_operand") > - (match_operand:VI_16_32 2 "general_reg_operand"))) > + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand"))) > (clobber (reg:CC FLAGS_REG))] > "reload_completed" > [(parallel > [(set (match_dup 0) > - (any_logic:SI (match_dup 1) (match_dup 2))) > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > (clobber (reg:CC FLAGS_REG))])] > { > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + if (!register_operand (operands[2], <MODE>mode)) > + { > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > + <MODE>mode); > + operands[2] = GEN_INT (val); > + } > + else > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > (define_split > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index c71c453cceb..5f63a7d52f5 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > return trunc_int_for_mode (val, SImode) == val; > }) > > +(define_predicate "register_or_x86_64_const_vector_operand" > + (ior (match_operand 0 "register_operand") > + (match_operand 0 "x86_64_const_vector_operand"))) > + > ;; Return true when OP is nonimmediate or standard SSE constant. > (define_predicate "nonimmediate_or_sse_const_operand" > (ior (match_operand 0 "nonimmediate_operand") > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > new file mode 100644 > index 00000000000..bb52385c8a5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo3 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 3; > + a[3] &= 3; > +} > + > +void > +foo4 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > + > + > +void > +foo1 (short* a, short* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > -- > 2.18.1 > ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-19 6:34 ` Uros Bizjak @ 2022-07-19 6:56 ` Hongtao Liu 2022-07-19 9:37 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-19 6:56 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > And split it after reload. > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > prepare insn operands. > > Split define_expand with just register_operand, and allow > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > But you will *ease* the job of the above passes if you use > ix86_fixup_binary_operands_no_copy in the expander. for -m32, it will hit ICE in Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, mode=E_V4QImode, operands=0x7fffffffa970) a /gcc/config/i386/i386-expand.cc:1184 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); (gdb) n 1185 gcc_assert (dst == operands[0]); -- here (gdb) the original operands[0], operands[1], operands[2] are below (gdb) p debug_rtx (operands[0]) (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) $1 = void (gdb) p debug_rtx (operands[1]) (subreg:V4QI (reg:SI 129) 0) $2 = void (gdb) p debug_rtx (operands[2]) (subreg:V4QI (reg:SI 98 [ _46 ]) 0) $3 = void (gdb) since operands[0] is mem and not equal to operands[1], ix86_fixup_binary_operands will create a pseudo register for dst. and then hit ICE. Is this a bug or assumed? > > Uros. > > > > > > Please use if (!register_operand (operands[2], <MODE>mode)) instead. > > Changed. > > > > Update patch. > > > > gcc/ChangeLog: > > > > PR target/106038 > > * config/i386/mmx.md (<code><mode>3): New define_expand, it's > > original "<code><mode>3". > > (*<code><mode>3): New define_insn, it's original > > "<code><mode>3" be extended to handle memory and immediate > > operand with ix86_binary_operator_ok. Also adjust define_split > > after it. > > (mmxinsnmode): New mode attribute. > > (*mov<mode>_imm): Refactor with mmxinsnmode. > > * config/i386/predicates.md > > (register_or_x86_64_const_vector_operand): New predicate. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/i386/pr106038-1.c: New test. > > --- > > gcc/config/i386/mmx.md | 71 ++++++++++++---------- > > gcc/config/i386/predicates.md | 4 ++ > > gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++ > > 3 files changed, 71 insertions(+), 31 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > > > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > > index 3294c1e6274..316b83dd3ac 100644 > > --- a/gcc/config/i386/mmx.md > > +++ b/gcc/config/i386/mmx.md > > @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize > > [(V8QI "b") (V4QI "b") (V2QI "b") > > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > > > +;; Mapping to same size integral mode. > > +(define_mode_attr mmxinsnmode > > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > > + (V4HI "DI") (V2HI "SI") > > + (V2SI "DI") > > + (V4HF "DI") (V2HF "SI") > > + (V2SF "DI")]) > > + > > (define_mode_attr mmxdoublemode > > [(V8QI "V8HI") (V4HI "V4SI")]) > > > > @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" > > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > > <MODE>mode); > > operands[1] = GEN_INT (val); > > - machine_mode mode; > > - switch (GET_MODE_SIZE (<MODE>mode)) > > - { > > - case 2: > > - mode = HImode; > > - break; > > - case 4: > > - mode = SImode; > > - break; > > - case 8: > > - mode = DImode; > > - break; > > - default: > > - gcc_unreachable (); > > - } > > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > > }) > > > > ;; For TARGET_64BIT we always round up to 8 bytes. > > @@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3" > > (set_attr "type" "mmxadd,sselog,sselog,sselog") > > (set_attr "mode" "DI,TI,TI,TI")]) > > > > -(define_insn "<code><mode>3" > > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > > +(define_expand "<code><mode>3" > > + [(parallel > > + [(set (match_operand:VI_16_32 0 "register_operand") > > + (any_logic:VI_16_32 > > + (match_operand:VI_16_32 1 "register_operand") > > + (match_operand:VI_16_32 2 "register_operand"))) > > + (clobber (reg:CC FLAGS_REG))])] > > + "") > > + > > +(define_insn "*<code><mode>3" > > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") > > (any_logic:VI_16_32 > > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) > > (clobber (reg:CC FLAGS_REG))] > > - "" > > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > > "#" > > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > > - (set_attr "type" "alu,sselog,sselog,sselog") > > - (set_attr "mode" "SI,TI,TI,TI")]) > > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > > > (define_split > > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > > + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") > > (any_logic:VI_16_32 > > - (match_operand:VI_16_32 1 "general_reg_operand") > > - (match_operand:VI_16_32 2 "general_reg_operand"))) > > + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") > > + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand"))) > > (clobber (reg:CC FLAGS_REG))] > > "reload_completed" > > [(parallel > > [(set (match_dup 0) > > - (any_logic:SI (match_dup 1) (match_dup 2))) > > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > > (clobber (reg:CC FLAGS_REG))])] > > { > > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > > + if (!register_operand (operands[2], <MODE>mode)) > > + { > > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > > + <MODE>mode); > > + operands[2] = GEN_INT (val); > > + } > > + else > > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > > }) > > > > (define_split > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > > index c71c453cceb..5f63a7d52f5 100644 > > --- a/gcc/config/i386/predicates.md > > +++ b/gcc/config/i386/predicates.md > > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > > return trunc_int_for_mode (val, SImode) == val; > > }) > > > > +(define_predicate "register_or_x86_64_const_vector_operand" > > + (ior (match_operand 0 "register_operand") > > + (match_operand 0 "x86_64_const_vector_operand"))) > > + > > ;; Return true when OP is nonimmediate or standard SSE constant. > > (define_predicate "nonimmediate_or_sse_const_operand" > > (ior (match_operand 0 "nonimmediate_operand") > > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > > new file mode 100644 > > index 00000000000..bb52385c8a5 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > > @@ -0,0 +1,27 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-msse2 -O2" } */ > > +/* { dg-final { scan-assembler-not "xmm" } } */ > > + > > +void > > +foo3 (char* a, char* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > + a[2] &= 3; > > + a[3] &= 3; > > +} > > + > > +void > > +foo4 (char* a, char* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > +} > > + > > + > > +void > > +foo1 (short* a, short* __restrict b) > > +{ > > + a[0] &= 1; > > + a[1] &= 2; > > +} > > -- > > 2.18.1 > > -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-19 6:56 ` Hongtao Liu @ 2022-07-19 9:37 ` Uros Bizjak 2022-07-20 2:37 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-19 9:37 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > And split it after reload. > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > prepare insn operands. > > > Split define_expand with just register_operand, and allow > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > But you will *ease* the job of the above passes if you use > > ix86_fixup_binary_operands_no_copy in the expander. > for -m32, it will hit ICE in > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > mode=E_V4QImode, operands=0x7fffffffa970) a > /gcc/config/i386/i386-expand.cc:1184 > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > (gdb) n > 1185 gcc_assert (dst == operands[0]); -- here > (gdb) > > the original operands[0], operands[1], operands[2] are below > (gdb) p debug_rtx (operands[0]) > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > $1 = void > (gdb) p debug_rtx (operands[1]) > (subreg:V4QI (reg:SI 129) 0) > $2 = void > (gdb) p debug_rtx (operands[2]) > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > $3 = void > (gdb) > > since operands[0] is mem and not equal to operands[1], > ix86_fixup_binary_operands will create a pseudo register for dst. and > then hit ICE. > Is this a bug or assumed? You will need ix86_expand_binary_operator here. Uros. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-19 9:37 ` Uros Bizjak @ 2022-07-20 2:37 ` Hongtao Liu 2022-07-20 6:14 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-20 2:37 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > And split it after reload. > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > prepare insn operands. > > > > Split define_expand with just register_operand, and allow > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > But you will *ease* the job of the above passes if you use > > > ix86_fixup_binary_operands_no_copy in the expander. > > for -m32, it will hit ICE in > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > mode=E_V4QImode, operands=0x7fffffffa970) a > > /gcc/config/i386/i386-expand.cc:1184 > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > (gdb) n > > 1185 gcc_assert (dst == operands[0]); -- here > > (gdb) > > > > the original operands[0], operands[1], operands[2] are below > > (gdb) p debug_rtx (operands[0]) > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > $1 = void > > (gdb) p debug_rtx (operands[1]) > > (subreg:V4QI (reg:SI 129) 0) > > $2 = void > > (gdb) p debug_rtx (operands[2]) > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > $3 = void > > (gdb) > > > > since operands[0] is mem and not equal to operands[1], > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > then hit ICE. > > Is this a bug or assumed? > > You will need ix86_expand_binary_operator here. It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. What about this? -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") +(define_expand "<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) - (clobber (reg:CC FLAGS_REG))] + (match_operand:VI_16_32 1 "nonimmediate_operand") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))] "" +{ + rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands); + if (MEM_P (operands[2])) + operands[2] = force_reg (<MODE>mode, operands[2]); + rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode, + operands[1], operands[2])); + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + DONE; +}) + > > Uros. -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 2:37 ` Hongtao Liu @ 2022-07-20 6:14 ` Uros Bizjak 2022-07-20 6:18 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-20 6:14 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > And split it after reload. > > > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > > prepare insn operands. > > > > > Split define_expand with just register_operand, and allow > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > > > But you will *ease* the job of the above passes if you use > > > > ix86_fixup_binary_operands_no_copy in the expander. > > > for -m32, it will hit ICE in > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > > mode=E_V4QImode, operands=0x7fffffffa970) a > > > /gcc/config/i386/i386-expand.cc:1184 > > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > > (gdb) n > > > 1185 gcc_assert (dst == operands[0]); -- here > > > (gdb) > > > > > > the original operands[0], operands[1], operands[2] are below > > > (gdb) p debug_rtx (operands[0]) > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > > $1 = void > > > (gdb) p debug_rtx (operands[1]) > > > (subreg:V4QI (reg:SI 129) 0) > > > $2 = void > > > (gdb) p debug_rtx (operands[2]) > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > > $3 = void > > > (gdb) > > > > > > since operands[0] is mem and not equal to operands[1], > > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > > then hit ICE. > > > Is this a bug or assumed? > > > > You will need ix86_expand_binary_operator here. > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. > > What about this? Still no good... You are using commutative operands, so the predicate of operand 2 should also allow memory. So, the predicate should be nonimmediate_or_x86_64_const_vector_operand. The intermediate insn pattern should look something like *<any_or:code><mode>_1, but with added XMM and MMX reg alternatives instead of mask regs. Uros. > > -(define_insn "<code><mode>3" > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > +(define_expand "<code><mode>3" > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > - (clobber (reg:CC FLAGS_REG))] > + (match_operand:VI_16_32 1 "nonimmediate_operand") > + (match_operand:VI_16_32 2 > "register_or_x86_64_const_vector_operand")))] > "" > +{ > + rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands); > + if (MEM_P (operands[2])) > + operands[2] = force_reg (<MODE>mode, operands[2]); > + rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode, > + operands[1], operands[2])); > + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); > + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); > + if (dst != operands[0]) > + emit_move_insn (operands[0], dst); > + DONE; > +}) > + > > > > > Uros. > > > > -- > BR, > Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 6:14 ` Uros Bizjak @ 2022-07-20 6:18 ` Uros Bizjak 2022-07-20 6:54 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-20 6:18 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > And split it after reload. > > > > > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > > > prepare insn operands. > > > > > > Split define_expand with just register_operand, and allow > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > > > > > But you will *ease* the job of the above passes if you use > > > > > ix86_fixup_binary_operands_no_copy in the expander. > > > > for -m32, it will hit ICE in > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > > > mode=E_V4QImode, operands=0x7fffffffa970) a > > > > /gcc/config/i386/i386-expand.cc:1184 > > > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > > > (gdb) n > > > > 1185 gcc_assert (dst == operands[0]); -- here > > > > (gdb) > > > > > > > > the original operands[0], operands[1], operands[2] are below > > > > (gdb) p debug_rtx (operands[0]) > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > > > $1 = void > > > > (gdb) p debug_rtx (operands[1]) > > > > (subreg:V4QI (reg:SI 129) 0) > > > > $2 = void > > > > (gdb) p debug_rtx (operands[2]) > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > > > $3 = void > > > > (gdb) > > > > > > > > since operands[0] is mem and not equal to operands[1], > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > > > then hit ICE. > > > > Is this a bug or assumed? > > > > > > You will need ix86_expand_binary_operator here. > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. > > > > What about this? > > Still no good... You are using commutative operands, so the predicate > of operand 2 should also allow memory. So, the predicate should be > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn > pattern should look something like *<any_or:code><mode>_1, but with > added XMM and MMX reg alternatives instead of mask regs. Alternatively, you can use UNKNOWN operator to prevent canonicalization, but then you should not use commutative constraint in the intermediate insn. I think this is the best solution. Uros. > > > > -(define_insn "<code><mode>3" > > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > > +(define_expand "<code><mode>3" > > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") > > (any_logic:VI_16_32 > > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > > - (clobber (reg:CC FLAGS_REG))] > > + (match_operand:VI_16_32 1 "nonimmediate_operand") > > + (match_operand:VI_16_32 2 > > "register_or_x86_64_const_vector_operand")))] > > "" > > +{ > > + rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands); > > + if (MEM_P (operands[2])) > > + operands[2] = force_reg (<MODE>mode, operands[2]); > > + rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode, > > + operands[1], operands[2])); > > + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); > > + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); > > + if (dst != operands[0]) > > + emit_move_insn (operands[0], dst); > > + DONE; > > +}) > > + > > > > > > > > Uros. > > > > > > > > -- > > BR, > > Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 6:18 ` Uros Bizjak @ 2022-07-20 6:54 ` Hongtao Liu 2022-07-20 7:18 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-20 6:54 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > > > And split it after reload. > > > > > > > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > > > > prepare insn operands. > > > > > > > Split define_expand with just register_operand, and allow > > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > > > > > > > But you will *ease* the job of the above passes if you use > > > > > > ix86_fixup_binary_operands_no_copy in the expander. > > > > > for -m32, it will hit ICE in > > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > > > > mode=E_V4QImode, operands=0x7fffffffa970) a > > > > > /gcc/config/i386/i386-expand.cc:1184 > > > > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > > > > (gdb) n > > > > > 1185 gcc_assert (dst == operands[0]); -- here > > > > > (gdb) > > > > > > > > > > the original operands[0], operands[1], operands[2] are below > > > > > (gdb) p debug_rtx (operands[0]) > > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > > > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > > > > $1 = void > > > > > (gdb) p debug_rtx (operands[1]) > > > > > (subreg:V4QI (reg:SI 129) 0) > > > > > $2 = void > > > > > (gdb) p debug_rtx (operands[2]) > > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > > > > $3 = void > > > > > (gdb) > > > > > > > > > > since operands[0] is mem and not equal to operands[1], > > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > > > > then hit ICE. > > > > > Is this a bug or assumed? > > > > > > > > You will need ix86_expand_binary_operator here. > > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. > > > > > > What about this? > > > > Still no good... You are using commutative operands, so the predicate > > of operand 2 should also allow memory. So, the predicate should be > > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn > > pattern should look something like *<any_or:code><mode>_1, but with > > added XMM and MMX reg alternatives instead of mask regs. > > Alternatively, you can use UNKNOWN operator to prevent > canonicalization, but then you should not use commutative constraint > in the intermediate insn. I think this is the best solution. Like this? -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") +(define_expand "<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) - (clobber (reg:CC FLAGS_REG))] + (match_operand:VI_16_32 1 "nonimmediate_operand") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))] "" +{ + rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands); + if (MEM_P (operands[2])) + operands[2] = force_reg (<MODE>mode, operands[2]); + rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode, + operands[1], operands[2])); + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); + if (dst != operands[0]) + emit_move_insn (operands[0], dst); + DONE; +}) + +(define_insn "*<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "nonimmediate_operand" "0,0,0,x,v") + (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (UNKNOWN, <MODE>mode, operands)" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > Uros. > > > > > > > -(define_insn "<code><mode>3" > > > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > > > +(define_expand "<code><mode>3" > > > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") > > > (any_logic:VI_16_32 > > > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > > > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > > > - (clobber (reg:CC FLAGS_REG))] > > > + (match_operand:VI_16_32 1 "nonimmediate_operand") > > > + (match_operand:VI_16_32 2 > > > "register_or_x86_64_const_vector_operand")))] > > > "" > > > +{ > > > + rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands); > > > + if (MEM_P (operands[2])) > > > + operands[2] = force_reg (<MODE>mode, operands[2]); > > > + rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode, > > > + operands[1], operands[2])); > > > + rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); > > > + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); > > > + if (dst != operands[0]) > > > + emit_move_insn (operands[0], dst); > > > + DONE; > > > +}) > > > + > > > > > > > > > > > Uros. > > > > > > > > > > > > -- > > > BR, > > > Hongtao -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 6:54 ` Hongtao Liu @ 2022-07-20 7:18 ` Uros Bizjak 2022-07-20 7:22 ` Hongtao Liu 0 siblings, 1 reply; 22+ messages in thread From: Uros Bizjak @ 2022-07-20 7:18 UTC (permalink / raw) To: Hongtao Liu; +Cc: liuhongt, gcc-patches [-- Attachment #1: Type: text/plain, Size: 3409 bytes --] On Wed, Jul 20, 2022 at 8:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > > > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > > > > > And split it after reload. > > > > > > > > > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > > > > > prepare insn operands. > > > > > > > > Split define_expand with just register_operand, and allow > > > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > > > > > > > > > But you will *ease* the job of the above passes if you use > > > > > > > ix86_fixup_binary_operands_no_copy in the expander. > > > > > > for -m32, it will hit ICE in > > > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > > > > > mode=E_V4QImode, operands=0x7fffffffa970) a > > > > > > /gcc/config/i386/i386-expand.cc:1184 > > > > > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > > > > > (gdb) n > > > > > > 1185 gcc_assert (dst == operands[0]); -- here > > > > > > (gdb) > > > > > > > > > > > > the original operands[0], operands[1], operands[2] are below > > > > > > (gdb) p debug_rtx (operands[0]) > > > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > > > > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > > > > > $1 = void > > > > > > (gdb) p debug_rtx (operands[1]) > > > > > > (subreg:V4QI (reg:SI 129) 0) > > > > > > $2 = void > > > > > > (gdb) p debug_rtx (operands[2]) > > > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > > > > > $3 = void > > > > > > (gdb) > > > > > > > > > > > > since operands[0] is mem and not equal to operands[1], > > > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > > > > > then hit ICE. > > > > > > Is this a bug or assumed? > > > > > > > > > > You will need ix86_expand_binary_operator here. > > > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. > > > > > > > > What about this? > > > > > > Still no good... You are using commutative operands, so the predicate > > > of operand 2 should also allow memory. So, the predicate should be > > > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn > > > pattern should look something like *<any_or:code><mode>_1, but with > > > added XMM and MMX reg alternatives instead of mask regs. > > > > Alternatively, you can use UNKNOWN operator to prevent > > canonicalization, but then you should not use commutative constraint > > in the intermediate insn. I think this is the best solution. > Like this? Please check the attached (lightly tested) patch that keeps commutative operands. Uros. [-- Attachment #2: p.diff.txt --] [-- Type: text/plain, Size: 4499 bytes --] diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..0a39d396430 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -86,6 +86,14 @@ [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +358,7 @@ HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2974,33 +2967,50 @@ (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") +(define_expand "<code><mode>3" + [(parallel + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) - (clobber (reg:CC FLAGS_REG))] + (match_operand:VI_16_32 1 "nonimmediate_operand") + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand"))) + (clobber (reg:CC FLAGS_REG))])] "" + "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;") + +(define_insn "*<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") + (match_operand:VI_16_32 2 "reg_or_const_vector_operand"))) (clobber (reg:CC FLAGS_REG))] "reload_completed" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + if (!register_operand (operands[2], <MODE>mode)) + { + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], + <MODE>mode); + operands[2] = GEN_INT (val); + } + else + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index 42053ea7209..064596d9594 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1209,6 +1209,10 @@ return trunc_int_for_mode (val, SImode) == val; }) +(define_predicate "nonimmediate_or_x86_64_const_vector_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "x86_64_const_vector_operand"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 7:18 ` Uros Bizjak @ 2022-07-20 7:22 ` Hongtao Liu 2022-07-21 5:18 ` [PATCH V3] " liuhongt 0 siblings, 1 reply; 22+ messages in thread From: Hongtao Liu @ 2022-07-20 7:22 UTC (permalink / raw) To: Uros Bizjak; +Cc: liuhongt, gcc-patches On Wed, Jul 20, 2022 at 3:18 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Jul 20, 2022 at 8:54 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote: > > > > > > > > > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote: > > > > > > > > > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches > > > > > > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > > > > > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > > > > > > > > > > > > > > > And split it after reload. > > > > > > > > > > > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with > > > > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to > > > > > > > > > > prepare insn operands. > > > > > > > > > Split define_expand with just register_operand, and allow > > > > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization. > > > > > > > > > > > > > > > > But you will *ease* the job of the above passes if you use > > > > > > > > ix86_fixup_binary_operands_no_copy in the expander. > > > > > > > for -m32, it will hit ICE in > > > > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR, > > > > > > > mode=E_V4QImode, operands=0x7fffffffa970) a > > > > > > > /gcc/config/i386/i386-expand.cc:1184 > > > > > > > 1184 rtx dst = ix86_fixup_binary_operands (code, mode, operands); > > > > > > > (gdb) n > > > > > > > 1185 gcc_assert (dst == operands[0]); -- here > > > > > > > (gdb) > > > > > > > > > > > > > > the original operands[0], operands[1], operands[2] are below > > > > > > > (gdb) p debug_rtx (operands[0]) > > > > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars) > > > > > > > (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4) > > > > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32]) > > > > > > > $1 = void > > > > > > > (gdb) p debug_rtx (operands[1]) > > > > > > > (subreg:V4QI (reg:SI 129) 0) > > > > > > > $2 = void > > > > > > > (gdb) p debug_rtx (operands[2]) > > > > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0) > > > > > > > $3 = void > > > > > > > (gdb) > > > > > > > > > > > > > > since operands[0] is mem and not equal to operands[1], > > > > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and > > > > > > > then hit ICE. > > > > > > > Is this a bug or assumed? > > > > > > > > > > > > You will need ix86_expand_binary_operator here. > > > > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn. > > > > > > > > > > What about this? > > > > > > > > Still no good... You are using commutative operands, so the predicate > > > > of operand 2 should also allow memory. So, the predicate should be > > > > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn > > > > pattern should look something like *<any_or:code><mode>_1, but with > > > > added XMM and MMX reg alternatives instead of mask regs. > > > > > > Alternatively, you can use UNKNOWN operator to prevent > > > canonicalization, but then you should not use commutative constraint > > > in the intermediate insn. I think this is the best solution. > > Like this? > > Please check the attached (lightly tested) patch that keeps > commutative operands. Yes, it looks best, I'll fully test the patch. > > Uros. -- BR, Hongtao ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCH V3] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-20 7:22 ` Hongtao Liu @ 2022-07-21 5:18 ` liuhongt 2022-07-21 5:55 ` Uros Bizjak 0 siblings, 1 reply; 22+ messages in thread From: liuhongt @ 2022-07-21 5:18 UTC (permalink / raw) To: gcc-patches And split it after reload. gcc/ChangeLog: PR target/106038 * config/i386/mmx.md (<code><mode>3): New define_expand, it's original "<code><mode>3". (*<code><mode>3): New define_insn, it's original "<code><mode>3" be extended to handle memory and immediate operand with ix86_binary_operator_ok. Also adjust define_split after it. (mmxinsnmode): New mode attribute. (*mov<mode>_imm): Refactor with mmxinsnmode. * config/i386/predicates.md (register_or_x86_64_const_vector_operand): New predicate. gcc/testsuite/ChangeLog: * gcc.target/i386/pr106038-1.c: New test. --- gcc/config/i386/mmx.md | 70 ++++++++++++---------- gcc/config/i386/predicates.md | 4 ++ gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 +++++++++ 3 files changed, 70 insertions(+), 31 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 3294c1e6274..dda4b43f5c1 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize [(V8QI "b") (V4QI "b") (V2QI "b") (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) +;; Mapping to same size integral mode. +(define_mode_attr mmxinsnmode + [(V8QI "DI") (V4QI "SI") (V2QI "HI") + (V4HI "DI") (V2HI "SI") + (V2SI "DI") + (V4HF "DI") (V2HF "SI") + (V2SF "DI")]) + (define_mode_attr mmxdoublemode [(V8QI "V8HI") (V4HI "V4SI")]) @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], <MODE>mode); operands[1] = GEN_INT (val); - machine_mode mode; - switch (GET_MODE_SIZE (<MODE>mode)) - { - case 2: - mode = HImode; - break; - case 4: - mode = SImode; - break; - case 8: - mode = DImode; - break; - default: - gcc_unreachable (); - } - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) ;; For TARGET_64BIT we always round up to 8 bytes. @@ -2974,33 +2967,48 @@ (define_insn "*mmx_<code><mode>3" (set_attr "type" "mmxadd,sselog,sselog,sselog") (set_attr "mode" "DI,TI,TI,TI")]) -(define_insn "<code><mode>3" - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") +(define_expand "<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) - (clobber (reg:CC FLAGS_REG))] + (match_operand:VI_16_32 1 "nonimmediate_operand") + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand")))] "" + "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;") + +(define_insn "*<code><mode>3" + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") + (any_logic:VI_16_32 + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v"))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" "#" - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") - (set_attr "type" "alu,sselog,sselog,sselog") - (set_attr "mode" "SI,TI,TI,TI")]) + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") + (set_attr "type" "alu,alu,sselog,sselog,sselog") + (set_attr "mode" "SI,SI,TI,TI,TI")]) (define_split - [(set (match_operand:VI_16_32 0 "general_reg_operand") + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") (any_logic:VI_16_32 - (match_operand:VI_16_32 1 "general_reg_operand") - (match_operand:VI_16_32 2 "general_reg_operand"))) + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") + (match_operand:VI_16_32 2 "reg_or_const_vector_operand"))) (clobber (reg:CC FLAGS_REG))] "reload_completed" [(parallel [(set (match_dup 0) - (any_logic:SI (match_dup 1) (match_dup 2))) + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] { - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); + if (!register_operand (operands[2], <MODE>mode)) + { + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], + <MODE>mode); + operands[2] = GEN_INT (val); + } + else + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); }) (define_split diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index c71c453cceb..73dfd46bf90 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" return trunc_int_for_mode (val, SImode) == val; }) +(define_predicate "nonimmediate_or_x86_64_const_vector_operand" + (ior (match_operand 0 "nonimmediate_operand") + (match_operand 0 "x86_64_const_vector_operand"))) + ;; Return true when OP is nonimmediate or standard SSE constant. (define_predicate "nonimmediate_or_sse_const_operand" (ior (match_operand 0 "nonimmediate_operand") diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c new file mode 100644 index 00000000000..bb52385c8a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-msse2 -O2" } */ +/* { dg-final { scan-assembler-not "xmm" } } */ + +void +foo3 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; + a[2] &= 3; + a[3] &= 3; +} + +void +foo4 (char* a, char* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} + + +void +foo1 (short* a, short* __restrict b) +{ + a[0] &= 1; + a[1] &= 2; +} -- 2.18.1 ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCH V3] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative. 2022-07-21 5:18 ` [PATCH V3] " liuhongt @ 2022-07-21 5:55 ` Uros Bizjak 0 siblings, 0 replies; 22+ messages in thread From: Uros Bizjak @ 2022-07-21 5:55 UTC (permalink / raw) To: liuhongt; +Cc: gcc-patches On Thu, Jul 21, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote: > > And split it after reload. > > gcc/ChangeLog: > > PR target/106038 > * config/i386/mmx.md (<code><mode>3): New define_expand, it's > original "<code><mode>3". > (*<code><mode>3): New define_insn, it's original > "<code><mode>3" be extended to handle memory and immediate > operand with ix86_binary_operator_ok. Also adjust define_split > after it. > (mmxinsnmode): New mode attribute. > (*mov<mode>_imm): Refactor with mmxinsnmode. > * config/i386/predicates.md > (register_or_x86_64_const_vector_operand): New predicate. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr106038-1.c: New test. OK. Thanks, Uros. > --- > gcc/config/i386/mmx.md | 70 ++++++++++++---------- > gcc/config/i386/predicates.md | 4 ++ > gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 +++++++++ > 3 files changed, 70 insertions(+), 31 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c > > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md > index 3294c1e6274..dda4b43f5c1 100644 > --- a/gcc/config/i386/mmx.md > +++ b/gcc/config/i386/mmx.md > @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize > [(V8QI "b") (V4QI "b") (V2QI "b") > (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")]) > > +;; Mapping to same size integral mode. > +(define_mode_attr mmxinsnmode > + [(V8QI "DI") (V4QI "SI") (V2QI "HI") > + (V4HI "DI") (V2HI "SI") > + (V2SI "DI") > + (V4HF "DI") (V2HF "SI") > + (V2SF "DI")]) > + > (define_mode_attr mmxdoublemode > [(V8QI "V8HI") (V4HI "V4SI")]) > > @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm" > HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1], > <MODE>mode); > operands[1] = GEN_INT (val); > - machine_mode mode; > - switch (GET_MODE_SIZE (<MODE>mode)) > - { > - case 2: > - mode = HImode; > - break; > - case 4: > - mode = SImode; > - break; > - case 8: > - mode = DImode; > - break; > - default: > - gcc_unreachable (); > - } > - operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > ;; For TARGET_64BIT we always round up to 8 bytes. > @@ -2974,33 +2967,48 @@ (define_insn "*mmx_<code><mode>3" > (set_attr "type" "mmxadd,sselog,sselog,sselog") > (set_attr "mode" "DI,TI,TI,TI")]) > > -(define_insn "<code><mode>3" > - [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v") > +(define_expand "<code><mode>3" > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v") > - (match_operand:VI_16_32 2 "register_operand" "r,x,x,v"))) > - (clobber (reg:CC FLAGS_REG))] > + (match_operand:VI_16_32 1 "nonimmediate_operand") > + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand")))] > "" > + "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;") > + > +(define_insn "*<code><mode>3" > + [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v") > + (any_logic:VI_16_32 > + (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v") > + (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v"))) > + (clobber (reg:CC FLAGS_REG))] > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" > "#" > - [(set_attr "isa" "*,sse2_noavx,avx,avx512vl") > - (set_attr "type" "alu,sselog,sselog,sselog") > - (set_attr "mode" "SI,TI,TI,TI")]) > + [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl") > + (set_attr "type" "alu,alu,sselog,sselog,sselog") > + (set_attr "mode" "SI,SI,TI,TI,TI")]) > > (define_split > - [(set (match_operand:VI_16_32 0 "general_reg_operand") > + [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand") > (any_logic:VI_16_32 > - (match_operand:VI_16_32 1 "general_reg_operand") > - (match_operand:VI_16_32 2 "general_reg_operand"))) > + (match_operand:VI_16_32 1 "nonimmediate_gr_operand") > + (match_operand:VI_16_32 2 "reg_or_const_vector_operand"))) > (clobber (reg:CC FLAGS_REG))] > "reload_completed" > [(parallel > [(set (match_dup 0) > - (any_logic:SI (match_dup 1) (match_dup 2))) > + (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2))) > (clobber (reg:CC FLAGS_REG))])] > { > - operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode); > - operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode); > - operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode); > + if (!register_operand (operands[2], <MODE>mode)) > + { > + HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2], > + <MODE>mode); > + operands[2] = GEN_INT (val); > + } > + else > + operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode); > + operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode); > + operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode); > }) > > (define_split > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index c71c453cceb..73dfd46bf90 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand" > return trunc_int_for_mode (val, SImode) == val; > }) > > +(define_predicate "nonimmediate_or_x86_64_const_vector_operand" > + (ior (match_operand 0 "nonimmediate_operand") > + (match_operand 0 "x86_64_const_vector_operand"))) > + > ;; Return true when OP is nonimmediate or standard SSE constant. > (define_predicate "nonimmediate_or_sse_const_operand" > (ior (match_operand 0 "nonimmediate_operand") > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c > new file mode 100644 > index 00000000000..bb52385c8a5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-msse2 -O2" } */ > +/* { dg-final { scan-assembler-not "xmm" } } */ > + > +void > +foo3 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > + a[2] &= 3; > + a[3] &= 3; > +} > + > +void > +foo4 (char* a, char* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > + > + > +void > +foo1 (short* a, short* __restrict b) > +{ > + a[0] &= 1; > + a[1] &= 2; > +} > -- > 2.18.1 > ^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2022-07-21 5:55 UTC | newest] Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-07-11 1:15 [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns liuhongt 2022-07-11 8:02 ` Uros Bizjak 2022-07-12 6:37 ` Hongtao Liu 2022-07-12 7:15 ` Uros Bizjak 2022-07-14 5:33 ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt 2022-07-14 7:22 ` Uros Bizjak 2022-07-14 9:32 ` Hongtao Liu 2022-07-14 9:46 ` Uros Bizjak 2022-07-18 1:59 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt 2022-07-18 6:28 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak 2022-07-19 6:07 ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt 2022-07-19 6:34 ` Uros Bizjak 2022-07-19 6:56 ` Hongtao Liu 2022-07-19 9:37 ` Uros Bizjak 2022-07-20 2:37 ` Hongtao Liu 2022-07-20 6:14 ` Uros Bizjak 2022-07-20 6:18 ` Uros Bizjak 2022-07-20 6:54 ` Hongtao Liu 2022-07-20 7:18 ` Uros Bizjak 2022-07-20 7:22 ` Hongtao Liu 2022-07-21 5:18 ` [PATCH V3] " liuhongt 2022-07-21 5:55 ` Uros Bizjak
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).