public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.
@ 2022-07-11  1:15 liuhongt
  2022-07-11  8:02 ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: liuhongt @ 2022-07-11  1:15 UTC (permalink / raw)
  To: gcc-patches

And split it to GPR-version instruction after reload.

This will enable below optimization for 16/32/64-bit vector bit_op

-       movd    (%rdi), %xmm0
-       movd    (%rsi), %xmm1
-       pand    %xmm1, %xmm0
-       movd    %xmm0, (%rdi)
+       movl    (%rsi), %eax
+       andl    %eax, (%rdi)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): Expand
	with (clobber (reg:CC flags_reg)) under TARGET_64BIT
	(mmx_code><mode>3): Ditto.
	(*mmx_<code><mode>3_1): New define_insn, add post_reload
	splitter after it.
	(*<code><mode>3): New define_insn, also add post_reload
	splitter after it.
	(mmxinsnmode): New mode attribute.
	(VI_16_32_64): New mode iterator.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md
	(nonimmediate_or_x86_64_vector_cst): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
	* gcc.target/i386/pr106038-2.c: New test.
	* gcc.target/i386/pr106038-3.c: New test.
---
 gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
 gcc/config/i386/predicates.md              |   4 +
 gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
 gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
 5 files changed, 213 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..85b06abea27 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
     (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
     (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
 
+(define_mode_iterator VI_16_32_64
+   [V2QI V4QI V2HI
+    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+    (V2SI "TARGET_64BIT")])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3"
 	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
 	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
-  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+  if (TARGET_64BIT)
+  {
+    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+    DONE;
+  }
+})
 
 (define_expand "<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand")
 	(any_logic:MMXMODEI
 	  (match_operand:MMXMODEI 1 "register_operand")
 	  (match_operand:MMXMODEI 2 "register_operand")))]
-  "TARGET_MMX_WITH_SSE")
+  "TARGET_MMX_WITH_SSE"
+{
+  if (TARGET_64BIT)
+    {
+      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+      DONE;
+    }
+})
 
 (define_insn "*mmx_<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
@@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
-        (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+(define_insn "*mmx_<code><mode>3_1"
+  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
+	  (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
+    (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && (TARGET_MMX || TARGET_SSE2)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "#"
+  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
+   (set_attr "mmx_isa" "native,*,*,*,*,*")
+   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
+   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
+
+(define_split
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
    (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && (TARGET_MMX || TARGET_SSE2)
+   && reload_completed
+   && !general_reg_operand (operands[0], <MODE>mode)"
+  [(set (match_dup 0)
+	(any_logic:<MODE> (match_dup 1) (match_dup 2)))])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "register_operand")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "register_operand")
+	  (match_operand:VI_16_32 2 "register_operand")))]
   ""
+{
+  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
-        (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+  [(set (match_operand:VI_16_32_64 0 "")
+        (any_logic:VI_16_32_64
+	  (match_operand:VI_16_32_64 1 "")
+	  (match_operand:VI_16_32_64 2 "")))
    (clobber (reg:CC FLAGS_REG))]
-  "reload_completed"
+  "reload_completed
+  && !sse_reg_operand (operands[1], <MODE>mode)
+  && !sse_reg_operand (operands[2], <MODE>mode)
+  && !sse_reg_operand (operands[0], <MODE>mode)"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (CONSTANT_P (operands[2]))
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+     }
+   else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..62280f58478 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "nonimmediate_or_x86_64_vector_cst"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..ac5d1990682
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo2 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+  a[4] &= b[4];
+  a[5] &= b[5];
+  a[6] &= b[6];
+  a[7] &= b[7];
+}
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+void
+foo5 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 2;
+  a[3] &= 3;
+  a[4] &= 4;
+  a[5] &= 5;
+  a[6] &= 6;
+  a[7] &= 7;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
new file mode 100644
index 00000000000..dce8a536a95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo3 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
new file mode 100644
index 00000000000..3c7bd978f36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo1 (int* a, int* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo4 (int* a, int* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.
  2022-07-11  1:15 [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns liuhongt
@ 2022-07-11  8:02 ` Uros Bizjak
  2022-07-12  6:37   ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-11  8:02 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches, H. J. Lu

On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it to GPR-version instruction after reload.
>
> This will enable below optimization for 16/32/64-bit vector bit_op
>
> -       movd    (%rdi), %xmm0
> -       movd    (%rsi), %xmm1
> -       pand    %xmm1, %xmm0
> -       movd    %xmm0, (%rdi)
> +       movl    (%rsi), %eax
> +       andl    %eax, (%rdi)
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

The patch will create many interunit moves (xmm <-> gpr) for anything
but the most simple logic sequences, because operations with
memory/immediate will be forced into GPR registers, while reg/reg
operations will remain in XMM registers.

I tried to introduce GPR registers to MMX logic insns in the past and
observed the above behavior, but perhaps RA evolved in the mean time
to handle different register sets better (especially under register
pressure). However, I would advise to be careful with this
functionality.

Perhaps this problem should be attacked in stages. First, please
introduce GPR registers to MMX logic instructions (similar to how
VI_16_32 mode instructions are handled). After RA effects will be
analysed, only then memory/immediate handling should be added. Also,
please don't forget to handle ANDNOT insn - TARGET_BMI slightly
complicates this part, but this is also solved with VI_16_32 mode
instructions.

Uros.

>
> gcc/ChangeLog:
>
>         PR target/106038
>         * config/i386/mmx.md (<code><mode>3): Expand
>         with (clobber (reg:CC flags_reg)) under TARGET_64BIT
>         (mmx_code><mode>3): Ditto.
>         (*mmx_<code><mode>3_1): New define_insn, add post_reload
>         splitter after it.
>         (*<code><mode>3): New define_insn, also add post_reload
>         splitter after it.
>         (mmxinsnmode): New mode attribute.
>         (VI_16_32_64): New mode iterator.
>         (*mov<mode>_imm): Refactor with mmxinsnmode.
>         * config/i386/predicates.md
>         (nonimmediate_or_x86_64_vector_cst): New predicate.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106038-1.c: New test.
>         * gcc.target/i386/pr106038-2.c: New test.
>         * gcc.target/i386/pr106038-3.c: New test.
> ---
>  gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
>  gcc/config/i386/predicates.md              |   4 +
>  gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
>  gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
>  5 files changed, 213 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 3294c1e6274..85b06abea27 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
>      (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
>      (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
>
> +(define_mode_iterator VI_16_32_64
> +   [V2QI V4QI V2HI
> +    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
> +    (V2SI "TARGET_64BIT")])
> +
>  ;; V2S* modes
>  (define_mode_iterator V2FI [V2SF V2SI])
>
> @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
>     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
>
> +;; Mapping to same size integral mode.
> +(define_mode_attr mmxinsnmode
> +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> +   (V4HI "DI") (V2HI "SI")
> +   (V2SI "DI")
> +   (V4HF "DI") (V2HF "SI")
> +   (V2SF "DI")])
> +
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
>    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
>                                                             <MODE>mode);
>    operands[1] = GEN_INT (val);
> -  machine_mode mode;
> -  switch (GET_MODE_SIZE (<MODE>mode))
> -    {
> -    case 2:
> -      mode = HImode;
> -      break;
> -    case 4:
> -      mode = SImode;
> -      break;
> -    case 8:
> -      mode = DImode;
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  ;; For TARGET_64BIT we always round up to 8 bytes.
> @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3"
>           (match_operand:MMXMODEI 1 "register_mmxmem_operand")
>           (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
>    "TARGET_MMX || TARGET_MMX_WITH_SSE"
> -  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
> +{
> +  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
> +  if (TARGET_64BIT)
> +  {
> +    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +    DONE;
> +  }
> +})
>
>  (define_expand "<code><mode>3"
>    [(set (match_operand:MMXMODEI 0 "register_operand")
>         (any_logic:MMXMODEI
>           (match_operand:MMXMODEI 1 "register_operand")
>           (match_operand:MMXMODEI 2 "register_operand")))]
> -  "TARGET_MMX_WITH_SSE")
> +  "TARGET_MMX_WITH_SSE"
> +{
> +  if (TARGET_64BIT)
> +    {
> +      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +      DONE;
> +    }
> +})
>
>  (define_insn "*mmx_<code><mode>3"
>    [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
> @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3"
>     (set_attr "type" "mmxadd,sselog,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI,TI")])
>
> -(define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> -        (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> +(define_insn "*mmx_<code><mode>3_1"
> +  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
> +        (any_logic:MMXMODEI
> +         (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
> +         (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && (TARGET_MMX || TARGET_SSE2)
> +   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> +  "#"
> +  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
> +   (set_attr "mmx_isa" "native,*,*,*,*,*")
> +   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
> +   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
> +
> +(define_split
> +  [(set (match_operand:MMXMODEI 0 "register_operand")
> +        (any_logic:MMXMODEI
> +         (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> +         (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
>     (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && (TARGET_MMX || TARGET_SSE2)
> +   && reload_completed
> +   && !general_reg_operand (operands[0], <MODE>mode)"
> +  [(set (match_dup 0)
> +       (any_logic:<MODE> (match_dup 1) (match_dup 2)))])
> +
> +(define_expand "<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "register_operand")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "register_operand")
> +         (match_operand:VI_16_32 2 "register_operand")))]
>    ""
> +{
> +  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +  DONE;
> +})
> +
> +(define_insn "*<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "#"
> -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> -   (set_attr "type" "alu,sselog,sselog,sselog")
> -   (set_attr "mode" "SI,TI,TI,TI")])
> +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> +   (set_attr "mode" "SI,SI,TI,TI,TI")])
>
>  (define_split
> -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> -        (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "general_reg_operand")
> -         (match_operand:VI_16_32 2 "general_reg_operand")))
> +  [(set (match_operand:VI_16_32_64 0 "")
> +        (any_logic:VI_16_32_64
> +         (match_operand:VI_16_32_64 1 "")
> +         (match_operand:VI_16_32_64 2 "")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "reload_completed"
> +  "reload_completed
> +  && !sse_reg_operand (operands[1], <MODE>mode)
> +  && !sse_reg_operand (operands[2], <MODE>mode)
> +  && !sse_reg_operand (operands[0], <MODE>mode)"
>    [(parallel
>       [(set (match_dup 0)
> -          (any_logic:SI (match_dup 1) (match_dup 2)))
> +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>  {
> -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (CONSTANT_P (operands[2]))
> +    {
> +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> +                                                               <MODE>mode);
> +      operands[2] = GEN_INT (val);
> +     }
> +   else
> +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  (define_split
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index c71c453cceb..62280f58478 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
>    return trunc_int_for_mode (val, SImode) == val;
>  })
>
> +(define_predicate "nonimmediate_or_x86_64_vector_cst"
> +  (ior (match_operand 0 "nonimmediate_operand")
> +       (match_operand 0 "x86_64_const_vector_operand")))
> +
>  ;; Return true when OP is nonimmediate or standard SSE constant.
>  (define_predicate "nonimmediate_or_sse_const_operand"
>    (ior (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> new file mode 100644
> index 00000000000..ac5d1990682
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +}
> +
> +void
> +foo1 (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo2 (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +  a[4] &= b[4];
> +  a[5] &= b[5];
> +  a[6] &= b[6];
> +  a[7] &= b[7];
> +}
> +
> +void
> +foo3 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> +
> +void
> +foo5 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 2;
> +  a[3] &= 3;
> +  a[4] &= 4;
> +  a[5] &= 5;
> +  a[6] &= 6;
> +  a[7] &= 7;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> new file mode 100644
> index 00000000000..dce8a536a95
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo (short* a, short* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +}
> +
> +void
> +foo1 (short* a, short* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo3 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> new file mode 100644
> index 00000000000..3c7bd978f36
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo1 (int* a, int* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo4 (int* a, int* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.
  2022-07-11  8:02 ` Uros Bizjak
@ 2022-07-12  6:37   ` Hongtao Liu
  2022-07-12  7:15     ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-12  6:37 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > And split it to GPR-version instruction after reload.
> >
> > This will enable below optimization for 16/32/64-bit vector bit_op
> >
> > -       movd    (%rdi), %xmm0
> > -       movd    (%rsi), %xmm1
> > -       pand    %xmm1, %xmm0
> > -       movd    %xmm0, (%rdi)
> > +       movl    (%rsi), %eax
> > +       andl    %eax, (%rdi)
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> The patch will create many interunit moves (xmm <-> gpr) for anything
> but the most simple logic sequences, because operations with
> memory/immediate will be forced into GPR registers, while reg/reg
> operations will remain in XMM registers.
Agree not to deal with mem/immediate at first.
>
> I tried to introduce GPR registers to MMX logic insns in the past and
> observed the above behavior, but perhaps RA evolved in the mean time
> to handle different register sets better (especially under register
> pressure). However, I would advise to be careful with this
> functionality.
>
> Perhaps this problem should be attacked in stages. First, please
> introduce GPR registers to MMX logic instructions (similar to how
> VI_16_32 mode instructions are handled). After RA effects will be
There's "?r" in VI_16_32 logic instructions which prevent RA allocate
gpr for testcase in the patch.
Is it ok to remove "?" for them(Also add alternative "r" instead of
"?r" in mmx logic insns)?
If there's other instructions that prefer "v to "r", then RA will
allocate "v", but for logic instructions, "r" and “v" should be
treated equally, just as in the 16/32/64-bit vector
mov<mode>_internal.
> analysed, only then memory/immediate handling should be added. Also,
> please don't forget to handle ANDNOT insn - TARGET_BMI slightly
> complicates this part, but this is also solved with VI_16_32 mode
> instructions.
>
> Uros.
>
> >
> > gcc/ChangeLog:
> >
> >         PR target/106038
> >         * config/i386/mmx.md (<code><mode>3): Expand
> >         with (clobber (reg:CC flags_reg)) under TARGET_64BIT
> >         (mmx_code><mode>3): Ditto.
> >         (*mmx_<code><mode>3_1): New define_insn, add post_reload
> >         splitter after it.
> >         (*<code><mode>3): New define_insn, also add post_reload
> >         splitter after it.
> >         (mmxinsnmode): New mode attribute.
> >         (VI_16_32_64): New mode iterator.
> >         (*mov<mode>_imm): Refactor with mmxinsnmode.
> >         * config/i386/predicates.md
> >         (nonimmediate_or_x86_64_vector_cst): New predicate.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr106038-1.c: New test.
> >         * gcc.target/i386/pr106038-2.c: New test.
> >         * gcc.target/i386/pr106038-3.c: New test.
> > ---
> >  gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
> >  gcc/config/i386/predicates.md              |   4 +
> >  gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
> >  5 files changed, 213 insertions(+), 35 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c
> >
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index 3294c1e6274..85b06abea27 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
> >      (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
> >      (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
> >
> > +(define_mode_iterator VI_16_32_64
> > +   [V2QI V4QI V2HI
> > +    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
> > +    (V2SI "TARGET_64BIT")])
> > +
> >  ;; V2S* modes
> >  (define_mode_iterator V2FI [V2SF V2SI])
> >
> > @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
> >    [(V8QI "b") (V4QI "b") (V2QI "b")
> >     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
> >
> > +;; Mapping to same size integral mode.
> > +(define_mode_attr mmxinsnmode
> > +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> > +   (V4HI "DI") (V2HI "SI")
> > +   (V2SI "DI")
> > +   (V4HF "DI") (V2HF "SI")
> > +   (V2SF "DI")])
> > +
> >  (define_mode_attr mmxdoublemode
> >    [(V8QI "V8HI") (V4HI "V4SI")])
> >
> > @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
> >    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
> >                                                             <MODE>mode);
> >    operands[1] = GEN_INT (val);
> > -  machine_mode mode;
> > -  switch (GET_MODE_SIZE (<MODE>mode))
> > -    {
> > -    case 2:
> > -      mode = HImode;
> > -      break;
> > -    case 4:
> > -      mode = SImode;
> > -      break;
> > -    case 8:
> > -      mode = DImode;
> > -      break;
> > -    default:
> > -      gcc_unreachable ();
> > -    }
> > -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  ;; For TARGET_64BIT we always round up to 8 bytes.
> > @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3"
> >           (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> >           (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
> >    "TARGET_MMX || TARGET_MMX_WITH_SSE"
> > -  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
> > +{
> > +  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
> > +  if (TARGET_64BIT)
> > +  {
> > +    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +    DONE;
> > +  }
> > +})
> >
> >  (define_expand "<code><mode>3"
> >    [(set (match_operand:MMXMODEI 0 "register_operand")
> >         (any_logic:MMXMODEI
> >           (match_operand:MMXMODEI 1 "register_operand")
> >           (match_operand:MMXMODEI 2 "register_operand")))]
> > -  "TARGET_MMX_WITH_SSE")
> > +  "TARGET_MMX_WITH_SSE"
> > +{
> > +  if (TARGET_64BIT)
> > +    {
> > +      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +      DONE;
> > +    }
> > +})
> >
> >  (define_insn "*mmx_<code><mode>3"
> >    [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
> > @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3"
> >     (set_attr "type" "mmxadd,sselog,sselog,sselog")
> >     (set_attr "mode" "DI,TI,TI,TI")])
> >
> > -(define_insn "<code><mode>3"
> > -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> > -        (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> > -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> > +(define_insn "*mmx_<code><mode>3_1"
> > +  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
> > +        (any_logic:MMXMODEI
> > +         (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
> > +         (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
> > +    (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_64BIT
> > +   && (TARGET_MMX || TARGET_SSE2)
> > +   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> > +  "#"
> > +  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
> > +   (set_attr "mmx_isa" "native,*,*,*,*,*")
> > +   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
> > +   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
> > +
> > +(define_split
> > +  [(set (match_operand:MMXMODEI 0 "register_operand")
> > +        (any_logic:MMXMODEI
> > +         (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> > +         (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
> >     (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_64BIT
> > +   && (TARGET_MMX || TARGET_SSE2)
> > +   && reload_completed
> > +   && !general_reg_operand (operands[0], <MODE>mode)"
> > +  [(set (match_dup 0)
> > +       (any_logic:<MODE> (match_dup 1) (match_dup 2)))])
> > +
> > +(define_expand "<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "register_operand")
> > +        (any_logic:VI_16_32
> > +         (match_operand:VI_16_32 1 "register_operand")
> > +         (match_operand:VI_16_32 2 "register_operand")))]
> >    ""
> > +{
> > +  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +  DONE;
> > +})
> > +
> > +(define_insn "*<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
> > +        (any_logic:VI_16_32
> > +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> > +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> >    "#"
> > -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> > -   (set_attr "type" "alu,sselog,sselog,sselog")
> > -   (set_attr "mode" "SI,TI,TI,TI")])
> > +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> > +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> > +   (set_attr "mode" "SI,SI,TI,TI,TI")])
> >
> >  (define_split
> > -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> > -        (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "general_reg_operand")
> > -         (match_operand:VI_16_32 2 "general_reg_operand")))
> > +  [(set (match_operand:VI_16_32_64 0 "")
> > +        (any_logic:VI_16_32_64
> > +         (match_operand:VI_16_32_64 1 "")
> > +         (match_operand:VI_16_32_64 2 "")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "reload_completed"
> > +  "reload_completed
> > +  && !sse_reg_operand (operands[1], <MODE>mode)
> > +  && !sse_reg_operand (operands[2], <MODE>mode)
> > +  && !sse_reg_operand (operands[0], <MODE>mode)"
> >    [(parallel
> >       [(set (match_dup 0)
> > -          (any_logic:SI (match_dup 1) (match_dup 2)))
> > +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >  {
> > -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> > -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> > -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> > +  if (CONSTANT_P (operands[2]))
> > +    {
> > +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> > +                                                               <MODE>mode);
> > +      operands[2] = GEN_INT (val);
> > +     }
> > +   else
> > +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> > +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  (define_split
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index c71c453cceb..62280f58478 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
> >    return trunc_int_for_mode (val, SImode) == val;
> >  })
> >
> > +(define_predicate "nonimmediate_or_x86_64_vector_cst"
> > +  (ior (match_operand 0 "nonimmediate_operand")
> > +       (match_operand 0 "x86_64_const_vector_operand")))
> > +
> >  ;; Return true when OP is nonimmediate or standard SSE constant.
> >  (define_predicate "nonimmediate_or_sse_const_operand"
> >    (ior (match_operand 0 "nonimmediate_operand")
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > new file mode 100644
> > index 00000000000..ac5d1990682
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > @@ -0,0 +1,61 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +}
> > +
> > +void
> > +foo1 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo2 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +  a[4] &= b[4];
> > +  a[5] &= b[5];
> > +  a[6] &= b[6];
> > +  a[7] &= b[7];
> > +}
> > +
> > +void
> > +foo3 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 3;
> > +  a[3] &= 3;
> > +}
> > +
> > +void
> > +foo4 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > +
> > +void
> > +foo5 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 2;
> > +  a[3] &= 3;
> > +  a[4] &= 4;
> > +  a[5] &= 5;
> > +  a[6] &= 6;
> > +  a[7] &= 7;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> > new file mode 100644
> > index 00000000000..dce8a536a95
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> > @@ -0,0 +1,35 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo (short* a, short* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +}
> > +
> > +void
> > +foo1 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo3 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 3;
> > +  a[3] &= 3;
> > +}
> > +
> > +void
> > +foo4 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> > new file mode 100644
> > index 00000000000..3c7bd978f36
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo1 (int* a, int* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo4 (int* a, int* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.
  2022-07-12  6:37   ` Hongtao Liu
@ 2022-07-12  7:15     ` Uros Bizjak
  2022-07-14  5:33       ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-12  7:15 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

On Tue, Jul 12, 2022 at 8:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > And split it to GPR-version instruction after reload.
> > >
> > > This will enable below optimization for 16/32/64-bit vector bit_op
> > >
> > > -       movd    (%rdi), %xmm0
> > > -       movd    (%rsi), %xmm1
> > > -       pand    %xmm1, %xmm0
> > > -       movd    %xmm0, (%rdi)
> > > +       movl    (%rsi), %eax
> > > +       andl    %eax, (%rdi)
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> >
> > The patch will create many interunit moves (xmm <-> gpr) for anything
> > but the most simple logic sequences, because operations with
> > memory/immediate will be forced into GPR registers, while reg/reg
> > operations will remain in XMM registers.
> Agree not to deal with mem/immediate at first.
> >
> > I tried to introduce GPR registers to MMX logic insns in the past and
> > observed the above behavior, but perhaps RA evolved in the mean time
> > to handle different register sets better (especially under register
> > pressure). However, I would advise to be careful with this
> > functionality.
> >
> > Perhaps this problem should be attacked in stages. First, please
> > introduce GPR registers to MMX logic instructions (similar to how
> > VI_16_32 mode instructions are handled). After RA effects will be
> There's "?r" in VI_16_32 logic instructions which prevent RA allocate
> gpr for testcase in the patch.
> Is it ok to remove "?" for them(Also add alternative "r" instead of
> "?r" in mmx logic insns)?
> If there's other instructions that prefer "v to "r", then RA will
> allocate "v", but for logic instructions, "r" and “v" should be
> treated equally, just as in the 16/32/64-bit vector
> mov<mode>_internal.

?r was introduced under the assumption that we want vector values
mostly in vector registers. Currently there are no instructions with
memory or immediate operand, so that made sense at the time. Let's
keep ?r until logic instructions with mem/imm operands are introduced.
So, for the patch that adds 64-bit vector logic in GPR, I would advise
to first introduce only register operands. mem/imm operands should be
added in a follow-up patch when the "?r" constraint will also be
relaxed.

Uros.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative
  2022-07-12  7:15     ` Uros Bizjak
@ 2022-07-14  5:33       ` liuhongt
  2022-07-14  7:22         ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: liuhongt @ 2022-07-14  5:33 UTC (permalink / raw)
  To: gcc-patches

And split it to GPR-version instruction after reload.

> ?r was introduced under the assumption that we want vector values
> mostly in vector registers. Currently there are no instructions with
> memory or immediate operand, so that made sense at the time. Let's
> keep ?r until logic instructions with mem/imm operands are introduced.
> So, for the patch that adds 64-bit vector logic in GPR, I would advise
> to first introduce only register operands. mem/imm operands should be
Update patch to add ?r to 64-bit bit_op patterns.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big imact on SPEC2017(Most same binary).

Ok for trunk?

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): Expand
	with (clobber (reg:CC flags_reg)) under TARGET_64BIT
	(mmx_code><mode>3): Ditto.
	(*mmx_<code><mode>3_gpr): New define_insn, add post_reload
	splitter after it.
	(mmx_andnot<mode>3_gpr): Ditto.
	(<code><mode>3): Extend follow define_split from VI_16_32 to
	VI_16_32_64.
	(*andnot<mode>3): Ditto.
	(mmxinsnmode): New mode attribute.
	(VI_16_32_64): New mode iterator.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
	* gcc.target/i386/pr106038-2.c: New test.
	* gcc.target/i386/pr106038-3.c: New test.
---
 gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
 gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
 gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
 4 files changed, 210 insertions(+), 34 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..5f7e40bd7a1 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
     (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
     (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
 
+(define_mode_iterator VI_16_32_64
+   [V2QI V4QI V2HI
+    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+    (V2SI "TARGET_64BIT")])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2878,6 +2876,31 @@ (define_insn "mmx_andnot<mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
+(define_insn "mmx_andnot<mode>3_gpr"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v")
+	(and:MMXMODEI
+	  (not:MMXMODEI (match_operand:MMXMODEI 1 "register_operand" "r,0,0,x,v"))
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)"
+  "#"
+  [(set_attr "isa" "bmi,*,sse2_noavx,avx,avx512vl")
+   (set_attr "mmx_isa" "*,native,*,*,*")
+   (set_attr "type" "alu,mmxadd,sselog,sselog,sselog")
+   (set_attr "mode" "DI,DI,TI,TI,TI")])
+
+(define_split
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+	(and:MMXMODEI
+	  (not:MMXMODEI (match_operand:MMXMODEI 1 "register_mmxmem_operand"))
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed
+   && (TARGET_MMX || TARGET_MMX_WITH_SSE)
+   && !GENERAL_REGNO_P (REGNO (operands[0]))"
+  [(set (match_dup 0)
+	(and:<MODE> (not:<MODE> (match_dup 1)) (match_dup 2)))])
+
 (define_insn "*andnot<mode>3"
   [(set (match_operand:VI_16_32 0 "register_operand" "=?&r,?r,x,x,v")
         (and:VI_16_32
@@ -2892,20 +2915,20 @@ (define_insn "*andnot<mode>3"
    (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
-        (and:VI_16_32
-	  (not:VI_16_32 (match_operand:VI_16_32 1 "general_reg_operand"))
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+  [(set (match_operand:VI_16_32_64 0 "general_reg_operand")
+        (and:VI_16_32_64
+	  (not:VI_16_32_64 (match_operand:VI_16_32_64 1 "general_reg_operand"))
+	  (match_operand:VI_16_32_64 2 "general_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI && reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (and:SI (not:SI (match_dup 1)) (match_dup 2)))
+	   (and:<mmxinsnmode> (not:<mmxinsnmode> (match_dup 1)) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
@@ -2948,14 +2971,28 @@ (define_expand "mmx_<code><mode>3"
 	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
 	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
-  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+  if (TARGET_64BIT)
+  {
+    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+    DONE;
+  }
+})
 
 (define_expand "<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand")
 	(any_logic:MMXMODEI
 	  (match_operand:MMXMODEI 1 "register_operand")
 	  (match_operand:MMXMODEI 2 "register_operand")))]
-  "TARGET_MMX_WITH_SSE")
+  "TARGET_MMX_WITH_SSE"
+{
+  if (TARGET_64BIT)
+    {
+      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+      DONE;
+    }
+})
 
 (define_insn "*mmx_<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
@@ -2974,6 +3011,32 @@ (define_insn "*mmx_<code><mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
+(define_insn "*mmx_<code><mode>3_gpr"
+  [(set (match_operand:MMXMODEI 0 "register_operand" "=?r,y,x,x,v")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "register_mmxmem_operand" "%0,0,0,x,v")
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand" "r,ym,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT && (TARGET_MMX || TARGET_SSE2)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "#"
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "mmx_isa" "*,native,*,*,*")
+   (set_attr "type" "alu,mmxadd,sselog,sselog,sselog")
+   (set_attr "mode" "DI,DI,TI,TI,TI")])
+
+(define_split
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+	(any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
+   (clobber (reg:CC FLAGS_REG))]
+  "reload_completed && (TARGET_MMX || TARGET_MMX_WITH_SSE)
+   && !GENERAL_REGNO_P (REGNO (operands[0]))"
+ [(set (match_dup 0)
+       (any_logic:<MODE> (match_dup 1)
+			 (match_dup 2)))])
+
 (define_insn "<code><mode>3"
   [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
         (any_logic:VI_16_32
@@ -2987,20 +3050,20 @@ (define_insn "<code><mode>3"
    (set_attr "mode" "SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
-        (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+  [(set (match_operand:VI_16_32_64 0 "general_reg_operand")
+        (any_logic:VI_16_32_64
+	  (match_operand:VI_16_32_64 1 "general_reg_operand")
+	  (match_operand:VI_16_32_64 2 "general_reg_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..c026329c843
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */
+
+void
+foo (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo2 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+  a[4] &= b[4];
+  a[5] &= b[5];
+  a[6] &= b[6];
+  a[7] &= b[7];
+}
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+void
+foo5 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 2;
+  a[3] &= 3;
+  a[4] &= 4;
+  a[5] &= 5;
+  a[6] &= 6;
+  a[7] &= 7;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
new file mode 100644
index 00000000000..87d2070784f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail *-*-* } } } */
+
+void
+foo (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo3 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
new file mode 100644
index 00000000000..91f7112395f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -mtune=generic" } */
+/* { dg-final { scan-assembler-not "xmm" { xfail { ! ia32 } } } } */
+
+void
+foo1 (int* a, int* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo4 (int* a, int* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative
  2022-07-14  5:33       ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt
@ 2022-07-14  7:22         ` Uros Bizjak
  2022-07-14  9:32           ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-14  7:22 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it to GPR-version instruction after reload.
>
> > ?r was introduced under the assumption that we want vector values
> > mostly in vector registers. Currently there are no instructions with
> > memory or immediate operand, so that made sense at the time. Let's
> > keep ?r until logic instructions with mem/imm operands are introduced.
> > So, for the patch that adds 64-bit vector logic in GPR, I would advise
> > to first introduce only register operands. mem/imm operands should be
> Update patch to add ?r to 64-bit bit_op patterns.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> No big imact on SPEC2017(Most same binary).

The problem with your approach is with the combine pass, where combine
first tries to recognize the combined instruction without clobber,
before re-recognizing instruction with added clobber. So, if a forward
propagation happens, the combine will *always* choose the insn variant
without GPR.

So, the solution with VI_16_32 is to always expand with a clobbered
version that is split to either SImode or V16QImode. With 64-bit
instructions, we have two additional complications. First, we have a
native MMX instruction, and we have to split to it after reload, and
second, we have a builtin that expects vector insn.

To solve the first issue, we should change the mode of
"*mmx<code><mode>" to V1DImode and split your new _gpr version with
clobber to it for !GENERAL_REG_P operands.

The second issue could be solved by emitting V1DImode instructions
directly from the expander. Please note there are several expanders
that expect non-clobbered logic insn in certain mode to be available,
so the situation can become quite annoying...

Uros.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative
  2022-07-14  7:22         ` Uros Bizjak
@ 2022-07-14  9:32           ` Hongtao Liu
  2022-07-14  9:46             ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-14  9:32 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Thu, Jul 14, 2022 at 3:22 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > And split it to GPR-version instruction after reload.
> >
> > > ?r was introduced under the assumption that we want vector values
> > > mostly in vector registers. Currently there are no instructions with
> > > memory or immediate operand, so that made sense at the time. Let's
> > > keep ?r until logic instructions with mem/imm operands are introduced.
> > > So, for the patch that adds 64-bit vector logic in GPR, I would advise
> > > to first introduce only register operands. mem/imm operands should be
> > Update patch to add ?r to 64-bit bit_op patterns.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > No big imact on SPEC2017(Most same binary).
>
> The problem with your approach is with the combine pass, where combine
> first tries to recognize the combined instruction without clobber,
> before re-recognizing instruction with added clobber. So, if a forward
> propagation happens, the combine will *always* choose the insn variant
> without GPR.
Thank you for the explanation, I really did not know this point.
>
> So, the solution with VI_16_32 is to always expand with a clobbered
> version that is split to either SImode or V16QImode. With 64-bit
> instructions, we have two additional complications. First, we have a
> native MMX instruction, and we have to split to it after reload, and
> second, we have a builtin that expects vector insn.
>
> To solve the first issue, we should change the mode of
> "*mmx<code><mode>" to V1DImode and split your new _gpr version with
> clobber to it for !GENERAL_REG_P operands.
>
> The second issue could be solved by emitting V1DImode instructions
> directly from the expander. Please note there are several expanders
> that expect non-clobbered logic insn in certain mode to be available,
> so the situation can become quite annoying...
Yes. It looks like it would add a lot of code complexity, I'll hold
the patch for now.
>
> Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative
  2022-07-14  9:32           ` Hongtao Liu
@ 2022-07-14  9:46             ` Uros Bizjak
  2022-07-18  1:59               ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-14  9:46 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

On Thu, Jul 14, 2022 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Thu, Jul 14, 2022 at 3:22 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Thu, Jul 14, 2022 at 7:33 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > And split it to GPR-version instruction after reload.
> > >
> > > > ?r was introduced under the assumption that we want vector values
> > > > mostly in vector registers. Currently there are no instructions with
> > > > memory or immediate operand, so that made sense at the time. Let's
> > > > keep ?r until logic instructions with mem/imm operands are introduced.
> > > > So, for the patch that adds 64-bit vector logic in GPR, I would advise
> > > > to first introduce only register operands. mem/imm operands should be
> > > Update patch to add ?r to 64-bit bit_op patterns.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > No big imact on SPEC2017(Most same binary).
> >
> > The problem with your approach is with the combine pass, where combine
> > first tries to recognize the combined instruction without clobber,
> > before re-recognizing instruction with added clobber. So, if a forward
> > propagation happens, the combine will *always* choose the insn variant
> > without GPR.
> Thank you for the explanation, I really did not know this point.
> >
> > So, the solution with VI_16_32 is to always expand with a clobbered
> > version that is split to either SImode or V16QImode. With 64-bit
> > instructions, we have two additional complications. First, we have a
> > native MMX instruction, and we have to split to it after reload, and
> > second, we have a builtin that expects vector insn.
> >
> > To solve the first issue, we should change the mode of
> > "*mmx<code><mode>" to V1DImode and split your new _gpr version with
> > clobber to it for !GENERAL_REG_P operands.
> >
> > The second issue could be solved by emitting V1DImode instructions
> > directly from the expander. Please note there are several expanders
> > that expect non-clobbered logic insn in certain mode to be available,
> > so the situation can become quite annoying...
> Yes. It looks like it would add a lot of code complexity, I'll hold
> the patch for now.

I did some experimenting in the past with the idea of adding GPR
instructions to 64-bit vectors. While there were some opportunities
with 32- and 16-bit operations, mostly due to the fact that these
arguments are passed via integer registers, 64-bit cases never
triggered, because 64-bit vectors are passed via XMM registers. Also,
when mem/imm alternatives were added, many inter-regunit moves were
generated for everything but the most simple testcases involving logic
operations, also considering the limited range of 64-bit immediates.
IMO, the only case it is worth adding is a direct immediate store to
memory, which HJ recently added.

Uros.


Uros.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative.
  2022-07-14  9:46             ` Uros Bizjak
@ 2022-07-18  1:59               ` liuhongt
  2022-07-18  6:28                 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: liuhongt @ 2022-07-18  1:59 UTC (permalink / raw)
  To: gcc-patches

And split it after reload.

>IMO, the only case it is worth adding is a direct immediate store to
>memory, which HJ recently added.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): Extend to AND mem,imm,
	and adjust below define_split.
	(mmxinsnmode): New mode attribute.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md
	(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md                     | 58 +++++++++++-----------
 gcc/config/i386/predicates.md              |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++++
 3 files changed, 60 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..fbcb34d4395 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2975,32 +2968,39 @@ (define_insn "*mmx_<code><mode>3"
    (set_attr "mode" "DI,TI,TI,TI")])
 
 (define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+	  (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+	  (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (GET_CODE (operands[2]) == CONST_VECTOR)
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+    }
+  else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..5f63a7d52f5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "register_or_x86_64_const_vector_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..bb52385c8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative.
  2022-07-18  1:59               ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt
@ 2022-07-18  6:28                 ` Uros Bizjak
  2022-07-19  6:07                   ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-18  6:28 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Mon, Jul 18, 2022 at 3:59 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it after reload.
>
> >IMO, the only case it is worth adding is a direct immediate store to
> >memory, which HJ recently added.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR target/106038
>         * config/i386/mmx.md (<code><mode>3): Extend to AND mem,imm,
>         and adjust below define_split.
>         (mmxinsnmode): New mode attribute.
>         (*mov<mode>_imm): Refactor with mmxinsnmode.
>         * config/i386/predicates.md
>         (register_or_x86_64_const_vector_operand): New predicate.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106038-1.c: New test.
> ---
>  gcc/config/i386/mmx.md                     | 58 +++++++++++-----------
>  gcc/config/i386/predicates.md              |  4 ++
>  gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++++
>  3 files changed, 60 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 3294c1e6274..fbcb34d4395 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
>     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
>
> +;; Mapping to same size integral mode.
> +(define_mode_attr mmxinsnmode
> +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> +   (V4HI "DI") (V2HI "SI")
> +   (V2SI "DI")
> +   (V4HF "DI") (V2HF "SI")
> +   (V2SF "DI")])
> +
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
>    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
>                                                             <MODE>mode);
>    operands[1] = GEN_INT (val);
> -  machine_mode mode;
> -  switch (GET_MODE_SIZE (<MODE>mode))
> -    {
> -    case 2:
> -      mode = HImode;
> -      break;
> -    case 4:
> -      mode = SImode;
> -      break;
> -    case 8:
> -      mode = DImode;
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  ;; For TARGET_64BIT we always round up to 8 bytes.
> @@ -2975,32 +2968,39 @@ (define_insn "*mmx_<code><mode>3"
>     (set_attr "mode" "DI,TI,TI,TI")])
>
>  (define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
>     (clobber (reg:CC FLAGS_REG))]
>    ""

You will need ix86_binary_operator_ok insn constraint here with
corresponding expander using ix86_fixup_binary_operands_no_copy to
prepare insn operands.

>    "#"
> -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> -   (set_attr "type" "alu,sselog,sselog,sselog")
> -   (set_attr "mode" "SI,TI,TI,TI")])
> +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> +   (set_attr "mode" "SI,SI,TI,TI,TI")])
>
>  (define_split
> -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "general_reg_operand")
> -         (match_operand:VI_16_32 2 "general_reg_operand")))
> +         (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
> +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
>     (clobber (reg:CC FLAGS_REG))]
>    "reload_completed"
>    [(parallel
>       [(set (match_dup 0)
> -          (any_logic:SI (match_dup 1) (match_dup 2)))
> +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>  {
> -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (GET_CODE (operands[2]) == CONST_VECTOR)

Please use if (!register_operand (operands[2], <MODE>mode)) instead.

Uros.

> +    {
> +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> +                                                               <MODE>mode);
> +      operands[2] = GEN_INT (val);
> +    }
> +  else
> +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  (define_split
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index c71c453cceb..5f63a7d52f5 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
>    return trunc_int_for_mode (val, SImode) == val;
>  })
>
> +(define_predicate "register_or_x86_64_const_vector_operand"
> +  (ior (match_operand 0 "register_operand")
> +       (match_operand 0 "x86_64_const_vector_operand")))
> +
>  ;; Return true when OP is nonimmediate or standard SSE constant.
>  (define_predicate "nonimmediate_or_sse_const_operand"
>    (ior (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> new file mode 100644
> index 00000000000..bb52385c8a5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo3 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> +
> +
> +void
> +foo1 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-18  6:28                 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak
@ 2022-07-19  6:07                   ` liuhongt
  2022-07-19  6:34                     ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: liuhongt @ 2022-07-19  6:07 UTC (permalink / raw)
  To: gcc-patches

And split it after reload.

> You will need ix86_binary_operator_ok insn constraint here with
> corresponding expander using ix86_fixup_binary_operands_no_copy to
> prepare insn operands.
Split define_expand with just register_operand, and allow
memory/immediate in define_insn, assume combine/forwprop will do optimization.

> Please use if (!register_operand (operands[2], <MODE>mode)) instead.
Changed.

Update patch.

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): New define_expand, it's
	original "<code><mode>3".
	(*<code><mode>3): New define_insn, it's original
	"<code><mode>3" be extended to handle memory and immediate
	operand with ix86_binary_operator_ok. Also adjust define_split
	after it.
	(mmxinsnmode): New mode attribute.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md
	(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md                     | 71 ++++++++++++----------
 gcc/config/i386/predicates.md              |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++
 3 files changed, 71 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..316b83dd3ac 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "<code><mode>3"
+  [(parallel
+    [(set (match_operand:VI_16_32 0 "register_operand")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "register_operand")
+	  (match_operand:VI_16_32 2 "register_operand")))
+   (clobber (reg:CC FLAGS_REG))])]
+  "")
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
    (clobber (reg:CC FLAGS_REG))]
-  ""
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+	  (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+	  (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (!register_operand (operands[2], <MODE>mode))
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+    }
+  else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..5f63a7d52f5 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "register_or_x86_64_const_vector_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..bb52385c8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-19  6:07                   ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt
@ 2022-07-19  6:34                     ` Uros Bizjak
  2022-07-19  6:56                       ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-19  6:34 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it after reload.
>
> > You will need ix86_binary_operator_ok insn constraint here with
> > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > prepare insn operands.
> Split define_expand with just register_operand, and allow
> memory/immediate in define_insn, assume combine/forwprop will do optimization.

But you will *ease* the job of the above passes if you use
ix86_fixup_binary_operands_no_copy in the expander. You can already
use a nonimmediate operand for operands 0 and 1 in the expander (and
register_or_x86_64_const_vector_operand as operand 2) and the fixup
will try to optimize and adjust the pattern. So I really don't see
what you gain with the proposed approach.

Uros.

>
> > Please use if (!register_operand (operands[2], <MODE>mode)) instead.
> Changed.
>
> Update patch.
>
> gcc/ChangeLog:
>
>         PR target/106038
>         * config/i386/mmx.md (<code><mode>3): New define_expand, it's
>         original "<code><mode>3".
>         (*<code><mode>3): New define_insn, it's original
>         "<code><mode>3" be extended to handle memory and immediate
>         operand with ix86_binary_operator_ok. Also adjust define_split
>         after it.
>         (mmxinsnmode): New mode attribute.
>         (*mov<mode>_imm): Refactor with mmxinsnmode.
>         * config/i386/predicates.md
>         (register_or_x86_64_const_vector_operand): New predicate.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106038-1.c: New test.
> ---
>  gcc/config/i386/mmx.md                     | 71 ++++++++++++----------
>  gcc/config/i386/predicates.md              |  4 ++
>  gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++
>  3 files changed, 71 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 3294c1e6274..316b83dd3ac 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
>     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
>
> +;; Mapping to same size integral mode.
> +(define_mode_attr mmxinsnmode
> +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> +   (V4HI "DI") (V2HI "SI")
> +   (V2SI "DI")
> +   (V4HF "DI") (V2HF "SI")
> +   (V2SF "DI")])
> +
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
>    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
>                                                             <MODE>mode);
>    operands[1] = GEN_INT (val);
> -  machine_mode mode;
> -  switch (GET_MODE_SIZE (<MODE>mode))
> -    {
> -    case 2:
> -      mode = HImode;
> -      break;
> -    case 4:
> -      mode = SImode;
> -      break;
> -    case 8:
> -      mode = DImode;
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  ;; For TARGET_64BIT we always round up to 8 bytes.
> @@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3"
>     (set_attr "type" "mmxadd,sselog,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI,TI")])
>
> -(define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> +(define_expand "<code><mode>3"
> +  [(parallel
> +    [(set (match_operand:VI_16_32 0 "register_operand")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "register_operand")
> +         (match_operand:VI_16_32 2 "register_operand")))
> +   (clobber (reg:CC FLAGS_REG))])]
> +  "")
> +
> +(define_insn "*<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
>     (clobber (reg:CC FLAGS_REG))]
> -  ""
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "#"
> -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> -   (set_attr "type" "alu,sselog,sselog,sselog")
> -   (set_attr "mode" "SI,TI,TI,TI")])
> +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> +   (set_attr "mode" "SI,SI,TI,TI,TI")])
>
>  (define_split
> -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "general_reg_operand")
> -         (match_operand:VI_16_32 2 "general_reg_operand")))
> +         (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
> +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
>     (clobber (reg:CC FLAGS_REG))]
>    "reload_completed"
>    [(parallel
>       [(set (match_dup 0)
> -          (any_logic:SI (match_dup 1) (match_dup 2)))
> +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>  {
> -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (!register_operand (operands[2], <MODE>mode))
> +    {
> +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> +                                                               <MODE>mode);
> +      operands[2] = GEN_INT (val);
> +    }
> +  else
> +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  (define_split
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index c71c453cceb..5f63a7d52f5 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
>    return trunc_int_for_mode (val, SImode) == val;
>  })
>
> +(define_predicate "register_or_x86_64_const_vector_operand"
> +  (ior (match_operand 0 "register_operand")
> +       (match_operand 0 "x86_64_const_vector_operand")))
> +
>  ;; Return true when OP is nonimmediate or standard SSE constant.
>  (define_predicate "nonimmediate_or_sse_const_operand"
>    (ior (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> new file mode 100644
> index 00000000000..bb52385c8a5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo3 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> +
> +
> +void
> +foo1 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-19  6:34                     ` Uros Bizjak
@ 2022-07-19  6:56                       ` Hongtao Liu
  2022-07-19  9:37                         ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-19  6:56 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > And split it after reload.
> >
> > > You will need ix86_binary_operator_ok insn constraint here with
> > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > prepare insn operands.
> > Split define_expand with just register_operand, and allow
> > memory/immediate in define_insn, assume combine/forwprop will do optimization.
>
> But you will *ease* the job of the above passes if you use
> ix86_fixup_binary_operands_no_copy in the expander.
for -m32, it will hit ICE in
Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
mode=E_V4QImode, operands=0x7fffffffa970) a
/gcc/config/i386/i386-expand.cc:1184
1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
(gdb) n
1185      gcc_assert (dst == operands[0]); -- here
(gdb)

the original operands[0], operands[1], operands[2] are below
(gdb) p debug_rtx (operands[0])
(mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
        (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
$1 = void
(gdb) p debug_rtx (operands[1])
(subreg:V4QI (reg:SI 129) 0)
$2 = void
(gdb) p debug_rtx (operands[2])
(subreg:V4QI (reg:SI 98 [ _46 ]) 0)
$3 = void
(gdb)

since operands[0] is mem and not equal to operands[1],
ix86_fixup_binary_operands will create a pseudo register for dst. and
then hit ICE.
Is this a bug or assumed?

>
> Uros.
>
> >
> > > Please use if (!register_operand (operands[2], <MODE>mode)) instead.
> > Changed.
> >
> > Update patch.
> >
> > gcc/ChangeLog:
> >
> >         PR target/106038
> >         * config/i386/mmx.md (<code><mode>3): New define_expand, it's
> >         original "<code><mode>3".
> >         (*<code><mode>3): New define_insn, it's original
> >         "<code><mode>3" be extended to handle memory and immediate
> >         operand with ix86_binary_operator_ok. Also adjust define_split
> >         after it.
> >         (mmxinsnmode): New mode attribute.
> >         (*mov<mode>_imm): Refactor with mmxinsnmode.
> >         * config/i386/predicates.md
> >         (register_or_x86_64_const_vector_operand): New predicate.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr106038-1.c: New test.
> > ---
> >  gcc/config/i386/mmx.md                     | 71 ++++++++++++----------
> >  gcc/config/i386/predicates.md              |  4 ++
> >  gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 ++++++++
> >  3 files changed, 71 insertions(+), 31 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
> >
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index 3294c1e6274..316b83dd3ac 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
> >    [(V8QI "b") (V4QI "b") (V2QI "b")
> >     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
> >
> > +;; Mapping to same size integral mode.
> > +(define_mode_attr mmxinsnmode
> > +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> > +   (V4HI "DI") (V2HI "SI")
> > +   (V2SI "DI")
> > +   (V4HF "DI") (V2HF "SI")
> > +   (V2SF "DI")])
> > +
> >  (define_mode_attr mmxdoublemode
> >    [(V8QI "V8HI") (V4HI "V4SI")])
> >
> > @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
> >    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
> >                                                             <MODE>mode);
> >    operands[1] = GEN_INT (val);
> > -  machine_mode mode;
> > -  switch (GET_MODE_SIZE (<MODE>mode))
> > -    {
> > -    case 2:
> > -      mode = HImode;
> > -      break;
> > -    case 4:
> > -      mode = SImode;
> > -      break;
> > -    case 8:
> > -      mode = DImode;
> > -      break;
> > -    default:
> > -      gcc_unreachable ();
> > -    }
> > -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  ;; For TARGET_64BIT we always round up to 8 bytes.
> > @@ -2974,33 +2967,49 @@ (define_insn "*mmx_<code><mode>3"
> >     (set_attr "type" "mmxadd,sselog,sselog,sselog")
> >     (set_attr "mode" "DI,TI,TI,TI")])
> >
> > -(define_insn "<code><mode>3"
> > -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> > +(define_expand "<code><mode>3"
> > +  [(parallel
> > +    [(set (match_operand:VI_16_32 0 "register_operand")
> > +        (any_logic:VI_16_32
> > +         (match_operand:VI_16_32 1 "register_operand")
> > +         (match_operand:VI_16_32 2 "register_operand")))
> > +   (clobber (reg:CC FLAGS_REG))])]
> > +  "")
> > +
> > +(define_insn "*<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
> >          (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> > -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> > +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> > +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  ""
> > +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> >    "#"
> > -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> > -   (set_attr "type" "alu,sselog,sselog,sselog")
> > -   (set_attr "mode" "SI,TI,TI,TI")])
> > +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> > +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> > +   (set_attr "mode" "SI,SI,TI,TI,TI")])
> >
> >  (define_split
> > -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> > +  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
> >          (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "general_reg_operand")
> > -         (match_operand:VI_16_32 2 "general_reg_operand")))
> > +         (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
> > +         (match_operand:VI_16_32 2 "register_or_x86_64_const_vector_operand")))
> >     (clobber (reg:CC FLAGS_REG))]
> >    "reload_completed"
> >    [(parallel
> >       [(set (match_dup 0)
> > -          (any_logic:SI (match_dup 1) (match_dup 2)))
> > +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >  {
> > -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> > -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> > -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> > +  if (!register_operand (operands[2], <MODE>mode))
> > +    {
> > +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> > +                                                               <MODE>mode);
> > +      operands[2] = GEN_INT (val);
> > +    }
> > +  else
> > +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> > +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  (define_split
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index c71c453cceb..5f63a7d52f5 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
> >    return trunc_int_for_mode (val, SImode) == val;
> >  })
> >
> > +(define_predicate "register_or_x86_64_const_vector_operand"
> > +  (ior (match_operand 0 "register_operand")
> > +       (match_operand 0 "x86_64_const_vector_operand")))
> > +
> >  ;; Return true when OP is nonimmediate or standard SSE constant.
> >  (define_predicate "nonimmediate_or_sse_const_operand"
> >    (ior (match_operand 0 "nonimmediate_operand")
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > new file mode 100644
> > index 00000000000..bb52385c8a5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > @@ -0,0 +1,27 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo3 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 3;
> > +  a[3] &= 3;
> > +}
> > +
> > +void
> > +foo4 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > +
> > +
> > +void
> > +foo1 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > --
> > 2.18.1
> >



--
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-19  6:56                       ` Hongtao Liu
@ 2022-07-19  9:37                         ` Uros Bizjak
  2022-07-20  2:37                           ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-19  9:37 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > And split it after reload.
> > >
> > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > prepare insn operands.
> > > Split define_expand with just register_operand, and allow
> > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> >
> > But you will *ease* the job of the above passes if you use
> > ix86_fixup_binary_operands_no_copy in the expander.
> for -m32, it will hit ICE in
> Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> mode=E_V4QImode, operands=0x7fffffffa970) a
> /gcc/config/i386/i386-expand.cc:1184
> 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> (gdb) n
> 1185      gcc_assert (dst == operands[0]); -- here
> (gdb)
>
> the original operands[0], operands[1], operands[2] are below
> (gdb) p debug_rtx (operands[0])
> (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
>         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> $1 = void
> (gdb) p debug_rtx (operands[1])
> (subreg:V4QI (reg:SI 129) 0)
> $2 = void
> (gdb) p debug_rtx (operands[2])
> (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> $3 = void
> (gdb)
>
> since operands[0] is mem and not equal to operands[1],
> ix86_fixup_binary_operands will create a pseudo register for dst. and
> then hit ICE.
> Is this a bug or assumed?

You will need ix86_expand_binary_operator here.

Uros.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-19  9:37                         ` Uros Bizjak
@ 2022-07-20  2:37                           ` Hongtao Liu
  2022-07-20  6:14                             ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-20  2:37 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > And split it after reload.
> > > >
> > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > prepare insn operands.
> > > > Split define_expand with just register_operand, and allow
> > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > >
> > > But you will *ease* the job of the above passes if you use
> > > ix86_fixup_binary_operands_no_copy in the expander.
> > for -m32, it will hit ICE in
> > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > mode=E_V4QImode, operands=0x7fffffffa970) a
> > /gcc/config/i386/i386-expand.cc:1184
> > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > (gdb) n
> > 1185      gcc_assert (dst == operands[0]); -- here
> > (gdb)
> >
> > the original operands[0], operands[1], operands[2] are below
> > (gdb) p debug_rtx (operands[0])
> > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > $1 = void
> > (gdb) p debug_rtx (operands[1])
> > (subreg:V4QI (reg:SI 129) 0)
> > $2 = void
> > (gdb) p debug_rtx (operands[2])
> > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > $3 = void
> > (gdb)
> >
> > since operands[0] is mem and not equal to operands[1],
> > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > then hit ICE.
> > Is this a bug or assumed?
>
> You will need ix86_expand_binary_operator here.
It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.

What about this?

-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
         (any_logic:VI_16_32
-         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
-   (clobber (reg:CC FLAGS_REG))]
+         (match_operand:VI_16_32 1 "nonimmediate_operand")
+         (match_operand:VI_16_32 2
"register_or_x86_64_const_vector_operand")))]
   ""
+{
+  rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands);
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode,
+                                            operands[1], operands[2]));
+  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+   DONE;
+})
+

>
> Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  2:37                           ` Hongtao Liu
@ 2022-07-20  6:14                             ` Uros Bizjak
  2022-07-20  6:18                               ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-20  6:14 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > And split it after reload.
> > > > >
> > > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > > prepare insn operands.
> > > > > Split define_expand with just register_operand, and allow
> > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > > >
> > > > But you will *ease* the job of the above passes if you use
> > > > ix86_fixup_binary_operands_no_copy in the expander.
> > > for -m32, it will hit ICE in
> > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > > mode=E_V4QImode, operands=0x7fffffffa970) a
> > > /gcc/config/i386/i386-expand.cc:1184
> > > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > > (gdb) n
> > > 1185      gcc_assert (dst == operands[0]); -- here
> > > (gdb)
> > >
> > > the original operands[0], operands[1], operands[2] are below
> > > (gdb) p debug_rtx (operands[0])
> > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> > >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > > $1 = void
> > > (gdb) p debug_rtx (operands[1])
> > > (subreg:V4QI (reg:SI 129) 0)
> > > $2 = void
> > > (gdb) p debug_rtx (operands[2])
> > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > > $3 = void
> > > (gdb)
> > >
> > > since operands[0] is mem and not equal to operands[1],
> > > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > > then hit ICE.
> > > Is this a bug or assumed?
> >
> > You will need ix86_expand_binary_operator here.
> It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.
>
> What about this?

Still no good... You are using commutative operands, so the predicate
of operand 2 should also allow memory. So, the predicate should be
nonimmediate_or_x86_64_const_vector_operand. The intermediate insn
pattern should look something like *<any_or:code><mode>_1, but with
added XMM and MMX reg alternatives instead of mask regs.

Uros.

>
> -(define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> +(define_expand "<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> -   (clobber (reg:CC FLAGS_REG))]
> +         (match_operand:VI_16_32 1 "nonimmediate_operand")
> +         (match_operand:VI_16_32 2
> "register_or_x86_64_const_vector_operand")))]
>    ""
> +{
> +  rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands);
> +  if (MEM_P (operands[2]))
> +    operands[2] = force_reg (<MODE>mode, operands[2]);
> +  rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode,
> +                                            operands[1], operands[2]));
> +  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
> +  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
> +  if (dst != operands[0])
> +    emit_move_insn (operands[0], dst);
> +   DONE;
> +})
> +
>
> >
> > Uros.
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  6:14                             ` Uros Bizjak
@ 2022-07-20  6:18                               ` Uros Bizjak
  2022-07-20  6:54                                 ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-20  6:18 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > >
> > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > >
> > > > > > And split it after reload.
> > > > > >
> > > > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > > > prepare insn operands.
> > > > > > Split define_expand with just register_operand, and allow
> > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > > > >
> > > > > But you will *ease* the job of the above passes if you use
> > > > > ix86_fixup_binary_operands_no_copy in the expander.
> > > > for -m32, it will hit ICE in
> > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > > > mode=E_V4QImode, operands=0x7fffffffa970) a
> > > > /gcc/config/i386/i386-expand.cc:1184
> > > > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > > > (gdb) n
> > > > 1185      gcc_assert (dst == operands[0]); -- here
> > > > (gdb)
> > > >
> > > > the original operands[0], operands[1], operands[2] are below
> > > > (gdb) p debug_rtx (operands[0])
> > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> > > >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > > > $1 = void
> > > > (gdb) p debug_rtx (operands[1])
> > > > (subreg:V4QI (reg:SI 129) 0)
> > > > $2 = void
> > > > (gdb) p debug_rtx (operands[2])
> > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > > > $3 = void
> > > > (gdb)
> > > >
> > > > since operands[0] is mem and not equal to operands[1],
> > > > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > > > then hit ICE.
> > > > Is this a bug or assumed?
> > >
> > > You will need ix86_expand_binary_operator here.
> > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.
> >
> > What about this?
>
> Still no good... You are using commutative operands, so the predicate
> of operand 2 should also allow memory. So, the predicate should be
> nonimmediate_or_x86_64_const_vector_operand. The intermediate insn
> pattern should look something like *<any_or:code><mode>_1, but with
> added XMM and MMX reg alternatives instead of mask regs.

Alternatively, you can use UNKNOWN operator to prevent
canonicalization, but then you should not use commutative constraint
in the intermediate insn. I think this is the best solution.

Uros.

> >
> > -(define_insn "<code><mode>3"
> > -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> > +(define_expand "<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
> >          (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> > -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> > -   (clobber (reg:CC FLAGS_REG))]
> > +         (match_operand:VI_16_32 1 "nonimmediate_operand")
> > +         (match_operand:VI_16_32 2
> > "register_or_x86_64_const_vector_operand")))]
> >    ""
> > +{
> > +  rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands);
> > +  if (MEM_P (operands[2]))
> > +    operands[2] = force_reg (<MODE>mode, operands[2]);
> > +  rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode,
> > +                                            operands[1], operands[2]));
> > +  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
> > +  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
> > +  if (dst != operands[0])
> > +    emit_move_insn (operands[0], dst);
> > +   DONE;
> > +})
> > +
> >
> > >
> > > Uros.
> >
> >
> >
> > --
> > BR,
> > Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  6:18                               ` Uros Bizjak
@ 2022-07-20  6:54                                 ` Hongtao Liu
  2022-07-20  7:18                                   ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-20  6:54 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > >
> > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > >
> > > > > > > And split it after reload.
> > > > > > >
> > > > > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > > > > prepare insn operands.
> > > > > > > Split define_expand with just register_operand, and allow
> > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > > > > >
> > > > > > But you will *ease* the job of the above passes if you use
> > > > > > ix86_fixup_binary_operands_no_copy in the expander.
> > > > > for -m32, it will hit ICE in
> > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > > > > mode=E_V4QImode, operands=0x7fffffffa970) a
> > > > > /gcc/config/i386/i386-expand.cc:1184
> > > > > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > > > > (gdb) n
> > > > > 1185      gcc_assert (dst == operands[0]); -- here
> > > > > (gdb)
> > > > >
> > > > > the original operands[0], operands[1], operands[2] are below
> > > > > (gdb) p debug_rtx (operands[0])
> > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> > > > >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > > > > $1 = void
> > > > > (gdb) p debug_rtx (operands[1])
> > > > > (subreg:V4QI (reg:SI 129) 0)
> > > > > $2 = void
> > > > > (gdb) p debug_rtx (operands[2])
> > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > > > > $3 = void
> > > > > (gdb)
> > > > >
> > > > > since operands[0] is mem and not equal to operands[1],
> > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > > > > then hit ICE.
> > > > > Is this a bug or assumed?
> > > >
> > > > You will need ix86_expand_binary_operator here.
> > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.
> > >
> > > What about this?
> >
> > Still no good... You are using commutative operands, so the predicate
> > of operand 2 should also allow memory. So, the predicate should be
> > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn
> > pattern should look something like *<any_or:code><mode>_1, but with
> > added XMM and MMX reg alternatives instead of mask regs.
>
> Alternatively, you can use UNKNOWN operator to prevent
> canonicalization, but then you should not use commutative constraint
> in the intermediate insn. I think this is the best solution.
Like this?

-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
         (any_logic:VI_16_32
-         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
-   (clobber (reg:CC FLAGS_REG))]
+         (match_operand:VI_16_32 1 "nonimmediate_operand")
+         (match_operand:VI_16_32 2
"register_or_x86_64_const_vector_operand")))]
   ""
+{
+  rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands);
+  if (MEM_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+  rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode,
+                                            operands[1], operands[2]));
+  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
+  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
+  if (dst != operands[0])
+    emit_move_insn (operands[0], dst);
+   DONE;
+})
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
+        (any_logic:VI_16_32
+         (match_operand:VI_16_32 1 "nonimmediate_operand" "0,0,0,x,v")
+         (match_operand:VI_16_32 2
"register_or_x86_64_const_vector_operand" "r,i,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (UNKNOWN, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])

>
> Uros.
>
> > >
> > > -(define_insn "<code><mode>3"
> > > -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> > > +(define_expand "<code><mode>3"
> > > +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
> > >          (any_logic:VI_16_32
> > > -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> > > -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> > > -   (clobber (reg:CC FLAGS_REG))]
> > > +         (match_operand:VI_16_32 1 "nonimmediate_operand")
> > > +         (match_operand:VI_16_32 2
> > > "register_or_x86_64_const_vector_operand")))]
> > >    ""
> > > +{
> > > +  rtx dst = ix86_fixup_binary_operands (<CODE>, <MODE>mode, operands);
> > > +  if (MEM_P (operands[2]))
> > > +    operands[2] = force_reg (<MODE>mode, operands[2]);
> > > +  rtx op = gen_rtx_SET (dst, gen_rtx_fmt_ee (<CODE>, <MODE>mode,
> > > +                                            operands[1], operands[2]));
> > > +  rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
> > > +  emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
> > > +  if (dst != operands[0])
> > > +    emit_move_insn (operands[0], dst);
> > > +   DONE;
> > > +})
> > > +
> > >
> > > >
> > > > Uros.
> > >
> > >
> > >
> > > --
> > > BR,
> > > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  6:54                                 ` Hongtao Liu
@ 2022-07-20  7:18                                   ` Uros Bizjak
  2022-07-20  7:22                                     ` Hongtao Liu
  0 siblings, 1 reply; 22+ messages in thread
From: Uros Bizjak @ 2022-07-20  7:18 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3409 bytes --]

On Wed, Jul 20, 2022 at 8:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > >
> > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > > >
> > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > > >
> > > > > > > > And split it after reload.
> > > > > > > >
> > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > > > > > prepare insn operands.
> > > > > > > > Split define_expand with just register_operand, and allow
> > > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > > > > > >
> > > > > > > But you will *ease* the job of the above passes if you use
> > > > > > > ix86_fixup_binary_operands_no_copy in the expander.
> > > > > > for -m32, it will hit ICE in
> > > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > > > > > mode=E_V4QImode, operands=0x7fffffffa970) a
> > > > > > /gcc/config/i386/i386-expand.cc:1184
> > > > > > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > > > > > (gdb) n
> > > > > > 1185      gcc_assert (dst == operands[0]); -- here
> > > > > > (gdb)
> > > > > >
> > > > > > the original operands[0], operands[1], operands[2] are below
> > > > > > (gdb) p debug_rtx (operands[0])
> > > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> > > > > >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > > > > > $1 = void
> > > > > > (gdb) p debug_rtx (operands[1])
> > > > > > (subreg:V4QI (reg:SI 129) 0)
> > > > > > $2 = void
> > > > > > (gdb) p debug_rtx (operands[2])
> > > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > > > > > $3 = void
> > > > > > (gdb)
> > > > > >
> > > > > > since operands[0] is mem and not equal to operands[1],
> > > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > > > > > then hit ICE.
> > > > > > Is this a bug or assumed?
> > > > >
> > > > > You will need ix86_expand_binary_operator here.
> > > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.
> > > >
> > > > What about this?
> > >
> > > Still no good... You are using commutative operands, so the predicate
> > > of operand 2 should also allow memory. So, the predicate should be
> > > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn
> > > pattern should look something like *<any_or:code><mode>_1, but with
> > > added XMM and MMX reg alternatives instead of mask regs.
> >
> > Alternatively, you can use UNKNOWN operator to prevent
> > canonicalization, but then you should not use commutative constraint
> > in the intermediate insn. I think this is the best solution.
> Like this?

Please check the attached (lightly tested) patch that keeps
commutative operands.

Uros.

[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 4499 bytes --]

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..0a39d396430 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2974,33 +2967,50 @@
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "<code><mode>3"
+  [(parallel
+    [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
-   (clobber (reg:CC FLAGS_REG))]
+	  (match_operand:VI_16_32 1 "nonimmediate_operand")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand")))
+   (clobber (reg:CC FLAGS_REG))])]
   ""
+  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+	  (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+	  (match_operand:VI_16_32 2 "reg_or_const_vector_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (!register_operand (operands[2], <MODE>mode))
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+    }
+  else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 42053ea7209..064596d9594 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1209,6 +1209,10 @@
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "nonimmediate_or_x86_64_const_vector_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  7:18                                   ` Uros Bizjak
@ 2022-07-20  7:22                                     ` Hongtao Liu
  2022-07-21  5:18                                       ` [PATCH V3] " liuhongt
  0 siblings, 1 reply; 22+ messages in thread
From: Hongtao Liu @ 2022-07-20  7:22 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: liuhongt, gcc-patches

On Wed, Jul 20, 2022 at 3:18 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Jul 20, 2022 at 8:54 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Wed, Jul 20, 2022 at 2:18 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Jul 20, 2022 at 8:14 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > >
> > > > On Wed, Jul 20, 2022 at 4:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > >
> > > > > On Tue, Jul 19, 2022 at 5:37 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > > > > >
> > > > > > On Tue, Jul 19, 2022 at 8:56 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > > > > > >
> > > > > > > On Tue, Jul 19, 2022 at 2:35 PM Uros Bizjak via Gcc-patches
> > > > > > > <gcc-patches@gcc.gnu.org> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jul 19, 2022 at 8:07 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > > > > > >
> > > > > > > > > And split it after reload.
> > > > > > > > >
> > > > > > > > > > You will need ix86_binary_operator_ok insn constraint here with
> > > > > > > > > > corresponding expander using ix86_fixup_binary_operands_no_copy to
> > > > > > > > > > prepare insn operands.
> > > > > > > > > Split define_expand with just register_operand, and allow
> > > > > > > > > memory/immediate in define_insn, assume combine/forwprop will do optimization.
> > > > > > > >
> > > > > > > > But you will *ease* the job of the above passes if you use
> > > > > > > > ix86_fixup_binary_operands_no_copy in the expander.
> > > > > > > for -m32, it will hit ICE in
> > > > > > > Breakpoint 1, ix86_fixup_binary_operands_no_copy (code=XOR,
> > > > > > > mode=E_V4QImode, operands=0x7fffffffa970) a
> > > > > > > /gcc/config/i386/i386-expand.cc:1184
> > > > > > > 1184      rtx dst = ix86_fixup_binary_operands (code, mode, operands);
> > > > > > > (gdb) n
> > > > > > > 1185      gcc_assert (dst == operands[0]); -- here
> > > > > > > (gdb)
> > > > > > >
> > > > > > > the original operands[0], operands[1], operands[2] are below
> > > > > > > (gdb) p debug_rtx (operands[0])
> > > > > > > (mem/c:V4QI (plus:SI (reg/f:SI 77 virtual-stack-vars)
> > > > > > >         (const_int -8220 [0xffffffffffffdfe4])) [0 MEM <vector(4)
> > > > > > > unsigned char> [(unsigned char *)&tmp2 + 4B]+0 S4 A32])
> > > > > > > $1 = void
> > > > > > > (gdb) p debug_rtx (operands[1])
> > > > > > > (subreg:V4QI (reg:SI 129) 0)
> > > > > > > $2 = void
> > > > > > > (gdb) p debug_rtx (operands[2])
> > > > > > > (subreg:V4QI (reg:SI 98 [ _46 ]) 0)
> > > > > > > $3 = void
> > > > > > > (gdb)
> > > > > > >
> > > > > > > since operands[0] is mem and not equal to operands[1],
> > > > > > > ix86_fixup_binary_operands will create a pseudo register for dst. and
> > > > > > > then hit ICE.
> > > > > > > Is this a bug or assumed?
> > > > > >
> > > > > > You will need ix86_expand_binary_operator here.
> > > > > It will swap memory operand from op1 to op2 and hit ICE for unrecognized insn.
> > > > >
> > > > > What about this?
> > > >
> > > > Still no good... You are using commutative operands, so the predicate
> > > > of operand 2 should also allow memory. So, the predicate should be
> > > > nonimmediate_or_x86_64_const_vector_operand. The intermediate insn
> > > > pattern should look something like *<any_or:code><mode>_1, but with
> > > > added XMM and MMX reg alternatives instead of mask regs.
> > >
> > > Alternatively, you can use UNKNOWN operator to prevent
> > > canonicalization, but then you should not use commutative constraint
> > > in the intermediate insn. I think this is the best solution.
> > Like this?
>
> Please check the attached (lightly tested) patch that keeps
> commutative operands.
Yes, it looks best, I'll fully test the patch.
>
> Uros.



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH V3] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-20  7:22                                     ` Hongtao Liu
@ 2022-07-21  5:18                                       ` liuhongt
  2022-07-21  5:55                                         ` Uros Bizjak
  0 siblings, 1 reply; 22+ messages in thread
From: liuhongt @ 2022-07-21  5:18 UTC (permalink / raw)
  To: gcc-patches

And split it after reload.

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): New define_expand, it's
	original "<code><mode>3".
	(*<code><mode>3): New define_insn, it's original
	"<code><mode>3" be extended to handle memory and immediate
	operand with ix86_binary_operator_ok. Also adjust define_split
	after it.
	(mmxinsnmode): New mode attribute.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md
	(register_or_x86_64_const_vector_operand): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
---
 gcc/config/i386/mmx.md                     | 70 ++++++++++++----------
 gcc/config/i386/predicates.md              |  4 ++
 gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 +++++++++
 3 files changed, 70 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..dda4b43f5c1 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2974,33 +2967,48 @@ (define_insn "*mmx_<code><mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
-   (clobber (reg:CC FLAGS_REG))]
+	  (match_operand:VI_16_32 1 "nonimmediate_operand")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand")))]
   ""
+  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
+  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
         (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+	  (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
+	  (match_operand:VI_16_32 2 "reg_or_const_vector_operand")))
    (clobber (reg:CC FLAGS_REG))]
   "reload_completed"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (!register_operand (operands[2], <MODE>mode))
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+    }
+  else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..73dfd46bf90 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "nonimmediate_or_x86_64_const_vector_operand"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..bb52385c8a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH V3] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative.
  2022-07-21  5:18                                       ` [PATCH V3] " liuhongt
@ 2022-07-21  5:55                                         ` Uros Bizjak
  0 siblings, 0 replies; 22+ messages in thread
From: Uros Bizjak @ 2022-07-21  5:55 UTC (permalink / raw)
  To: liuhongt; +Cc: gcc-patches

On Thu, Jul 21, 2022 at 7:19 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it after reload.
>
> gcc/ChangeLog:
>
>         PR target/106038
>         * config/i386/mmx.md (<code><mode>3): New define_expand, it's
>         original "<code><mode>3".
>         (*<code><mode>3): New define_insn, it's original
>         "<code><mode>3" be extended to handle memory and immediate
>         operand with ix86_binary_operator_ok. Also adjust define_split
>         after it.
>         (mmxinsnmode): New mode attribute.
>         (*mov<mode>_imm): Refactor with mmxinsnmode.
>         * config/i386/predicates.md
>         (register_or_x86_64_const_vector_operand): New predicate.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106038-1.c: New test.

OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/mmx.md                     | 70 ++++++++++++----------
>  gcc/config/i386/predicates.md              |  4 ++
>  gcc/testsuite/gcc.target/i386/pr106038-1.c | 27 +++++++++
>  3 files changed, 70 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 3294c1e6274..dda4b43f5c1 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -86,6 +86,14 @@ (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
>     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
>
> +;; Mapping to same size integral mode.
> +(define_mode_attr mmxinsnmode
> +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> +   (V4HI "DI") (V2HI "SI")
> +   (V2SI "DI")
> +   (V4HF "DI") (V2HF "SI")
> +   (V2SF "DI")])
> +
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> @@ -350,22 +358,7 @@ (define_insn_and_split "*mov<mode>_imm"
>    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
>                                                             <MODE>mode);
>    operands[1] = GEN_INT (val);
> -  machine_mode mode;
> -  switch (GET_MODE_SIZE (<MODE>mode))
> -    {
> -    case 2:
> -      mode = HImode;
> -      break;
> -    case 4:
> -      mode = SImode;
> -      break;
> -    case 8:
> -      mode = DImode;
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  ;; For TARGET_64BIT we always round up to 8 bytes.
> @@ -2974,33 +2967,48 @@ (define_insn "*mmx_<code><mode>3"
>     (set_attr "type" "mmxadd,sselog,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI,TI")])
>
> -(define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> +(define_expand "<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> -   (clobber (reg:CC FLAGS_REG))]
> +         (match_operand:VI_16_32 1 "nonimmediate_operand")
> +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand")))]
>    ""
> +  "ix86_expand_binary_operator (<CODE>, <MODE>mode, operands); DONE;")
> +
> +(define_insn "*<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=?r,m,x,x,v")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_const_vector_operand" "r,i,x,x,v")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "#"
> -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> -   (set_attr "type" "alu,sselog,sselog,sselog")
> -   (set_attr "mode" "SI,TI,TI,TI")])
> +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> +   (set_attr "mode" "SI,SI,TI,TI,TI")])
>
>  (define_split
> -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_gr_operand")
>          (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "general_reg_operand")
> -         (match_operand:VI_16_32 2 "general_reg_operand")))
> +         (match_operand:VI_16_32 1 "nonimmediate_gr_operand")
> +         (match_operand:VI_16_32 2 "reg_or_const_vector_operand")))
>     (clobber (reg:CC FLAGS_REG))]
>    "reload_completed"
>    [(parallel
>       [(set (match_dup 0)
> -          (any_logic:SI (match_dup 1) (match_dup 2)))
> +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>  {
> -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (!register_operand (operands[2], <MODE>mode))
> +    {
> +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> +                                                               <MODE>mode);
> +      operands[2] = GEN_INT (val);
> +    }
> +  else
> +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  (define_split
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index c71c453cceb..73dfd46bf90 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
>    return trunc_int_for_mode (val, SImode) == val;
>  })
>
> +(define_predicate "nonimmediate_or_x86_64_const_vector_operand"
> +  (ior (match_operand 0 "nonimmediate_operand")
> +       (match_operand 0 "x86_64_const_vector_operand")))
> +
>  ;; Return true when OP is nonimmediate or standard SSE constant.
>  (define_predicate "nonimmediate_or_sse_const_operand"
>    (ior (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> new file mode 100644
> index 00000000000..bb52385c8a5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo3 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> +
> +
> +void
> +foo1 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-07-21  5:55 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-11  1:15 [PATCH] Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns liuhongt
2022-07-11  8:02 ` Uros Bizjak
2022-07-12  6:37   ` Hongtao Liu
2022-07-12  7:15     ` Uros Bizjak
2022-07-14  5:33       ` [PATCH] Extend 64-bit vector bit_op patterns with ?r alternative liuhongt
2022-07-14  7:22         ` Uros Bizjak
2022-07-14  9:32           ` Hongtao Liu
2022-07-14  9:46             ` Uros Bizjak
2022-07-18  1:59               ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m, 0, i)(vertical) alternative liuhongt
2022-07-18  6:28                 ` [PATCH] Extend 16/32-bit vector bit_op patterns with (m,0,i)(vertical) alternative Uros Bizjak
2022-07-19  6:07                   ` [PATCH V2] Extend 16/32-bit vector bit_op patterns with (m, 0, i) alternative liuhongt
2022-07-19  6:34                     ` Uros Bizjak
2022-07-19  6:56                       ` Hongtao Liu
2022-07-19  9:37                         ` Uros Bizjak
2022-07-20  2:37                           ` Hongtao Liu
2022-07-20  6:14                             ` Uros Bizjak
2022-07-20  6:18                               ` Uros Bizjak
2022-07-20  6:54                                 ` Hongtao Liu
2022-07-20  7:18                                   ` Uros Bizjak
2022-07-20  7:22                                     ` Hongtao Liu
2022-07-21  5:18                                       ` [PATCH V3] " liuhongt
2022-07-21  5:55                                         ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).