[PATCH] Strip of a vector load which is only used partially.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Strip of a vector load which is only used partially.
@ 2022-05-05  5:04 liuhongt
  2022-05-05  8:26 ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: liuhongt @ 2022-05-05  5:04 UTC (permalink / raw)
  To: gcc-patches

Optimize

  _1 = *srcp_3(D);
  _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
  _5 = BIT_FIELD_REF <_4, 128, 0>;

to

  _1 = *srcp_3(D);
  _5 = BIT_FIELD_REF <_1, 128, 128>;

the upper will finally be optimized to

_5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;

Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/102583
	* gimple.h (gate_optimize_vector_load): Declare.
	* match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
	7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
	* tree-ssa-forwprop.cc (gate_optimize_vector_load): New
	function.
	(pass_forwprop::execute): Put condition codes in the upper new
	function.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102583.c: New test.
---
 gcc/gimple.h                             |  1 +
 gcc/match.pd                             | 56 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
 gcc/tree-ssa-forwprop.cc                 | 32 +++++++++-----
 4 files changed, 109 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c

diff --git a/gcc/gimple.h b/gcc/gimple.h
index 6b1e89ad74e..1747dae1193 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *);
 extern bool gimple_inexpensive_call_p (gcall *);
 extern bool stmt_can_terminate_bb_p (gimple *);
 extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
+extern bool gate_optimize_vector_load (gimple *);
 
 /* Return the disposition for a warning (or all warnings by default)
    for a statement.  */
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b3..ac214310251 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	}
 	(cmp @0 { res; })))))))))
 
+#if GIMPLE
+/* Simplify partail vector access, transform
+
+   V8SI A;
+   V4SI B;
+   A = *PA;
+   B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
+   C = BIT_FIELD_REF (B, 128, 0)
+
+to
+
+   A = *PA;
+   C = BIT_FIELD_REF (B, 128, 128);
+
+optimize_vector_load will eventually optimize the upper to
+
+   C = BIT_FIELD_REF (*PA, 128, 128);  */
+
+(simplify
+ (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
+ (if (VECTOR_TYPE_P (type)
+     && TYPE_MODE (type) != BLKmode
+     && single_use (@2)
+     && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
+     && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
+  (with
+   {
+     unsigned HOST_WIDE_INT nelts = -1;
+     if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
+       return NULL_TREE;
+     tree inner_type = TREE_TYPE (type);
+     unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
+     unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
+     unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
+     unsigned HOST_WIDE_INT start
+       = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
+
+     for (unsigned HOST_WIDE_INT i  = pos / elt_w + 1; i != size / elt_w; i++)
+       {
+	 /* Continuous area.  */
+	 if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
+	     != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
+	   return NULL_TREE;
+       }
+
+     /* Aligned or support movmisalign_optab.  */
+     unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
+     if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
+	  || start * elt_w % dest_align)
+	&& (optab_handler (movmisalign_optab, TYPE_MODE (type))
+	    == CODE_FOR_nothing))
+       return NULL_TREE;
+   }
+   (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
+#endif
+
 /* Canonicalizations of BIT_FIELD_REFs.  */
 
 (simplify
diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
new file mode 100644
index 00000000000..ff2ffb5e671
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102583.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
+      (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v2sf) { (float)src[4], (float)src[5] };
+}
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 484491fa1c5..2c8d8bc6dce 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   return true;
 }
 
+/* Gate for optimize_vector_load.  */
+bool
+gate_optimize_vector_load (gimple* stmt)
+{
+  if (!is_gimple_assign (stmt))
+    return false;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  return (cfun
+	  && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
+	  && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+	      /* After vector lowering rewrite all loads, but
+		 initially do not since this conflicts with
+		 vector CONSTRUCTOR to shuffle optimization.  */
+	      || (cfun->curr_properties & PROP_gimple_lvec))
+	  && gimple_assign_load_p (stmt)
+	  && !gimple_has_volatile_ops (stmt)
+	  && !stmt_can_throw_internal (cfun, stmt)
+	  && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
+}
 
 /* Rewrite the vector load at *GSI to component-wise loads if the load
    is only used in BIT_FIELD_REF extractions with eventual intermediate
@@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
 	      else
 		gsi_next (&gsi);
 	    }
-	  else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
-		   && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
-		       /* After vector lowering rewrite all loads, but
-			  initially do not since this conflicts with
-			  vector CONSTRUCTOR to shuffle optimization.  */
-		       || (fun->curr_properties & PROP_gimple_lvec))
-		   && gimple_assign_load_p (stmt)
-		   && !gimple_has_volatile_ops (stmt)
-		   && !stmt_can_throw_internal (cfun, stmt)
-		   && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
+	  else if (gate_optimize_vector_load (stmt))
 	    optimize_vector_load (&gsi);
 
 	  else if (code == COMPLEX_EXPR)
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Strip of a vector load which is only used partially.
  2022-05-05  5:04 [PATCH] Strip of a vector load which is only used partially liuhongt
@ 2022-05-05  8:26 ` Richard Biener
  2022-05-09  5:11   ` [PATCH v2] " liuhongt
  2022-05-09 22:57   ` [PATCH] " Jeff Law
  0 siblings, 2 replies; 9+ messages in thread
From: Richard Biener @ 2022-05-05  8:26 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Optimize
>
>   _1 = *srcp_3(D);
>   _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>   _5 = BIT_FIELD_REF <_4, 128, 0>;
>
> to
>
>   _1 = *srcp_3(D);
>   _5 = BIT_FIELD_REF <_1, 128, 128>;
>
> the upper will finally be optimized to
>
> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> Ok for trunk?

Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
handle this in the

  if (code == VEC_PERM_EXPR
      && constant_multiple_p (bit_field_offset (op), size, &idx))
    {

part of the code - maybe that needs to be enhanced to cover
a contiguous stride in the VEC_PERM_EXPR.  I see
we have

  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
  if (maybe_ne (bit_field_size (op), size))
    return false;

where it will currently bail, so adjust that to check for a
constant multiple.  I also think we should only handle the
case where the new bit_field_offset alignment is not
worse than the original one.

That said, I'd prefer if you integrate this transform with
simplify_bitfield_ref.

Richard.

>
> gcc/ChangeLog:
>
>         PR tree-optimization/102583
>         * gimple.h (gate_optimize_vector_load): Declare.
>         * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
>         7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
>         * tree-ssa-forwprop.cc (gate_optimize_vector_load): New
>         function.
>         (pass_forwprop::execute): Put condition codes in the upper new
>         function.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102583.c: New test.
> ---
>  gcc/gimple.h                             |  1 +
>  gcc/match.pd                             | 56 ++++++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
>  gcc/tree-ssa-forwprop.cc                 | 32 +++++++++-----
>  4 files changed, 109 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
>
> diff --git a/gcc/gimple.h b/gcc/gimple.h
> index 6b1e89ad74e..1747dae1193 100644
> --- a/gcc/gimple.h
> +++ b/gcc/gimple.h
> @@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *);
>  extern bool gimple_inexpensive_call_p (gcall *);
>  extern bool stmt_can_terminate_bb_p (gimple *);
>  extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
> +extern bool gate_optimize_vector_load (gimple *);
>
>  /* Return the disposition for a warning (or all warnings by default)
>     for a statement.  */
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6d691d302b3..ac214310251 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>         }
>         (cmp @0 { res; })))))))))
>
> +#if GIMPLE
> +/* Simplify partail vector access, transform
> +
> +   V8SI A;
> +   V4SI B;
> +   A = *PA;
> +   B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
> +   C = BIT_FIELD_REF (B, 128, 0)
> +
> +to
> +
> +   A = *PA;
> +   C = BIT_FIELD_REF (B, 128, 128);
> +
> +optimize_vector_load will eventually optimize the upper to
> +
> +   C = BIT_FIELD_REF (*PA, 128, 128);  */
> +
> +(simplify
> + (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
> + (if (VECTOR_TYPE_P (type)
> +     && TYPE_MODE (type) != BLKmode
> +     && single_use (@2)
> +     && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
> +     && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
> +  (with
> +   {
> +     unsigned HOST_WIDE_INT nelts = -1;
> +     if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
> +       return NULL_TREE;
> +     tree inner_type = TREE_TYPE (type);
> +     unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
> +     unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
> +     unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
> +     unsigned HOST_WIDE_INT start
> +       = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
> +
> +     for (unsigned HOST_WIDE_INT i  = pos / elt_w + 1; i != size / elt_w; i++)
> +       {
> +        /* Continuous area.  */
> +        if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
> +            != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
> +          return NULL_TREE;
> +       }
> +
> +     /* Aligned or support movmisalign_optab.  */
> +     unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
> +     if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
> +         || start * elt_w % dest_align)
> +       && (optab_handler (movmisalign_optab, TYPE_MODE (type))
> +           == CODE_FOR_nothing))
> +       return NULL_TREE;
> +   }
> +   (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
> +#endif
> +
>  /* Canonicalizations of BIT_FIELD_REFs.  */
>
>  (simplify
> diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> new file mode 100644
> index 00000000000..ff2ffb5e671
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> +
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef float v8sf __attribute__((vector_size(32)));
> +typedef float v4sf __attribute__((vector_size(16)));
> +typedef float v2sf __attribute__((vector_size(8)));
> +
> +v8sf part (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
> +      (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
> +}
> +
> +v4sf part1 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
> +}
> +
> +v2sf part2 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v2sf) { (float)src[4], (float)src[5] };
> +}
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 484491fa1c5..2c8d8bc6dce 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
>    return true;
>  }
>
> +/* Gate for optimize_vector_load.  */
> +bool
> +gate_optimize_vector_load (gimple* stmt)
> +{
> +  if (!is_gimple_assign (stmt))
> +    return false;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +  return (cfun
> +         && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> +         && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> +             /* After vector lowering rewrite all loads, but
> +                initially do not since this conflicts with
> +                vector CONSTRUCTOR to shuffle optimization.  */
> +             || (cfun->curr_properties & PROP_gimple_lvec))
> +         && gimple_assign_load_p (stmt)
> +         && !gimple_has_volatile_ops (stmt)
> +         && !stmt_can_throw_internal (cfun, stmt)
> +         && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
> +}
>
>  /* Rewrite the vector load at *GSI to component-wise loads if the load
>     is only used in BIT_FIELD_REF extractions with eventual intermediate
> @@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
>               else
>                 gsi_next (&gsi);
>             }
> -         else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> -                  && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> -                      /* After vector lowering rewrite all loads, but
> -                         initially do not since this conflicts with
> -                         vector CONSTRUCTOR to shuffle optimization.  */
> -                      || (fun->curr_properties & PROP_gimple_lvec))
> -                  && gimple_assign_load_p (stmt)
> -                  && !gimple_has_volatile_ops (stmt)
> -                  && !stmt_can_throw_internal (cfun, stmt)
> -                  && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
> +         else if (gate_optimize_vector_load (stmt))
>             optimize_vector_load (&gsi);
>
>           else if (code == COMPLEX_EXPR)
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] Strip of a vector load which is only used partially.
  2022-05-05  8:26 ` Richard Biener
@ 2022-05-09  5:11   ` liuhongt
  2022-05-10  6:53     ` Richard Biener
  2022-05-09 22:57   ` [PATCH] " Jeff Law
  1 sibling, 1 reply; 9+ messages in thread
From: liuhongt @ 2022-05-09  5:11 UTC (permalink / raw)
  To: gcc-patches

Here's adjused patch.
Ok for trunk?

Optimize

  _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
  _5 = BIT_FIELD_REF <_4, 128, 0>;

to

  _5 = BIT_FIELD_REF <_1, 128, 128>;

gcc/ChangeLog:

	PR tree-optimization/102583
	* tree-ssa-forwprop.cc (simplify_bitfield_ref): Extended to a
	contiguous stride in the VEC_PERM_EXPR.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102583.c: New test.
	* gcc.target/i386/pr92645-2.c: Adjust testcase.
	* gcc.target/i386/pr92645-3.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr102583.c  | 30 ++++++++
 gcc/testsuite/gcc.target/i386/pr92645-2.c |  4 +-
 gcc/testsuite/gcc.target/i386/pr92645-3.c |  4 +-
 gcc/tree-ssa-forwprop.cc                  | 89 ++++++++++++++++-------
 4 files changed, 96 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c

diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
new file mode 100644
index 00000000000..4ef2f296d0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102583.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+32\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+16\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v8sf) { (float)src[8], (float) src[9], (float)src[10], (float)src[11],
+      (float)src[12], (float)src[13], (float)src[14], (float)src[15] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v4sf) { (float)src[4], (float)src[5], (float)src[6], (float)src[7] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v2sf) { (float)src[4], (float)src[5] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
index d34ed3aa8e5..f0608de938a 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
@@ -29,6 +29,6 @@ void odd (v2si *dst, v4si *srcp)
 }
 
 /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
 /* Ideally highpart extraction would elide the permutation as well.  */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
index 9c08c9fb632..691011195c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
@@ -32,6 +32,6 @@ void odd (v4sf *dst, v8si *srcp)
 /* Four conversions, on the smaller vector type, to not convert excess
    elements.  */
 /* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
 /* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 484491fa1c5..f91f738895d 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -2334,8 +2334,10 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
   gimple *stmt = gsi_stmt (*gsi);
   gimple *def_stmt;
   tree op, op0, op1;
-  tree elem_type;
-  unsigned idx, size;
+  tree elem_type, type;
+  tree p, m, tem;
+  unsigned HOST_WIDE_INT nelts;
+  unsigned idx, size, elem_size;
   enum tree_code code;
 
   op = gimple_assign_rhs1 (stmt);
@@ -2353,42 +2355,75 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
   op1 = TREE_OPERAND (op, 1);
   code = gimple_assign_rhs_code (def_stmt);
   elem_type = TREE_TYPE (TREE_TYPE (op0));
-  if (TREE_TYPE (op) != elem_type)
+  type = TREE_TYPE (op);
+  /* Also hanlde vector type.
+   .i.e.
+   _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
+   _11 = BIT_FIELD_REF <_7, 64, 0>;
+
+   to
+
+   _11 = BIT_FIELD_REF <_1, 64, 64>.  */
+  if (type != elem_type
+      && (!VECTOR_TYPE_P (type) || TREE_TYPE (type) != elem_type))
     return false;
 
-  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+  elem_size = size = TREE_INT_CST_LOW (TYPE_SIZE (type));
   if (maybe_ne (bit_field_size (op), size))
     return false;
 
-  if (code == VEC_PERM_EXPR
-      && constant_multiple_p (bit_field_offset (op), size, &idx))
+  if (code != VEC_PERM_EXPR
+      || !constant_multiple_p (bit_field_offset (op), size, &idx))
+    return false;
+
+  m = gimple_assign_rhs3 (def_stmt);
+  if (TREE_CODE (m) != VECTOR_CST
+      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
+    return false;
+
+  /* One element.  */
+  if (type == elem_type)
+    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
+  else
     {
-      tree p, m, tem;
-      unsigned HOST_WIDE_INT nelts;
-      m = gimple_assign_rhs3 (def_stmt);
-      if (TREE_CODE (m) != VECTOR_CST
-	  || !VECTOR_CST_NELTS (m).is_constant (&nelts))
+      elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+      unsigned nelts_op;
+      if (!constant_multiple_p (bit_field_size (op), elem_size, &nelts_op))
 	return false;
-      idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
-      idx %= 2 * nelts;
-      if (idx < nelts)
-	{
-	  p = gimple_assign_rhs1 (def_stmt);
-	}
-      else
+      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx));
+      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1));
+      /* Be in the same vector.  */
+      if ((start < nelts) != (end < nelts))
+	return false;
+      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
 	{
-	  p = gimple_assign_rhs2 (def_stmt);
-	  idx -= nelts;
+	  /* Continuous area.  */
+	  if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) - 1
+	      != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)))
+	    return false;
 	}
-      tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
-		    unshare_expr (p), op1, bitsize_int (idx * size));
-      gimple_assign_set_rhs1 (stmt, tem);
-      fold_stmt (gsi);
-      update_stmt (gsi_stmt (*gsi));
-      return true;
+      /* Alignment not worse than before.  */
+      unsigned dest_align = TREE_INT_CST_LOW (TYPE_SIZE (type));
+      if (start * elem_size % dest_align)
+       return false;
+      idx = start;
     }
 
-  return false;
+  idx %= 2 * nelts;
+  if (idx < nelts)
+    p = gimple_assign_rhs1 (def_stmt);
+  else
+    {
+      p = gimple_assign_rhs2 (def_stmt);
+      idx -= nelts;
+    }
+
+  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
+		unshare_expr (p), op1, bitsize_int (idx * elem_size));
+  gimple_assign_set_rhs1 (stmt, tem);
+  fold_stmt (gsi);
+  update_stmt (gsi_stmt (*gsi));
+  return true;
 }
 
 /* Determine whether applying the 2 permutations (mask1 then mask2)
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Strip of a vector load which is only used partially.
  2022-05-05  8:26 ` Richard Biener
  2022-05-09  5:11   ` [PATCH v2] " liuhongt
@ 2022-05-09 22:57   ` Jeff Law
  2022-05-10  6:30     ` Richard Biener
  1 sibling, 1 reply; 9+ messages in thread
From: Jeff Law @ 2022-05-09 22:57 UTC (permalink / raw)
  To: gcc-patches



On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>> Optimize
>>
>>    _1 = *srcp_3(D);
>>    _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>>    _5 = BIT_FIELD_REF <_4, 128, 0>;
>>
>> to
>>
>>    _1 = *srcp_3(D);
>>    _5 = BIT_FIELD_REF <_1, 128, 128>;
>>
>> the upper will finally be optimized to
>>
>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>
>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>> Ok for trunk?
> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> handle this in the
>
>    if (code == VEC_PERM_EXPR
>        && constant_multiple_p (bit_field_offset (op), size, &idx))
>      {
>
> part of the code - maybe that needs to be enhanced to cover
> a contiguous stride in the VEC_PERM_EXPR.  I see
> we have
>
>    size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>    if (maybe_ne (bit_field_size (op), size))
>      return false;
>
> where it will currently bail, so adjust that to check for a
> constant multiple.  I also think we should only handle the
> case where the new bit_field_offset alignment is not
> worse than the original one.
>
> That said, I'd prefer if you integrate this transform with
> simplify_bitfield_ref.
I've got a hack here that tries to do something similar, but it's trying 
to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It 
walks the CONSTRUCTOR elements to see if an element has the right 
offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to 
eliminate the VEC_PERMUTE entirely and just forward operands into the 
BIT_FIELD_REF.

I was leaning towards moving those bits into match.pd before submitting, 
but if you'd prefer them in tree-ssa-forwprop, that's even easier.

Jeff



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Strip of a vector load which is only used partially.
  2022-05-09 22:57   ` [PATCH] " Jeff Law
@ 2022-05-10  6:30     ` Richard Biener
  2022-06-26 18:59       ` Jeff Law
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2022-05-10  6:30 UTC (permalink / raw)
  To: Jeff Law; +Cc: GCC Patches

On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> > On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
> >> Optimize
> >>
> >>    _1 = *srcp_3(D);
> >>    _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> >>    _5 = BIT_FIELD_REF <_4, 128, 0>;
> >>
> >> to
> >>
> >>    _1 = *srcp_3(D);
> >>    _5 = BIT_FIELD_REF <_1, 128, 128>;
> >>
> >> the upper will finally be optimized to
> >>
> >> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
> >>
> >> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> >> Ok for trunk?
> > Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> > handle this in the
> >
> >    if (code == VEC_PERM_EXPR
> >        && constant_multiple_p (bit_field_offset (op), size, &idx))
> >      {
> >
> > part of the code - maybe that needs to be enhanced to cover
> > a contiguous stride in the VEC_PERM_EXPR.  I see
> > we have
> >
> >    size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> >    if (maybe_ne (bit_field_size (op), size))
> >      return false;
> >
> > where it will currently bail, so adjust that to check for a
> > constant multiple.  I also think we should only handle the
> > case where the new bit_field_offset alignment is not
> > worse than the original one.
> >
> > That said, I'd prefer if you integrate this transform with
> > simplify_bitfield_ref.
> I've got a hack here that tries to do something similar, but it's trying
> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It
> walks the CONSTRUCTOR elements to see if an element has the right
> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
> eliminate the VEC_PERMUTE entirely and just forward operands into the
> BIT_FIELD_REF.
>
> I was leaning towards moving those bits into match.pd before submitting,
> but if you'd prefer them in tree-ssa-forwprop, that's even easier.

I think when deciding where to put things it's important to look where related
transforms reside.  We already do have a (simplify (BIT_FIELD_REF
CONSTRUCTOR@ ...))
pattern which should also handle your case already.  So instead of
adding something
new it would be nice to figure why it doesn't handle the case you are
interested in and
eventually just adjust the existing pattern.

In the case of the above patch there isn't a match.pd pattern for this yet but
forwprop already has code to match bit-field-refs with vec-perms, so that's the
reason I preferred extending that.  But of course the whole thing could live in
match.pd as well.

Richard.

> Jeff
>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] Strip of a vector load which is only used partially.
  2022-05-09  5:11   ` [PATCH v2] " liuhongt
@ 2022-05-10  6:53     ` Richard Biener
  2022-05-12  1:48       ` Hongtao Liu
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Biener @ 2022-05-10  6:53 UTC (permalink / raw)
  To: liuhongt; +Cc: GCC Patches

On Mon, May 9, 2022 at 7:11 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Here's adjused patch.
> Ok for trunk?
>
> Optimize
>
>   _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>   _5 = BIT_FIELD_REF <_4, 128, 0>;
>
> to
>
>   _5 = BIT_FIELD_REF <_1, 128, 128>;
>
> gcc/ChangeLog:
>
>         PR tree-optimization/102583
>         * tree-ssa-forwprop.cc (simplify_bitfield_ref): Extended to a
>         contiguous stride in the VEC_PERM_EXPR.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr102583.c: New test.
>         * gcc.target/i386/pr92645-2.c: Adjust testcase.
>         * gcc.target/i386/pr92645-3.c: Ditto.
> ---
>  gcc/testsuite/gcc.target/i386/pr102583.c  | 30 ++++++++
>  gcc/testsuite/gcc.target/i386/pr92645-2.c |  4 +-
>  gcc/testsuite/gcc.target/i386/pr92645-3.c |  4 +-
>  gcc/tree-ssa-forwprop.cc                  | 89 ++++++++++++++++-------
>  4 files changed, 96 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> new file mode 100644
> index 00000000000..4ef2f296d0c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+32\(%.*%ymm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+16\(%.*%xmm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> +
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef float v8sf __attribute__((vector_size(32)));
> +typedef float v4sf __attribute__((vector_size(16)));
> +typedef float v2sf __attribute__((vector_size(8)));
> +
> +v8sf part (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v8sf) { (float)src[8], (float) src[9], (float)src[10], (float)src[11],
> +      (float)src[12], (float)src[13], (float)src[14], (float)src[15] };
> +}
> +
> +v4sf part1 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v4sf) { (float)src[4], (float)src[5], (float)src[6], (float)src[7] };
> +}
> +
> +v2sf part2 (v16si *srcp)
> +{
> +  v16si src = *srcp;
> +  return (v2sf) { (float)src[4], (float)src[5] };
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> index d34ed3aa8e5..f0608de938a 100644
> --- a/gcc/testsuite/gcc.target/i386/pr92645-2.c
> +++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> @@ -29,6 +29,6 @@ void odd (v2si *dst, v4si *srcp)
>  }
>
>  /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
> -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
>  /* Ideally highpart extraction would elide the permutation as well.  */
> -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> index 9c08c9fb632..691011195c9 100644
> --- a/gcc/testsuite/gcc.target/i386/pr92645-3.c
> +++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> @@ -32,6 +32,6 @@ void odd (v4sf *dst, v8si *srcp)
>  /* Four conversions, on the smaller vector type, to not convert excess
>     elements.  */
>  /* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
> -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
>  /* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
> -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 484491fa1c5..f91f738895d 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -2334,8 +2334,10 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
>    gimple *stmt = gsi_stmt (*gsi);
>    gimple *def_stmt;
>    tree op, op0, op1;
> -  tree elem_type;
> -  unsigned idx, size;
> +  tree elem_type, type;
> +  tree p, m, tem;
> +  unsigned HOST_WIDE_INT nelts;
> +  unsigned idx, size, elem_size;
>    enum tree_code code;
>
>    op = gimple_assign_rhs1 (stmt);
> @@ -2353,42 +2355,75 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
>    op1 = TREE_OPERAND (op, 1);
>    code = gimple_assign_rhs_code (def_stmt);
>    elem_type = TREE_TYPE (TREE_TYPE (op0));
> -  if (TREE_TYPE (op) != elem_type)
> +  type = TREE_TYPE (op);
> +  /* Also hanlde vector type.
> +   .i.e.
> +   _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
> +   _11 = BIT_FIELD_REF <_7, 64, 0>;
> +
> +   to
> +
> +   _11 = BIT_FIELD_REF <_1, 64, 64>.  */
> +  if (type != elem_type
> +      && (!VECTOR_TYPE_P (type) || TREE_TYPE (type) != elem_type))
>      return false;

I'm not sure we actually need this check.  I think we could handle
a __int128_t extract from the permuted vector as well, no?

>
> -  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> +  elem_size = size = TREE_INT_CST_LOW (TYPE_SIZE (type));

That's not going to work with VLA vectors, I think you need to make
elem_size/size poly_uint64 and use tree_to_poly_uint64 (TYPE_SIZE (type))
here.

>    if (maybe_ne (bit_field_size (op), size))
>      return false;
>
> -  if (code == VEC_PERM_EXPR
> -      && constant_multiple_p (bit_field_offset (op), size, &idx))
> +  if (code != VEC_PERM_EXPR
> +      || !constant_multiple_p (bit_field_offset (op), size, &idx))

That might be a bit restrictive?  I think we want elem_size here
with elem_size picked from the vector type of the VEC_PERM_EXPR
operand.

> +    return false;
> +
> +  m = gimple_assign_rhs3 (def_stmt);
> +  if (TREE_CODE (m) != VECTOR_CST
> +      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> +    return false;
> +
> +  /* One element.  */
> +  if (type == elem_type)

I think this should work on the sizes, so

      if (known_eq (size, elem_size))
...

that makes sure we do not end up with nelts_op == 1 below and also
handle the case where the BIT_FIELD_REF would do int<->float
punning.

> +    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> +  else
>      {
> -      tree p, m, tem;
> -      unsigned HOST_WIDE_INT nelts;
> -      m = gimple_assign_rhs3 (def_stmt);
> -      if (TREE_CODE (m) != VECTOR_CST
> -         || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> +      elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> +      unsigned nelts_op;
> +      if (!constant_multiple_p (bit_field_size (op), elem_size, &nelts_op))

bit_field_size (op) == size?  I think we also want to handle
power-of-two nelts_op
only (I can't think of how we get a non-power-of-two, but checking
pow2p_hwi (nelts_op)
woudl be nice if only for documentation purposes)

>         return false;
> -      idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> -      idx %= 2 * nelts;
> -      if (idx < nelts)
> -       {
> -         p = gimple_assign_rhs1 (def_stmt);
> -       }
> -      else
> +      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx));
> +      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1));
> +      /* Be in the same vector.  */
> +      if ((start < nelts) != (end < nelts))
> +       return false;
> +      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
>         {
> -         p = gimple_assign_rhs2 (def_stmt);
> -         idx -= nelts;
> +         /* Continuous area.  */
> +         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) - 1
> +             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)))
> +           return false;
>         }
> -      tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> -                   unshare_expr (p), op1, bitsize_int (idx * size));
> -      gimple_assign_set_rhs1 (stmt, tem);
> -      fold_stmt (gsi);
> -      update_stmt (gsi_stmt (*gsi));
> -      return true;
> +      /* Alignment not worse than before.  */
> +      unsigned dest_align = TREE_INT_CST_LOW (TYPE_SIZE (type));
> +      if (start * elem_size % dest_align)
> +       return false;

So I think we want to have start * elem_size be a multiple of size
(the check I said above is too restrictive on the _original_ bit-field-ref but
we want to have for the target bit-field-ref).  I think that's what you check
here but see above for the use of poly-ints.

> +      idx = start;
>      }
>
> -  return false;
> +  idx %= 2 * nelts;

that's odd - why do you need to clamp idx?

> +  if (idx < nelts)
> +    p = gimple_assign_rhs1 (def_stmt);
> +  else
> +    {
> +      p = gimple_assign_rhs2 (def_stmt);
> +      idx -= nelts;
> +    }
> +
> +  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> +               unshare_expr (p), op1, bitsize_int (idx * elem_size));

You don't need to unshare_expr 'p' here.

Otherwise looks OK.

Thanks,
Richard.

> +  gimple_assign_set_rhs1 (stmt, tem);
> +  fold_stmt (gsi);
> +  update_stmt (gsi_stmt (*gsi));
> +  return true;
>  }
>
>  /* Determine whether applying the 2 permutations (mask1 then mask2)
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] Strip of a vector load which is only used partially.
  2022-05-10  6:53     ` Richard Biener
@ 2022-05-12  1:48       ` Hongtao Liu
  2022-05-12 11:26         ` Richard Biener
  0 siblings, 1 reply; 9+ messages in thread
From: Hongtao Liu @ 2022-05-12  1:48 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 10259 bytes --]

On Tue, May 10, 2022 at 2:54 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, May 9, 2022 at 7:11 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Here's adjused patch.
> > Ok for trunk?
> >
> > Optimize
> >
> >   _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> >   _5 = BIT_FIELD_REF <_4, 128, 0>;
> >
> > to
> >
> >   _5 = BIT_FIELD_REF <_1, 128, 128>;
> >
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/102583
> >         * tree-ssa-forwprop.cc (simplify_bitfield_ref): Extended to a
> >         contiguous stride in the VEC_PERM_EXPR.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr102583.c: New test.
> >         * gcc.target/i386/pr92645-2.c: Adjust testcase.
> >         * gcc.target/i386/pr92645-3.c: Ditto.
> > ---
> >  gcc/testsuite/gcc.target/i386/pr102583.c  | 30 ++++++++
> >  gcc/testsuite/gcc.target/i386/pr92645-2.c |  4 +-
> >  gcc/testsuite/gcc.target/i386/pr92645-3.c |  4 +-
> >  gcc/tree-ssa-forwprop.cc                  | 89 ++++++++++++++++-------
> >  4 files changed, 96 insertions(+), 31 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> > new file mode 100644
> > index 00000000000..4ef2f296d0c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-mavx512f -O2" } */
> > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+32\(%.*%ymm} 1 } } */
> > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+16\(%.*%xmm} 1 } } */
> > +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> > +
> > +typedef int v16si __attribute__((vector_size(64)));
> > +typedef float v8sf __attribute__((vector_size(32)));
> > +typedef float v4sf __attribute__((vector_size(16)));
> > +typedef float v2sf __attribute__((vector_size(8)));
> > +
> > +v8sf part (v16si *srcp)
> > +{
> > +  v16si src = *srcp;
> > +  return (v8sf) { (float)src[8], (float) src[9], (float)src[10], (float)src[11],
> > +      (float)src[12], (float)src[13], (float)src[14], (float)src[15] };
> > +}
> > +
> > +v4sf part1 (v16si *srcp)
> > +{
> > +  v16si src = *srcp;
> > +  return (v4sf) { (float)src[4], (float)src[5], (float)src[6], (float)src[7] };
> > +}
> > +
> > +v2sf part2 (v16si *srcp)
> > +{
> > +  v16si src = *srcp;
> > +  return (v2sf) { (float)src[4], (float)src[5] };
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > index d34ed3aa8e5..f0608de938a 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > @@ -29,6 +29,6 @@ void odd (v2si *dst, v4si *srcp)
> >  }
> >
> >  /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
> > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
> >  /* Ideally highpart extraction would elide the permutation as well.  */
> > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > index 9c08c9fb632..691011195c9 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > @@ -32,6 +32,6 @@ void odd (v4sf *dst, v8si *srcp)
> >  /* Four conversions, on the smaller vector type, to not convert excess
> >     elements.  */
> >  /* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
> > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
> >  /* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
> > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > index 484491fa1c5..f91f738895d 100644
> > --- a/gcc/tree-ssa-forwprop.cc
> > +++ b/gcc/tree-ssa-forwprop.cc
> > @@ -2334,8 +2334,10 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
> >    gimple *stmt = gsi_stmt (*gsi);
> >    gimple *def_stmt;
> >    tree op, op0, op1;
> > -  tree elem_type;
> > -  unsigned idx, size;
> > +  tree elem_type, type;
> > +  tree p, m, tem;
> > +  unsigned HOST_WIDE_INT nelts;
> > +  unsigned idx, size, elem_size;
> >    enum tree_code code;
> >
> >    op = gimple_assign_rhs1 (stmt);
> > @@ -2353,42 +2355,75 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
> >    op1 = TREE_OPERAND (op, 1);
> >    code = gimple_assign_rhs_code (def_stmt);
> >    elem_type = TREE_TYPE (TREE_TYPE (op0));
> > -  if (TREE_TYPE (op) != elem_type)
> > +  type = TREE_TYPE (op);
> > +  /* Also hanlde vector type.
> > +   .i.e.
> > +   _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
> > +   _11 = BIT_FIELD_REF <_7, 64, 0>;
> > +
> > +   to
> > +
> > +   _11 = BIT_FIELD_REF <_1, 64, 64>.  */
> > +  if (type != elem_type
> > +      && (!VECTOR_TYPE_P (type) || TREE_TYPE (type) != elem_type))
> >      return false;
>
> I'm not sure we actually need this check.  I think we could handle
> a __int128_t extract from the permuted vector as well, no?
Yes, changed.
>
> >
> > -  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> > +  elem_size = size = TREE_INT_CST_LOW (TYPE_SIZE (type));
>
> That's not going to work with VLA vectors, I think you need to make
> elem_size/size poly_uint64 and use tree_to_poly_uint64 (TYPE_SIZE (type))
> here.
>
Changed.
> >    if (maybe_ne (bit_field_size (op), size))
> >      return false;
> >
> > -  if (code == VEC_PERM_EXPR
> > -      && constant_multiple_p (bit_field_offset (op), size, &idx))
> > +  if (code != VEC_PERM_EXPR
> > +      || !constant_multiple_p (bit_field_offset (op), size, &idx))
>
> That might be a bit restrictive?  I think we want elem_size here
> with elem_size picked from the vector type of the VEC_PERM_EXPR
> operand.
>
Changed.
> > +    return false;
> > +
> > +  m = gimple_assign_rhs3 (def_stmt);
> > +  if (TREE_CODE (m) != VECTOR_CST
> > +      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> > +    return false;
> > +
> > +  /* One element.  */
> > +  if (type == elem_type)
>
> I think this should work on the sizes, so
>
>       if (known_eq (size, elem_size))
> ...
>
> that makes sure we do not end up with nelts_op == 1 below and also
> handle the case where the BIT_FIELD_REF would do int<->float
> punning.
>
Yes, also for V1TImode <-> TImode punning.
> > +    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> > +  else
> >      {
> > -      tree p, m, tem;
> > -      unsigned HOST_WIDE_INT nelts;
> > -      m = gimple_assign_rhs3 (def_stmt);
> > -      if (TREE_CODE (m) != VECTOR_CST
> > -         || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> > +      elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> > +      unsigned nelts_op;
> > +      if (!constant_multiple_p (bit_field_size (op), elem_size, &nelts_op))
>
> bit_field_size (op) == size?  I think we also want to handle
> power-of-two nelts_op
> only (I can't think of how we get a non-power-of-two, but checking
> pow2p_hwi (nelts_op)
> woudl be nice if only for documentation purposes)
Add the check.
>
> >         return false;
> > -      idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> > -      idx %= 2 * nelts;
> > -      if (idx < nelts)
> > -       {
> > -         p = gimple_assign_rhs1 (def_stmt);
> > -       }
> > -      else
> > +      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx));
> > +      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1));
> > +      /* Be in the same vector.  */
> > +      if ((start < nelts) != (end < nelts))
> > +       return false;
> > +      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
> >         {
> > -         p = gimple_assign_rhs2 (def_stmt);
> > -         idx -= nelts;
> > +         /* Continuous area.  */
> > +         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) - 1
> > +             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)))
> > +           return false;
> >         }
> > -      tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> > -                   unshare_expr (p), op1, bitsize_int (idx * size));
> > -      gimple_assign_set_rhs1 (stmt, tem);
> > -      fold_stmt (gsi);
> > -      update_stmt (gsi_stmt (*gsi));
> > -      return true;
> > +      /* Alignment not worse than before.  */
> > +      unsigned dest_align = TREE_INT_CST_LOW (TYPE_SIZE (type));
> > +      if (start * elem_size % dest_align)
> > +       return false;
>
> So I think we want to have start * elem_size be a multiple of size
> (the check I said above is too restrictive on the _original_ bit-field-ref but
> we want to have for the target bit-field-ref).  I think that's what you check
> here but see above for the use of poly-ints.
Changed.
>
> > +      idx = start;
> >      }
> >
> > -  return false;
> > +  idx %= 2 * nelts;
>
> that's odd - why do you need to clamp idx?
Changed.
This is just copy from original codes, it looks it's not needed any
more, similar for your below comment.
>
> > +  if (idx < nelts)
> > +    p = gimple_assign_rhs1 (def_stmt);
> > +  else
> > +    {
> > +      p = gimple_assign_rhs2 (def_stmt);
> > +      idx -= nelts;
> > +    }
> > +
> > +  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> > +               unshare_expr (p), op1, bitsize_int (idx * elem_size));
>
> You don't need to unshare_expr 'p' here.
>
> Otherwise looks OK.
>
Here's updated patch.
> Thanks,
> Richard.
>
> > +  gimple_assign_set_rhs1 (stmt, tem);
> > +  fold_stmt (gsi);
> > +  update_stmt (gsi_stmt (*gsi));
> > +  return true;
> >  }
> >
> >  /* Determine whether applying the 2 permutations (mask1 then mask2)
> > --
> > 2.18.1
> >



-- 
BR,
Hongtao

[-- Attachment #2: v3-0001-Strip-of-a-vector-load-which-is-only-used-partial.patch --]
[-- Type: application/octet-stream, Size: 7494 bytes --]

From 5bb438c82cedd2c79f579ed6ea17cdd7be3a5900 Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 8 Apr 2022 11:26:46 +0800
Subject: [PATCH v3] Strip of a vector load which is only used partially.

Optimize

  _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
  _5 = BIT_FIELD_REF <_4, 128, 0>;

to

  _5 = BIT_FIELD_REF <_1, 128, 128>;

gcc/ChangeLog:

	PR tree-optimization/102583
	* tree-ssa-forwprop.cc (simplify_bitfield_ref): Extended to a
	contiguous stride in the VEC_PERM_EXPR.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr102583.c: New test.
	* gcc.target/i386/pr92645-2.c: Adjust testcase.
	* gcc.target/i386/pr92645-3.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr102583.c  | 30 ++++++++
 gcc/testsuite/gcc.target/i386/pr92645-2.c |  4 +-
 gcc/testsuite/gcc.target/i386/pr92645-3.c |  4 +-
 gcc/tree-ssa-forwprop.cc                  | 87 +++++++++++++++--------
 4 files changed, 93 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c

diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
new file mode 100644
index 00000000000..4ef2f296d0c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102583.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+32\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+16\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v8sf) { (float)src[8], (float) src[9], (float)src[10], (float)src[11],
+      (float)src[12], (float)src[13], (float)src[14], (float)src[15] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v4sf) { (float)src[4], (float)src[5], (float)src[6], (float)src[7] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v2sf) { (float)src[4], (float)src[5] };
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
index d34ed3aa8e5..f0608de938a 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
@@ -29,6 +29,6 @@ void odd (v2si *dst, v4si *srcp)
 }
 
 /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
 /* Ideally highpart extraction would elide the permutation as well.  */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
index 9c08c9fb632..691011195c9 100644
--- a/gcc/testsuite/gcc.target/i386/pr92645-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
@@ -32,6 +32,6 @@ void odd (v4sf *dst, v8si *srcp)
 /* Four conversions, on the smaller vector type, to not convert excess
    elements.  */
 /* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
 /* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
-/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 484491fa1c5..c5b2a4f9f42 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -2334,8 +2334,10 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
   gimple *stmt = gsi_stmt (*gsi);
   gimple *def_stmt;
   tree op, op0, op1;
-  tree elem_type;
-  unsigned idx, size;
+  tree elem_type, type;
+  tree p, m, tem;
+  unsigned HOST_WIDE_INT nelts, idx;
+  poly_uint64 size, elem_size;
   enum tree_code code;
 
   op = gimple_assign_rhs1 (stmt);
@@ -2353,42 +2355,71 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
   op1 = TREE_OPERAND (op, 1);
   code = gimple_assign_rhs_code (def_stmt);
   elem_type = TREE_TYPE (TREE_TYPE (op0));
-  if (TREE_TYPE (op) != elem_type)
-    return false;
+  type = TREE_TYPE (op);
+  /* Also hanlde vector type.
+   .i.e.
+   _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
+   _11 = BIT_FIELD_REF <_7, 64, 0>;
+
+   to
 
-  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
+   _11 = BIT_FIELD_REF <_1, 64, 64>.  */
+
+  size = tree_to_poly_uint64 (TYPE_SIZE (type));
   if (maybe_ne (bit_field_size (op), size))
     return false;
 
-  if (code == VEC_PERM_EXPR
-      && constant_multiple_p (bit_field_offset (op), size, &idx))
+  elem_size = tree_to_poly_uint64 (TYPE_SIZE (elem_type));
+  if (code != VEC_PERM_EXPR
+      || !constant_multiple_p (bit_field_offset (op), elem_size, &idx))
+    return false;
+
+  m = gimple_assign_rhs3 (def_stmt);
+  if (TREE_CODE (m) != VECTOR_CST
+      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
+    return false;
+
+  /* One element.  */
+  if (known_eq (size, elem_size))
+    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
+  else
     {
-      tree p, m, tem;
-      unsigned HOST_WIDE_INT nelts;
-      m = gimple_assign_rhs3 (def_stmt);
-      if (TREE_CODE (m) != VECTOR_CST
-	  || !VECTOR_CST_NELTS (m).is_constant (&nelts))
+      unsigned HOST_WIDE_INT nelts_op;
+      if (!constant_multiple_p (size, elem_size, &nelts_op)
+	  || !pow2p_hwi (nelts_op))
 	return false;
-      idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
-      idx %= 2 * nelts;
-      if (idx < nelts)
-	{
-	  p = gimple_assign_rhs1 (def_stmt);
-	}
-      else
+      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx));
+      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1));
+      /* Be in the same vector.  */
+      if ((start < nelts) != (end < nelts))
+	return false;
+      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
 	{
-	  p = gimple_assign_rhs2 (def_stmt);
-	  idx -= nelts;
+	  /* Continuous area.  */
+	  if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) - 1
+	      != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)))
+	    return false;
 	}
-      tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
-		    unshare_expr (p), op1, bitsize_int (idx * size));
-      gimple_assign_set_rhs1 (stmt, tem);
-      fold_stmt (gsi);
-      update_stmt (gsi_stmt (*gsi));
-      return true;
+      /* Alignment not worse than before.  */
+      if (start % nelts_op)
+       return false;
+      idx = start;
     }
 
-  return false;
+  if (idx < nelts)
+    p = gimple_assign_rhs1 (def_stmt);
+  else
+    {
+      p = gimple_assign_rhs2 (def_stmt);
+      idx -= nelts;
+    }
+
+  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
+		p, op1, bitsize_int (idx * elem_size));
+  gimple_assign_set_rhs1 (stmt, tem);
+  fold_stmt (gsi);
+  update_stmt (gsi_stmt (*gsi));
+  return true;
 }
 
 /* Determine whether applying the 2 permutations (mask1 then mask2)
-- 
2.18.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] Strip of a vector load which is only used partially.
  2022-05-12  1:48       ` Hongtao Liu
@ 2022-05-12 11:26         ` Richard Biener
  0 siblings, 0 replies; 9+ messages in thread
From: Richard Biener @ 2022-05-12 11:26 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: liuhongt, GCC Patches

On Thu, May 12, 2022 at 3:48 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, May 10, 2022 at 2:54 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, May 9, 2022 at 7:11 AM liuhongt via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Here's adjused patch.
> > > Ok for trunk?
> > >
> > > Optimize
> > >
> > >   _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> > >   _5 = BIT_FIELD_REF <_4, 128, 0>;
> > >
> > > to
> > >
> > >   _5 = BIT_FIELD_REF <_1, 128, 128>;
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR tree-optimization/102583
> > >         * tree-ssa-forwprop.cc (simplify_bitfield_ref): Extended to a
> > >         contiguous stride in the VEC_PERM_EXPR.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr102583.c: New test.
> > >         * gcc.target/i386/pr92645-2.c: Adjust testcase.
> > >         * gcc.target/i386/pr92645-3.c: Ditto.
> > > ---
> > >  gcc/testsuite/gcc.target/i386/pr102583.c  | 30 ++++++++
> > >  gcc/testsuite/gcc.target/i386/pr92645-2.c |  4 +-
> > >  gcc/testsuite/gcc.target/i386/pr92645-3.c |  4 +-
> > >  gcc/tree-ssa-forwprop.cc                  | 89 ++++++++++++++++-------
> > >  4 files changed, 96 insertions(+), 31 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
> > >
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> > > new file mode 100644
> > > index 00000000000..4ef2f296d0c
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-mavx512f -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+32\(%.*%ymm} 1 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+16\(%.*%xmm} 1 } } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> > > +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> > > +
> > > +typedef int v16si __attribute__((vector_size(64)));
> > > +typedef float v8sf __attribute__((vector_size(32)));
> > > +typedef float v4sf __attribute__((vector_size(16)));
> > > +typedef float v2sf __attribute__((vector_size(8)));
> > > +
> > > +v8sf part (v16si *srcp)
> > > +{
> > > +  v16si src = *srcp;
> > > +  return (v8sf) { (float)src[8], (float) src[9], (float)src[10], (float)src[11],
> > > +      (float)src[12], (float)src[13], (float)src[14], (float)src[15] };
> > > +}
> > > +
> > > +v4sf part1 (v16si *srcp)
> > > +{
> > > +  v16si src = *srcp;
> > > +  return (v4sf) { (float)src[4], (float)src[5], (float)src[6], (float)src[7] };
> > > +}
> > > +
> > > +v2sf part2 (v16si *srcp)
> > > +{
> > > +  v16si src = *srcp;
> > > +  return (v2sf) { (float)src[4], (float)src[5] };
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > > index d34ed3aa8e5..f0608de938a 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c
> > > @@ -29,6 +29,6 @@ void odd (v2si *dst, v4si *srcp)
> > >  }
> > >
> > >  /* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> > > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
> > >  /* Ideally highpart extraction would elide the permutation as well.  */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> > > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > > index 9c08c9fb632..691011195c9 100644
> > > --- a/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > > +++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c
> > > @@ -32,6 +32,6 @@ void odd (v4sf *dst, v8si *srcp)
> > >  /* Four conversions, on the smaller vector type, to not convert excess
> > >     elements.  */
> > >  /* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */
> > > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" { xfail *-*-* } } } */
> > >  /* Ideally highpart extraction would elide the VEC_PERM_EXPR as well.  */
> > > -/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */
> > > +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" } } */
> > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > > index 484491fa1c5..f91f738895d 100644
> > > --- a/gcc/tree-ssa-forwprop.cc
> > > +++ b/gcc/tree-ssa-forwprop.cc
> > > @@ -2334,8 +2334,10 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
> > >    gimple *stmt = gsi_stmt (*gsi);
> > >    gimple *def_stmt;
> > >    tree op, op0, op1;
> > > -  tree elem_type;
> > > -  unsigned idx, size;
> > > +  tree elem_type, type;
> > > +  tree p, m, tem;
> > > +  unsigned HOST_WIDE_INT nelts;
> > > +  unsigned idx, size, elem_size;
> > >    enum tree_code code;
> > >
> > >    op = gimple_assign_rhs1 (stmt);
> > > @@ -2353,42 +2355,75 @@ simplify_bitfield_ref (gimple_stmt_iterator *gsi)
> > >    op1 = TREE_OPERAND (op, 1);
> > >    code = gimple_assign_rhs_code (def_stmt);
> > >    elem_type = TREE_TYPE (TREE_TYPE (op0));
> > > -  if (TREE_TYPE (op) != elem_type)
> > > +  type = TREE_TYPE (op);
> > > +  /* Also hanlde vector type.
> > > +   .i.e.
> > > +   _7 = VEC_PERM_EXPR <_1, _1, { 2, 3, 2, 3 }>;
> > > +   _11 = BIT_FIELD_REF <_7, 64, 0>;
> > > +
> > > +   to
> > > +
> > > +   _11 = BIT_FIELD_REF <_1, 64, 64>.  */
> > > +  if (type != elem_type
> > > +      && (!VECTOR_TYPE_P (type) || TREE_TYPE (type) != elem_type))
> > >      return false;
> >
> > I'm not sure we actually need this check.  I think we could handle
> > a __int128_t extract from the permuted vector as well, no?
> Yes, changed.
> >
> > >
> > > -  size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> > > +  elem_size = size = TREE_INT_CST_LOW (TYPE_SIZE (type));
> >
> > That's not going to work with VLA vectors, I think you need to make
> > elem_size/size poly_uint64 and use tree_to_poly_uint64 (TYPE_SIZE (type))
> > here.
> >
> Changed.
> > >    if (maybe_ne (bit_field_size (op), size))
> > >      return false;
> > >
> > > -  if (code == VEC_PERM_EXPR
> > > -      && constant_multiple_p (bit_field_offset (op), size, &idx))
> > > +  if (code != VEC_PERM_EXPR
> > > +      || !constant_multiple_p (bit_field_offset (op), size, &idx))
> >
> > That might be a bit restrictive?  I think we want elem_size here
> > with elem_size picked from the vector type of the VEC_PERM_EXPR
> > operand.
> >
> Changed.
> > > +    return false;
> > > +
> > > +  m = gimple_assign_rhs3 (def_stmt);
> > > +  if (TREE_CODE (m) != VECTOR_CST
> > > +      || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> > > +    return false;
> > > +
> > > +  /* One element.  */
> > > +  if (type == elem_type)
> >
> > I think this should work on the sizes, so
> >
> >       if (known_eq (size, elem_size))
> > ...
> >
> > that makes sure we do not end up with nelts_op == 1 below and also
> > handle the case where the BIT_FIELD_REF would do int<->float
> > punning.
> >
> Yes, also for V1TImode <-> TImode punning.
> > > +    idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> > > +  else
> > >      {
> > > -      tree p, m, tem;
> > > -      unsigned HOST_WIDE_INT nelts;
> > > -      m = gimple_assign_rhs3 (def_stmt);
> > > -      if (TREE_CODE (m) != VECTOR_CST
> > > -         || !VECTOR_CST_NELTS (m).is_constant (&nelts))
> > > +      elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> > > +      unsigned nelts_op;
> > > +      if (!constant_multiple_p (bit_field_size (op), elem_size, &nelts_op))
> >
> > bit_field_size (op) == size?  I think we also want to handle
> > power-of-two nelts_op
> > only (I can't think of how we get a non-power-of-two, but checking
> > pow2p_hwi (nelts_op)
> > woudl be nice if only for documentation purposes)
> Add the check.
> >
> > >         return false;
> > > -      idx = TREE_INT_CST_LOW (VECTOR_CST_ELT (m, idx));
> > > -      idx %= 2 * nelts;
> > > -      if (idx < nelts)
> > > -       {
> > > -         p = gimple_assign_rhs1 (def_stmt);
> > > -       }
> > > -      else
> > > +      unsigned start = TREE_INT_CST_LOW (vector_cst_elt (m, idx));
> > > +      unsigned end = TREE_INT_CST_LOW (vector_cst_elt (m, idx + nelts_op - 1));
> > > +      /* Be in the same vector.  */
> > > +      if ((start < nelts) != (end < nelts))
> > > +       return false;
> > > +      for (unsigned HOST_WIDE_INT i = 1; i != nelts_op; i++)
> > >         {
> > > -         p = gimple_assign_rhs2 (def_stmt);
> > > -         idx -= nelts;
> > > +         /* Continuous area.  */
> > > +         if (TREE_INT_CST_LOW (vector_cst_elt (m, idx + i)) - 1
> > > +             != TREE_INT_CST_LOW (vector_cst_elt (m, idx + i - 1)))
> > > +           return false;
> > >         }
> > > -      tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> > > -                   unshare_expr (p), op1, bitsize_int (idx * size));
> > > -      gimple_assign_set_rhs1 (stmt, tem);
> > > -      fold_stmt (gsi);
> > > -      update_stmt (gsi_stmt (*gsi));
> > > -      return true;
> > > +      /* Alignment not worse than before.  */
> > > +      unsigned dest_align = TREE_INT_CST_LOW (TYPE_SIZE (type));
> > > +      if (start * elem_size % dest_align)
> > > +       return false;
> >
> > So I think we want to have start * elem_size be a multiple of size
> > (the check I said above is too restrictive on the _original_ bit-field-ref but
> > we want to have for the target bit-field-ref).  I think that's what you check
> > here but see above for the use of poly-ints.
> Changed.
> >
> > > +      idx = start;
> > >      }
> > >
> > > -  return false;
> > > +  idx %= 2 * nelts;
> >
> > that's odd - why do you need to clamp idx?
> Changed.
> This is just copy from original codes, it looks it's not needed any
> more, similar for your below comment.
> >
> > > +  if (idx < nelts)
> > > +    p = gimple_assign_rhs1 (def_stmt);
> > > +  else
> > > +    {
> > > +      p = gimple_assign_rhs2 (def_stmt);
> > > +      idx -= nelts;
> > > +    }
> > > +
> > > +  tem = build3 (BIT_FIELD_REF, TREE_TYPE (op),
> > > +               unshare_expr (p), op1, bitsize_int (idx * elem_size));
> >
> > You don't need to unshare_expr 'p' here.
> >
> > Otherwise looks OK.
> >
> Here's updated patch.

Awesome.

OK for trunk if it boostraps/tests.

Thanks,
Richard.

> > Thanks,
> > Richard.
> >
> > > +  gimple_assign_set_rhs1 (stmt, tem);
> > > +  fold_stmt (gsi);
> > > +  update_stmt (gsi_stmt (*gsi));
> > > +  return true;
> > >  }
> > >
> > >  /* Determine whether applying the 2 permutations (mask1 then mask2)
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Strip of a vector load which is only used partially.
  2022-05-10  6:30     ` Richard Biener
@ 2022-06-26 18:59       ` Jeff Law
  0 siblings, 0 replies; 9+ messages in thread
From: Jeff Law @ 2022-06-26 18:59 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches



On 5/10/2022 12:30 AM, Richard Biener wrote:
> On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>>
>> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
>>> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>>>> Optimize
>>>>
>>>>     _1 = *srcp_3(D);
>>>>     _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>>>>     _5 = BIT_FIELD_REF <_4, 128, 0>;
>>>>
>>>> to
>>>>
>>>>     _1 = *srcp_3(D);
>>>>     _5 = BIT_FIELD_REF <_1, 128, 128>;
>>>>
>>>> the upper will finally be optimized to
>>>>
>>>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>>>
>>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>>>> Ok for trunk?
>>> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
>>> handle this in the
>>>
>>>     if (code == VEC_PERM_EXPR
>>>         && constant_multiple_p (bit_field_offset (op), size, &idx))
>>>       {
>>>
>>> part of the code - maybe that needs to be enhanced to cover
>>> a contiguous stride in the VEC_PERM_EXPR.  I see
>>> we have
>>>
>>>     size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>>>     if (maybe_ne (bit_field_size (op), size))
>>>       return false;
>>>
>>> where it will currently bail, so adjust that to check for a
>>> constant multiple.  I also think we should only handle the
>>> case where the new bit_field_offset alignment is not
>>> worse than the original one.
>>>
>>> That said, I'd prefer if you integrate this transform with
>>> simplify_bitfield_ref.
>> I've got a hack here that tries to do something similar, but it's trying
>> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF.  It
>> walks the CONSTRUCTOR elements to see if an element has the right
>> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
>> eliminate the VEC_PERMUTE entirely and just forward operands into the
>> BIT_FIELD_REF.
>>
>> I was leaning towards moving those bits into match.pd before submitting,
>> but if you'd prefer them in tree-ssa-forwprop, that's even easier.
> I think when deciding where to put things it's important to look where related
> transforms reside.  We already do have a (simplify (BIT_FIELD_REF
> CONSTRUCTOR@ ...))
> pattern which should also handle your case already.  So instead of
> adding something
> new it would be nice to figure why it doesn't handle the case you are
> interested in and
> eventually just adjust the existing pattern.
I'm aware of that pattern.  I've found it painfully inadequate in every 
case I've looked at.    In general I've found tree-ssa-forwprop is a 
reasonable place to prototype a lot of stuff to see how it works in 
practice, but I think match.pd is better for most of the transformations 
in the long term.

It sounds like you'd prefer this particular case to move into match.pd.  
Fine.  That's what I'd originally planned to do.  It's pretty simple 
support code, so doing it in match.pd shouldn't be too hard.

Jeff

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-06-26 18:59 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-05  5:04 [PATCH] Strip of a vector load which is only used partially liuhongt
2022-05-05  8:26 ` Richard Biener
2022-05-09  5:11   ` [PATCH v2] " liuhongt
2022-05-10  6:53     ` Richard Biener
2022-05-12  1:48       ` Hongtao Liu
2022-05-12 11:26         ` Richard Biener
2022-05-09 22:57   ` [PATCH] " Jeff Law
2022-05-10  6:30     ` Richard Biener
2022-06-26 18:59       ` Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).