[PATCH 1/8 v9]middle-end slp: Support optimizing load distribution

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
@ 2020-12-28 13:35 Tamar Christina
  2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 5403 bytes --]

Hi All,

This introduces a post processing step for the pattern matcher to flatten
permutes introduced by the complex multiplications patterns. 

This performs a blend early such that SLP is not cancelled by the LOAD_LANES
permute.  This is a temporary workaround to the fact that loads are not CSEd
during building and is required to produce efficient code.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp.c (optimize_load_redistribution_1): New.
	(optimize_load_redistribution): New.
	(vect_match_slp_patterns): Use it.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
+      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
+      && SLP_TREE_LANE_PERMUTATION (root).exists ()
+      && !SLP_TREE_SCALAR_STMTS (root).exists ())
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      auto_vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      load_permutation_t load_perm;
+      load_perm.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+	  /* This isn't strictly needed, but this function is a temporary
+	     one for specifically pattern matching, so don't want it to
+	     optimize things the remainder of the pipeline will.  */
+	  if (perm.first != j)
+	    goto next;
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      slp_tree *value = bst_map->get (stmts);
+      if (value)
+	node = *value;
+      else
+	{
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+	    SLP_TREE_REF_COUNT (node)++;
+
+	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
+	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
+	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
+	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
+	  bst_map->put (stmts_cpy, node);
+	}
+      SLP_TREE_REF_COUNT (node)++;
+
+      return node;
+    }
+
+next:
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2385,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,


-- 

[-- Attachment #2: rb13956.patch --]
[-- Type: text/x-diff, Size: 4743 bytes --]

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
+      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
+      && SLP_TREE_LANE_PERMUTATION (root).exists ()
+      && !SLP_TREE_SCALAR_STMTS (root).exists ())
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      auto_vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      load_permutation_t load_perm;
+      load_perm.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+	  /* This isn't strictly needed, but this function is a temporary
+	     one for specifically pattern matching, so don't want it to
+	     optimize things the remainder of the pipeline will.  */
+	  if (perm.first != j)
+	    goto next;
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+	    return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      slp_tree *value = bst_map->get (stmts);
+      if (value)
+	node = *value;
+      else
+	{
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
+	    SLP_TREE_REF_COUNT (node)++;
+
+	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
+	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
+	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
+	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
+	  bst_map->put (stmts_cpy, node);
+	}
+      SLP_TREE_REF_COUNT (node)++;
+
+      return node;
+    }
+
+next:
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2385,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,


^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
@ 2020-12-28 13:36 ` Tamar Christina
  2021-01-07 13:17   ` Richard Biener
  2020-12-28 13:36 ` [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p Tamar Christina
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 1900 bytes --]

Hi All,

This fixes an issue where is_linear_load_p could return the incorrect
permutation kind because it is singe pass.

This arranges the candidates in such a way that there won't be any ambiguity so
that the function can still be linear but give correct values.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp-patterns.c (is_linear_load_p): Fix ambiguity.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index fede88923af8521ee4954c8ae27b0e589f975610..7fd79d91c6ba4ccdbf361307a6105fb7e46aa961 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -140,32 +140,32 @@ is_linear_load_p (load_permutation_t loads)
 
   unsigned load, i;
   complex_perm_kinds_t candidates[4]
-    = { PERM_EVENODD
-      , PERM_ODDEVEN
-      , PERM_ODDODD
+    = { PERM_ODDODD
       , PERM_EVENEVEN
+      , PERM_EVENODD
+      , PERM_ODDEVEN
       };
 
   int valid_patterns = 4;
-  FOR_EACH_VEC_ELT_FROM (loads, i, load, 1)
+  FOR_EACH_VEC_ELT (loads, i, load)
     {
-      if (candidates[0] != PERM_UNKNOWN && load != i)
+      if (candidates[0] != PERM_UNKNOWN && load != 1)
 	{
 	  candidates[0] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[1] != PERM_UNKNOWN
-	  && load != (i % 2 == 0 ? i + 1 : i - 1))
+      if (candidates[1] != PERM_UNKNOWN && load != 0)
 	{
 	  candidates[1] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[2] != PERM_UNKNOWN && load != 1)
+      if (candidates[2] != PERM_UNKNOWN && load != i)
 	{
 	  candidates[2] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[3] != PERM_UNKNOWN && load != 0)
+      if (candidates[3] != PERM_UNKNOWN
+	  && load != (i % 2 == 0 ? i + 1 : i - 1))
 	{
 	  candidates[3] = PERM_UNKNOWN;
 	  valid_patterns--;


-- 

[-- Attachment #2: rb13957.patch --]
[-- Type: text/x-diff, Size: 1402 bytes --]

diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index fede88923af8521ee4954c8ae27b0e589f975610..7fd79d91c6ba4ccdbf361307a6105fb7e46aa961 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -140,32 +140,32 @@ is_linear_load_p (load_permutation_t loads)
 
   unsigned load, i;
   complex_perm_kinds_t candidates[4]
-    = { PERM_EVENODD
-      , PERM_ODDEVEN
-      , PERM_ODDODD
+    = { PERM_ODDODD
       , PERM_EVENEVEN
+      , PERM_EVENODD
+      , PERM_ODDEVEN
       };
 
   int valid_patterns = 4;
-  FOR_EACH_VEC_ELT_FROM (loads, i, load, 1)
+  FOR_EACH_VEC_ELT (loads, i, load)
     {
-      if (candidates[0] != PERM_UNKNOWN && load != i)
+      if (candidates[0] != PERM_UNKNOWN && load != 1)
 	{
 	  candidates[0] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[1] != PERM_UNKNOWN
-	  && load != (i % 2 == 0 ? i + 1 : i - 1))
+      if (candidates[1] != PERM_UNKNOWN && load != 0)
 	{
 	  candidates[1] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[2] != PERM_UNKNOWN && load != 1)
+      if (candidates[2] != PERM_UNKNOWN && load != i)
 	{
 	  candidates[2] = PERM_UNKNOWN;
 	  valid_patterns--;
 	}
-      if (candidates[3] != PERM_UNKNOWN && load != 0)
+      if (candidates[3] != PERM_UNKNOWN
+	  && load != (i % 2 == 0 ? i + 1 : i - 1))
 	{
 	  candidates[3] = PERM_UNKNOWN;
 	  valid_patterns--;


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers
  2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
@ 2021-01-07 13:17   ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-07 13:17 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This fixes an issue where is_linear_load_p could return the incorrect
> permutation kind because it is singe pass.
> 
> This arranges the candidates in such a way that there won't be any ambiguity so
> that the function can still be linear but give correct values.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

OK.  I see no testcases in any of the patch in this series though.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp-patterns.c (is_linear_load_p): Fix ambiguity.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index fede88923af8521ee4954c8ae27b0e589f975610..7fd79d91c6ba4ccdbf361307a6105fb7e46aa961 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -140,32 +140,32 @@ is_linear_load_p (load_permutation_t loads)
>  
>    unsigned load, i;
>    complex_perm_kinds_t candidates[4]
> -    = { PERM_EVENODD
> -      , PERM_ODDEVEN
> -      , PERM_ODDODD
> +    = { PERM_ODDODD
>        , PERM_EVENEVEN
> +      , PERM_EVENODD
> +      , PERM_ODDEVEN
>        };
>  
>    int valid_patterns = 4;
> -  FOR_EACH_VEC_ELT_FROM (loads, i, load, 1)
> +  FOR_EACH_VEC_ELT (loads, i, load)
>      {
> -      if (candidates[0] != PERM_UNKNOWN && load != i)
> +      if (candidates[0] != PERM_UNKNOWN && load != 1)
>  	{
>  	  candidates[0] = PERM_UNKNOWN;
>  	  valid_patterns--;
>  	}
> -      if (candidates[1] != PERM_UNKNOWN
> -	  && load != (i % 2 == 0 ? i + 1 : i - 1))
> +      if (candidates[1] != PERM_UNKNOWN && load != 0)
>  	{
>  	  candidates[1] = PERM_UNKNOWN;
>  	  valid_patterns--;
>  	}
> -      if (candidates[2] != PERM_UNKNOWN && load != 1)
> +      if (candidates[2] != PERM_UNKNOWN && load != i)
>  	{
>  	  candidates[2] = PERM_UNKNOWN;
>  	  valid_patterns--;
>  	}
> -      if (candidates[3] != PERM_UNKNOWN && load != 0)
> +      if (candidates[3] != PERM_UNKNOWN
> +	  && load != (i % 2 == 0 ? i + 1 : i - 1))
>  	{
>  	  candidates[3] = PERM_UNKNOWN;
>  	  valid_patterns--;
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
  2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
@ 2020-12-28 13:36 ` Tamar Christina
  2021-01-07 13:17   ` Richard Biener
  2020-12-28 13:37 ` [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks Tamar Christina
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 1761 bytes --]

Hi All,

This fixes a bug with externals and linear_loads_p where I forgot to save the
value before returning.

It also fixes handling of nodes with multiple children on a non VEC_PERM node.
There the child iteration would already resolve the kind and the loads are All
expected to be the same if valid so just return one.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp-patterns.c (linear_loads_p): Fix externals.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 7fd79d91c6ba4ccdbf361307a6105fb7e46aa961..235c0741c78b04f14725751ec399c0fdb32a0823 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -229,6 +229,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
   else if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
     {
       retval.first = PERM_TOP;
+      perm_cache->put (root, retval);
       return retval;
     }
 
@@ -241,6 +242,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
       complex_load_perm_t res = linear_loads_p (perm_cache, child);
       kind = vect_merge_perms (kind, res.first);
       /* Unknown and Top are not valid on blends as they produce no permute.  */
+      retval.first = kind;
       if (kind == PERM_UNKNOWN || kind == PERM_TOP)
 	return retval;
       all_loads.safe_push (res.second);
@@ -258,7 +260,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
       retval.first = kind;
       retval.second = nloads;
     }
-  else if (all_loads.length () == 1)
+  else
     {
       retval.first = kind;
       retval.second = all_loads[0];


-- 

[-- Attachment #2: rb13958.patch --]
[-- Type: text/x-diff, Size: 1208 bytes --]

diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 7fd79d91c6ba4ccdbf361307a6105fb7e46aa961..235c0741c78b04f14725751ec399c0fdb32a0823 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -229,6 +229,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
   else if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
     {
       retval.first = PERM_TOP;
+      perm_cache->put (root, retval);
       return retval;
     }
 
@@ -241,6 +242,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
       complex_load_perm_t res = linear_loads_p (perm_cache, child);
       kind = vect_merge_perms (kind, res.first);
       /* Unknown and Top are not valid on blends as they produce no permute.  */
+      retval.first = kind;
       if (kind == PERM_UNKNOWN || kind == PERM_TOP)
 	return retval;
       all_loads.safe_push (res.second);
@@ -258,7 +260,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
       retval.first = kind;
       retval.second = nloads;
     }
-  else if (all_loads.length () == 1)
+  else
     {
       retval.first = kind;
       retval.second = all_loads[0];


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p
  2020-12-28 13:36 ` [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p Tamar Christina
@ 2021-01-07 13:17   ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-07 13:17 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This fixes a bug with externals and linear_loads_p where I forgot to save the
> value before returning.
> 
> It also fixes handling of nodes with multiple children on a non VEC_PERM node.
> There the child iteration would already resolve the kind and the loads are All
> expected to be the same if valid so just return one.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

OK.

Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp-patterns.c (linear_loads_p): Fix externals.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 7fd79d91c6ba4ccdbf361307a6105fb7e46aa961..235c0741c78b04f14725751ec399c0fdb32a0823 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -229,6 +229,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
>    else if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
>      {
>        retval.first = PERM_TOP;
> +      perm_cache->put (root, retval);
>        return retval;
>      }
>  
> @@ -241,6 +242,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
>        complex_load_perm_t res = linear_loads_p (perm_cache, child);
>        kind = vect_merge_perms (kind, res.first);
>        /* Unknown and Top are not valid on blends as they produce no permute.  */
> +      retval.first = kind;
>        if (kind == PERM_UNKNOWN || kind == PERM_TOP)
>  	return retval;
>        all_loads.safe_push (res.second);
> @@ -258,7 +260,7 @@ linear_loads_p (slp_tree_to_load_perm_map_t *perm_cache, slp_tree root)
>        retval.first = kind;
>        retval.second = nloads;
>      }
> -  else if (all_loads.length () == 1)
> +  else
>      {
>        retval.first = kind;
>        retval.second = all_loads[0];
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
  2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
  2020-12-28 13:36 ` [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p Tamar Christina
@ 2020-12-28 13:37 ` Tamar Christina
  2021-01-07 13:18   ` Richard Biener
  2020-12-28 13:37 ` [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate Tamar Christina
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 3360 bytes --]

Hi All,

This fixes a memory leak in complex_add_pattern because I was not calling
vect_free_slp_tree when dissolving one side of the TWO_OPERANDS nodes.

Secondly it also upgrades the class to the new inteface required by the other
patterns.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp-patterns.c (class complex_pattern,
	class complex_add_pattern): Add parameters to matches.
	(complex_add_pattern::build): Free memory.
	(complex_add_pattern::matches): Move validation end of match.
	(complex_add_pattern::recognize): Likewise.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 235c0741c78b04f14725751ec399c0fdb32a0823..dbc58f7c53868ed431fc67de1f0162eb0d3b2c24 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -503,7 +503,7 @@ class complex_pattern : public vect_pattern
     void build (vec_info *);
 
     static internal_fn
-    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
 	     vec<slp_tree> *);
 };
 
@@ -608,11 +608,17 @@ class complex_add_pattern : public complex_pattern
   public:
     void build (vec_info *);
     static internal_fn
-    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
 	     vec<slp_tree> *);
 
     static vect_pattern*
     recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_add_pattern (node, m_ops, ifn);
+    }
 };
 
 /* Perform a replacement of the detected complex add pattern with the new
@@ -630,6 +636,11 @@ complex_add_pattern::build (vec_info *vinfo)
   nodes.quick_push (children[0]);
   nodes.quick_push (vect_build_swap_evenodd_node (children[1]));
 
+  SLP_TREE_REF_COUNT (nodes[0])++;
+  SLP_TREE_REF_COUNT (nodes[1])++;
+  vect_free_slp_tree (this->m_ops[0]);
+  vect_free_slp_tree (this->m_ops[1]);
+
   SLP_TREE_CHILDREN (*this->m_node).truncate (0);
   SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
 
@@ -650,7 +661,7 @@ complex_add_pattern::build (vec_info *vinfo)
 internal_fn
 complex_add_pattern::matches (complex_operation_t op,
 			      slp_tree_to_load_perm_map_t *perm_cache,
-			      vec<slp_tree> *ops)
+			      slp_tree *node, vec<slp_tree> *ops)
 {
   internal_fn ifn = IFN_LAST;
 
@@ -685,6 +696,9 @@ complex_add_pattern::matches (complex_operation_t op,
   if (linear_loads_p (perm_cache, children[1]).first != PERM_ODDEVEN)
     return IFN_LAST;
 
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
   return ifn;
 }
 
@@ -697,8 +711,9 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   auto_vec<slp_tree> ops;
   complex_operation_t op
     = vect_detect_pair_op (*node, true, &ops);
-  internal_fn ifn = complex_add_pattern::matches (op, perm_cache, &ops);
-  if (!vect_pattern_validate_optab (ifn, *node))
+  internal_fn ifn
+    = complex_add_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
     return NULL;
 
   return new complex_add_pattern (node, &ops, ifn);


-- 

[-- Attachment #2: rb13959.patch --]
[-- Type: text/x-diff, Size: 2688 bytes --]

diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 235c0741c78b04f14725751ec399c0fdb32a0823..dbc58f7c53868ed431fc67de1f0162eb0d3b2c24 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -503,7 +503,7 @@ class complex_pattern : public vect_pattern
     void build (vec_info *);
 
     static internal_fn
-    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
 	     vec<slp_tree> *);
 };
 
@@ -608,11 +608,17 @@ class complex_add_pattern : public complex_pattern
   public:
     void build (vec_info *);
     static internal_fn
-    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
 	     vec<slp_tree> *);
 
     static vect_pattern*
     recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_add_pattern (node, m_ops, ifn);
+    }
 };
 
 /* Perform a replacement of the detected complex add pattern with the new
@@ -630,6 +636,11 @@ complex_add_pattern::build (vec_info *vinfo)
   nodes.quick_push (children[0]);
   nodes.quick_push (vect_build_swap_evenodd_node (children[1]));
 
+  SLP_TREE_REF_COUNT (nodes[0])++;
+  SLP_TREE_REF_COUNT (nodes[1])++;
+  vect_free_slp_tree (this->m_ops[0]);
+  vect_free_slp_tree (this->m_ops[1]);
+
   SLP_TREE_CHILDREN (*this->m_node).truncate (0);
   SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
 
@@ -650,7 +661,7 @@ complex_add_pattern::build (vec_info *vinfo)
 internal_fn
 complex_add_pattern::matches (complex_operation_t op,
 			      slp_tree_to_load_perm_map_t *perm_cache,
-			      vec<slp_tree> *ops)
+			      slp_tree *node, vec<slp_tree> *ops)
 {
   internal_fn ifn = IFN_LAST;
 
@@ -685,6 +696,9 @@ complex_add_pattern::matches (complex_operation_t op,
   if (linear_loads_p (perm_cache, children[1]).first != PERM_ODDEVEN)
     return IFN_LAST;
 
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
   return ifn;
 }
 
@@ -697,8 +711,9 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   auto_vec<slp_tree> ops;
   complex_operation_t op
     = vect_detect_pair_op (*node, true, &ops);
-  internal_fn ifn = complex_add_pattern::matches (op, perm_cache, &ops);
-  if (!vect_pattern_validate_optab (ifn, *node))
+  internal_fn ifn
+    = complex_add_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
     return NULL;
 
   return new complex_add_pattern (node, &ops, ifn);


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks
  2020-12-28 13:37 ` [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks Tamar Christina
@ 2021-01-07 13:18   ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-07 13:18 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This fixes a memory leak in complex_add_pattern because I was not calling
> vect_free_slp_tree when dissolving one side of the TWO_OPERANDS nodes.
> 
> Secondly it also upgrades the class to the new inteface required by the other
> patterns.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp-patterns.c (class complex_pattern,
> 	class complex_add_pattern): Add parameters to matches.
> 	(complex_add_pattern::build): Free memory.
> 	(complex_add_pattern::matches): Move validation end of match.
> 	(complex_add_pattern::recognize): Likewise.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 235c0741c78b04f14725751ec399c0fdb32a0823..dbc58f7c53868ed431fc67de1f0162eb0d3b2c24 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -503,7 +503,7 @@ class complex_pattern : public vect_pattern
>      void build (vec_info *);
>  
>      static internal_fn
> -    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
>  	     vec<slp_tree> *);
>  };
>  
> @@ -608,11 +608,17 @@ class complex_add_pattern : public complex_pattern
>    public:
>      void build (vec_info *);
>      static internal_fn
> -    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
>  	     vec<slp_tree> *);
>  
>      static vect_pattern*
>      recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +
> +    static vect_pattern*
> +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +    {
> +      return new complex_add_pattern (node, m_ops, ifn);
> +    }
>  };
>  
>  /* Perform a replacement of the detected complex add pattern with the new
> @@ -630,6 +636,11 @@ complex_add_pattern::build (vec_info *vinfo)
>    nodes.quick_push (children[0]);
>    nodes.quick_push (vect_build_swap_evenodd_node (children[1]));
>  
> +  SLP_TREE_REF_COUNT (nodes[0])++;
> +  SLP_TREE_REF_COUNT (nodes[1])++;
> +  vect_free_slp_tree (this->m_ops[0]);
> +  vect_free_slp_tree (this->m_ops[1]);
> +
>    SLP_TREE_CHILDREN (*this->m_node).truncate (0);
>    SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
>  
> @@ -650,7 +661,7 @@ complex_add_pattern::build (vec_info *vinfo)
>  internal_fn
>  complex_add_pattern::matches (complex_operation_t op,
>  			      slp_tree_to_load_perm_map_t *perm_cache,
> -			      vec<slp_tree> *ops)
> +			      slp_tree *node, vec<slp_tree> *ops)
>  {
>    internal_fn ifn = IFN_LAST;
>  
> @@ -685,6 +696,9 @@ complex_add_pattern::matches (complex_operation_t op,
>    if (linear_loads_p (perm_cache, children[1]).first != PERM_ODDEVEN)
>      return IFN_LAST;
>  
> +  if (!vect_pattern_validate_optab (ifn, *node))
> +    return IFN_LAST;
> +
>    return ifn;
>  }
>  
> @@ -697,8 +711,9 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
>    auto_vec<slp_tree> ops;
>    complex_operation_t op
>      = vect_detect_pair_op (*node, true, &ops);
> -  internal_fn ifn = complex_add_pattern::matches (op, perm_cache, &ops);
> -  if (!vect_pattern_validate_optab (ifn, *node))
> +  internal_fn ifn
> +    = complex_add_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn == IFN_LAST)
>      return NULL;
>  
>    return new complex_add_pattern (node, &ops, ifn);
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
                   ` (2 preceding siblings ...)
  2020-12-28 13:37 ` [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks Tamar Christina
@ 2020-12-28 13:37 ` Tamar Christina
  2021-01-08  9:37   ` Richard Biener
  2020-12-28 13:37 ` [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate Tamar Christina
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 15665 bytes --]

Hi All,

This adds support for complex multiply and complex multiply and accumulate to
the vect pattern detector.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
	* optabs.def (cmul_optab, cmul_conj_optab): New.
	* doc/md.texi: Document them.
	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
	vect_build_combine_node, class complex_mul_pattern,
	complex_mul_pattern::matches, complex_mul_pattern::recognize,
	complex_mul_pattern::build): New.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmul@var{m}4} instruction pattern
+@item @samp{cmul@var{m}4}
+Perform a vector multiply that is semantically the same as multiply of
+complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmul_conj@var{m}4} instruction pattern
+@item @samp{cmul_conj@var{m}4}
+Perform a vector multiply by conjugate that is semantically the same as a
+multiply of complex numbers where the second multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
+OPTAB_D (cmul_optab, "cmul$a3")
+OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..82721acbab8cf81c4d6f9954c98fb913a7bb6282 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -719,6 +719,368 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   return new complex_add_pattern (node, &ops, ifn);
 }
 
+/*******************************************************************************
+ * complex_mul_pattern
+ ******************************************************************************/
+
+/* Helper function of that looks for a match in the CHILDth child of NODE.  The
+   child used is stored in RES.
+
+   If the match is successful then ARGS will contain the operands matched
+   and the complex_operation_t type is returned.  If match is not successful
+   then CMPLX_NONE is returned and ARGS is left unmodified.  */
+
+static inline complex_operation_t
+vect_match_call_complex_mla (slp_tree node, unsigned child,
+			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
+{
+  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
+
+  slp_tree data = SLP_TREE_CHILDREN (node)[child];
+
+  if (res)
+    *res = data;
+
+  return vect_detect_pair_op (data, false, args);
+}
+
+/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
+   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
+
+   If a negate is found then the values in ARGS are reordered such that the
+   negate node is always the second one and the entry is replaced by the child
+   of the negate node.  */
+
+static inline bool
+vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
+{
+  gcc_assert (args.length () == 2);
+  bool neg_found = false;
+
+  if (vect_match_expression_p (args[0], NEGATE_EXPR))
+    {
+      std::swap (args[0], args[1]);
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = true;
+    }
+  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
+    {
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = false;
+    }
+
+  if (neg_found)
+    args[1] = SLP_TREE_CHILDREN (args[1])[0];
+
+  return neg_found;
+}
+
+/* Helper function to check if PERM is KIND or PERM_TOP.  */
+
+static inline bool
+is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
+{
+  return perm.first == kind || perm.first == PERM_TOP;
+}
+
+/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
+   nodes but also that they represent an operation that is either a complex
+   multiplication or a complex multiplication by conjugated value.
+
+   Of the negation is expected to be in the first half of the tree (As required
+   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
+   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
+   second operand contains the conjugate operation.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> left_op, vec<slp_tree> right_op,
+			     bool neg_first, bool *conj_first_operand,
+			     bool fms)
+{
+  /* The presence of a negation indicates that we have either a conjugate or a
+     rotation.  We need to distinguish which one.  */
+  *conj_first_operand = false;
+  complex_perm_kinds_t kind;
+
+  /* Complex conjugates have the negation on the imaginary part of the
+     number where rotations affect the real component.  So check if the
+     negation is on a dup of lane 1.  */
+  if (fms)
+    {
+      /* Canonicalization for fms is not consistent. So have to test both
+	 variants to be sure.  This needs to be fixed in the mid-end so
+	 this part can be simpler.  */
+      kind = linear_loads_p (perm_cache, right_op[0]).first;
+      if (!((kind == PERM_ODDODD
+	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDEVEN))
+	  || (kind == PERM_ODDEVEN
+	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDODD))))
+	return false;
+    }
+  else
+    {
+      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
+	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
+			    PERM_ODDEVEN))
+	return false;
+    }
+
+  /* Deal with differences in indexes.  */
+  int index1 = fms ? 1 : 0;
+  int index2 = fms ? 0 : 1;
+
+  /* Check if the conjugate is on the second first or second operand.  The
+     order of the node with the conjugate value determines this, and the dup
+     node must be one of lane 0 of the same DR as the neg node.  */
+  kind = linear_loads_p (perm_cache, left_op[index1]).first;
+  if (kind == PERM_TOP)
+    {
+      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
+	return true;
+    }
+  else if (kind == PERM_EVENODD)
+    {
+      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
+	return false;
+    }
+  else if (!neg_first)
+    *conj_first_operand = true;
+  else
+    return false;
+
+  if (kind != PERM_EVENEVEN)
+    return false;
+
+  return true;
+}
+
+/* Helper function to help distinguish between a conjugate and a rotation in a
+   complex multiplication.  The operations have similar shapes but the order of
+   the load permutes are different.  This function returns TRUE when the order
+   is consistent with a multiplication or multiplication by conjugated
+   operand but returns FALSE if it's a multiplication by rotated operand.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> op, complex_perm_kinds_t permKind)
+{
+  /* The left node is the more common case, test it first.  */
+  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
+    {
+      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
+	return false;
+    }
+  return true;
+}
+
+/* This function combines two nodes containing only even and only odd lanes
+   together into a single node which contains the nodes in even/odd order
+   by using a lane permute.  */
+
+static slp_tree
+vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
+{
+  auto_vec<slp_tree> nodes;
+  nodes.create (2);
+  vec<std::pair<unsigned, unsigned> > perm;
+  perm.create (SLP_TREE_LANES (rep));
+
+  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
+    {
+      perm.quick_push (std::make_pair (0, x));
+      perm.quick_push (std::make_pair (1, x));
+    }
+
+  nodes.quick_push (even);
+  nodes.quick_push (odd);
+
+  SLP_TREE_REF_COUNT (even)++;
+  SLP_TREE_REF_COUNT (odd)++;
+
+  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
+  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
+  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
+  SLP_TREE_CHILDREN (vnode).safe_splice (nodes);
+  SLP_TREE_REF_COUNT (vnode) = 1;
+  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
+  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
+  /* Representation is set to that of the current node as the vectorizer
+     can't deal with VEC_PERMs with no representation, as would be the
+     case with invariants.  */
+  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
+  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
+  return vnode;
+}
+
+class complex_mul_pattern : public complex_pattern
+{
+  protected:
+    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 2;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_mul_pattern (node, m_ops, ifn);
+    }
+
+};
+
+/* Pattern matcher for trying to match complex multiply pattern in SLP tree
+   If the operation matches then IFN is set to the operation it matched
+   and the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]);
+   double bx = (a[i+1] * b[i]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_mul_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree *node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  if (op != MINUS_PLUS)
+    return IFN_LAST;
+
+  slp_tree root = *node;
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
+      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
+    return IFN_LAST;
+
+  bool neg_first;
+  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
+
+  if (!is_neg)
+    {
+      /* A multiplication needs to multiply agains the real pair, otherwise
+	 the pattern matches that of FMS.   */
+      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
+	  || vect_normalize_conj_loc (left_op))
+	return IFN_LAST;
+      ifn = IFN_COMPLEX_MUL;
+    }
+  else if (is_neg)
+    {
+      bool conj_first_operand;
+      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
+					 neg_first, &conj_first_operand,
+					 false))
+	return IFN_LAST;
+
+      ifn = IFN_COMPLEX_MUL_CONJ;
+    }
+
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (3);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_mul_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_mul_pattern::build (vec_info *vinfo)
+{
+  auto_vec<slp_tree> nodes;
+
+  /* First re-arrange the children.  */
+  nodes.create (2);
+
+  nodes.quick_push (this->m_ops[2]);
+  nodes.quick_push (
+    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[2])++;
+
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


-- 

[-- Attachment #2: rb13960.patch --]
[-- Type: text/x-diff, Size: 14965 bytes --]

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmul@var{m}4} instruction pattern
+@item @samp{cmul@var{m}4}
+Perform a vector multiply that is semantically the same as multiply of
+complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmul_conj@var{m}4} instruction pattern
+@item @samp{cmul_conj@var{m}4}
+Perform a vector multiply by conjugate that is semantically the same as a
+multiply of complex numbers where the second multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
+OPTAB_D (cmul_optab, "cmul$a3")
+OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..82721acbab8cf81c4d6f9954c98fb913a7bb6282 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -719,6 +719,368 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   return new complex_add_pattern (node, &ops, ifn);
 }
 
+/*******************************************************************************
+ * complex_mul_pattern
+ ******************************************************************************/
+
+/* Helper function of that looks for a match in the CHILDth child of NODE.  The
+   child used is stored in RES.
+
+   If the match is successful then ARGS will contain the operands matched
+   and the complex_operation_t type is returned.  If match is not successful
+   then CMPLX_NONE is returned and ARGS is left unmodified.  */
+
+static inline complex_operation_t
+vect_match_call_complex_mla (slp_tree node, unsigned child,
+			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
+{
+  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
+
+  slp_tree data = SLP_TREE_CHILDREN (node)[child];
+
+  if (res)
+    *res = data;
+
+  return vect_detect_pair_op (data, false, args);
+}
+
+/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
+   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
+
+   If a negate is found then the values in ARGS are reordered such that the
+   negate node is always the second one and the entry is replaced by the child
+   of the negate node.  */
+
+static inline bool
+vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
+{
+  gcc_assert (args.length () == 2);
+  bool neg_found = false;
+
+  if (vect_match_expression_p (args[0], NEGATE_EXPR))
+    {
+      std::swap (args[0], args[1]);
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = true;
+    }
+  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
+    {
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = false;
+    }
+
+  if (neg_found)
+    args[1] = SLP_TREE_CHILDREN (args[1])[0];
+
+  return neg_found;
+}
+
+/* Helper function to check if PERM is KIND or PERM_TOP.  */
+
+static inline bool
+is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
+{
+  return perm.first == kind || perm.first == PERM_TOP;
+}
+
+/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
+   nodes but also that they represent an operation that is either a complex
+   multiplication or a complex multiplication by conjugated value.
+
+   Of the negation is expected to be in the first half of the tree (As required
+   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
+   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
+   second operand contains the conjugate operation.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> left_op, vec<slp_tree> right_op,
+			     bool neg_first, bool *conj_first_operand,
+			     bool fms)
+{
+  /* The presence of a negation indicates that we have either a conjugate or a
+     rotation.  We need to distinguish which one.  */
+  *conj_first_operand = false;
+  complex_perm_kinds_t kind;
+
+  /* Complex conjugates have the negation on the imaginary part of the
+     number where rotations affect the real component.  So check if the
+     negation is on a dup of lane 1.  */
+  if (fms)
+    {
+      /* Canonicalization for fms is not consistent. So have to test both
+	 variants to be sure.  This needs to be fixed in the mid-end so
+	 this part can be simpler.  */
+      kind = linear_loads_p (perm_cache, right_op[0]).first;
+      if (!((kind == PERM_ODDODD
+	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDEVEN))
+	  || (kind == PERM_ODDEVEN
+	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDODD))))
+	return false;
+    }
+  else
+    {
+      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
+	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
+			    PERM_ODDEVEN))
+	return false;
+    }
+
+  /* Deal with differences in indexes.  */
+  int index1 = fms ? 1 : 0;
+  int index2 = fms ? 0 : 1;
+
+  /* Check if the conjugate is on the second first or second operand.  The
+     order of the node with the conjugate value determines this, and the dup
+     node must be one of lane 0 of the same DR as the neg node.  */
+  kind = linear_loads_p (perm_cache, left_op[index1]).first;
+  if (kind == PERM_TOP)
+    {
+      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
+	return true;
+    }
+  else if (kind == PERM_EVENODD)
+    {
+      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
+	return false;
+    }
+  else if (!neg_first)
+    *conj_first_operand = true;
+  else
+    return false;
+
+  if (kind != PERM_EVENEVEN)
+    return false;
+
+  return true;
+}
+
+/* Helper function to help distinguish between a conjugate and a rotation in a
+   complex multiplication.  The operations have similar shapes but the order of
+   the load permutes are different.  This function returns TRUE when the order
+   is consistent with a multiplication or multiplication by conjugated
+   operand but returns FALSE if it's a multiplication by rotated operand.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> op, complex_perm_kinds_t permKind)
+{
+  /* The left node is the more common case, test it first.  */
+  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
+    {
+      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
+	return false;
+    }
+  return true;
+}
+
+/* This function combines two nodes containing only even and only odd lanes
+   together into a single node which contains the nodes in even/odd order
+   by using a lane permute.  */
+
+static slp_tree
+vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
+{
+  auto_vec<slp_tree> nodes;
+  nodes.create (2);
+  vec<std::pair<unsigned, unsigned> > perm;
+  perm.create (SLP_TREE_LANES (rep));
+
+  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
+    {
+      perm.quick_push (std::make_pair (0, x));
+      perm.quick_push (std::make_pair (1, x));
+    }
+
+  nodes.quick_push (even);
+  nodes.quick_push (odd);
+
+  SLP_TREE_REF_COUNT (even)++;
+  SLP_TREE_REF_COUNT (odd)++;
+
+  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
+  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
+  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
+  SLP_TREE_CHILDREN (vnode).safe_splice (nodes);
+  SLP_TREE_REF_COUNT (vnode) = 1;
+  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
+  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
+  /* Representation is set to that of the current node as the vectorizer
+     can't deal with VEC_PERMs with no representation, as would be the
+     case with invariants.  */
+  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
+  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
+  return vnode;
+}
+
+class complex_mul_pattern : public complex_pattern
+{
+  protected:
+    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 2;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_mul_pattern (node, m_ops, ifn);
+    }
+
+};
+
+/* Pattern matcher for trying to match complex multiply pattern in SLP tree
+   If the operation matches then IFN is set to the operation it matched
+   and the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]);
+   double bx = (a[i+1] * b[i]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_mul_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree *node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  if (op != MINUS_PLUS)
+    return IFN_LAST;
+
+  slp_tree root = *node;
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
+      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
+    return IFN_LAST;
+
+  bool neg_first;
+  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
+
+  if (!is_neg)
+    {
+      /* A multiplication needs to multiply agains the real pair, otherwise
+	 the pattern matches that of FMS.   */
+      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
+	  || vect_normalize_conj_loc (left_op))
+	return IFN_LAST;
+      ifn = IFN_COMPLEX_MUL;
+    }
+  else if (is_neg)
+    {
+      bool conj_first_operand;
+      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
+					 neg_first, &conj_first_operand,
+					 false))
+	return IFN_LAST;
+
+      ifn = IFN_COMPLEX_MUL_CONJ;
+    }
+
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (3);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_mul_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_mul_pattern::build (vec_info *vinfo)
+{
+  auto_vec<slp_tree> nodes;
+
+  /* First re-arrange the children.  */
+  nodes.create (2);
+
+  nodes.quick_push (this->m_ops[2]);
+  nodes.quick_push (
+    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[2])++;
+
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate
  2020-12-28 13:37 ` [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate Tamar Christina
@ 2021-01-08  9:37   ` Richard Biener
  2021-01-11 11:01     ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2021-01-08  9:37 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This adds support for complex multiply and complex multiply and accumulate to
> the vect pattern detector.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
> 	* optabs.def (cmul_optab, cmul_conj_optab): New.
> 	* doc/md.texi: Document them.
> 	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
> 	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
> 	vect_build_combine_node, class complex_mul_pattern,
> 	complex_mul_pattern::matches, complex_mul_pattern::recognize,
> 	complex_mul_pattern::build): New.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{cmul@var{m}4} instruction pattern
> +@item @samp{cmul@var{m}4}
> +Perform a vector multiply that is semantically the same as multiply of
> +complex numbers.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] = a[i] * b[i];
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{cmul_conj@var{m}4} instruction pattern
> +@item @samp{cmul_conj@var{m}4}
> +Perform a vector multiply by conjugate that is semantically the same as a
> +multiply of complex numbers where the second multiply arguments is conjugated.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] = a[i] * conj (b[i]);
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{ffs@var{m}2} instruction pattern
>  @item @samp{ffs@var{m}2}
>  Store into operand 0 one plus the index of the least significant 1-bit
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
>  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
>  
>  
>  /* FP scales.  */
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
>  OPTAB_D (xorsign_optab, "xorsign$F$a3")
>  OPTAB_D (cadd90_optab, "cadd90$a3")
>  OPTAB_D (cadd270_optab, "cadd270$a3")
> +OPTAB_D (cmul_optab, "cmul$a3")
> +OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
>  OPTAB_D (cos_optab, "cos$a2")
>  OPTAB_D (cosh_optab, "cosh$a2")
>  OPTAB_D (exp10_optab, "exp10$a2")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..82721acbab8cf81c4d6f9954c98fb913a7bb6282 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -719,6 +719,368 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
>    return new complex_add_pattern (node, &ops, ifn);
>  }
>  
> +/*******************************************************************************
> + * complex_mul_pattern
> + ******************************************************************************/
> +
> +/* Helper function of that looks for a match in the CHILDth child of NODE.  The
> +   child used is stored in RES.
> +
> +   If the match is successful then ARGS will contain the operands matched
> +   and the complex_operation_t type is returned.  If match is not successful
> +   then CMPLX_NONE is returned and ARGS is left unmodified.  */
> +
> +static inline complex_operation_t
> +vect_match_call_complex_mla (slp_tree node, unsigned child,
> +			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
> +{
> +  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
> +
> +  slp_tree data = SLP_TREE_CHILDREN (node)[child];
> +
> +  if (res)
> +    *res = data;
> +
> +  return vect_detect_pair_op (data, false, args);
> +}
> +
> +/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
> +   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
> +
> +   If a negate is found then the values in ARGS are reordered such that the
> +   negate node is always the second one and the entry is replaced by the child
> +   of the negate node.  */
> +
> +static inline bool
> +vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
> +{
> +  gcc_assert (args.length () == 2);
> +  bool neg_found = false;
> +
> +  if (vect_match_expression_p (args[0], NEGATE_EXPR))
> +    {
> +      std::swap (args[0], args[1]);
> +      neg_found = true;
> +      if (neg_first_p)
> +	*neg_first_p = true;
> +    }
> +  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
> +    {
> +      neg_found = true;
> +      if (neg_first_p)
> +	*neg_first_p = false;
> +    }
> +
> +  if (neg_found)
> +    args[1] = SLP_TREE_CHILDREN (args[1])[0];
> +
> +  return neg_found;
> +}
> +
> +/* Helper function to check if PERM is KIND or PERM_TOP.  */
> +
> +static inline bool
> +is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
> +{
> +  return perm.first == kind || perm.first == PERM_TOP;
> +}
> +
> +/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
> +   nodes but also that they represent an operation that is either a complex
> +   multiplication or a complex multiplication by conjugated value.
> +
> +   Of the negation is expected to be in the first half of the tree (As required
> +   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
> +   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
> +   second operand contains the conjugate operation.  */
> +
> +static inline bool
> +vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
> +			     vec<slp_tree> left_op, vec<slp_tree> right_op,
> +			     bool neg_first, bool *conj_first_operand,
> +			     bool fms)
> +{
> +  /* The presence of a negation indicates that we have either a conjugate or a
> +     rotation.  We need to distinguish which one.  */
> +  *conj_first_operand = false;
> +  complex_perm_kinds_t kind;
> +
> +  /* Complex conjugates have the negation on the imaginary part of the
> +     number where rotations affect the real component.  So check if the
> +     negation is on a dup of lane 1.  */
> +  if (fms)
> +    {
> +      /* Canonicalization for fms is not consistent. So have to test both
> +	 variants to be sure.  This needs to be fixed in the mid-end so
> +	 this part can be simpler.  */
> +      kind = linear_loads_p (perm_cache, right_op[0]).first;
> +      if (!((kind == PERM_ODDODD
> +	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> +			     PERM_ODDEVEN))
> +	  || (kind == PERM_ODDEVEN
> +	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> +			     PERM_ODDODD))))
> +	return false;
> +    }
> +  else
> +    {
> +      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
> +	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
> +			    PERM_ODDEVEN))
> +	return false;
> +    }
> +
> +  /* Deal with differences in indexes.  */
> +  int index1 = fms ? 1 : 0;
> +  int index2 = fms ? 0 : 1;
> +
> +  /* Check if the conjugate is on the second first or second operand.  The
> +     order of the node with the conjugate value determines this, and the dup
> +     node must be one of lane 0 of the same DR as the neg node.  */
> +  kind = linear_loads_p (perm_cache, left_op[index1]).first;
> +  if (kind == PERM_TOP)
> +    {
> +      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
> +	return true;
> +    }
> +  else if (kind == PERM_EVENODD)
> +    {
> +      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
> +	return false;
> +    }
> +  else if (!neg_first)
> +    *conj_first_operand = true;
> +  else
> +    return false;
> +
> +  if (kind != PERM_EVENEVEN)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* Helper function to help distinguish between a conjugate and a rotation in a
> +   complex multiplication.  The operations have similar shapes but the order of
> +   the load permutes are different.  This function returns TRUE when the order
> +   is consistent with a multiplication or multiplication by conjugated
> +   operand but returns FALSE if it's a multiplication by rotated operand.  */
> +
> +static inline bool
> +vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
> +			     vec<slp_tree> op, complex_perm_kinds_t permKind)
> +{
> +  /* The left node is the more common case, test it first.  */
> +  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
> +    {
> +      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
> +	return false;
> +    }
> +  return true;
> +}
> +
> +/* This function combines two nodes containing only even and only odd lanes
> +   together into a single node which contains the nodes in even/odd order
> +   by using a lane permute.  */
> +
> +static slp_tree
> +vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
> +{
> +  auto_vec<slp_tree> nodes;
> +  nodes.create (2);
> +  vec<std::pair<unsigned, unsigned> > perm;
> +  perm.create (SLP_TREE_LANES (rep));
> +
> +  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
> +    {
> +      perm.quick_push (std::make_pair (0, x));
> +      perm.quick_push (std::make_pair (1, x));
> +    }

That looks wrong, it creates {0,0}, {1, 0}, {0, 2}, {1, 2}
but you want {0, 0}, {1, 0}, {0, 1}, {1, 1} AFAICS.  At least
I assume SLP_TREE_LANES (odd/even) == SLP_TREE_LANES (rep) / 2?

'rep' isn't documented, I assume it's supoosed to be a "representative"
for the result?

> +
> +  nodes.quick_push (even);
> +  nodes.quick_push (odd);

No need for this intermediate nodes array, just push to ...

> +  SLP_TREE_REF_COUNT (even)++;
> +  SLP_TREE_REF_COUNT (odd)++;
> +
> +  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
> +  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
> +  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
> +  SLP_TREE_CHILDREN (vnode).safe_splice (nodes);

... the children array directly (even with quick_push, we've
already allocated 2 elements for the children).

> +  SLP_TREE_REF_COUNT (vnode) = 1;
> +  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
> +  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
> +  /* Representation is set to that of the current node as the vectorizer
> +     can't deal with VEC_PERMs with no representation, as would be the
> +     case with invariants.  */

Yeah, I need to fix this ...

> +  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
> +  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
> +  return vnode;
> +}
> +
> +class complex_mul_pattern : public complex_pattern
> +{
> +  protected:
> +    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +      : complex_pattern (node, m_ops, ifn)
> +    {
> +      this->m_num_args = 2;
> +    }
> +
> +  public:
> +    void build (vec_info *);
> +    static internal_fn
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
> +	     vec<slp_tree> *);
> +
> +    static vect_pattern*
> +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +
> +    static vect_pattern*
> +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +    {
> +      return new complex_mul_pattern (node, m_ops, ifn);
> +    }
> +
> +};
> +
> +/* Pattern matcher for trying to match complex multiply pattern in SLP tree
> +   If the operation matches then IFN is set to the operation it matched
> +   and the arguments to the two replacement statements are put in m_ops.
> +
> +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> +
> +   This function matches the patterns shaped as:
> +
> +   double ax = (b[i+1] * a[i]);
> +   double bx = (a[i+1] * b[i]);
> +
> +   c[i] = c[i] - ax;
> +   c[i+1] = c[i+1] + bx;
> +
> +   If a match occurred then TRUE is returned, else FALSE.  The initial match is
> +   expected to be in OP1 and the initial match operands in args0.  */
> +
> +internal_fn
> +complex_mul_pattern::matches (complex_operation_t op,
> +			      slp_tree_to_load_perm_map_t *perm_cache,
> +			      slp_tree *node, vec<slp_tree> *ops)
> +{
> +  internal_fn ifn = IFN_LAST;
> +
> +  if (op != MINUS_PLUS)
> +    return IFN_LAST;
> +
> +  slp_tree root = *node;
> +  /* First two nodes must be a multiply.  */
> +  auto_vec<slp_tree> muls;
> +  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
> +      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
> +    return IFN_LAST;
> +
> +  /* Now operand2+4 may lead to another expression.  */
> +  auto_vec<slp_tree> left_op, right_op;
> +  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
> +  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
> +
> +  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
> +    return IFN_LAST;
> +
> +  bool neg_first;
> +  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
> +
> +  if (!is_neg)
> +    {
> +      /* A multiplication needs to multiply agains the real pair, otherwise
> +	 the pattern matches that of FMS.   */
> +      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
> +	  || vect_normalize_conj_loc (left_op))
> +	return IFN_LAST;
> +      ifn = IFN_COMPLEX_MUL;
> +    }
> +  else if (is_neg)
> +    {
> +      bool conj_first_operand;
> +      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
> +					 neg_first, &conj_first_operand,
> +					 false))
> +	return IFN_LAST;
> +
> +      ifn = IFN_COMPLEX_MUL_CONJ;
> +    }
> +
> +  if (!vect_pattern_validate_optab (ifn, *node))
> +    return IFN_LAST;
> +
> +  ops->truncate (0);
> +  ops->create (3);
> +
> +  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
> +  if (kind == PERM_EVENODD)
> +    {
> +      ops->quick_push (left_op[1]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else if (kind == PERM_TOP)
> +    {
> +      ops->quick_push (left_op[1]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else
> +    {
> +      ops->quick_push (left_op[0]);
> +      ops->quick_push (right_op[0]);
> +      ops->quick_push (left_op[1]);
> +    }
> +
> +  return ifn;
> +}
> +
> +/* Attempt to recognize a complex mul pattern.  */
> +
> +vect_pattern*
> +complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
> +				slp_tree *node)
> +{
> +  auto_vec<slp_tree> ops;
> +  complex_operation_t op
> +    = vect_detect_pair_op (*node, true, &ops);
> +  internal_fn ifn
> +    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn == IFN_LAST)
> +    return NULL;
> +
> +  return new complex_mul_pattern (node, &ops, ifn);
> +}
> +
> +/* Perform a replacement of the detected complex mul pattern with the new
> +   instruction sequences.  */
> +
> +void
> +complex_mul_pattern::build (vec_info *vinfo)
> +{
> +  auto_vec<slp_tree> nodes;
> +
> +  /* First re-arrange the children.  */
> +  nodes.create (2);
> +
> +  nodes.quick_push (this->m_ops[2]);
> +  nodes.quick_push (
> +    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
> +  SLP_TREE_REF_COUNT (this->m_ops[2])++;
> +
> +  slp_tree node;
> +  unsigned i;
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> +    vect_free_slp_tree (node);
> +
> +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);

please elide the nodes array.  *this->m_node now has a "wrong"
representative but I guess

> +  complex_pattern::build (vinfo);

will fix that up?  I still find the structure of the pattern matching
& transform hard to follow.  But well - I've settled with the idea
of refactoring it for next stage1 after the fact ;)

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate
  2021-01-08  9:37   ` Richard Biener
@ 2021-01-11 11:01     ` Tamar Christina
  2021-01-11 12:04       ` Richard Biener
  0 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2021-01-11 11:01 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

[-- Attachment #1: Type: text/plain, Size: 37116 bytes --]

Hi Richi,

This adds support for complex multiply and complex multiply and accumulate to
the vect pattern detector.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master? (I replied to a comment at the very end of the mail from the previous patch)

Example of instructions matched:

#include <stdio.h>
#include <complex.h>

#define N 200
#define ROT
#define TYPE float
#define TYPE2 float


void g (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] =  a[i] * (b[i] ROT);
    }
}

void g_f1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] =  conjf (a[i]) * (b[i] ROT);
    }
}

void g_s1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] =  a[i] * conjf (b[i] ROT);
    }
}

Thanks,
Tamar

gcc/ChangeLog:

	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
	* optabs.def (cmul_optab, cmul_conj_optab): New.
	* doc/md.texi: Document them.
	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
	vect_build_combine_node, class complex_mul_pattern,
	complex_mul_pattern::matches, complex_mul_pattern::recognize,
	complex_mul_pattern::build): New.

--- inline copy of patch --

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmul@var{m}4} instruction pattern
+@item @samp{cmul@var{m}4}
+Perform a vector multiply that is semantically the same as multiply of
+complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmul_conj@var{m}4} instruction pattern
+@item @samp{cmul_conj@var{m}4}
+Perform a vector multiply by conjugate that is semantically the same as a
+multiply of complex numbers where the second multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
+OPTAB_D (cmul_optab, "cmul$a3")
+OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..fb58b45602f00a440ef7c27853276945ba696522 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -719,6 +719,375 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   return new complex_add_pattern (node, &ops, ifn);
 }
 
+/*******************************************************************************
+ * complex_mul_pattern
+ ******************************************************************************/
+
+/* Helper function of that looks for a match in the CHILDth child of NODE.  The
+   child used is stored in RES.
+
+   If the match is successful then ARGS will contain the operands matched
+   and the complex_operation_t type is returned.  If match is not successful
+   then CMPLX_NONE is returned and ARGS is left unmodified.  */
+
+static inline complex_operation_t
+vect_match_call_complex_mla (slp_tree node, unsigned child,
+			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
+{
+  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
+
+  slp_tree data = SLP_TREE_CHILDREN (node)[child];
+
+  if (res)
+    *res = data;
+
+  return vect_detect_pair_op (data, false, args);
+}
+
+/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
+   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
+
+   If a negate is found then the values in ARGS are reordered such that the
+   negate node is always the second one and the entry is replaced by the child
+   of the negate node.  */
+
+static inline bool
+vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
+{
+  gcc_assert (args.length () == 2);
+  bool neg_found = false;
+
+  if (vect_match_expression_p (args[0], NEGATE_EXPR))
+    {
+      std::swap (args[0], args[1]);
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = true;
+    }
+  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
+    {
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = false;
+    }
+
+  if (neg_found)
+    args[1] = SLP_TREE_CHILDREN (args[1])[0];
+
+  return neg_found;
+}
+
+/* Helper function to check if PERM is KIND or PERM_TOP.  */
+
+static inline bool
+is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
+{
+  return perm.first == kind || perm.first == PERM_TOP;
+}
+
+/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
+   nodes but also that they represent an operation that is either a complex
+   multiplication or a complex multiplication by conjugated value.
+
+   Of the negation is expected to be in the first half of the tree (As required
+   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
+   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
+   second operand contains the conjugate operation.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> left_op, vec<slp_tree> right_op,
+			     bool neg_first, bool *conj_first_operand,
+			     bool fms)
+{
+  /* The presence of a negation indicates that we have either a conjugate or a
+     rotation.  We need to distinguish which one.  */
+  *conj_first_operand = false;
+  complex_perm_kinds_t kind;
+
+  /* Complex conjugates have the negation on the imaginary part of the
+     number where rotations affect the real component.  So check if the
+     negation is on a dup of lane 1.  */
+  if (fms)
+    {
+      /* Canonicalization for fms is not consistent. So have to test both
+	 variants to be sure.  This needs to be fixed in the mid-end so
+	 this part can be simpler.  */
+      kind = linear_loads_p (perm_cache, right_op[0]).first;
+      if (!((kind == PERM_ODDODD
+	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDEVEN))
+	  || (kind == PERM_ODDEVEN
+	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDODD))))
+	return false;
+    }
+  else
+    {
+      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
+	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
+			    PERM_ODDEVEN))
+	return false;
+    }
+
+  /* Deal with differences in indexes.  */
+  int index1 = fms ? 1 : 0;
+  int index2 = fms ? 0 : 1;
+
+  /* Check if the conjugate is on the second first or second operand.  The
+     order of the node with the conjugate value determines this, and the dup
+     node must be one of lane 0 of the same DR as the neg node.  */
+  kind = linear_loads_p (perm_cache, left_op[index1]).first;
+  if (kind == PERM_TOP)
+    {
+      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
+	return true;
+    }
+  else if (kind == PERM_EVENODD)
+    {
+      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
+	return false;
+    }
+  else if (!neg_first)
+    *conj_first_operand = true;
+  else
+    return false;
+
+  if (kind != PERM_EVENEVEN)
+    return false;
+
+  return true;
+}
+
+/* Helper function to help distinguish between a conjugate and a rotation in a
+   complex multiplication.  The operations have similar shapes but the order of
+   the load permutes are different.  This function returns TRUE when the order
+   is consistent with a multiplication or multiplication by conjugated
+   operand but returns FALSE if it's a multiplication by rotated operand.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> op, complex_perm_kinds_t permKind)
+{
+  /* The left node is the more common case, test it first.  */
+  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
+    {
+      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
+	return false;
+    }
+  return true;
+}
+
+/* This function combines two nodes containing only even and only odd lanes
+   together into a single node which contains the nodes in even/odd order
+   by using a lane permute.
+
+   The lanes in EVEN and ODD are duplicated 2 times inside the vectors.
+   So for a lanes = 4 EVEN contains {EVEN1, EVEN1, EVEN2, EVEN2}.
+
+   The tree REPRESENTATION is taken from the supplied REP along with the
+   vectype which must be the same between all three nodes.
+*/
+
+static slp_tree
+vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
+{
+  vec<std::pair<unsigned, unsigned> > perm;
+  perm.create (SLP_TREE_LANES (rep));
+
+  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
+    {
+      perm.quick_push (std::make_pair (0, x));
+      perm.quick_push (std::make_pair (1, x+1));
+    }
+
+  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
+  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
+  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
+
+  SLP_TREE_CHILDREN (vnode).create (2);
+  SLP_TREE_CHILDREN (vnode).quick_push (even);
+  SLP_TREE_CHILDREN (vnode).quick_push (odd);
+  SLP_TREE_REF_COUNT (even)++;
+  SLP_TREE_REF_COUNT (odd)++;
+  SLP_TREE_REF_COUNT (vnode) = 1;
+
+  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
+  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
+  /* Representation is set to that of the current node as the vectorizer
+     can't deal with VEC_PERMs with no representation, as would be the
+     case with invariants.  */
+  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
+  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
+  return vnode;
+}
+
+class complex_mul_pattern : public complex_pattern
+{
+  protected:
+    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 2;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_mul_pattern (node, m_ops, ifn);
+    }
+
+};
+
+/* Pattern matcher for trying to match complex multiply pattern in SLP tree
+   If the operation matches then IFN is set to the operation it matched
+   and the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]);
+   double bx = (a[i+1] * b[i]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_mul_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree *node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  if (op != MINUS_PLUS)
+    return IFN_LAST;
+
+  slp_tree root = *node;
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
+      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
+    return IFN_LAST;
+
+  bool neg_first = false;
+  bool conj_first_operand = false;
+  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
+
+  if (!is_neg)
+    {
+      /* A multiplication needs to multiply agains the real pair, otherwise
+	 the pattern matches that of FMS.   */
+      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
+	  || vect_normalize_conj_loc (left_op))
+	return IFN_LAST;
+      ifn = IFN_COMPLEX_MUL;
+    }
+  else if (is_neg)
+    {
+      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
+					 neg_first, &conj_first_operand,
+					 false))
+	return IFN_LAST;
+
+      ifn = IFN_COMPLEX_MUL_CONJ;
+    }
+
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (3);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_EVENEVEN && !conj_first_operand)
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+  else
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_mul_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_mul_pattern::build (vec_info *vinfo)
+{
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  /* First re-arrange the children.  */
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).create (2);
+  SLP_TREE_CHILDREN (*this->m_node).quick_push (this->m_ops[2]);
+  SLP_TREE_CHILDREN (*this->m_node).quick_push (
+    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[2])++;
+
+  /* And then rewrite the node itself.  */
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, January 8, 2021 9:37 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH 5/8 v9]middle-end slp: support complex multiply and
> complex multiply conjugate
> 
> On Mon, 28 Dec 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This adds support for complex multiply and complex multiply and
> > accumulate to the vect pattern detector.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
> > 	* optabs.def (cmul_optab, cmul_conj_optab): New.
> > 	* doc/md.texi: Document them.
> > 	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
> > 	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
> > 	vect_build_combine_node, class complex_mul_pattern,
> > 	complex_mul_pattern::matches, complex_mul_pattern::recognize,
> > 	complex_mul_pattern::build): New.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> >
> ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2ef
> e
> > fc1a333f8b3a 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6202,6 +6202,50 @@ The operation is only supported for vector
> modes @var{m}.
> >
> >  This pattern is not allowed to @code{FAIL}.
> >
> > +@cindex @code{cmul@var{m}4} instruction pattern @item
> > +@samp{cmul@var{m}4} Perform a vector multiply that is semantically
> > +the same as multiply of complex numbers.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] = a[i] * b[i];
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> > +@cindex @code{cmul_conj@var{m}4} instruction pattern @item
> > +@samp{cmul_conj@var{m}4} Perform a vector multiply by conjugate that
> > +is semantically the same as a multiply of complex numbers where the
> > +second multiply arguments is conjugated.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] = a[i] * conj (b[i]);
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> >  @cindex @code{ffs@var{m}2} instruction pattern  @item
> > @samp{ffs@var{m}2}  Store into operand 0 one plus the index of the
> > least significant 1-bit diff --git a/gcc/internal-fn.def
> > b/gcc/internal-fn.def index
> >
> 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e6
> 0f69
> > 96b28164ae38 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX,
> ECF_CONST, fmax,
> > binary)  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90,
> binary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST,
> cadd270, binary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST,
> cmul_conj,
> > +binary)
> >
> >
> >  /* FP scales.  */
> > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> >
> e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761
> a56
> > 0b7fbcb69ce1 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
> OPTAB_D
> > (xorsign_optab, "xorsign$F$a3")  OPTAB_D (cadd90_optab, "cadd90$a3")
> > OPTAB_D (cadd270_optab, "cadd270$a3")
> > +OPTAB_D (cmul_optab, "cmul$a3")
> > +OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> >  OPTAB_D (cos_optab, "cos$a2")
> >  OPTAB_D (cosh_optab, "cosh$a2")
> >  OPTAB_D (exp10_optab, "exp10$a2")
> > diff --git a/gcc/tree-vect-slp-patterns.c
> > b/gcc/tree-vect-slp-patterns.c index
> >
> dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..82721acbab8cf81c4d6f9954c9
> 8f
> > b913a7bb6282 100644
> > --- a/gcc/tree-vect-slp-patterns.c
> > +++ b/gcc/tree-vect-slp-patterns.c
> > @@ -719,6 +719,368 @@ complex_add_pattern::recognize
> (slp_tree_to_load_perm_map_t *perm_cache,
> >    return new complex_add_pattern (node, &ops, ifn);  }
> >
> >
> +/*********************************************************
> ***********
> > +***********
> > + * complex_mul_pattern
> > +
> >
> +*********************************************************
> ************
> > +*********/
> > +
> > +/* Helper function of that looks for a match in the CHILDth child of NODE.
> The
> > +   child used is stored in RES.
> > +
> > +   If the match is successful then ARGS will contain the operands matched
> > +   and the complex_operation_t type is returned.  If match is not
> successful
> > +   then CMPLX_NONE is returned and ARGS is left unmodified.  */
> > +
> > +static inline complex_operation_t
> > +vect_match_call_complex_mla (slp_tree node, unsigned child,
> > +			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
> {
> > +  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
> > +
> > +  slp_tree data = SLP_TREE_CHILDREN (node)[child];
> > +
> > +  if (res)
> > +    *res = data;
> > +
> > +  return vect_detect_pair_op (data, false, args); }
> > +
> > +/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the
> first
> > +   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
> > +
> > +   If a negate is found then the values in ARGS are reordered such that the
> > +   negate node is always the second one and the entry is replaced by the
> child
> > +   of the negate node.  */
> > +
> > +static inline bool
> > +vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p =
> > +NULL) {
> > +  gcc_assert (args.length () == 2);
> > +  bool neg_found = false;
> > +
> > +  if (vect_match_expression_p (args[0], NEGATE_EXPR))
> > +    {
> > +      std::swap (args[0], args[1]);
> > +      neg_found = true;
> > +      if (neg_first_p)
> > +	*neg_first_p = true;
> > +    }
> > +  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
> > +    {
> > +      neg_found = true;
> > +      if (neg_first_p)
> > +	*neg_first_p = false;
> > +    }
> > +
> > +  if (neg_found)
> > +    args[1] = SLP_TREE_CHILDREN (args[1])[0];
> > +
> > +  return neg_found;
> > +}
> > +
> > +/* Helper function to check if PERM is KIND or PERM_TOP.  */
> > +
> > +static inline bool
> > +is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
> {
> > +  return perm.first == kind || perm.first == PERM_TOP; }
> > +
> > +/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both
> MULT_EXPR
> > +   nodes but also that they represent an operation that is either a complex
> > +   multiplication or a complex multiplication by conjugated value.
> > +
> > +   Of the negation is expected to be in the first half of the tree (As
> required
> > +   by an FMS pattern) then NEG_FIRST is true.  If the operation is a
> conjugate
> > +   operation then CONJ_FIRST_OPERAND is set to indicate whether the
> first or
> > +   second operand contains the conjugate operation.  */
> > +
> > +static inline bool
> > +vect_validate_multiplication (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +			     vec<slp_tree> left_op, vec<slp_tree> right_op,
> > +			     bool neg_first, bool *conj_first_operand,
> > +			     bool fms)
> > +{
> > +  /* The presence of a negation indicates that we have either a conjugate
> or a
> > +     rotation.  We need to distinguish which one.  */
> > +  *conj_first_operand = false;
> > +  complex_perm_kinds_t kind;
> > +
> > +  /* Complex conjugates have the negation on the imaginary part of the
> > +     number where rotations affect the real component.  So check if the
> > +     negation is on a dup of lane 1.  */
> > +  if (fms)
> > +    {
> > +      /* Canonicalization for fms is not consistent. So have to test both
> > +	 variants to be sure.  This needs to be fixed in the mid-end so
> > +	 this part can be simpler.  */
> > +      kind = linear_loads_p (perm_cache, right_op[0]).first;
> > +      if (!((kind == PERM_ODDODD
> > +	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> > +			     PERM_ODDEVEN))
> > +	  || (kind == PERM_ODDEVEN
> > +	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> > +			     PERM_ODDODD))))
> > +	return false;
> > +    }
> > +  else
> > +    {
> > +      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
> > +	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
> > +			    PERM_ODDEVEN))
> > +	return false;
> > +    }
> > +
> > +  /* Deal with differences in indexes.  */  int index1 = fms ? 1 : 0;
> > + int index2 = fms ? 0 : 1;
> > +
> > +  /* Check if the conjugate is on the second first or second operand.  The
> > +     order of the node with the conjugate value determines this, and the
> dup
> > +     node must be one of lane 0 of the same DR as the neg node.  */
> > +  kind = linear_loads_p (perm_cache, left_op[index1]).first;
> > +  if (kind == PERM_TOP)
> > +    {
> > +      if (linear_loads_p (perm_cache, left_op[index2]).first ==
> PERM_EVENODD)
> > +	return true;
> > +    }
> > +  else if (kind == PERM_EVENODD)
> > +    {
> > +      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) ==
> PERM_EVENODD)
> > +	return false;
> > +    }
> > +  else if (!neg_first)
> > +    *conj_first_operand = true;
> > +  else
> > +    return false;
> > +
> > +  if (kind != PERM_EVENEVEN)
> > +    return false;
> > +
> > +  return true;
> > +}
> > +
> > +/* Helper function to help distinguish between a conjugate and a rotation
> in a
> > +   complex multiplication.  The operations have similar shapes but the
> order of
> > +   the load permutes are different.  This function returns TRUE when the
> order
> > +   is consistent with a multiplication or multiplication by conjugated
> > +   operand but returns FALSE if it's a multiplication by rotated
> > +operand.  */
> > +
> > +static inline bool
> > +vect_validate_multiplication (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +			     vec<slp_tree> op, complex_perm_kinds_t
> permKind) {
> > +  /* The left node is the more common case, test it first.  */
> > +  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
> > +    {
> > +      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
> > +	return false;
> > +    }
> > +  return true;
> > +}
> > +
> > +/* This function combines two nodes containing only even and only odd
> lanes
> > +   together into a single node which contains the nodes in even/odd order
> > +   by using a lane permute.  */
> > +
> > +static slp_tree
> > +vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep) {
> > +  auto_vec<slp_tree> nodes;
> > +  nodes.create (2);
> > +  vec<std::pair<unsigned, unsigned> > perm;
> > +  perm.create (SLP_TREE_LANES (rep));
> > +
> > +  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
> > +    {
> > +      perm.quick_push (std::make_pair (0, x));
> > +      perm.quick_push (std::make_pair (1, x));
> > +    }
> 
> That looks wrong, it creates {0,0}, {1, 0}, {0, 2}, {1, 2} but you want {0, 0}, {1, 0},
> {0, 1}, {1, 1} AFAICS.  At least I assume SLP_TREE_LANES (odd/even) ==
> SLP_TREE_LANES (rep) / 2?
> 
> 'rep' isn't documented, I assume it's supoosed to be a "representative"
> for the result?
> 
> > +
> > +  nodes.quick_push (even);
> > +  nodes.quick_push (odd);
> 
> No need for this intermediate nodes array, just push to ...
> 
> > +  SLP_TREE_REF_COUNT (even)++;
> > +  SLP_TREE_REF_COUNT (odd)++;
> > +
> > +  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE
> > + (even));  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
> > + SLP_TREE_LANE_PERMUTATION (vnode) = perm;  SLP_TREE_CHILDREN
> > + (vnode).safe_splice (nodes);
> 
> ... the children array directly (even with quick_push, we've already allocated
> 2 elements for the children).
> 
> > +  SLP_TREE_REF_COUNT (vnode) = 1;
> > +  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);  gcc_assert
> > + (perm.length () == SLP_TREE_LANES (vnode));
> > +  /* Representation is set to that of the current node as the vectorizer
> > +     can't deal with VEC_PERMs with no representation, as would be the
> > +     case with invariants.  */
> 
> Yeah, I need to fix this ...
> 
> > +  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
> > +  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
> > +  return vnode;
> > +}
> > +
> > +class complex_mul_pattern : public complex_pattern {
> > +  protected:
> > +    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> internal_fn ifn)
> > +      : complex_pattern (node, m_ops, ifn)
> > +    {
> > +      this->m_num_args = 2;
> > +    }
> > +
> > +  public:
> > +    void build (vec_info *);
> > +    static internal_fn
> > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> slp_tree *,
> > +	     vec<slp_tree> *);
> > +
> > +    static vect_pattern*
> > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > +
> > +    static vect_pattern*
> > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > +    {
> > +      return new complex_mul_pattern (node, m_ops, ifn);
> > +    }
> > +
> > +};
> > +
> > +/* Pattern matcher for trying to match complex multiply pattern in SLP
> tree
> > +   If the operation matches then IFN is set to the operation it matched
> > +   and the arguments to the two replacement statements are put in
> m_ops.
> > +
> > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > +
> > +   This function matches the patterns shaped as:
> > +
> > +   double ax = (b[i+1] * a[i]);
> > +   double bx = (a[i+1] * b[i]);
> > +
> > +   c[i] = c[i] - ax;
> > +   c[i+1] = c[i+1] + bx;
> > +
> > +   If a match occurred then TRUE is returned, else FALSE.  The initial match
> is
> > +   expected to be in OP1 and the initial match operands in args0.  */
> > +
> > +internal_fn
> > +complex_mul_pattern::matches (complex_operation_t op,
> > +			      slp_tree_to_load_perm_map_t *perm_cache,
> > +			      slp_tree *node, vec<slp_tree> *ops) {
> > +  internal_fn ifn = IFN_LAST;
> > +
> > +  if (op != MINUS_PLUS)
> > +    return IFN_LAST;
> > +
> > +  slp_tree root = *node;
> > +  /* First two nodes must be a multiply.  */  auto_vec<slp_tree>
> > + muls;  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
> > +      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
> > +    return IFN_LAST;
> > +
> > +  /* Now operand2+4 may lead to another expression.  */
> > + auto_vec<slp_tree> left_op, right_op;  left_op.safe_splice
> > + (SLP_TREE_CHILDREN (muls[0]));  right_op.safe_splice
> > + (SLP_TREE_CHILDREN (muls[1]));
> > +
> > +  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
> > +    return IFN_LAST;
> > +
> > +  bool neg_first;
> > +  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
> > +
> > +  if (!is_neg)
> > +    {
> > +      /* A multiplication needs to multiply agains the real pair, otherwise
> > +	 the pattern matches that of FMS.   */
> > +      if (!vect_validate_multiplication (perm_cache, left_op,
> PERM_EVENEVEN)
> > +	  || vect_normalize_conj_loc (left_op))
> > +	return IFN_LAST;
> > +      ifn = IFN_COMPLEX_MUL;
> > +    }
> > +  else if (is_neg)
> > +    {
> > +      bool conj_first_operand;
> > +      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
> > +					 neg_first, &conj_first_operand,
> > +					 false))
> > +	return IFN_LAST;
> > +
> > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > +    }
> > +
> > +  if (!vect_pattern_validate_optab (ifn, *node))
> > +    return IFN_LAST;
> > +
> > +  ops->truncate (0);
> > +  ops->create (3);
> > +
> > +  complex_perm_kinds_t kind = linear_loads_p (perm_cache,
> > + left_op[0]).first;  if (kind == PERM_EVENODD)
> > +    {
> > +      ops->quick_push (left_op[1]);
> > +      ops->quick_push (right_op[1]);
> > +      ops->quick_push (left_op[0]);
> > +    }
> > +  else if (kind == PERM_TOP)
> > +    {
> > +      ops->quick_push (left_op[1]);
> > +      ops->quick_push (right_op[1]);
> > +      ops->quick_push (left_op[0]);
> > +    }
> > +  else
> > +    {
> > +      ops->quick_push (left_op[0]);
> > +      ops->quick_push (right_op[0]);
> > +      ops->quick_push (left_op[1]);
> > +    }
> > +
> > +  return ifn;
> > +}
> > +
> > +/* Attempt to recognize a complex mul pattern.  */
> > +
> > +vect_pattern*
> > +complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +				slp_tree *node)
> > +{
> > +  auto_vec<slp_tree> ops;
> > +  complex_operation_t op
> > +    = vect_detect_pair_op (*node, true, &ops);
> > +  internal_fn ifn
> > +    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
> > +  if (ifn == IFN_LAST)
> > +    return NULL;
> > +
> > +  return new complex_mul_pattern (node, &ops, ifn); }
> > +
> > +/* Perform a replacement of the detected complex mul pattern with the
> new
> > +   instruction sequences.  */
> > +
> > +void
> > +complex_mul_pattern::build (vec_info *vinfo) {
> > +  auto_vec<slp_tree> nodes;
> > +
> > +  /* First re-arrange the children.  */  nodes.create (2);
> > +
> > +  nodes.quick_push (this->m_ops[2]);
> > +  nodes.quick_push (
> > +    vect_build_combine_node (this->m_ops[0], this->m_ops[1],
> > + *this->m_node));  SLP_TREE_REF_COUNT (this->m_ops[2])++;
> > +
> > +  slp_tree node;
> > +  unsigned i;
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> > +    vect_free_slp_tree (node);
> > +
> > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> SLP_TREE_CHILDREN
> > + (*this->m_node).safe_splice (nodes);
> 
> please elide the nodes array.  *this->m_node now has a "wrong"
> representative but I guess
> 
> > +  complex_pattern::build (vinfo);
> 
> will fix that up?  I still find the structure of the pattern matching & transform
> hard to follow.  But well - I've settled with the idea of refactoring it for next
> stage1 after the fact ;)

Indeed it does, I can flip order if replacing them last is clearer.

For next stage1 I wonder if it's not easier to not have build_slp produce the two_operands
At all and have a generic expander that targets can override.  This of course would require
It to change the VF, but will have to to support LDn/STn in SLP anyway.

But this would simplify things a lot.  I have some ideas but will write them up to get a review on
the idea before I start implementing.

Thanks for allowing this version in!

Regards,
Tamar

> 
> Thanks,
> Richard.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: pr13960.patch --]
[-- Type: text/x-diff; name="pr13960.patch", Size: 15919 bytes --]

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmul@var{m}4} instruction pattern
+@item @samp{cmul@var{m}4}
+Perform a vector multiply that is semantically the same as multiply of
+complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmul_conj@var{m}4} instruction pattern
+@item @samp{cmul_conj@var{m}4}
+Perform a vector multiply by conjugate that is semantically the same as a
+multiply of complex numbers where the second multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] = a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{ffs@var{m}2} instruction pattern
 @item @samp{ffs@var{m}2}
 Store into operand 0 one plus the index of the least significant 1-bit
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
 DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
 OPTAB_D (xorsign_optab, "xorsign$F$a3")
 OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
+OPTAB_D (cmul_optab, "cmul$a3")
+OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..fb58b45602f00a440ef7c27853276945ba696522 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -719,6 +719,375 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
   return new complex_add_pattern (node, &ops, ifn);
 }
 
+/*******************************************************************************
+ * complex_mul_pattern
+ ******************************************************************************/
+
+/* Helper function of that looks for a match in the CHILDth child of NODE.  The
+   child used is stored in RES.
+
+   If the match is successful then ARGS will contain the operands matched
+   and the complex_operation_t type is returned.  If match is not successful
+   then CMPLX_NONE is returned and ARGS is left unmodified.  */
+
+static inline complex_operation_t
+vect_match_call_complex_mla (slp_tree node, unsigned child,
+			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
+{
+  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
+
+  slp_tree data = SLP_TREE_CHILDREN (node)[child];
+
+  if (res)
+    *res = data;
+
+  return vect_detect_pair_op (data, false, args);
+}
+
+/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
+   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
+
+   If a negate is found then the values in ARGS are reordered such that the
+   negate node is always the second one and the entry is replaced by the child
+   of the negate node.  */
+
+static inline bool
+vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
+{
+  gcc_assert (args.length () == 2);
+  bool neg_found = false;
+
+  if (vect_match_expression_p (args[0], NEGATE_EXPR))
+    {
+      std::swap (args[0], args[1]);
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = true;
+    }
+  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
+    {
+      neg_found = true;
+      if (neg_first_p)
+	*neg_first_p = false;
+    }
+
+  if (neg_found)
+    args[1] = SLP_TREE_CHILDREN (args[1])[0];
+
+  return neg_found;
+}
+
+/* Helper function to check if PERM is KIND or PERM_TOP.  */
+
+static inline bool
+is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
+{
+  return perm.first == kind || perm.first == PERM_TOP;
+}
+
+/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
+   nodes but also that they represent an operation that is either a complex
+   multiplication or a complex multiplication by conjugated value.
+
+   Of the negation is expected to be in the first half of the tree (As required
+   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
+   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
+   second operand contains the conjugate operation.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> left_op, vec<slp_tree> right_op,
+			     bool neg_first, bool *conj_first_operand,
+			     bool fms)
+{
+  /* The presence of a negation indicates that we have either a conjugate or a
+     rotation.  We need to distinguish which one.  */
+  *conj_first_operand = false;
+  complex_perm_kinds_t kind;
+
+  /* Complex conjugates have the negation on the imaginary part of the
+     number where rotations affect the real component.  So check if the
+     negation is on a dup of lane 1.  */
+  if (fms)
+    {
+      /* Canonicalization for fms is not consistent. So have to test both
+	 variants to be sure.  This needs to be fixed in the mid-end so
+	 this part can be simpler.  */
+      kind = linear_loads_p (perm_cache, right_op[0]).first;
+      if (!((kind == PERM_ODDODD
+	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDEVEN))
+	  || (kind == PERM_ODDEVEN
+	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
+			     PERM_ODDODD))))
+	return false;
+    }
+  else
+    {
+      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
+	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
+			    PERM_ODDEVEN))
+	return false;
+    }
+
+  /* Deal with differences in indexes.  */
+  int index1 = fms ? 1 : 0;
+  int index2 = fms ? 0 : 1;
+
+  /* Check if the conjugate is on the second first or second operand.  The
+     order of the node with the conjugate value determines this, and the dup
+     node must be one of lane 0 of the same DR as the neg node.  */
+  kind = linear_loads_p (perm_cache, left_op[index1]).first;
+  if (kind == PERM_TOP)
+    {
+      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
+	return true;
+    }
+  else if (kind == PERM_EVENODD)
+    {
+      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
+	return false;
+    }
+  else if (!neg_first)
+    *conj_first_operand = true;
+  else
+    return false;
+
+  if (kind != PERM_EVENEVEN)
+    return false;
+
+  return true;
+}
+
+/* Helper function to help distinguish between a conjugate and a rotation in a
+   complex multiplication.  The operations have similar shapes but the order of
+   the load permutes are different.  This function returns TRUE when the order
+   is consistent with a multiplication or multiplication by conjugated
+   operand but returns FALSE if it's a multiplication by rotated operand.  */
+
+static inline bool
+vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
+			     vec<slp_tree> op, complex_perm_kinds_t permKind)
+{
+  /* The left node is the more common case, test it first.  */
+  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
+    {
+      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
+	return false;
+    }
+  return true;
+}
+
+/* This function combines two nodes containing only even and only odd lanes
+   together into a single node which contains the nodes in even/odd order
+   by using a lane permute.
+
+   The lanes in EVEN and ODD are duplicated 2 times inside the vectors.
+   So for a lanes = 4 EVEN contains {EVEN1, EVEN1, EVEN2, EVEN2}.
+
+   The tree REPRESENTATION is taken from the supplied REP along with the
+   vectype which must be the same between all three nodes.
+*/
+
+static slp_tree
+vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
+{
+  vec<std::pair<unsigned, unsigned> > perm;
+  perm.create (SLP_TREE_LANES (rep));
+
+  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
+    {
+      perm.quick_push (std::make_pair (0, x));
+      perm.quick_push (std::make_pair (1, x+1));
+    }
+
+  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
+  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
+  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
+
+  SLP_TREE_CHILDREN (vnode).create (2);
+  SLP_TREE_CHILDREN (vnode).quick_push (even);
+  SLP_TREE_CHILDREN (vnode).quick_push (odd);
+  SLP_TREE_REF_COUNT (even)++;
+  SLP_TREE_REF_COUNT (odd)++;
+  SLP_TREE_REF_COUNT (vnode) = 1;
+
+  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
+  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
+  /* Representation is set to that of the current node as the vectorizer
+     can't deal with VEC_PERMs with no representation, as would be the
+     case with invariants.  */
+  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
+  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
+  return vnode;
+}
+
+class complex_mul_pattern : public complex_pattern
+{
+  protected:
+    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 2;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_mul_pattern (node, m_ops, ifn);
+    }
+
+};
+
+/* Pattern matcher for trying to match complex multiply pattern in SLP tree
+   If the operation matches then IFN is set to the operation it matched
+   and the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]);
+   double bx = (a[i+1] * b[i]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_mul_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree *node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  if (op != MINUS_PLUS)
+    return IFN_LAST;
+
+  slp_tree root = *node;
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
+      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
+    return IFN_LAST;
+
+  bool neg_first = false;
+  bool conj_first_operand = false;
+  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
+
+  if (!is_neg)
+    {
+      /* A multiplication needs to multiply agains the real pair, otherwise
+	 the pattern matches that of FMS.   */
+      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
+	  || vect_normalize_conj_loc (left_op))
+	return IFN_LAST;
+      ifn = IFN_COMPLEX_MUL;
+    }
+  else if (is_neg)
+    {
+      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
+					 neg_first, &conj_first_operand,
+					 false))
+	return IFN_LAST;
+
+      ifn = IFN_COMPLEX_MUL_CONJ;
+    }
+
+  if (!vect_pattern_validate_optab (ifn, *node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (3);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (left_op[1]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_EVENEVEN && !conj_first_operand)
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+  else
+    {
+      ops->quick_push (left_op[0]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_mul_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_mul_pattern::build (vec_info *vinfo)
+{
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  /* First re-arrange the children.  */
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).create (2);
+  SLP_TREE_CHILDREN (*this->m_node).quick_push (this->m_ops[2]);
+  SLP_TREE_CHILDREN (*this->m_node).quick_push (
+    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[2])++;
+
+  /* And then rewrite the node itself.  */
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate
  2021-01-11 11:01     ` Tamar Christina
@ 2021-01-11 12:04       ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-11 12:04 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd

On Mon, 11 Jan 2021, Tamar Christina wrote:

> Hi Richi,
> 
> This adds support for complex multiply and complex multiply and accumulate to
> the vect pattern detector.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master? (I replied to a comment at the very end of the mail from the previous patch)
> 
> Example of instructions matched:
> 
> #include <stdio.h>
> #include <complex.h>
> 
> #define N 200
> #define ROT
> #define TYPE float
> #define TYPE2 float
> 
> 
> void g (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
> {
>   for (int i=0; i < N; i++)
>     {
>       c[i] =  a[i] * (b[i] ROT);
>     }
> }
> 
> void g_f1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
> {
>   for (int i=0; i < N; i++)
>     {
>       c[i] =  conjf (a[i]) * (b[i] ROT);
>     }
> }
> 
> void g_s1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
> {
>   for (int i=0; i < N; i++)
>     {
>       c[i] =  a[i] * conjf (b[i] ROT);
>     }
> }
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
> 	* optabs.def (cmul_optab, cmul_conj_optab): New.
> 	* doc/md.texi: Document them.
> 	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
> 	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
> 	vect_build_combine_node, class complex_mul_pattern,
> 	complex_mul_pattern::matches, complex_mul_pattern::recognize,
> 	complex_mul_pattern::build): New.
> 
> --- inline copy of patch --
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -6202,6 +6202,50 @@ The operation is only supported for vector modes @var{m}.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{cmul@var{m}4} instruction pattern
> +@item @samp{cmul@var{m}4}
> +Perform a vector multiply that is semantically the same as multiply of
> +complex numbers.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] = a[i] * b[i];
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{cmul_conj@var{m}4} instruction pattern
> +@item @samp{cmul_conj@var{m}4}
> +Perform a vector multiply by conjugate that is semantically the same as a
> +multiply of complex numbers where the second multiply arguments is conjugated.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] = a[i] * conj (b[i]);
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{ffs@var{m}2} instruction pattern
>  @item @samp{ffs@var{m}2}
>  Store into operand 0 one plus the index of the least significant 1-bit
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e60f6996b28164ae38 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX, ECF_CONST, fmax, binary)
>  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
>  
>  
>  /* FP scales.  */
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761a560b7fbcb69ce1 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
>  OPTAB_D (xorsign_optab, "xorsign$F$a3")
>  OPTAB_D (cadd90_optab, "cadd90$a3")
>  OPTAB_D (cadd270_optab, "cadd270$a3")
> +OPTAB_D (cmul_optab, "cmul$a3")
> +OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
>  OPTAB_D (cos_optab, "cos$a2")
>  OPTAB_D (cosh_optab, "cosh$a2")
>  OPTAB_D (exp10_optab, "exp10$a2")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..fb58b45602f00a440ef7c27853276945ba696522 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -719,6 +719,375 @@ complex_add_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
>    return new complex_add_pattern (node, &ops, ifn);
>  }
>  
> +/*******************************************************************************
> + * complex_mul_pattern
> + ******************************************************************************/
> +
> +/* Helper function of that looks for a match in the CHILDth child of NODE.  The
> +   child used is stored in RES.
> +
> +   If the match is successful then ARGS will contain the operands matched
> +   and the complex_operation_t type is returned.  If match is not successful
> +   then CMPLX_NONE is returned and ARGS is left unmodified.  */
> +
> +static inline complex_operation_t
> +vect_match_call_complex_mla (slp_tree node, unsigned child,
> +			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
> +{
> +  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
> +
> +  slp_tree data = SLP_TREE_CHILDREN (node)[child];
> +
> +  if (res)
> +    *res = data;
> +
> +  return vect_detect_pair_op (data, false, args);
> +}
> +
> +/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the first
> +   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
> +
> +   If a negate is found then the values in ARGS are reordered such that the
> +   negate node is always the second one and the entry is replaced by the child
> +   of the negate node.  */
> +
> +static inline bool
> +vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p = NULL)
> +{
> +  gcc_assert (args.length () == 2);
> +  bool neg_found = false;
> +
> +  if (vect_match_expression_p (args[0], NEGATE_EXPR))
> +    {
> +      std::swap (args[0], args[1]);
> +      neg_found = true;
> +      if (neg_first_p)
> +	*neg_first_p = true;
> +    }
> +  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
> +    {
> +      neg_found = true;
> +      if (neg_first_p)
> +	*neg_first_p = false;
> +    }
> +
> +  if (neg_found)
> +    args[1] = SLP_TREE_CHILDREN (args[1])[0];
> +
> +  return neg_found;
> +}
> +
> +/* Helper function to check if PERM is KIND or PERM_TOP.  */
> +
> +static inline bool
> +is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
> +{
> +  return perm.first == kind || perm.first == PERM_TOP;
> +}
> +
> +/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both MULT_EXPR
> +   nodes but also that they represent an operation that is either a complex
> +   multiplication or a complex multiplication by conjugated value.
> +
> +   Of the negation is expected to be in the first half of the tree (As required
> +   by an FMS pattern) then NEG_FIRST is true.  If the operation is a conjugate
> +   operation then CONJ_FIRST_OPERAND is set to indicate whether the first or
> +   second operand contains the conjugate operation.  */
> +
> +static inline bool
> +vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
> +			     vec<slp_tree> left_op, vec<slp_tree> right_op,
> +			     bool neg_first, bool *conj_first_operand,
> +			     bool fms)
> +{
> +  /* The presence of a negation indicates that we have either a conjugate or a
> +     rotation.  We need to distinguish which one.  */
> +  *conj_first_operand = false;
> +  complex_perm_kinds_t kind;
> +
> +  /* Complex conjugates have the negation on the imaginary part of the
> +     number where rotations affect the real component.  So check if the
> +     negation is on a dup of lane 1.  */
> +  if (fms)
> +    {
> +      /* Canonicalization for fms is not consistent. So have to test both
> +	 variants to be sure.  This needs to be fixed in the mid-end so
> +	 this part can be simpler.  */
> +      kind = linear_loads_p (perm_cache, right_op[0]).first;
> +      if (!((kind == PERM_ODDODD
> +	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> +			     PERM_ODDEVEN))
> +	  || (kind == PERM_ODDEVEN
> +	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> +			     PERM_ODDODD))))
> +	return false;
> +    }
> +  else
> +    {
> +      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
> +	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
> +			    PERM_ODDEVEN))
> +	return false;
> +    }
> +
> +  /* Deal with differences in indexes.  */
> +  int index1 = fms ? 1 : 0;
> +  int index2 = fms ? 0 : 1;
> +
> +  /* Check if the conjugate is on the second first or second operand.  The
> +     order of the node with the conjugate value determines this, and the dup
> +     node must be one of lane 0 of the same DR as the neg node.  */
> +  kind = linear_loads_p (perm_cache, left_op[index1]).first;
> +  if (kind == PERM_TOP)
> +    {
> +      if (linear_loads_p (perm_cache, left_op[index2]).first == PERM_EVENODD)
> +	return true;
> +    }
> +  else if (kind == PERM_EVENODD)
> +    {
> +      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) == PERM_EVENODD)
> +	return false;
> +    }
> +  else if (!neg_first)
> +    *conj_first_operand = true;
> +  else
> +    return false;
> +
> +  if (kind != PERM_EVENEVEN)
> +    return false;
> +
> +  return true;
> +}
> +
> +/* Helper function to help distinguish between a conjugate and a rotation in a
> +   complex multiplication.  The operations have similar shapes but the order of
> +   the load permutes are different.  This function returns TRUE when the order
> +   is consistent with a multiplication or multiplication by conjugated
> +   operand but returns FALSE if it's a multiplication by rotated operand.  */
> +
> +static inline bool
> +vect_validate_multiplication (slp_tree_to_load_perm_map_t *perm_cache,
> +			     vec<slp_tree> op, complex_perm_kinds_t permKind)
> +{
> +  /* The left node is the more common case, test it first.  */
> +  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
> +    {
> +      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
> +	return false;
> +    }
> +  return true;
> +}
> +
> +/* This function combines two nodes containing only even and only odd lanes
> +   together into a single node which contains the nodes in even/odd order
> +   by using a lane permute.
> +
> +   The lanes in EVEN and ODD are duplicated 2 times inside the vectors.
> +   So for a lanes = 4 EVEN contains {EVEN1, EVEN1, EVEN2, EVEN2}.
> +
> +   The tree REPRESENTATION is taken from the supplied REP along with the
> +   vectype which must be the same between all three nodes.
> +*/
> +
> +static slp_tree
> +vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep)
> +{
> +  vec<std::pair<unsigned, unsigned> > perm;
> +  perm.create (SLP_TREE_LANES (rep));
> +
> +  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
> +    {
> +      perm.quick_push (std::make_pair (0, x));
> +      perm.quick_push (std::make_pair (1, x+1));
> +    }
> +
> +  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE (even));
> +  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
> +  SLP_TREE_LANE_PERMUTATION (vnode) = perm;
> +
> +  SLP_TREE_CHILDREN (vnode).create (2);
> +  SLP_TREE_CHILDREN (vnode).quick_push (even);
> +  SLP_TREE_CHILDREN (vnode).quick_push (odd);
> +  SLP_TREE_REF_COUNT (even)++;
> +  SLP_TREE_REF_COUNT (odd)++;
> +  SLP_TREE_REF_COUNT (vnode) = 1;
> +
> +  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);
> +  gcc_assert (perm.length () == SLP_TREE_LANES (vnode));
> +  /* Representation is set to that of the current node as the vectorizer
> +     can't deal with VEC_PERMs with no representation, as would be the
> +     case with invariants.  */
> +  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
> +  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
> +  return vnode;
> +}
> +
> +class complex_mul_pattern : public complex_pattern
> +{
> +  protected:
> +    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +      : complex_pattern (node, m_ops, ifn)
> +    {
> +      this->m_num_args = 2;
> +    }
> +
> +  public:
> +    void build (vec_info *);
> +    static internal_fn
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
> +	     vec<slp_tree> *);
> +
> +    static vect_pattern*
> +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +
> +    static vect_pattern*
> +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +    {
> +      return new complex_mul_pattern (node, m_ops, ifn);
> +    }
> +
> +};
> +
> +/* Pattern matcher for trying to match complex multiply pattern in SLP tree
> +   If the operation matches then IFN is set to the operation it matched
> +   and the arguments to the two replacement statements are put in m_ops.
> +
> +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> +
> +   This function matches the patterns shaped as:
> +
> +   double ax = (b[i+1] * a[i]);
> +   double bx = (a[i+1] * b[i]);
> +
> +   c[i] = c[i] - ax;
> +   c[i+1] = c[i+1] + bx;
> +
> +   If a match occurred then TRUE is returned, else FALSE.  The initial match is
> +   expected to be in OP1 and the initial match operands in args0.  */
> +
> +internal_fn
> +complex_mul_pattern::matches (complex_operation_t op,
> +			      slp_tree_to_load_perm_map_t *perm_cache,
> +			      slp_tree *node, vec<slp_tree> *ops)
> +{
> +  internal_fn ifn = IFN_LAST;
> +
> +  if (op != MINUS_PLUS)
> +    return IFN_LAST;
> +
> +  slp_tree root = *node;
> +  /* First two nodes must be a multiply.  */
> +  auto_vec<slp_tree> muls;
> +  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
> +      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
> +    return IFN_LAST;
> +
> +  /* Now operand2+4 may lead to another expression.  */
> +  auto_vec<slp_tree> left_op, right_op;
> +  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
> +  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
> +
> +  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
> +    return IFN_LAST;
> +
> +  bool neg_first = false;
> +  bool conj_first_operand = false;
> +  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
> +
> +  if (!is_neg)
> +    {
> +      /* A multiplication needs to multiply agains the real pair, otherwise
> +	 the pattern matches that of FMS.   */
> +      if (!vect_validate_multiplication (perm_cache, left_op, PERM_EVENEVEN)
> +	  || vect_normalize_conj_loc (left_op))
> +	return IFN_LAST;
> +      ifn = IFN_COMPLEX_MUL;
> +    }
> +  else if (is_neg)
> +    {
> +      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
> +					 neg_first, &conj_first_operand,
> +					 false))
> +	return IFN_LAST;
> +
> +      ifn = IFN_COMPLEX_MUL_CONJ;
> +    }
> +
> +  if (!vect_pattern_validate_optab (ifn, *node))
> +    return IFN_LAST;
> +
> +  ops->truncate (0);
> +  ops->create (3);
> +
> +  complex_perm_kinds_t kind = linear_loads_p (perm_cache, left_op[0]).first;
> +  if (kind == PERM_EVENODD)
> +    {
> +      ops->quick_push (left_op[1]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else if (kind == PERM_TOP)
> +    {
> +      ops->quick_push (left_op[1]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else if (kind == PERM_EVENEVEN && !conj_first_operand)
> +    {
> +      ops->quick_push (left_op[0]);
> +      ops->quick_push (right_op[0]);
> +      ops->quick_push (left_op[1]);
> +    }
> +  else
> +    {
> +      ops->quick_push (left_op[0]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[1]);
> +    }
> +
> +  return ifn;
> +}
> +
> +/* Attempt to recognize a complex mul pattern.  */
> +
> +vect_pattern*
> +complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
> +				slp_tree *node)
> +{
> +  auto_vec<slp_tree> ops;
> +  complex_operation_t op
> +    = vect_detect_pair_op (*node, true, &ops);
> +  internal_fn ifn
> +    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn == IFN_LAST)
> +    return NULL;
> +
> +  return new complex_mul_pattern (node, &ops, ifn);
> +}
> +
> +/* Perform a replacement of the detected complex mul pattern with the new
> +   instruction sequences.  */
> +
> +void
> +complex_mul_pattern::build (vec_info *vinfo)
> +{
> +  slp_tree node;
> +  unsigned i;
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> +    vect_free_slp_tree (node);
> +
> +  /* First re-arrange the children.  */
> +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> +  SLP_TREE_CHILDREN (*this->m_node).create (2);

So technically this is a leak - either do .release () instead
of truncate (0) or use .reserve_exact (2) in place of .create ()
(I prefer the latter since it avoids re-allocation in most cases).

OK with that change.

> +  SLP_TREE_CHILDREN (*this->m_node).quick_push (this->m_ops[2]);
> +  SLP_TREE_CHILDREN (*this->m_node).quick_push (
> +    vect_build_combine_node (this->m_ops[0], this->m_ops[1], *this->m_node));
> +  SLP_TREE_REF_COUNT (this->m_ops[2])++;
> +
> +  /* And then rewrite the node itself.  */
> +  complex_pattern::build (vinfo);
> +}
> +
>  /*******************************************************************************
>   * Pattern matching definitions
>   ******************************************************************************/
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, January 8, 2021 9:37 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH 5/8 v9]middle-end slp: support complex multiply and
> > complex multiply conjugate
> > 
> > On Mon, 28 Dec 2020, Tamar Christina wrote:
> > 
> > > Hi All,
> > >
> > > This adds support for complex multiply and complex multiply and
> > > accumulate to the vect pattern detector.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* internal-fn.def (COMPLEX_MUL, COMPLEX_MUL_CONJ): New.
> > > 	* optabs.def (cmul_optab, cmul_conj_optab): New.
> > > 	* doc/md.texi: Document them.
> > > 	* tree-vect-slp-patterns.c (vect_match_call_complex_mla,
> > > 	vect_normalize_conj_loc, is_eq_or_top, vect_validate_multiplication,
> > > 	vect_build_combine_node, class complex_mul_pattern,
> > > 	complex_mul_pattern::matches, complex_mul_pattern::recognize,
> > > 	complex_mul_pattern::build): New.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > >
> > ec6ec180b91fcf9f481b6754c044483787fd923c..b8cc90e1a75e402abbf8a8cf2ef
> > e
> > > fc1a333f8b3a 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -6202,6 +6202,50 @@ The operation is only supported for vector
> > modes @var{m}.
> > >
> > >  This pattern is not allowed to @code{FAIL}.
> > >
> > > +@cindex @code{cmul@var{m}4} instruction pattern @item
> > > +@samp{cmul@var{m}4} Perform a vector multiply that is semantically
> > > +the same as multiply of complex numbers.
> > > +
> > > +@smallexample
> > > +  complex TYPE c[N];
> > > +  complex TYPE a[N];
> > > +  complex TYPE b[N];
> > > +  for (int i = 0; i < N; i += 1)
> > > +    @{
> > > +      c[i] = a[i] * b[i];
> > > +    @}
> > > +@end smallexample
> > > +
> > > +In GCC lane ordering the real part of the number must be in the even
> > > +lanes with the imaginary part in the odd lanes.
> > > +
> > > +The operation is only supported for vector modes @var{m}.
> > > +
> > > +This pattern is not allowed to @code{FAIL}.
> > > +
> > > +@cindex @code{cmul_conj@var{m}4} instruction pattern @item
> > > +@samp{cmul_conj@var{m}4} Perform a vector multiply by conjugate that
> > > +is semantically the same as a multiply of complex numbers where the
> > > +second multiply arguments is conjugated.
> > > +
> > > +@smallexample
> > > +  complex TYPE c[N];
> > > +  complex TYPE a[N];
> > > +  complex TYPE b[N];
> > > +  for (int i = 0; i < N; i += 1)
> > > +    @{
> > > +      c[i] = a[i] * conj (b[i]);
> > > +    @}
> > > +@end smallexample
> > > +
> > > +In GCC lane ordering the real part of the number must be in the even
> > > +lanes with the imaginary part in the odd lanes.
> > > +
> > > +The operation is only supported for vector modes @var{m}.
> > > +
> > > +This pattern is not allowed to @code{FAIL}.
> > > +
> > >  @cindex @code{ffs@var{m}2} instruction pattern  @item
> > > @samp{ffs@var{m}2}  Store into operand 0 one plus the index of the
> > > least significant 1-bit diff --git a/gcc/internal-fn.def
> > > b/gcc/internal-fn.def index
> > >
> > 511fe70162b5d9db3a61a5285d31c008f6835487..5a0bbe3fe5dee591d54130e6
> > 0f69
> > > 96b28164ae38 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -279,6 +279,8 @@ DEF_INTERNAL_FLT_FLOATN_FN (FMAX,
> > ECF_CONST, fmax,
> > > binary)  DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> > > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90,
> > binary)
> > > DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST,
> > cadd270, binary)
> > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
> > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST,
> > cmul_conj,
> > > +binary)
> > >
> > >
> > >  /* FP scales.  */
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> > >
> > e9727def4dbf941bb9ac8b56f83f8ea0f52b262c..e82396bae1117c6de91304761
> > a56
> > > 0b7fbcb69ce1 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -292,6 +292,8 @@ OPTAB_D (copysign_optab, "copysign$F$a3")
> > OPTAB_D
> > > (xorsign_optab, "xorsign$F$a3")  OPTAB_D (cadd90_optab, "cadd90$a3")
> > > OPTAB_D (cadd270_optab, "cadd270$a3")
> > > +OPTAB_D (cmul_optab, "cmul$a3")
> > > +OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> > >  OPTAB_D (cos_optab, "cos$a2")
> > >  OPTAB_D (cosh_optab, "cosh$a2")
> > >  OPTAB_D (exp10_optab, "exp10$a2")
> > > diff --git a/gcc/tree-vect-slp-patterns.c
> > > b/gcc/tree-vect-slp-patterns.c index
> > >
> > dbc58f7c53868ed431fc67de1f0162eb0d3b2c24..82721acbab8cf81c4d6f9954c9
> > 8f
> > > b913a7bb6282 100644
> > > --- a/gcc/tree-vect-slp-patterns.c
> > > +++ b/gcc/tree-vect-slp-patterns.c
> > > @@ -719,6 +719,368 @@ complex_add_pattern::recognize
> > (slp_tree_to_load_perm_map_t *perm_cache,
> > >    return new complex_add_pattern (node, &ops, ifn);  }
> > >
> > >
> > +/*********************************************************
> > ***********
> > > +***********
> > > + * complex_mul_pattern
> > > +
> > >
> > +*********************************************************
> > ************
> > > +*********/
> > > +
> > > +/* Helper function of that looks for a match in the CHILDth child of NODE.
> > The
> > > +   child used is stored in RES.
> > > +
> > > +   If the match is successful then ARGS will contain the operands matched
> > > +   and the complex_operation_t type is returned.  If match is not
> > successful
> > > +   then CMPLX_NONE is returned and ARGS is left unmodified.  */
> > > +
> > > +static inline complex_operation_t
> > > +vect_match_call_complex_mla (slp_tree node, unsigned child,
> > > +			     vec<slp_tree> *args = NULL, slp_tree *res = NULL)
> > {
> > > +  gcc_assert (child < SLP_TREE_CHILDREN (node).length ());
> > > +
> > > +  slp_tree data = SLP_TREE_CHILDREN (node)[child];
> > > +
> > > +  if (res)
> > > +    *res = data;
> > > +
> > > +  return vect_detect_pair_op (data, false, args); }
> > > +
> > > +/* Check to see if either of the trees in ARGS are a NEGATE_EXPR.  If the
> > first
> > > +   child (args[0]) is a NEGATE_EXPR then NEG_FIRST_P is set to TRUE.
> > > +
> > > +   If a negate is found then the values in ARGS are reordered such that the
> > > +   negate node is always the second one and the entry is replaced by the
> > child
> > > +   of the negate node.  */
> > > +
> > > +static inline bool
> > > +vect_normalize_conj_loc (vec<slp_tree> args, bool *neg_first_p =
> > > +NULL) {
> > > +  gcc_assert (args.length () == 2);
> > > +  bool neg_found = false;
> > > +
> > > +  if (vect_match_expression_p (args[0], NEGATE_EXPR))
> > > +    {
> > > +      std::swap (args[0], args[1]);
> > > +      neg_found = true;
> > > +      if (neg_first_p)
> > > +	*neg_first_p = true;
> > > +    }
> > > +  else if (vect_match_expression_p (args[1], NEGATE_EXPR))
> > > +    {
> > > +      neg_found = true;
> > > +      if (neg_first_p)
> > > +	*neg_first_p = false;
> > > +    }
> > > +
> > > +  if (neg_found)
> > > +    args[1] = SLP_TREE_CHILDREN (args[1])[0];
> > > +
> > > +  return neg_found;
> > > +}
> > > +
> > > +/* Helper function to check if PERM is KIND or PERM_TOP.  */
> > > +
> > > +static inline bool
> > > +is_eq_or_top (complex_load_perm_t perm, complex_perm_kinds_t kind)
> > {
> > > +  return perm.first == kind || perm.first == PERM_TOP; }
> > > +
> > > +/* Helper function that checks to see if LEFT_OP and RIGHT_OP are both
> > MULT_EXPR
> > > +   nodes but also that they represent an operation that is either a complex
> > > +   multiplication or a complex multiplication by conjugated value.
> > > +
> > > +   Of the negation is expected to be in the first half of the tree (As
> > required
> > > +   by an FMS pattern) then NEG_FIRST is true.  If the operation is a
> > conjugate
> > > +   operation then CONJ_FIRST_OPERAND is set to indicate whether the
> > first or
> > > +   second operand contains the conjugate operation.  */
> > > +
> > > +static inline bool
> > > +vect_validate_multiplication (slp_tree_to_load_perm_map_t
> > *perm_cache,
> > > +			     vec<slp_tree> left_op, vec<slp_tree> right_op,
> > > +			     bool neg_first, bool *conj_first_operand,
> > > +			     bool fms)
> > > +{
> > > +  /* The presence of a negation indicates that we have either a conjugate
> > or a
> > > +     rotation.  We need to distinguish which one.  */
> > > +  *conj_first_operand = false;
> > > +  complex_perm_kinds_t kind;
> > > +
> > > +  /* Complex conjugates have the negation on the imaginary part of the
> > > +     number where rotations affect the real component.  So check if the
> > > +     negation is on a dup of lane 1.  */
> > > +  if (fms)
> > > +    {
> > > +      /* Canonicalization for fms is not consistent. So have to test both
> > > +	 variants to be sure.  This needs to be fixed in the mid-end so
> > > +	 this part can be simpler.  */
> > > +      kind = linear_loads_p (perm_cache, right_op[0]).first;
> > > +      if (!((kind == PERM_ODDODD
> > > +	   && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> > > +			     PERM_ODDEVEN))
> > > +	  || (kind == PERM_ODDEVEN
> > > +	      && is_eq_or_top (linear_loads_p (perm_cache, right_op[1]),
> > > +			     PERM_ODDODD))))
> > > +	return false;
> > > +    }
> > > +  else
> > > +    {
> > > +      if (linear_loads_p (perm_cache, right_op[1]).first != PERM_ODDODD
> > > +	  && !is_eq_or_top (linear_loads_p (perm_cache, right_op[0]),
> > > +			    PERM_ODDEVEN))
> > > +	return false;
> > > +    }
> > > +
> > > +  /* Deal with differences in indexes.  */  int index1 = fms ? 1 : 0;
> > > + int index2 = fms ? 0 : 1;
> > > +
> > > +  /* Check if the conjugate is on the second first or second operand.  The
> > > +     order of the node with the conjugate value determines this, and the
> > dup
> > > +     node must be one of lane 0 of the same DR as the neg node.  */
> > > +  kind = linear_loads_p (perm_cache, left_op[index1]).first;
> > > +  if (kind == PERM_TOP)
> > > +    {
> > > +      if (linear_loads_p (perm_cache, left_op[index2]).first ==
> > PERM_EVENODD)
> > > +	return true;
> > > +    }
> > > +  else if (kind == PERM_EVENODD)
> > > +    {
> > > +      if ((kind = linear_loads_p (perm_cache, left_op[index2]).first) ==
> > PERM_EVENODD)
> > > +	return false;
> > > +    }
> > > +  else if (!neg_first)
> > > +    *conj_first_operand = true;
> > > +  else
> > > +    return false;
> > > +
> > > +  if (kind != PERM_EVENEVEN)
> > > +    return false;
> > > +
> > > +  return true;
> > > +}
> > > +
> > > +/* Helper function to help distinguish between a conjugate and a rotation
> > in a
> > > +   complex multiplication.  The operations have similar shapes but the
> > order of
> > > +   the load permutes are different.  This function returns TRUE when the
> > order
> > > +   is consistent with a multiplication or multiplication by conjugated
> > > +   operand but returns FALSE if it's a multiplication by rotated
> > > +operand.  */
> > > +
> > > +static inline bool
> > > +vect_validate_multiplication (slp_tree_to_load_perm_map_t
> > *perm_cache,
> > > +			     vec<slp_tree> op, complex_perm_kinds_t
> > permKind) {
> > > +  /* The left node is the more common case, test it first.  */
> > > +  if (!is_eq_or_top (linear_loads_p (perm_cache, op[0]), permKind))
> > > +    {
> > > +      if (!is_eq_or_top (linear_loads_p (perm_cache, op[1]), permKind))
> > > +	return false;
> > > +    }
> > > +  return true;
> > > +}
> > > +
> > > +/* This function combines two nodes containing only even and only odd
> > lanes
> > > +   together into a single node which contains the nodes in even/odd order
> > > +   by using a lane permute.  */
> > > +
> > > +static slp_tree
> > > +vect_build_combine_node (slp_tree even, slp_tree odd, slp_tree rep) {
> > > +  auto_vec<slp_tree> nodes;
> > > +  nodes.create (2);
> > > +  vec<std::pair<unsigned, unsigned> > perm;
> > > +  perm.create (SLP_TREE_LANES (rep));
> > > +
> > > +  for (unsigned x = 0; x < SLP_TREE_LANES (rep); x+=2)
> > > +    {
> > > +      perm.quick_push (std::make_pair (0, x));
> > > +      perm.quick_push (std::make_pair (1, x));
> > > +    }
> > 
> > That looks wrong, it creates {0,0}, {1, 0}, {0, 2}, {1, 2} but you want {0, 0}, {1, 0},
> > {0, 1}, {1, 1} AFAICS.  At least I assume SLP_TREE_LANES (odd/even) ==
> > SLP_TREE_LANES (rep) / 2?
> > 
> > 'rep' isn't documented, I assume it's supoosed to be a "representative"
> > for the result?
> > 
> > > +
> > > +  nodes.quick_push (even);
> > > +  nodes.quick_push (odd);
> > 
> > No need for this intermediate nodes array, just push to ...
> > 
> > > +  SLP_TREE_REF_COUNT (even)++;
> > > +  SLP_TREE_REF_COUNT (odd)++;
> > > +
> > > +  slp_tree vnode = vect_create_new_slp_node (2, SLP_TREE_CODE
> > > + (even));  SLP_TREE_CODE (vnode) = VEC_PERM_EXPR;
> > > + SLP_TREE_LANE_PERMUTATION (vnode) = perm;  SLP_TREE_CHILDREN
> > > + (vnode).safe_splice (nodes);
> > 
> > ... the children array directly (even with quick_push, we've already allocated
> > 2 elements for the children).
> > 
> > > +  SLP_TREE_REF_COUNT (vnode) = 1;
> > > +  SLP_TREE_LANES (vnode) = SLP_TREE_LANES (rep);  gcc_assert
> > > + (perm.length () == SLP_TREE_LANES (vnode));
> > > +  /* Representation is set to that of the current node as the vectorizer
> > > +     can't deal with VEC_PERMs with no representation, as would be the
> > > +     case with invariants.  */
> > 
> > Yeah, I need to fix this ...
> > 
> > > +  SLP_TREE_REPRESENTATIVE (vnode) = SLP_TREE_REPRESENTATIVE (rep);
> > > +  SLP_TREE_VECTYPE (vnode) = SLP_TREE_VECTYPE (rep);
> > > +  return vnode;
> > > +}
> > > +
> > > +class complex_mul_pattern : public complex_pattern {
> > > +  protected:
> > > +    complex_mul_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> > internal_fn ifn)
> > > +      : complex_pattern (node, m_ops, ifn)
> > > +    {
> > > +      this->m_num_args = 2;
> > > +    }
> > > +
> > > +  public:
> > > +    void build (vec_info *);
> > > +    static internal_fn
> > > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> > slp_tree *,
> > > +	     vec<slp_tree> *);
> > > +
> > > +    static vect_pattern*
> > > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > > +
> > > +    static vect_pattern*
> > > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > > +    {
> > > +      return new complex_mul_pattern (node, m_ops, ifn);
> > > +    }
> > > +
> > > +};
> > > +
> > > +/* Pattern matcher for trying to match complex multiply pattern in SLP
> > tree
> > > +   If the operation matches then IFN is set to the operation it matched
> > > +   and the arguments to the two replacement statements are put in
> > m_ops.
> > > +
> > > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > > +
> > > +   This function matches the patterns shaped as:
> > > +
> > > +   double ax = (b[i+1] * a[i]);
> > > +   double bx = (a[i+1] * b[i]);
> > > +
> > > +   c[i] = c[i] - ax;
> > > +   c[i+1] = c[i+1] + bx;
> > > +
> > > +   If a match occurred then TRUE is returned, else FALSE.  The initial match
> > is
> > > +   expected to be in OP1 and the initial match operands in args0.  */
> > > +
> > > +internal_fn
> > > +complex_mul_pattern::matches (complex_operation_t op,
> > > +			      slp_tree_to_load_perm_map_t *perm_cache,
> > > +			      slp_tree *node, vec<slp_tree> *ops) {
> > > +  internal_fn ifn = IFN_LAST;
> > > +
> > > +  if (op != MINUS_PLUS)
> > > +    return IFN_LAST;
> > > +
> > > +  slp_tree root = *node;
> > > +  /* First two nodes must be a multiply.  */  auto_vec<slp_tree>
> > > + muls;  if (vect_match_call_complex_mla (root, 0) != MULT_MULT
> > > +      || vect_match_call_complex_mla (root, 1, &muls) != MULT_MULT)
> > > +    return IFN_LAST;
> > > +
> > > +  /* Now operand2+4 may lead to another expression.  */
> > > + auto_vec<slp_tree> left_op, right_op;  left_op.safe_splice
> > > + (SLP_TREE_CHILDREN (muls[0]));  right_op.safe_splice
> > > + (SLP_TREE_CHILDREN (muls[1]));
> > > +
> > > +  if (linear_loads_p (perm_cache, left_op[1]).first == PERM_ODDEVEN)
> > > +    return IFN_LAST;
> > > +
> > > +  bool neg_first;
> > > +  bool is_neg = vect_normalize_conj_loc (right_op, &neg_first);
> > > +
> > > +  if (!is_neg)
> > > +    {
> > > +      /* A multiplication needs to multiply agains the real pair, otherwise
> > > +	 the pattern matches that of FMS.   */
> > > +      if (!vect_validate_multiplication (perm_cache, left_op,
> > PERM_EVENEVEN)
> > > +	  || vect_normalize_conj_loc (left_op))
> > > +	return IFN_LAST;
> > > +      ifn = IFN_COMPLEX_MUL;
> > > +    }
> > > +  else if (is_neg)
> > > +    {
> > > +      bool conj_first_operand;
> > > +      if (!vect_validate_multiplication (perm_cache, left_op, right_op,
> > > +					 neg_first, &conj_first_operand,
> > > +					 false))
> > > +	return IFN_LAST;
> > > +
> > > +      ifn = IFN_COMPLEX_MUL_CONJ;
> > > +    }
> > > +
> > > +  if (!vect_pattern_validate_optab (ifn, *node))
> > > +    return IFN_LAST;
> > > +
> > > +  ops->truncate (0);
> > > +  ops->create (3);
> > > +
> > > +  complex_perm_kinds_t kind = linear_loads_p (perm_cache,
> > > + left_op[0]).first;  if (kind == PERM_EVENODD)
> > > +    {
> > > +      ops->quick_push (left_op[1]);
> > > +      ops->quick_push (right_op[1]);
> > > +      ops->quick_push (left_op[0]);
> > > +    }
> > > +  else if (kind == PERM_TOP)
> > > +    {
> > > +      ops->quick_push (left_op[1]);
> > > +      ops->quick_push (right_op[1]);
> > > +      ops->quick_push (left_op[0]);
> > > +    }
> > > +  else
> > > +    {
> > > +      ops->quick_push (left_op[0]);
> > > +      ops->quick_push (right_op[0]);
> > > +      ops->quick_push (left_op[1]);
> > > +    }
> > > +
> > > +  return ifn;
> > > +}
> > > +
> > > +/* Attempt to recognize a complex mul pattern.  */
> > > +
> > > +vect_pattern*
> > > +complex_mul_pattern::recognize (slp_tree_to_load_perm_map_t
> > *perm_cache,
> > > +				slp_tree *node)
> > > +{
> > > +  auto_vec<slp_tree> ops;
> > > +  complex_operation_t op
> > > +    = vect_detect_pair_op (*node, true, &ops);
> > > +  internal_fn ifn
> > > +    = complex_mul_pattern::matches (op, perm_cache, node, &ops);
> > > +  if (ifn == IFN_LAST)
> > > +    return NULL;
> > > +
> > > +  return new complex_mul_pattern (node, &ops, ifn); }
> > > +
> > > +/* Perform a replacement of the detected complex mul pattern with the
> > new
> > > +   instruction sequences.  */
> > > +
> > > +void
> > > +complex_mul_pattern::build (vec_info *vinfo) {
> > > +  auto_vec<slp_tree> nodes;
> > > +
> > > +  /* First re-arrange the children.  */  nodes.create (2);
> > > +
> > > +  nodes.quick_push (this->m_ops[2]);
> > > +  nodes.quick_push (
> > > +    vect_build_combine_node (this->m_ops[0], this->m_ops[1],
> > > + *this->m_node));  SLP_TREE_REF_COUNT (this->m_ops[2])++;
> > > +
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> > > +    vect_free_slp_tree (node);
> > > +
> > > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> > SLP_TREE_CHILDREN
> > > + (*this->m_node).safe_splice (nodes);
> > 
> > please elide the nodes array.  *this->m_node now has a "wrong"
> > representative but I guess
> > 
> > > +  complex_pattern::build (vinfo);
> > 
> > will fix that up?  I still find the structure of the pattern matching & transform
> > hard to follow.  But well - I've settled with the idea of refactoring it for next
> > stage1 after the fact ;)
> 
> Indeed it does, I can flip order if replacing them last is clearer.
> 
> For next stage1 I wonder if it's not easier to not have build_slp produce the two_operands
> At all and have a generic expander that targets can override.  This of course would require
> It to change the VF, but will have to to support LDn/STn in SLP anyway.
> 
> But this would simplify things a lot.  I have some ideas but will write them up to get a review on
> the idea before I start implementing.

Yeah, I hope to find some time before stage1 opens to experiment with
some x86 specific patterns to see where things can/need to be improved.

Thanks,
Richard.

> 
> Thanks for allowing this version in!
> 
> Regards,
> Tamar
> 
> > 
> > Thanks,
> > Richard.
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
                   ` (3 preceding siblings ...)
  2020-12-28 13:37 ` [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate Tamar Christina
@ 2020-12-28 13:37 ` Tamar Christina
  2021-01-08  9:45   ` Richard Biener
  2020-12-28 13:38 ` [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate Tamar Christina
                   ` (2 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:37 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 9819 bytes --]

Hi All,

This adds support for FMA and FMA conjugated to the slp pattern matcher.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
	* optabs.def (cmla_optab, cmla_conj_optab): New.
	* doc/md.texi: Document them.
	* tree-vect-slp-patterns.c (vect_match_call_p,
	class complex_fma_pattern, vect_slp_reset_pattern,
	complex_fma_pattern::matches, complex_fma_pattern::recognize,
	complex_fma_pattern::build): New.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,51 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmla@var{m}4} instruction pattern
+@item @samp{cmla@var{m}4}
+Perform a vector multiply and accumulate that is semantically the same as
+a multiply and accumulate of complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] += a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmla_conj@var{m}4} instruction pattern
+@item @samp{cmla_conj@var{m}4}
+Perform a vector multiply by conjugate and accumulate that is semantically
+the same as a multiply and accumulate of complex numbers where the second
+multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] += a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{cmul@var{m}4} instruction pattern
 @item @samp{cmul@var{m}4}
 Perform a vector multiply that is semantically the same as multiply of
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9ca719ec5570ff43eb 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 
 /* Ternary math functions.  */
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac571eb40d45a294ed 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
 OPTAB_D (cmul_optab, "cmul$a3")
 OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
+OPTAB_D (cmla_optab, "cmla$a4")
+OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e17e65b3b2c7da83 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node, tree_code code)
   return true;
 }
 
+/* Checks to see if the expression represented by NODE is a call to the internal
+   function FN.  */
+
+static inline bool
+vect_match_call_p (slp_tree node, internal_fn fn)
+{
+  if (!node
+      || !SLP_TREE_REPRESENTATIVE (node))
+    return false;
+
+  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node));
+  if (!expr
+      || !gimple_call_internal_p (expr, fn))
+    return false;
+
+   return true;
+}
+
 /* Check if the given lane permute in PERMUTES matches an alternating sequence
    of {even odd even odd ...}.  This to account for unrolled loops.  Further
    mode there resulting permute must be linear.   */
@@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_fma_pattern class
+ ******************************************************************************/
+
+class complex_fma_pattern : public complex_pattern
+{
+  protected:
+    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 3;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_fma_pattern (node, m_ops, ifn);
+    }
+};
+
+/* Helper function to "reset" a previously matched node and undo the changes
+   made enough so that the node is treated as an irrelevant node.  */
+
+static inline void
+vect_slp_reset_pattern (slp_tree node)
+{
+  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
+  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
+  STMT_SLP_TYPE (stmt_info) = pure_slp;
+  SLP_TREE_REPRESENTATIVE (node) = stmt_info;
+}
+
+/* Pattern matcher for trying to match complex multiply and accumulate
+   and multiply and subtract patterns in SLP tree.
+   If the operation matches then IFN is set to the operation it matched and
+   the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The match is
+   performed after COMPLEX_MUL which would have done the majority of the work.
+   This function merely matches an ADD with a COMPLEX_MUL IFN.  The initial
+   match is expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_fma_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t * /* perm_cache */,
+			      slp_tree *ref_node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+     * FMA: + +.
+
+     We need to ignore the two_operands nodes that may also match.
+     For that we can check if they have any scalar statements and also
+     check that it's not a permute node as we're looking for a normal
+     PLUS_EXPR operation.  */
+  if (op != CMPLX_NONE)
+    return IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+   * FMA: + + on a non-two_operands node.  */
+  slp_tree vnode = *ref_node;
+  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
+      /* Need to exclude the plus two-operands node.  These are not marked
+	 so we have to infer it based on conditions.  */
+      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
+      || !vect_match_expression_p (vnode, PLUS_EXPR))
+    return IFN_LAST;
+
+  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
+
+  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
+    ifn = IFN_COMPLEX_FMA;
+  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
+    ifn = IFN_COMPLEX_FMA_CONJ;
+  else
+    return IFN_LAST;
+
+  if (!vect_pattern_validate_optab (ifn, vnode))
+    return IFN_LAST;
+
+  vect_slp_reset_pattern (node);
+  ops->truncate (0);
+  ops->create (3);
+
+  if (ifn == IFN_COMPLEX_FMA)
+    {
+      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
+    }
+  else
+    {
+      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_fma_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_fma_pattern::build (vec_info *vinfo)
+{
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


-- 

[-- Attachment #2: rb13961.patch --]
[-- Type: text/x-diff, Size: 9232 bytes --]

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6202,6 +6202,51 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmla@var{m}4} instruction pattern
+@item @samp{cmla@var{m}4}
+Perform a vector multiply and accumulate that is semantically the same as
+a multiply and accumulate of complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] += a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmla_conj@var{m}4} instruction pattern
+@item @samp{cmla_conj@var{m}4}
+Perform a vector multiply by conjugate and accumulate that is semantically
+the same as a multiply and accumulate of complex numbers where the second
+multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] += a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{cmul@var{m}4} instruction pattern
 @item @samp{cmul@var{m}4}
 Perform a vector multiply that is semantically the same as multiply of
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9ca719ec5570ff43eb 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 
 /* Ternary math functions.  */
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac571eb40d45a294ed 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")
 OPTAB_D (cadd270_optab, "cadd270$a3")
 OPTAB_D (cmul_optab, "cmul$a3")
 OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
+OPTAB_D (cmla_optab, "cmla$a4")
+OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e17e65b3b2c7da83 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node, tree_code code)
   return true;
 }
 
+/* Checks to see if the expression represented by NODE is a call to the internal
+   function FN.  */
+
+static inline bool
+vect_match_call_p (slp_tree node, internal_fn fn)
+{
+  if (!node
+      || !SLP_TREE_REPRESENTATIVE (node))
+    return false;
+
+  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node));
+  if (!expr
+      || !gimple_call_internal_p (expr, fn))
+    return false;
+
+   return true;
+}
+
 /* Check if the given lane permute in PERMUTES matches an alternating sequence
    of {even odd even odd ...}.  This to account for unrolled loops.  Further
    mode there resulting permute must be linear.   */
@@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_fma_pattern class
+ ******************************************************************************/
+
+class complex_fma_pattern : public complex_pattern
+{
+  protected:
+    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 3;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_fma_pattern (node, m_ops, ifn);
+    }
+};
+
+/* Helper function to "reset" a previously matched node and undo the changes
+   made enough so that the node is treated as an irrelevant node.  */
+
+static inline void
+vect_slp_reset_pattern (slp_tree node)
+{
+  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
+  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
+  STMT_SLP_TYPE (stmt_info) = pure_slp;
+  SLP_TREE_REPRESENTATIVE (node) = stmt_info;
+}
+
+/* Pattern matcher for trying to match complex multiply and accumulate
+   and multiply and subtract patterns in SLP tree.
+   If the operation matches then IFN is set to the operation it matched and
+   the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The match is
+   performed after COMPLEX_MUL which would have done the majority of the work.
+   This function merely matches an ADD with a COMPLEX_MUL IFN.  The initial
+   match is expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_fma_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t * /* perm_cache */,
+			      slp_tree *ref_node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+     * FMA: + +.
+
+     We need to ignore the two_operands nodes that may also match.
+     For that we can check if they have any scalar statements and also
+     check that it's not a permute node as we're looking for a normal
+     PLUS_EXPR operation.  */
+  if (op != CMPLX_NONE)
+    return IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+   * FMA: + + on a non-two_operands node.  */
+  slp_tree vnode = *ref_node;
+  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
+      /* Need to exclude the plus two-operands node.  These are not marked
+	 so we have to infer it based on conditions.  */
+      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
+      || !vect_match_expression_p (vnode, PLUS_EXPR))
+    return IFN_LAST;
+
+  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
+
+  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
+    ifn = IFN_COMPLEX_FMA;
+  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
+    ifn = IFN_COMPLEX_FMA_CONJ;
+  else
+    return IFN_LAST;
+
+  if (!vect_pattern_validate_optab (ifn, vnode))
+    return IFN_LAST;
+
+  vect_slp_reset_pattern (node);
+  ops->truncate (0);
+  ops->create (3);
+
+  if (ifn == IFN_COMPLEX_FMA)
+    {
+      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
+    }
+  else
+    {
+      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
+      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_fma_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_fma_pattern::build (vec_info *vinfo)
+{
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2020-12-28 13:37 ` [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate Tamar Christina
@ 2021-01-08  9:45   ` Richard Biener
  2021-01-08  9:59     ` Tamar Christina
  2021-01-11 10:24     ` Tamar Christina
  0 siblings, 2 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-08  9:45 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This adds support for FMA and FMA conjugated to the slp pattern matcher.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
> 	* optabs.def (cmla_optab, cmla_conj_optab): New.
> 	* doc/md.texi: Document them.
> 	* tree-vect-slp-patterns.c (vect_match_call_p,
> 	class complex_fma_pattern, vect_slp_reset_pattern,
> 	complex_fma_pattern::matches, complex_fma_pattern::recognize,
> 	complex_fma_pattern::build): New.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -6202,6 +6202,51 @@ The operation is only supported for vector modes @var{m}.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{cmla@var{m}4} instruction pattern
> +@item @samp{cmla@var{m}4}
> +Perform a vector multiply and accumulate that is semantically the same as
> +a multiply and accumulate of complex numbers.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] += a[i] * b[i];
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{cmla_conj@var{m}4} instruction pattern
> +@item @samp{cmla_conj@var{m}4}
> +Perform a vector multiply by conjugate and accumulate that is semantically
> +the same as a multiply and accumulate of complex numbers where the second
> +multiply arguments is conjugated.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] += a[i] * conj (b[i]);
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{cmul@var{m}4} instruction pattern
>  @item @samp{cmul@var{m}4}
>  Perform a vector multiply that is semantically the same as multiply of
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9ca719ec5570ff43eb 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
>  
>  /* Ternary math functions.  */
>  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
>  
>  /* Unary integer ops.  */
>  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac571eb40d45a294ed 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")
>  OPTAB_D (cadd270_optab, "cadd270$a3")
>  OPTAB_D (cmul_optab, "cmul$a3")
>  OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> +OPTAB_D (cmla_optab, "cmla$a4")
> +OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
>  OPTAB_D (cos_optab, "cos$a2")
>  OPTAB_D (cosh_optab, "cosh$a2")
>  OPTAB_D (exp10_optab, "exp10$a2")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e17e65b3b2c7da83 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node, tree_code code)
>    return true;
>  }
>  
> +/* Checks to see if the expression represented by NODE is a call to the internal
> +   function FN.  */
> +
> +static inline bool
> +vect_match_call_p (slp_tree node, internal_fn fn)
> +{
> +  if (!node
> +      || !SLP_TREE_REPRESENTATIVE (node))
> +    return false;
> +
> +  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (node));
> +  if (!expr
> +      || !gimple_call_internal_p (expr, fn))
> +    return false;
> +
> +   return true;
> +}
> +
>  /* Check if the given lane permute in PERMUTES matches an alternating sequence
>     of {even odd even odd ...}.  This to account for unrolled loops.  Further
>     mode there resulting permute must be linear.   */
> @@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
>    complex_pattern::build (vinfo);
>  }
>  
> +/*******************************************************************************
> + * complex_fma_pattern class
> + ******************************************************************************/
> +
> +class complex_fma_pattern : public complex_pattern
> +{
> +  protected:
> +    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +      : complex_pattern (node, m_ops, ifn)
> +    {
> +      this->m_num_args = 3;
> +    }
> +
> +  public:
> +    void build (vec_info *);
> +    static internal_fn
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
> +	     vec<slp_tree> *);
> +
> +    static vect_pattern*
> +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +
> +    static vect_pattern*
> +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +    {
> +      return new complex_fma_pattern (node, m_ops, ifn);
> +    }
> +};
> +
> +/* Helper function to "reset" a previously matched node and undo the changes
> +   made enough so that the node is treated as an irrelevant node.  */
> +
> +static inline void
> +vect_slp_reset_pattern (slp_tree node)
> +{
> +  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
> +  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
> +  STMT_SLP_TYPE (stmt_info) = pure_slp;
> +  SLP_TREE_REPRESENTATIVE (node) = stmt_info;
> +}
> +
> +/* Pattern matcher for trying to match complex multiply and accumulate
> +   and multiply and subtract patterns in SLP tree.
> +   If the operation matches then IFN is set to the operation it matched and
> +   the arguments to the two replacement statements are put in m_ops.
> +
> +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> +
> +   This function matches the patterns shaped as:
> +
> +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> +
> +   c[i] = c[i] - ax;
> +   c[i+1] = c[i+1] + bx;
> +
> +   If a match occurred then TRUE is returned, else FALSE.  The match is
> +   performed after COMPLEX_MUL which would have done the majority of the work.
> +   This function merely matches an ADD with a COMPLEX_MUL IFN.  The initial
> +   match is expected to be in OP1 and the initial match operands in args0.  */
> +
> +internal_fn
> +complex_fma_pattern::matches (complex_operation_t op,
> +			      slp_tree_to_load_perm_map_t * /* perm_cache */,
> +			      slp_tree *ref_node, vec<slp_tree> *ops)
> +{
> +  internal_fn ifn = IFN_LAST;
> +
> +  /* Find the two components.  We match Complex MUL first which reduces the
> +     amount of work this pattern has to do.  After that we just match the
> +     head node and we're done.:
> +
> +     * FMA: + +.
> +
> +     We need to ignore the two_operands nodes that may also match.
> +     For that we can check if they have any scalar statements and also
> +     check that it's not a permute node as we're looking for a normal
> +     PLUS_EXPR operation.  */
> +  if (op != CMPLX_NONE)
> +    return IFN_LAST;
> +
> +  /* Find the two components.  We match Complex MUL first which reduces the
> +     amount of work this pattern has to do.  After that we just match the
> +     head node and we're done.:
> +
> +   * FMA: + + on a non-two_operands node.  */
> +  slp_tree vnode = *ref_node;
> +  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
> +      /* Need to exclude the plus two-operands node.  These are not marked
> +	 so we have to infer it based on conditions.  */
> +      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()

as said earlier we shouldn't test this.  The existing lane permute
should already cover this - where the test would better be

 SLP_TREE_CODE (vnode) == VEC_PERM_EXPR

> +      || !vect_match_expression_p (vnode, PLUS_EXPR))

But then it shouldn't match this (the vect_match_expression_p should
only ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.

> +    return IFN_LAST;
> +
> +  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
> +
> +  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
> +    ifn = IFN_COMPLEX_FMA;
> +  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
> +    ifn = IFN_COMPLEX_FMA_CONJ;
> +  else
> +    return IFN_LAST;
> +
> +  if (!vect_pattern_validate_optab (ifn, vnode))
> +    return IFN_LAST;
> +
> +  vect_slp_reset_pattern (node);

I don't understand this ... it deserves a comment at least.
Having no testcases with this patch makes it impossible for
me to dig in myself :/

Otherwise looks OK.

Thanks,
Richard.

> +  ops->truncate (0);
> +  ops->create (3);
> +
> +  if (ifn == IFN_COMPLEX_FMA)
> +    {
> +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> +    }
> +  else
> +    {
> +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> +    }
> +
> +  return ifn;
> +}
> +
> +/* Attempt to recognize a complex mul pattern.  */
> +
> +vect_pattern*
> +complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
> +				slp_tree *node)
> +{
> +  auto_vec<slp_tree> ops;
> +  complex_operation_t op
> +    = vect_detect_pair_op (*node, true, &ops);
> +  internal_fn ifn
> +    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn == IFN_LAST)
> +    return NULL;
> +
> +  return new complex_fma_pattern (node, &ops, ifn);
> +}
> +
> +/* Perform a replacement of the detected complex mul pattern with the new
> +   instruction sequences.  */
> +
> +void
> +complex_fma_pattern::build (vec_info *vinfo)
> +{
> +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
> +
> +  complex_pattern::build (vinfo);
> +}
> +
>  /*******************************************************************************
>   * Pattern matching definitions
>   ******************************************************************************/
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2021-01-08  9:45   ` Richard Biener
@ 2021-01-08  9:59     ` Tamar Christina
  2021-01-08 10:17       ` Richard Biener
  2021-01-11 10:24     ` Tamar Christina
  1 sibling, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2021-01-08  9:59 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, ook

Hi Richi,

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, January 8, 2021 9:45 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH 6/8 v9]middle-end slp: support complex FMA and
> complex FMA conjugate
> 
> On Mon, 28 Dec 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This adds support for FMA and FMA conjugated to the slp pattern matcher.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
> > 	* optabs.def (cmla_optab, cmla_conj_optab): New.
> > 	* doc/md.texi: Document them.
> > 	* tree-vect-slp-patterns.c (vect_match_call_p,
> > 	class complex_fma_pattern, vect_slp_reset_pattern,
> > 	complex_fma_pattern::matches, complex_fma_pattern::recognize,
> > 	complex_fma_pattern::build): New.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> >
> b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c2
> 9
> > caa6863fd3f7 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6202,6 +6202,51 @@ The operation is only supported for vector
> modes @var{m}.
> >
> >  This pattern is not allowed to @code{FAIL}.
> >
> > +@cindex @code{cmla@var{m}4} instruction pattern @item
> > +@samp{cmla@var{m}4} Perform a vector multiply and accumulate that is
> > +semantically the same as a multiply and accumulate of complex
> > +numbers.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] += a[i] * b[i];
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> > +@cindex @code{cmla_conj@var{m}4} instruction pattern @item
> > +@samp{cmla_conj@var{m}4} Perform a vector multiply by conjugate and
> > +accumulate that is semantically the same as a multiply and accumulate
> > +of complex numbers where the second multiply arguments is conjugated.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] += a[i] * conj (b[i]);
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> >  @cindex @code{cmul@var{m}4} instruction pattern  @item
> > @samp{cmul@var{m}4}  Perform a vector multiply that is semantically
> > the same as multiply of diff --git a/gcc/internal-fn.def
> > b/gcc/internal-fn.def index
> >
> 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9c
> a719
> > ec5570ff43eb 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp,
> > binary)
> >
> >  /* Ternary math functions.  */
> >  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST,
> cmla_conj,
> > +ternary)
> >
> >  /* Unary integer ops.  */
> >  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb,
> unary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> >
> e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac
> 571e
> > b40d45a294ed 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")  OPTAB_D
> > (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab, "cmul$a3")
> > OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> > +OPTAB_D (cmla_optab, "cmla$a4")
> > +OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> >  OPTAB_D (cos_optab, "cos$a2")
> >  OPTAB_D (cosh_optab, "cosh$a2")
> >  OPTAB_D (exp10_optab, "exp10$a2")
> > diff --git a/gcc/tree-vect-slp-patterns.c
> > b/gcc/tree-vect-slp-patterns.c index
> >
> 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e1
> 7e
> > 65b3b2c7da83 100644
> > --- a/gcc/tree-vect-slp-patterns.c
> > +++ b/gcc/tree-vect-slp-patterns.c
> > @@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node,
> tree_code code)
> >    return true;
> >  }
> >
> > +/* Checks to see if the expression represented by NODE is a call to the
> internal
> > +   function FN.  */
> > +
> > +static inline bool
> > +vect_match_call_p (slp_tree node, internal_fn fn) {
> > +  if (!node
> > +      || !SLP_TREE_REPRESENTATIVE (node))
> > +    return false;
> > +
> > +  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE
> (node));
> > + if (!expr
> > +      || !gimple_call_internal_p (expr, fn))
> > +    return false;
> > +
> > +   return true;
> > +}
> > +
> >  /* Check if the given lane permute in PERMUTES matches an alternating
> sequence
> >     of {even odd even odd ...}.  This to account for unrolled loops.  Further
> >     mode there resulting permute must be linear.   */
> > @@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
> >    complex_pattern::build (vinfo);
> >  }
> >
> >
> +/*********************************************************
> ***********
> > +***********
> > + * complex_fma_pattern class
> > +
> >
> +*********************************************************
> ************
> > +*********/
> > +
> > +class complex_fma_pattern : public complex_pattern {
> > +  protected:
> > +    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> internal_fn ifn)
> > +      : complex_pattern (node, m_ops, ifn)
> > +    {
> > +      this->m_num_args = 3;
> > +    }
> > +
> > +  public:
> > +    void build (vec_info *);
> > +    static internal_fn
> > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> slp_tree *,
> > +	     vec<slp_tree> *);
> > +
> > +    static vect_pattern*
> > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > +
> > +    static vect_pattern*
> > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > +    {
> > +      return new complex_fma_pattern (node, m_ops, ifn);
> > +    }
> > +};
> > +
> > +/* Helper function to "reset" a previously matched node and undo the
> changes
> > +   made enough so that the node is treated as an irrelevant node.  */
> > +
> > +static inline void
> > +vect_slp_reset_pattern (slp_tree node) {
> > +  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE
> > +(node));
> > +  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
> > +  STMT_SLP_TYPE (stmt_info) = pure_slp;
> > +  SLP_TREE_REPRESENTATIVE (node) = stmt_info; }
> > +
> > +/* Pattern matcher for trying to match complex multiply and accumulate
> > +   and multiply and subtract patterns in SLP tree.
> > +   If the operation matches then IFN is set to the operation it matched and
> > +   the arguments to the two replacement statements are put in m_ops.
> > +
> > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > +
> > +   This function matches the patterns shaped as:
> > +
> > +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> > +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> > +
> > +   c[i] = c[i] - ax;
> > +   c[i+1] = c[i+1] + bx;
> > +
> > +   If a match occurred then TRUE is returned, else FALSE.  The match is
> > +   performed after COMPLEX_MUL which would have done the majority of
> the work.
> > +   This function merely matches an ADD with a COMPLEX_MUL IFN.  The
> initial
> > +   match is expected to be in OP1 and the initial match operands in
> > + args0.  */
> > +
> > +internal_fn
> > +complex_fma_pattern::matches (complex_operation_t op,
> > +			      slp_tree_to_load_perm_map_t * /* perm_cache
> */,
> > +			      slp_tree *ref_node, vec<slp_tree> *ops) {
> > +  internal_fn ifn = IFN_LAST;
> > +
> > +  /* Find the two components.  We match Complex MUL first which
> reduces the
> > +     amount of work this pattern has to do.  After that we just match the
> > +     head node and we're done.:
> > +
> > +     * FMA: + +.
> > +
> > +     We need to ignore the two_operands nodes that may also match.
> > +     For that we can check if they have any scalar statements and also
> > +     check that it's not a permute node as we're looking for a normal
> > +     PLUS_EXPR operation.  */
> > +  if (op != CMPLX_NONE)
> > +    return IFN_LAST;
> > +
> > +  /* Find the two components.  We match Complex MUL first which
> reduces the
> > +     amount of work this pattern has to do.  After that we just match the
> > +     head node and we're done.:
> > +
> > +   * FMA: + + on a non-two_operands node.  */
> > +  slp_tree vnode = *ref_node;
> > +  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
> > +      /* Need to exclude the plus two-operands node.  These are not
> marked
> > +	 so we have to infer it based on conditions.  */
> > +      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
> 
> as said earlier we shouldn't test this.  The existing lane permute should
> already cover this - where the test would better be
> 
>  SLP_TREE_CODE (vnode) == VEC_PERM_EXPR
> 
> > +      || !vect_match_expression_p (vnode, PLUS_EXPR))
> 
> But then it shouldn't match this (the vect_match_expression_p should only
> ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.
> 

How so? An FMA doesn't have a TWO_OPERANDS node as the root since the operations
Are always two PLUS operations.

The corresponding tree is

note:   SLP size 10 vs. limit 24.
note:   Final SLP tree for instance 0x48f68d0:
note:   node 0x4809870 (max_nunits=4, refcnt=2)
note:   op template: REALPART_EXPR <*_3> = _31;
note:     stmt 0 REALPART_EXPR <*_3> = _31;
note:     stmt 1 IMAGPART_EXPR <*_3> = _32;
note:     children 0x48098f8
note:   node 0x48098f8 (max_nunits=4, refcnt=2)
note:   op template: _31 = _12 + _29;
note:     stmt 0 _31 = _12 + _29;
note:     stmt 1 _32 = _11 + _30;
note:     children 0x4809980 0x4809a08
note:   node 0x4809980 (max_nunits=4, refcnt=2)
note:   op template: _12 = REALPART_EXPR <*_3>;
note:     stmt 0 _12 = REALPART_EXPR <*_3>;
note:     stmt 1 _11 = IMAGPART_EXPR <*_3>;
note:     load permutation { 0 1 }
note:   node 0x4809a08 (max_nunits=4, refcnt=2)
note:   op: VEC_PERM_EXPR
note:     stmt 0 _29 = _25 - _26;
note:     stmt 1 _30 = _27 + _28;
note:     lane permutation { 0[0] 1[1] }
note:     children 0x4809dc0 0x4809e48
note:   node 0x4809dc0 (max_nunits=1, refcnt=1)
note:   op template: _29 = _25 - _26;
note:     { }
note:     children 0x4809a90 0x4809c28
note:   node 0x4809a90 (max_nunits=4, refcnt=3)
note:   op template: _25 = _19 * _22;
note:     stmt 0 _25 = _19 * _22;
note:     stmt 1 _27 = _20 * _22;
note:     children 0x4809b18 0x4809ba0
note:   node 0x4809b18 (max_nunits=4, refcnt=2)
note:   op template: _19 = REALPART_EXPR <*_7>;
note:     stmt 0 _19 = REALPART_EXPR <*_7>;
note:     stmt 1 _20 = IMAGPART_EXPR <*_7>;
note:     load permutation { 0 1 }
note:   node 0x4809ba0 (max_nunits=4, refcnt=2)
note:   op template: _22 = REALPART_EXPR <*_5>;
note:     stmt 0 _22 = REALPART_EXPR <*_5>;
note:     stmt 1 _22 = REALPART_EXPR <*_5>;
note:     load permutation { 0 0 }
note:   node 0x4809c28 (max_nunits=4, refcnt=3)
note:   op template: _26 = _20 * _21;
note:     stmt 0 _26 = _20 * _21;
note:     stmt 1 _28 = _19 * _21;
note:     children 0x4809cb0 0x4809d38
note:   node 0x4809cb0 (max_nunits=4, refcnt=2)
note:   op template: _20 = IMAGPART_EXPR <*_7>;
note:     stmt 0 _20 = IMAGPART_EXPR <*_7>;
note:     stmt 1 _19 = REALPART_EXPR <*_7>;
note:     load permutation { 1 0 }
note:   node 0x4809d38 (max_nunits=4, refcnt=2)
note:   op template: _21 = IMAGPART_EXPR <*_5>;
note:     stmt 0 _21 = IMAGPART_EXPR <*_5>;
note:     stmt 1 _21 = IMAGPART_EXPR <*_5>;
note:     load permutation { 1 1 }
note:   node 0x4809e48 (max_nunits=1, refcnt=1)
note:   op template: _30 = _27 + _28;
note:     { }
note:     children 0x4809a90 0x4809c28

and after matching the MUL all you have are the ADD node going into a COMPLEX_MUL node.

> > +    return IFN_LAST;
> > +
> > +  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
> > +
> > +  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
> > +    ifn = IFN_COMPLEX_FMA;
> > +  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
> > +    ifn = IFN_COMPLEX_FMA_CONJ;
> > +  else
> > +    return IFN_LAST;
> > +
> > +  if (!vect_pattern_validate_optab (ifn, vnode))
> > +    return IFN_LAST;
> > +
> > +  vect_slp_reset_pattern (node);
> 
> I don't understand this ... it deserves a comment at least.

The previous pass detecting COMPLEX_MUL would have marked the
Instructions as being inside of a MUL pattern.  These need to be unmarked
As being part of the COMPLEX_MUL and instead be marked as COMPLEX_FMA.

> Having no testcases with this patch makes it impossible for me to dig in
> myself :/

Sorry, the tests would have made the file too big again.. The previous test for complex add
Added gcc/testsuite/gcc.dg/vect/complex/complex-operations.c which is an overarching test
Testing everything in one go.

The individual tests are split off from that large test.

> 
> Otherwise looks OK.
> 
> Thanks,
> Richard.
> 
> > +  ops->truncate (0);
> > +  ops->create (3);
> > +
> > +  if (ifn == IFN_COMPLEX_FMA)
> > +    {
> > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > +    }
> > +  else
> > +    {
> > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > +    }
> > +
> > +  return ifn;
> > +}
> > +
> > +/* Attempt to recognize a complex mul pattern.  */
> > +
> > +vect_pattern*
> > +complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +				slp_tree *node)
> > +{
> > +  auto_vec<slp_tree> ops;
> > +  complex_operation_t op
> > +    = vect_detect_pair_op (*node, true, &ops);
> > +  internal_fn ifn
> > +    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> > +  if (ifn == IFN_LAST)
> > +    return NULL;
> > +
> > +  return new complex_fma_pattern (node, &ops, ifn); }
> > +
> > +/* Perform a replacement of the detected complex mul pattern with the
> new
> > +   instruction sequences.  */
> > +
> > +void
> > +complex_fma_pattern::build (vec_info *vinfo) {
> > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> > +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
> > +
> > +  complex_pattern::build (vinfo);
> > +}
> > +
> >
> /**********************************************************
> *********************
> >   * Pattern matching definitions
> >
> >
> **********************************************************
> ************
> > ********/
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2021-01-08  9:59     ` Tamar Christina
@ 2021-01-08 10:17       ` Richard Biener
  2021-01-08 10:21         ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2021-01-08 10:17 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Fri, 8 Jan 2021, Tamar Christina wrote:

> Hi Richi,
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, January 8, 2021 9:45 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH 6/8 v9]middle-end slp: support complex FMA and
> > complex FMA conjugate
> > 
> > On Mon, 28 Dec 2020, Tamar Christina wrote:
> > 
> > > Hi All,
> > >
> > > This adds support for FMA and FMA conjugated to the slp pattern matcher.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
> > > 	* optabs.def (cmla_optab, cmla_conj_optab): New.
> > > 	* doc/md.texi: Document them.
> > > 	* tree-vect-slp-patterns.c (vect_match_call_p,
> > > 	class complex_fma_pattern, vect_slp_reset_pattern,
> > > 	complex_fma_pattern::matches, complex_fma_pattern::recognize,
> > > 	complex_fma_pattern::build): New.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > >
> > b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c2
> > 9
> > > caa6863fd3f7 100644
> > > --- a/gcc/doc/md.texi
> > > +++ b/gcc/doc/md.texi
> > > @@ -6202,6 +6202,51 @@ The operation is only supported for vector
> > modes @var{m}.
> > >
> > >  This pattern is not allowed to @code{FAIL}.
> > >
> > > +@cindex @code{cmla@var{m}4} instruction pattern @item
> > > +@samp{cmla@var{m}4} Perform a vector multiply and accumulate that is
> > > +semantically the same as a multiply and accumulate of complex
> > > +numbers.
> > > +
> > > +@smallexample
> > > +  complex TYPE c[N];
> > > +  complex TYPE a[N];
> > > +  complex TYPE b[N];
> > > +  for (int i = 0; i < N; i += 1)
> > > +    @{
> > > +      c[i] += a[i] * b[i];
> > > +    @}
> > > +@end smallexample
> > > +
> > > +In GCC lane ordering the real part of the number must be in the even
> > > +lanes with the imaginary part in the odd lanes.
> > > +
> > > +The operation is only supported for vector modes @var{m}.
> > > +
> > > +This pattern is not allowed to @code{FAIL}.
> > > +
> > > +@cindex @code{cmla_conj@var{m}4} instruction pattern @item
> > > +@samp{cmla_conj@var{m}4} Perform a vector multiply by conjugate and
> > > +accumulate that is semantically the same as a multiply and accumulate
> > > +of complex numbers where the second multiply arguments is conjugated.
> > > +
> > > +@smallexample
> > > +  complex TYPE c[N];
> > > +  complex TYPE a[N];
> > > +  complex TYPE b[N];
> > > +  for (int i = 0; i < N; i += 1)
> > > +    @{
> > > +      c[i] += a[i] * conj (b[i]);
> > > +    @}
> > > +@end smallexample
> > > +
> > > +In GCC lane ordering the real part of the number must be in the even
> > > +lanes with the imaginary part in the odd lanes.
> > > +
> > > +The operation is only supported for vector modes @var{m}.
> > > +
> > > +This pattern is not allowed to @code{FAIL}.
> > > +
> > >  @cindex @code{cmul@var{m}4} instruction pattern  @item
> > > @samp{cmul@var{m}4}  Perform a vector multiply that is semantically
> > > the same as multiply of diff --git a/gcc/internal-fn.def
> > > b/gcc/internal-fn.def index
> > >
> > 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9c
> > a719
> > > ec5570ff43eb 100644
> > > --- a/gcc/internal-fn.def
> > > +++ b/gcc/internal-fn.def
> > > @@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp,
> > > binary)
> > >
> > >  /* Ternary math functions.  */
> > >  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
> > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST,
> > cmla_conj,
> > > +ternary)
> > >
> > >  /* Unary integer ops.  */
> > >  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb,
> > unary)
> > > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> > >
> > e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac
> > 571e
> > > b40d45a294ed 100644
> > > --- a/gcc/optabs.def
> > > +++ b/gcc/optabs.def
> > > @@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")  OPTAB_D
> > > (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab, "cmul$a3")
> > > OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> > > +OPTAB_D (cmla_optab, "cmla$a4")
> > > +OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> > >  OPTAB_D (cos_optab, "cos$a2")
> > >  OPTAB_D (cosh_optab, "cosh$a2")
> > >  OPTAB_D (exp10_optab, "exp10$a2")
> > > diff --git a/gcc/tree-vect-slp-patterns.c
> > > b/gcc/tree-vect-slp-patterns.c index
> > >
> > 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e1
> > 7e
> > > 65b3b2c7da83 100644
> > > --- a/gcc/tree-vect-slp-patterns.c
> > > +++ b/gcc/tree-vect-slp-patterns.c
> > > @@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node,
> > tree_code code)
> > >    return true;
> > >  }
> > >
> > > +/* Checks to see if the expression represented by NODE is a call to the
> > internal
> > > +   function FN.  */
> > > +
> > > +static inline bool
> > > +vect_match_call_p (slp_tree node, internal_fn fn) {
> > > +  if (!node
> > > +      || !SLP_TREE_REPRESENTATIVE (node))
> > > +    return false;
> > > +
> > > +  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE
> > (node));
> > > + if (!expr
> > > +      || !gimple_call_internal_p (expr, fn))
> > > +    return false;
> > > +
> > > +   return true;
> > > +}
> > > +
> > >  /* Check if the given lane permute in PERMUTES matches an alternating
> > sequence
> > >     of {even odd even odd ...}.  This to account for unrolled loops.  Further
> > >     mode there resulting permute must be linear.   */
> > > @@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
> > >    complex_pattern::build (vinfo);
> > >  }
> > >
> > >
> > +/*********************************************************
> > ***********
> > > +***********
> > > + * complex_fma_pattern class
> > > +
> > >
> > +*********************************************************
> > ************
> > > +*********/
> > > +
> > > +class complex_fma_pattern : public complex_pattern {
> > > +  protected:
> > > +    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> > internal_fn ifn)
> > > +      : complex_pattern (node, m_ops, ifn)
> > > +    {
> > > +      this->m_num_args = 3;
> > > +    }
> > > +
> > > +  public:
> > > +    void build (vec_info *);
> > > +    static internal_fn
> > > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> > slp_tree *,
> > > +	     vec<slp_tree> *);
> > > +
> > > +    static vect_pattern*
> > > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > > +
> > > +    static vect_pattern*
> > > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > > +    {
> > > +      return new complex_fma_pattern (node, m_ops, ifn);
> > > +    }
> > > +};
> > > +
> > > +/* Helper function to "reset" a previously matched node and undo the
> > changes
> > > +   made enough so that the node is treated as an irrelevant node.  */
> > > +
> > > +static inline void
> > > +vect_slp_reset_pattern (slp_tree node) {
> > > +  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE
> > > +(node));
> > > +  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
> > > +  STMT_SLP_TYPE (stmt_info) = pure_slp;
> > > +  SLP_TREE_REPRESENTATIVE (node) = stmt_info; }
> > > +
> > > +/* Pattern matcher for trying to match complex multiply and accumulate
> > > +   and multiply and subtract patterns in SLP tree.
> > > +   If the operation matches then IFN is set to the operation it matched and
> > > +   the arguments to the two replacement statements are put in m_ops.
> > > +
> > > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > > +
> > > +   This function matches the patterns shaped as:
> > > +
> > > +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> > > +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> > > +
> > > +   c[i] = c[i] - ax;
> > > +   c[i+1] = c[i+1] + bx;
> > > +
> > > +   If a match occurred then TRUE is returned, else FALSE.  The match is
> > > +   performed after COMPLEX_MUL which would have done the majority of
> > the work.
> > > +   This function merely matches an ADD with a COMPLEX_MUL IFN.  The
> > initial
> > > +   match is expected to be in OP1 and the initial match operands in
> > > + args0.  */
> > > +
> > > +internal_fn
> > > +complex_fma_pattern::matches (complex_operation_t op,
> > > +			      slp_tree_to_load_perm_map_t * /* perm_cache
> > */,
> > > +			      slp_tree *ref_node, vec<slp_tree> *ops) {
> > > +  internal_fn ifn = IFN_LAST;
> > > +
> > > +  /* Find the two components.  We match Complex MUL first which
> > reduces the
> > > +     amount of work this pattern has to do.  After that we just match the
> > > +     head node and we're done.:
> > > +
> > > +     * FMA: + +.
> > > +
> > > +     We need to ignore the two_operands nodes that may also match.
> > > +     For that we can check if they have any scalar statements and also
> > > +     check that it's not a permute node as we're looking for a normal
> > > +     PLUS_EXPR operation.  */
> > > +  if (op != CMPLX_NONE)
> > > +    return IFN_LAST;
> > > +
> > > +  /* Find the two components.  We match Complex MUL first which
> > reduces the
> > > +     amount of work this pattern has to do.  After that we just match the
> > > +     head node and we're done.:
> > > +
> > > +   * FMA: + + on a non-two_operands node.  */
> > > +  slp_tree vnode = *ref_node;
> > > +  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
> > > +      /* Need to exclude the plus two-operands node.  These are not
> > marked
> > > +	 so we have to infer it based on conditions.  */
> > > +      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
> > 
> > as said earlier we shouldn't test this.  The existing lane permute should
> > already cover this - where the test would better be
> > 
> >  SLP_TREE_CODE (vnode) == VEC_PERM_EXPR
> > 
> > > +      || !vect_match_expression_p (vnode, PLUS_EXPR))
> > 
> > But then it shouldn't match this (the vect_match_expression_p should only
> > ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.
> > 
> 
> How so? An FMA doesn't have a TWO_OPERANDS node as the root since the operations
> Are always two PLUS operations.
> 
> The corresponding tree is
> 
> note:   SLP size 10 vs. limit 24.
> note:   Final SLP tree for instance 0x48f68d0:
> note:   node 0x4809870 (max_nunits=4, refcnt=2)
> note:   op template: REALPART_EXPR <*_3> = _31;
> note:     stmt 0 REALPART_EXPR <*_3> = _31;
> note:     stmt 1 IMAGPART_EXPR <*_3> = _32;
> note:     children 0x48098f8
> note:   node 0x48098f8 (max_nunits=4, refcnt=2)
> note:   op template: _31 = _12 + _29;
> note:     stmt 0 _31 = _12 + _29;
> note:     stmt 1 _32 = _11 + _30;
> note:     children 0x4809980 0x4809a08
> note:   node 0x4809980 (max_nunits=4, refcnt=2)
> note:   op template: _12 = REALPART_EXPR <*_3>;
> note:     stmt 0 _12 = REALPART_EXPR <*_3>;
> note:     stmt 1 _11 = IMAGPART_EXPR <*_3>;
> note:     load permutation { 0 1 }
> note:   node 0x4809a08 (max_nunits=4, refcnt=2)
> note:   op: VEC_PERM_EXPR
> note:     stmt 0 _29 = _25 - _26;
> note:     stmt 1 _30 = _27 + _28;
> note:     lane permutation { 0[0] 1[1] }
> note:     children 0x4809dc0 0x4809e48
> note:   node 0x4809dc0 (max_nunits=1, refcnt=1)
> note:   op template: _29 = _25 - _26;
> note:     { }
> note:     children 0x4809a90 0x4809c28
> note:   node 0x4809a90 (max_nunits=4, refcnt=3)
> note:   op template: _25 = _19 * _22;
> note:     stmt 0 _25 = _19 * _22;
> note:     stmt 1 _27 = _20 * _22;
> note:     children 0x4809b18 0x4809ba0
> note:   node 0x4809b18 (max_nunits=4, refcnt=2)
> note:   op template: _19 = REALPART_EXPR <*_7>;
> note:     stmt 0 _19 = REALPART_EXPR <*_7>;
> note:     stmt 1 _20 = IMAGPART_EXPR <*_7>;
> note:     load permutation { 0 1 }
> note:   node 0x4809ba0 (max_nunits=4, refcnt=2)
> note:   op template: _22 = REALPART_EXPR <*_5>;
> note:     stmt 0 _22 = REALPART_EXPR <*_5>;
> note:     stmt 1 _22 = REALPART_EXPR <*_5>;
> note:     load permutation { 0 0 }
> note:   node 0x4809c28 (max_nunits=4, refcnt=3)
> note:   op template: _26 = _20 * _21;
> note:     stmt 0 _26 = _20 * _21;
> note:     stmt 1 _28 = _19 * _21;
> note:     children 0x4809cb0 0x4809d38
> note:   node 0x4809cb0 (max_nunits=4, refcnt=2)
> note:   op template: _20 = IMAGPART_EXPR <*_7>;
> note:     stmt 0 _20 = IMAGPART_EXPR <*_7>;
> note:     stmt 1 _19 = REALPART_EXPR <*_7>;
> note:     load permutation { 1 0 }
> note:   node 0x4809d38 (max_nunits=4, refcnt=2)
> note:   op template: _21 = IMAGPART_EXPR <*_5>;
> note:     stmt 0 _21 = IMAGPART_EXPR <*_5>;
> note:     stmt 1 _21 = IMAGPART_EXPR <*_5>;
> note:     load permutation { 1 1 }
> note:   node 0x4809e48 (max_nunits=1, refcnt=1)
> note:   op template: _30 = _27 + _28;
> note:     { }
> note:     children 0x4809a90 0x4809c28
> 
> and after matching the MUL all you have are the ADD node going into a COMPLEX_MUL node.

So the above is the original tree - how does the tree look like
at the point you need to rule out 0x4809e48 (and thus have those
COMPLEX_MUL ones)?

As said, !SLP_TREE_SCALAR_STMTS (vnode).exists () is not a good test.

If complex pattern detection doesn't want to see the direct(?) children
of permute nodes then the pattern machinery should provide means
to do this.  Likewise if the pattern should not contain parents
outside of the supposed match the machinery should provide means
to restrict matches to single entry SLP subgraphs (here the
children of the plus have an alternate "entry").

> > > +    return IFN_LAST;
> > > +
> > > +  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
> > > +
> > > +  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
> > > +    ifn = IFN_COMPLEX_FMA;
> > > +  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
> > > +    ifn = IFN_COMPLEX_FMA_CONJ;
> > > +  else
> > > +    return IFN_LAST;
> > > +
> > > +  if (!vect_pattern_validate_optab (ifn, vnode))
> > > +    return IFN_LAST;
> > > +
> > > +  vect_slp_reset_pattern (node);
> > 
> > I don't understand this ... it deserves a comment at least.
> 
> The previous pass detecting COMPLEX_MUL would have marked the
> Instructions as being inside of a MUL pattern.  These need to be unmarked
> As being part of the COMPLEX_MUL and instead be marked as COMPLEX_FMA.

The scalar pattern code adjusts this at the point it marks the
pattern "consuming" the earlier pattern.   We should try follow that
style here I think.

> > Having no testcases with this patch makes it impossible for me to dig in
> > myself :/
> 
> Sorry, the tests would have made the file too big again.. The previous test for complex add
> Added gcc/testsuite/gcc.dg/vect/complex/complex-operations.c which is an overarching test
> Testing everything in one go.
> 
> The individual tests are split off from that large test.
> 
> > 
> > Otherwise looks OK.
> > 
> > Thanks,
> > Richard.
> > 
> > > +  ops->truncate (0);
> > > +  ops->create (3);
> > > +
> > > +  if (ifn == IFN_COMPLEX_FMA)
> > > +    {
> > > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > > +    }
> > > +  else
> > > +    {
> > > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > > +    }
> > > +
> > > +  return ifn;
> > > +}
> > > +
> > > +/* Attempt to recognize a complex mul pattern.  */
> > > +
> > > +vect_pattern*
> > > +complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t
> > *perm_cache,
> > > +				slp_tree *node)
> > > +{
> > > +  auto_vec<slp_tree> ops;
> > > +  complex_operation_t op
> > > +    = vect_detect_pair_op (*node, true, &ops);
> > > +  internal_fn ifn
> > > +    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> > > +  if (ifn == IFN_LAST)
> > > +    return NULL;
> > > +
> > > +  return new complex_fma_pattern (node, &ops, ifn); }
> > > +
> > > +/* Perform a replacement of the detected complex mul pattern with the
> > new
> > > +   instruction sequences.  */
> > > +
> > > +void
> > > +complex_fma_pattern::build (vec_info *vinfo) {
> > > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> > > +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
> > > +
> > > +  complex_pattern::build (vinfo);
> > > +}
> > > +
> > >
> > /**********************************************************
> > *********************
> > >   * Pattern matching definitions
> > >
> > >
> > **********************************************************
> > ************
> > > ********/
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2021-01-08 10:17       ` Richard Biener
@ 2021-01-08 10:21         ` Tamar Christina
  0 siblings, 0 replies; 27+ messages in thread
From: Tamar Christina @ 2021-01-08 10:21 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, ook



> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, January 8, 2021 10:17 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: RE: [PATCH 6/8 v9]middle-end slp: support complex FMA and
> complex FMA conjugate
> 
> On Fri, 8 Jan 2021, Tamar Christina wrote:
> 
> > Hi Richi,
> >
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Friday, January 8, 2021 9:45 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > > Subject: Re: [PATCH 6/8 v9]middle-end slp: support complex FMA and
> > > complex FMA conjugate
> > >
> > > On Mon, 28 Dec 2020, Tamar Christina wrote:
> > >
> > > > Hi All,
> > > >
> > > > This adds support for FMA and FMA conjugated to the slp pattern
> matcher.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > x86_64-pc-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > 	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
> > > > 	* optabs.def (cmla_optab, cmla_conj_optab): New.
> > > > 	* doc/md.texi: Document them.
> > > > 	* tree-vect-slp-patterns.c (vect_match_call_p,
> > > > 	class complex_fma_pattern, vect_slp_reset_pattern,
> > > > 	complex_fma_pattern::matches, complex_fma_pattern::recognize,
> > > > 	complex_fma_pattern::build): New.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> > > >
> > >
> b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c
> > > 2
> > > 9
> > > > caa6863fd3f7 100644
> > > > --- a/gcc/doc/md.texi
> > > > +++ b/gcc/doc/md.texi
> > > > @@ -6202,6 +6202,51 @@ The operation is only supported for vector
> > > modes @var{m}.
> > > >
> > > >  This pattern is not allowed to @code{FAIL}.
> > > >
> > > > +@cindex @code{cmla@var{m}4} instruction pattern @item
> > > > +@samp{cmla@var{m}4} Perform a vector multiply and accumulate
> that
> > > > +is semantically the same as a multiply and accumulate of complex
> > > > +numbers.
> > > > +
> > > > +@smallexample
> > > > +  complex TYPE c[N];
> > > > +  complex TYPE a[N];
> > > > +  complex TYPE b[N];
> > > > +  for (int i = 0; i < N; i += 1)
> > > > +    @{
> > > > +      c[i] += a[i] * b[i];
> > > > +    @}
> > > > +@end smallexample
> > > > +
> > > > +In GCC lane ordering the real part of the number must be in the
> > > > +even lanes with the imaginary part in the odd lanes.
> > > > +
> > > > +The operation is only supported for vector modes @var{m}.
> > > > +
> > > > +This pattern is not allowed to @code{FAIL}.
> > > > +
> > > > +@cindex @code{cmla_conj@var{m}4} instruction pattern @item
> > > > +@samp{cmla_conj@var{m}4} Perform a vector multiply by conjugate
> > > > +and accumulate that is semantically the same as a multiply and
> > > > +accumulate of complex numbers where the second multiply
> arguments is conjugated.
> > > > +
> > > > +@smallexample
> > > > +  complex TYPE c[N];
> > > > +  complex TYPE a[N];
> > > > +  complex TYPE b[N];
> > > > +  for (int i = 0; i < N; i += 1)
> > > > +    @{
> > > > +      c[i] += a[i] * conj (b[i]);
> > > > +    @}
> > > > +@end smallexample
> > > > +
> > > > +In GCC lane ordering the real part of the number must be in the
> > > > +even lanes with the imaginary part in the odd lanes.
> > > > +
> > > > +The operation is only supported for vector modes @var{m}.
> > > > +
> > > > +This pattern is not allowed to @code{FAIL}.
> > > > +
> > > >  @cindex @code{cmul@var{m}4} instruction pattern  @item
> > > > @samp{cmul@var{m}4}  Perform a vector multiply that is
> > > > semantically the same as multiply of diff --git
> > > > a/gcc/internal-fn.def b/gcc/internal-fn.def index
> > > >
> > >
> 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9c
> > > a719
> > > > ec5570ff43eb 100644
> > > > --- a/gcc/internal-fn.def
> > > > +++ b/gcc/internal-fn.def
> > > > @@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST,
> ldexp,
> > > > binary)
> > > >
> > > >  /* Ternary math functions.  */
> > > >  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> > > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla,
> ternary)
> > > > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST,
> > > cmla_conj,
> > > > +ternary)
> > > >
> > > >  /* Unary integer ops.  */
> > > >  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb,
> > > unary)
> > > > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> > > >
> > >
> e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac
> > > 571e
> > > > b40d45a294ed 100644
> > > > --- a/gcc/optabs.def
> > > > +++ b/gcc/optabs.def
> > > > @@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")
> OPTAB_D
> > > > (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab, "cmul$a3")
> > > > OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> > > > +OPTAB_D (cmla_optab, "cmla$a4")
> > > > +OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> > > >  OPTAB_D (cos_optab, "cos$a2")
> > > >  OPTAB_D (cosh_optab, "cosh$a2")
> > > >  OPTAB_D (exp10_optab, "exp10$a2") diff --git
> > > > a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> > > > index
> > > >
> > >
> 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e1
> > > 7e
> > > > 65b3b2c7da83 100644
> > > > --- a/gcc/tree-vect-slp-patterns.c
> > > > +++ b/gcc/tree-vect-slp-patterns.c
> > > > @@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node,
> > > tree_code code)
> > > >    return true;
> > > >  }
> > > >
> > > > +/* Checks to see if the expression represented by NODE is a call
> > > > +to the
> > > internal
> > > > +   function FN.  */
> > > > +
> > > > +static inline bool
> > > > +vect_match_call_p (slp_tree node, internal_fn fn) {
> > > > +  if (!node
> > > > +      || !SLP_TREE_REPRESENTATIVE (node))
> > > > +    return false;
> > > > +
> > > > +  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE
> > > (node));
> > > > + if (!expr
> > > > +      || !gimple_call_internal_p (expr, fn))
> > > > +    return false;
> > > > +
> > > > +   return true;
> > > > +}
> > > > +
> > > >  /* Check if the given lane permute in PERMUTES matches an
> > > > alternating
> > > sequence
> > > >     of {even odd even odd ...}.  This to account for unrolled loops.
> Further
> > > >     mode there resulting permute must be linear.   */
> > > > @@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info
> *vinfo)
> > > >    complex_pattern::build (vinfo);  }
> > > >
> > > >
> > >
> +/*********************************************************
> > > ***********
> > > > +***********
> > > > + * complex_fma_pattern class
> > > > +
> > > >
> > >
> +*********************************************************
> > > ************
> > > > +*********/
> > > > +
> > > > +class complex_fma_pattern : public complex_pattern {
> > > > +  protected:
> > > > +    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> > > internal_fn ifn)
> > > > +      : complex_pattern (node, m_ops, ifn)
> > > > +    {
> > > > +      this->m_num_args = 3;
> > > > +    }
> > > > +
> > > > +  public:
> > > > +    void build (vec_info *);
> > > > +    static internal_fn
> > > > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t
> *,
> > > slp_tree *,
> > > > +	     vec<slp_tree> *);
> > > > +
> > > > +    static vect_pattern*
> > > > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > > > +
> > > > +    static vect_pattern*
> > > > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > > > +    {
> > > > +      return new complex_fma_pattern (node, m_ops, ifn);
> > > > +    }
> > > > +};
> > > > +
> > > > +/* Helper function to "reset" a previously matched node and undo the
> > > changes
> > > > +   made enough so that the node is treated as an irrelevant node.  */
> > > > +
> > > > +static inline void
> > > > +vect_slp_reset_pattern (slp_tree node) {
> > > > +  stmt_vec_info stmt_info = vect_orig_stmt
> (SLP_TREE_REPRESENTATIVE
> > > > +(node));
> > > > +  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
> > > > +  STMT_SLP_TYPE (stmt_info) = pure_slp;
> > > > +  SLP_TREE_REPRESENTATIVE (node) = stmt_info; }
> > > > +
> > > > +/* Pattern matcher for trying to match complex multiply and
> accumulate
> > > > +   and multiply and subtract patterns in SLP tree.
> > > > +   If the operation matches then IFN is set to the operation it matched
> and
> > > > +   the arguments to the two replacement statements are put in m_ops.
> > > > +
> > > > +   If no match is found then IFN is set to IFN_LAST and m_ops is
> unchanged.
> > > > +
> > > > +   This function matches the patterns shaped as:
> > > > +
> > > > +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> > > > +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> > > > +
> > > > +   c[i] = c[i] - ax;
> > > > +   c[i+1] = c[i+1] + bx;
> > > > +
> > > > +   If a match occurred then TRUE is returned, else FALSE.  The match is
> > > > +   performed after COMPLEX_MUL which would have done the
> majority of
> > > the work.
> > > > +   This function merely matches an ADD with a COMPLEX_MUL IFN.
> The
> > > initial
> > > > +   match is expected to be in OP1 and the initial match operands in
> > > > + args0.  */
> > > > +
> > > > +internal_fn
> > > > +complex_fma_pattern::matches (complex_operation_t op,
> > > > +			      slp_tree_to_load_perm_map_t * /* perm_cache
> > > */,
> > > > +			      slp_tree *ref_node, vec<slp_tree> *ops) {
> > > > +  internal_fn ifn = IFN_LAST;
> > > > +
> > > > +  /* Find the two components.  We match Complex MUL first which
> > > reduces the
> > > > +     amount of work this pattern has to do.  After that we just match the
> > > > +     head node and we're done.:
> > > > +
> > > > +     * FMA: + +.
> > > > +
> > > > +     We need to ignore the two_operands nodes that may also match.
> > > > +     For that we can check if they have any scalar statements and also
> > > > +     check that it's not a permute node as we're looking for a normal
> > > > +     PLUS_EXPR operation.  */
> > > > +  if (op != CMPLX_NONE)
> > > > +    return IFN_LAST;
> > > > +
> > > > +  /* Find the two components.  We match Complex MUL first which
> > > reduces the
> > > > +     amount of work this pattern has to do.  After that we just match the
> > > > +     head node and we're done.:
> > > > +
> > > > +   * FMA: + + on a non-two_operands node.  */
> > > > +  slp_tree vnode = *ref_node;
> > > > +  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
> > > > +      /* Need to exclude the plus two-operands node.  These are not
> > > marked
> > > > +	 so we have to infer it based on conditions.  */
> > > > +      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
> > >
> > > as said earlier we shouldn't test this.  The existing lane permute should
> > > already cover this - where the test would better be
> > >
> > >  SLP_TREE_CODE (vnode) == VEC_PERM_EXPR
> > >
> > > > +      || !vect_match_expression_p (vnode, PLUS_EXPR))
> > >
> > > But then it shouldn't match this (the vect_match_expression_p should
> only
> > > ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.
> > >
> >
> > How so? An FMA doesn't have a TWO_OPERANDS node as the root since
> the operations
> > Are always two PLUS operations.
> >
> > The corresponding tree is
> >
> > note:   SLP size 10 vs. limit 24.
> > note:   Final SLP tree for instance 0x48f68d0:
> > note:   node 0x4809870 (max_nunits=4, refcnt=2)
> > note:   op template: REALPART_EXPR <*_3> = _31;
> > note:     stmt 0 REALPART_EXPR <*_3> = _31;
> > note:     stmt 1 IMAGPART_EXPR <*_3> = _32;
> > note:     children 0x48098f8
> > note:   node 0x48098f8 (max_nunits=4, refcnt=2)
> > note:   op template: _31 = _12 + _29;
> > note:     stmt 0 _31 = _12 + _29;
> > note:     stmt 1 _32 = _11 + _30;
> > note:     children 0x4809980 0x4809a08
> > note:   node 0x4809980 (max_nunits=4, refcnt=2)
> > note:   op template: _12 = REALPART_EXPR <*_3>;
> > note:     stmt 0 _12 = REALPART_EXPR <*_3>;
> > note:     stmt 1 _11 = IMAGPART_EXPR <*_3>;
> > note:     load permutation { 0 1 }
> > note:   node 0x4809a08 (max_nunits=4, refcnt=2)
> > note:   op: VEC_PERM_EXPR
> > note:     stmt 0 _29 = _25 - _26;
> > note:     stmt 1 _30 = _27 + _28;
> > note:     lane permutation { 0[0] 1[1] }
> > note:     children 0x4809dc0 0x4809e48
> > note:   node 0x4809dc0 (max_nunits=1, refcnt=1)
> > note:   op template: _29 = _25 - _26;
> > note:     { }
> > note:     children 0x4809a90 0x4809c28
> > note:   node 0x4809a90 (max_nunits=4, refcnt=3)
> > note:   op template: _25 = _19 * _22;
> > note:     stmt 0 _25 = _19 * _22;
> > note:     stmt 1 _27 = _20 * _22;
> > note:     children 0x4809b18 0x4809ba0
> > note:   node 0x4809b18 (max_nunits=4, refcnt=2)
> > note:   op template: _19 = REALPART_EXPR <*_7>;
> > note:     stmt 0 _19 = REALPART_EXPR <*_7>;
> > note:     stmt 1 _20 = IMAGPART_EXPR <*_7>;
> > note:     load permutation { 0 1 }
> > note:   node 0x4809ba0 (max_nunits=4, refcnt=2)
> > note:   op template: _22 = REALPART_EXPR <*_5>;
> > note:     stmt 0 _22 = REALPART_EXPR <*_5>;
> > note:     stmt 1 _22 = REALPART_EXPR <*_5>;
> > note:     load permutation { 0 0 }
> > note:   node 0x4809c28 (max_nunits=4, refcnt=3)
> > note:   op template: _26 = _20 * _21;
> > note:     stmt 0 _26 = _20 * _21;
> > note:     stmt 1 _28 = _19 * _21;
> > note:     children 0x4809cb0 0x4809d38
> > note:   node 0x4809cb0 (max_nunits=4, refcnt=2)
> > note:   op template: _20 = IMAGPART_EXPR <*_7>;
> > note:     stmt 0 _20 = IMAGPART_EXPR <*_7>;
> > note:     stmt 1 _19 = REALPART_EXPR <*_7>;
> > note:     load permutation { 1 0 }
> > note:   node 0x4809d38 (max_nunits=4, refcnt=2)
> > note:   op template: _21 = IMAGPART_EXPR <*_5>;
> > note:     stmt 0 _21 = IMAGPART_EXPR <*_5>;
> > note:     stmt 1 _21 = IMAGPART_EXPR <*_5>;
> > note:     load permutation { 1 1 }
> > note:   node 0x4809e48 (max_nunits=1, refcnt=1)
> > note:   op template: _30 = _27 + _28;
> > note:     { }
> > note:     children 0x4809a90 0x4809c28
> >
> > and after matching the MUL all you have are the ADD node going into a
> COMPLEX_MUL node.
> 
> So the above is the original tree - how does the tree look like
> at the point you need to rule out 0x4809e48 (and thus have those
> COMPLEX_MUL ones)?
> 
> As said, !SLP_TREE_SCALAR_STMTS (vnode).exists () is not a good test.

Agreed, Sorry I need to start drinking coffee.. I read the above 

> > > But then it shouldn't match this (the vect_match_expression_p should
> only
> > > ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.
> 

as SLP_TREE_CODE (vnode) == VEC_PERM_EXPR...

I will go respin patches 😊

Thanks!


> If complex pattern detection doesn't want to see the direct(?) children
> of permute nodes then the pattern machinery should provide means
> to do this.  Likewise if the pattern should not contain parents
> outside of the supposed match the machinery should provide means
> to restrict matches to single entry SLP subgraphs (here the
> children of the plus have an alternate "entry").
> 
> > > > +    return IFN_LAST;
> > > > +
> > > > +  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
> > > > +
> > > > +  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
> > > > +    ifn = IFN_COMPLEX_FMA;
> > > > +  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
> > > > +    ifn = IFN_COMPLEX_FMA_CONJ;
> > > > +  else
> > > > +    return IFN_LAST;
> > > > +
> > > > +  if (!vect_pattern_validate_optab (ifn, vnode))
> > > > +    return IFN_LAST;
> > > > +
> > > > +  vect_slp_reset_pattern (node);
> > >
> > > I don't understand this ... it deserves a comment at least.
> >
> > The previous pass detecting COMPLEX_MUL would have marked the
> > Instructions as being inside of a MUL pattern.  These need to be unmarked
> > As being part of the COMPLEX_MUL and instead be marked as
> COMPLEX_FMA.
> 
> The scalar pattern code adjusts this at the point it marks the
> pattern "consuming" the earlier pattern.   We should try follow that
> style here I think.
> 
> > > Having no testcases with this patch makes it impossible for me to dig in
> > > myself :/
> >
> > Sorry, the tests would have made the file too big again.. The previous test
> for complex add
> > Added gcc/testsuite/gcc.dg/vect/complex/complex-operations.c which is
> an overarching test
> > Testing everything in one go.
> >
> > The individual tests are split off from that large test.
> >
> > >
> > > Otherwise looks OK.
> > >
> > > Thanks,
> > > Richard.
> > >
> > > > +  ops->truncate (0);
> > > > +  ops->create (3);
> > > > +
> > > > +  if (ifn == IFN_COMPLEX_FMA)
> > > > +    {
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > > > +    }
> > > > +  else
> > > > +    {
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > > > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > > > +    }
> > > > +
> > > > +  return ifn;
> > > > +}
> > > > +
> > > > +/* Attempt to recognize a complex mul pattern.  */
> > > > +
> > > > +vect_pattern*
> > > > +complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t
> > > *perm_cache,
> > > > +				slp_tree *node)
> > > > +{
> > > > +  auto_vec<slp_tree> ops;
> > > > +  complex_operation_t op
> > > > +    = vect_detect_pair_op (*node, true, &ops);
> > > > +  internal_fn ifn
> > > > +    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> > > > +  if (ifn == IFN_LAST)
> > > > +    return NULL;
> > > > +
> > > > +  return new complex_fma_pattern (node, &ops, ifn); }
> > > > +
> > > > +/* Perform a replacement of the detected complex mul pattern with
> the
> > > new
> > > > +   instruction sequences.  */
> > > > +
> > > > +void
> > > > +complex_fma_pattern::build (vec_info *vinfo) {
> > > > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> > > > +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
> > > > +
> > > > +  complex_pattern::build (vinfo);
> > > > +}
> > > > +
> > > >
> > >
> /**********************************************************
> > > *********************
> > > >   * Pattern matching definitions
> > > >
> > > >
> > >
> **********************************************************
> > > ************
> > > > ********/
> > > >
> > > >
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> > > Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg,
> Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate
  2021-01-08  9:45   ` Richard Biener
  2021-01-08  9:59     ` Tamar Christina
@ 2021-01-11 10:24     ` Tamar Christina
  1 sibling, 0 replies; 27+ messages in thread
From: Tamar Christina @ 2021-01-11 10:24 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

Hi Richi,

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, January 8, 2021 9:45 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH 6/8 v9]middle-end slp: support complex FMA and
> complex FMA conjugate
> 
> On Mon, 28 Dec 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This adds support for FMA and FMA conjugated to the slp pattern matcher.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.def (COMPLEX_FMA, COMPLEX_FMA_CONJ): New.
> > 	* optabs.def (cmla_optab, cmla_conj_optab): New.
> > 	* doc/md.texi: Document them.
> > 	* tree-vect-slp-patterns.c (vect_match_call_p,
> > 	class complex_fma_pattern, vect_slp_reset_pattern,
> > 	complex_fma_pattern::matches, complex_fma_pattern::recognize,
> > 	complex_fma_pattern::build): New.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> >
> b8cc90e1a75e402abbf8a8cf2efefc1a333f8b3a..6d5a98c4946d3ff4c2b8abea5c2
> 9
> > caa6863fd3f7 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6202,6 +6202,51 @@ The operation is only supported for vector
> modes @var{m}.
> >
> >  This pattern is not allowed to @code{FAIL}.
> >
> > +@cindex @code{cmla@var{m}4} instruction pattern @item
> > +@samp{cmla@var{m}4} Perform a vector multiply and accumulate that is
> > +semantically the same as a multiply and accumulate of complex
> > +numbers.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] += a[i] * b[i];
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> > +@cindex @code{cmla_conj@var{m}4} instruction pattern @item
> > +@samp{cmla_conj@var{m}4} Perform a vector multiply by conjugate and
> > +accumulate that is semantically the same as a multiply and accumulate
> > +of complex numbers where the second multiply arguments is conjugated.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] += a[i] * conj (b[i]);
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> >  @cindex @code{cmul@var{m}4} instruction pattern  @item
> > @samp{cmul@var{m}4}  Perform a vector multiply that is semantically
> > the same as multiply of diff --git a/gcc/internal-fn.def
> > b/gcc/internal-fn.def index
> >
> 5a0bbe3fe5dee591d54130e60f6996b28164ae38..305450e026d4b94ab62ceb9c
> a719
> > ec5570ff43eb 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -288,6 +288,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp,
> > binary)
> >
> >  /* Ternary math functions.  */
> >  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST,
> cmla_conj,
> > +ternary)
> >
> >  /* Unary integer ops.  */
> >  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb,
> unary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> >
> e82396bae1117c6de91304761a560b7fbcb69ce1..8e2758d685ed85e02df10dac
> 571e
> > b40d45a294ed 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -294,6 +294,8 @@ OPTAB_D (cadd90_optab, "cadd90$a3")  OPTAB_D
> > (cadd270_optab, "cadd270$a3")  OPTAB_D (cmul_optab, "cmul$a3")
> > OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
> > +OPTAB_D (cmla_optab, "cmla$a4")
> > +OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> >  OPTAB_D (cos_optab, "cos$a2")
> >  OPTAB_D (cosh_optab, "cosh$a2")
> >  OPTAB_D (exp10_optab, "exp10$a2")
> > diff --git a/gcc/tree-vect-slp-patterns.c
> > b/gcc/tree-vect-slp-patterns.c index
> >
> 82721acbab8cf81c4d6f9954c98fb913a7bb6282..3625a80c08e3d70fd362fc52e1
> 7e
> > 65b3b2c7da83 100644
> > --- a/gcc/tree-vect-slp-patterns.c
> > +++ b/gcc/tree-vect-slp-patterns.c
> > @@ -325,6 +325,24 @@ vect_match_expression_p (slp_tree node,
> tree_code code)
> >    return true;
> >  }
> >
> > +/* Checks to see if the expression represented by NODE is a call to the
> internal
> > +   function FN.  */
> > +
> > +static inline bool
> > +vect_match_call_p (slp_tree node, internal_fn fn) {
> > +  if (!node
> > +      || !SLP_TREE_REPRESENTATIVE (node))
> > +    return false;
> > +
> > +  gimple* expr = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE
> (node));
> > + if (!expr
> > +      || !gimple_call_internal_p (expr, fn))
> > +    return false;
> > +
> > +   return true;
> > +}
> > +
> >  /* Check if the given lane permute in PERMUTES matches an alternating
> sequence
> >     of {even odd even odd ...}.  This to account for unrolled loops.  Further
> >     mode there resulting permute must be linear.   */
> > @@ -1081,6 +1099,161 @@ complex_mul_pattern::build (vec_info *vinfo)
> >    complex_pattern::build (vinfo);
> >  }
> >
> >
> +/*********************************************************
> ***********
> > +***********
> > + * complex_fma_pattern class
> > +
> >
> +*********************************************************
> ************
> > +*********/
> > +
> > +class complex_fma_pattern : public complex_pattern {
> > +  protected:
> > +    complex_fma_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> internal_fn ifn)
> > +      : complex_pattern (node, m_ops, ifn)
> > +    {
> > +      this->m_num_args = 3;
> > +    }
> > +
> > +  public:
> > +    void build (vec_info *);
> > +    static internal_fn
> > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> slp_tree *,
> > +	     vec<slp_tree> *);
> > +
> > +    static vect_pattern*
> > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > +
> > +    static vect_pattern*
> > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > +    {
> > +      return new complex_fma_pattern (node, m_ops, ifn);
> > +    }
> > +};
> > +
> > +/* Helper function to "reset" a previously matched node and undo the
> changes
> > +   made enough so that the node is treated as an irrelevant node.  */
> > +
> > +static inline void
> > +vect_slp_reset_pattern (slp_tree node) {
> > +  stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE
> > +(node));
> > +  STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
> > +  STMT_SLP_TYPE (stmt_info) = pure_slp;
> > +  SLP_TREE_REPRESENTATIVE (node) = stmt_info; }
> > +
> > +/* Pattern matcher for trying to match complex multiply and accumulate
> > +   and multiply and subtract patterns in SLP tree.
> > +   If the operation matches then IFN is set to the operation it matched and
> > +   the arguments to the two replacement statements are put in m_ops.
> > +
> > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > +
> > +   This function matches the patterns shaped as:
> > +
> > +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> > +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> > +
> > +   c[i] = c[i] - ax;
> > +   c[i+1] = c[i+1] + bx;
> > +
> > +   If a match occurred then TRUE is returned, else FALSE.  The match is
> > +   performed after COMPLEX_MUL which would have done the majority of
> the work.
> > +   This function merely matches an ADD with a COMPLEX_MUL IFN.  The
> initial
> > +   match is expected to be in OP1 and the initial match operands in
> > + args0.  */
> > +
> > +internal_fn
> > +complex_fma_pattern::matches (complex_operation_t op,
> > +			      slp_tree_to_load_perm_map_t * /* perm_cache
> */,
> > +			      slp_tree *ref_node, vec<slp_tree> *ops) {
> > +  internal_fn ifn = IFN_LAST;
> > +
> > +  /* Find the two components.  We match Complex MUL first which
> reduces the
> > +     amount of work this pattern has to do.  After that we just match the
> > +     head node and we're done.:
> > +
> > +     * FMA: + +.
> > +
> > +     We need to ignore the two_operands nodes that may also match.
> > +     For that we can check if they have any scalar statements and also
> > +     check that it's not a permute node as we're looking for a normal
> > +     PLUS_EXPR operation.  */
> > +  if (op != CMPLX_NONE)
> > +    return IFN_LAST;
> > +
> > +  /* Find the two components.  We match Complex MUL first which
> reduces the
> > +     amount of work this pattern has to do.  After that we just match the
> > +     head node and we're done.:
> > +
> > +   * FMA: + + on a non-two_operands node.  */
> > +  slp_tree vnode = *ref_node;
> > +  if (SLP_TREE_LANE_PERMUTATION (vnode).exists ()
> > +      /* Need to exclude the plus two-operands node.  These are not
> marked
> > +	 so we have to infer it based on conditions.  */
> > +      || !SLP_TREE_SCALAR_STMTS (vnode).exists ()
> 
> as said earlier we shouldn't test this.  The existing lane permute should
> already cover this - where the test would better be
> 
>  SLP_TREE_CODE (vnode) == VEC_PERM_EXPR
> 
> > +      || !vect_match_expression_p (vnode, PLUS_EXPR))
> 
> But then it shouldn't match this (the vect_match_expression_p should only
> ever match SLP_TREE_CODE (vnode) != VEC_PERM_EXPR) anyway.
> 
> > +    return IFN_LAST;
> > +
> > +  slp_tree node = SLP_TREE_CHILDREN (vnode)[1];
> > +
> > +  if (vect_match_call_p (node, IFN_COMPLEX_MUL))
> > +    ifn = IFN_COMPLEX_FMA;
> > +  else if (vect_match_call_p (node, IFN_COMPLEX_MUL_CONJ))
> > +    ifn = IFN_COMPLEX_FMA_CONJ;
> > +  else
> > +    return IFN_LAST;
> > +
> > +  if (!vect_pattern_validate_optab (ifn, vnode))
> > +    return IFN_LAST;
> > +
> > +  vect_slp_reset_pattern (node);
> 
> I don't understand this ... it deserves a comment at least.
> Having no testcases with this patch makes it impossible for me to dig in
> myself :/

I cleaned up the things pointed out in the review and added this comment:

  /* FMA matched ADD + CMUL.  During the matching of CMUL the
     stmt that starts the pattern is marked as being in a pattern,
     namely the CMUL.  When replacing this with a CFMA we have to
     unmark this statement as being in a pattern.  This is because
     vect_mark_pattern_stmts will only mark the current stmt as being
     in a pattern.  Later on when the scalar stmts are examined the
     old statement which is supposed to be irrelevant will point to
     CMUL unless we undo the pattern relationship here.  */
  vect_slp_reset_pattern (node);

As for testcases, they're quite simple, the final commit will contain a full range.
I usually try to send the testcases a long but the number of ISA this supports at
Once has forced me to test outside of the testsuite for quick iteration and moving
them into the testsuite proper is taking a bit of time :/

But they're all in the shape of

#include <stdio.h>
#include <complex.h>

#define N 200
#define ROT
#define TYPE float
#define TYPE2 float

void g (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] +=  a[i] * (b[i] ROT);
    }
}

void g_f1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] +=  conjf (a[i]) * (b[i] ROT);
    }
}

void g_s1 (TYPE2 complex a[restrict N], TYPE complex b[restrict N], TYPE complex c[restrict N])
{
  for (int i=0; i < N; i++)
    {
      c[i] +=  a[i] * conjf (b[i] ROT);
    }
}

void caxpy_add(double complex * restrict y, double complex * restrict x, size_t N, double complex f) {
  for (size_t i = 0; i < N; ++i)
    y[i] += x[i]* f;
}

Regards,
Tamar

> 
> Otherwise looks OK.
> 
> Thanks,
> Richard.
> 
> > +  ops->truncate (0);
> > +  ops->create (3);
> > +
> > +  if (ifn == IFN_COMPLEX_FMA)
> > +    {
> > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > +    }
> > +  else
> > +    {
> > +      ops->quick_push (SLP_TREE_CHILDREN (vnode)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[0]);
> > +      ops->quick_push (SLP_TREE_CHILDREN (node)[1]);
> > +    }
> > +
> > +  return ifn;
> > +}
> > +
> > +/* Attempt to recognize a complex mul pattern.  */
> > +
> > +vect_pattern*
> > +complex_fma_pattern::recognize (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +				slp_tree *node)
> > +{
> > +  auto_vec<slp_tree> ops;
> > +  complex_operation_t op
> > +    = vect_detect_pair_op (*node, true, &ops);
> > +  internal_fn ifn
> > +    = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> > +  if (ifn == IFN_LAST)
> > +    return NULL;
> > +
> > +  return new complex_fma_pattern (node, &ops, ifn); }
> > +
> > +/* Perform a replacement of the detected complex mul pattern with the
> new
> > +   instruction sequences.  */
> > +
> > +void
> > +complex_fma_pattern::build (vec_info *vinfo) {
> > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> > +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (this->m_ops);
> > +
> > +  complex_pattern::build (vinfo);
> > +}
> > +
> >
> /**********************************************************
> *********************
> >   * Pattern matching definitions
> >
> >
> **********************************************************
> ************
> > ********/
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
                   ` (4 preceding siblings ...)
  2020-12-28 13:37 ` [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate Tamar Christina
@ 2020-12-28 13:38 ` Tamar Christina
  2021-01-08  9:49   ` Richard Biener
  2020-12-28 13:38 ` [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers Tamar Christina
  2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 9344 bytes --]

Hi All,

This adds support for FMS and FMS conjugated to the slp pattern matcher.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* internal-fn.def (COMPLEX_FMS, COMPLEX_FMS_CONJ): New.
	* optabs.def (cmls_optab, cmls_conj_optab): New.
	* doc/md.texi: Document them.
	* tree-vect-slp-patterns.c (class complex_fms_pattern,
	complex_fms_pattern::matches, complex_fms_pattern::recognize,
	complex_fms_pattern::build): New.

--- inline copy of patch -- 
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7..3f5a42df285b3ee162edc9ec661f25c0eec5e4fa 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6247,6 +6247,51 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmls@var{m}4} instruction pattern
+@item @samp{cmls@var{m}4}
+Perform a vector multiply and subtract that is semantically the same as
+a multiply and subtract of complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] -= a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmls_conj@var{m}4} instruction pattern
+@item @samp{cmls_conj@var{m}4}
+Perform a vector multiply by conjugate and subtract that is semantically
+the same as a multiply and subtract of complex numbers where the second
+multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] -= a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{cmul@var{m}4} instruction pattern
 @item @samp{cmul@var{m}4}
 Perform a vector multiply that is semantically the same as multiply of
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 305450e026d4b94ab62ceb9ca719ec5570ff43eb..c8161509d9497afe58f32bde12d8e6bd7b876a3c 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -290,6 +290,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS, ECF_CONST, cmls, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS_CONJ, ECF_CONST, cmls_conj, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 8e2758d685ed85e02df10dac571eb40d45a294ed..320bb5f3dce31867d312bbbb6a4c6e31c534254e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -296,6 +296,8 @@ OPTAB_D (cmul_optab, "cmul$a3")
 OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cmla_optab, "cmla$a4")
 OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
+OPTAB_D (cmls_optab, "cmls$a4")
+OPTAB_D (cmls_conj_optab, "cmls_conj$a4")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 3625a80c08e3d70fd362fc52e17e65b3b2c7da83..ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1254,6 +1254,181 @@ complex_fma_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_fms_pattern class
+ ******************************************************************************/
+
+class complex_fms_pattern : public complex_pattern
+{
+  protected:
+    complex_fms_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 3;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_fms_pattern (node, m_ops, ifn);
+    }
+};
+
+
+/* Pattern matcher for trying to match complex multiply and accumulate
+   and multiply and subtract patterns in SLP tree.
+   If the operation matches then IFN is set to the operation it matched and
+   the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_fms_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree * ref_node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+     * FMS: - +.  */
+  slp_tree child = NULL;
+
+  /* We need to ignore the two_operands nodes that may also match,
+     for that we can check if they have any scalar statements and also
+     check that it's not a permute node as we're looking for a normal
+     PLUS_EXPR operation.  */
+  if (op != PLUS_MINUS)
+    return IFN_LAST;
+
+  child = SLP_TREE_CHILDREN ((*ops)[1])[1];
+  if (vect_detect_pair_op (child) != MINUS_PLUS)
+    return IFN_LAST;
+
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (child, 0) != MULT_MULT
+      || vect_match_call_complex_mla (child, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  bool is_neg = vect_normalize_conj_loc (left_op);
+
+  child = SLP_TREE_CHILDREN ((*ops)[1])[0];
+  bool conj_first_operand;
+  if (!vect_validate_multiplication (perm_cache, right_op, left_op, false,
+				     &conj_first_operand, true))
+    return IFN_LAST;
+
+  if (!is_neg)
+    ifn = IFN_COMPLEX_FMS;
+  else if (is_neg)
+    ifn = IFN_COMPLEX_FMS_CONJ;
+
+  if (!vect_pattern_validate_optab (ifn, *ref_node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (4);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, right_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[0]);
+    }
+  else
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_fms_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_fms_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_fms_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_fms_pattern::build (vec_info *vinfo)
+{
+  auto_vec<slp_tree> nodes;
+
+  /* First re-arrange the children.  */
+  nodes.create (3);
+
+  nodes.quick_push (this->m_ops[0]);
+  nodes.quick_push (this->m_ops[1]);
+  nodes.quick_push (
+    vect_build_combine_node (this->m_ops[2], this->m_ops[3], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[0])++;
+  SLP_TREE_REF_COUNT (this->m_ops[1])++;
+
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


-- 

[-- Attachment #2: rb13962.patch --]
[-- Type: text/x-diff, Size: 8801 bytes --]

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7..3f5a42df285b3ee162edc9ec661f25c0eec5e4fa 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6247,6 +6247,51 @@ The operation is only supported for vector modes @var{m}.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{cmls@var{m}4} instruction pattern
+@item @samp{cmls@var{m}4}
+Perform a vector multiply and subtract that is semantically the same as
+a multiply and subtract of complex numbers.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] -= a[i] * b[i];
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
+@cindex @code{cmls_conj@var{m}4} instruction pattern
+@item @samp{cmls_conj@var{m}4}
+Perform a vector multiply by conjugate and subtract that is semantically
+the same as a multiply and subtract of complex numbers where the second
+multiply arguments is conjugated.
+
+@smallexample
+  complex TYPE c[N];
+  complex TYPE a[N];
+  complex TYPE b[N];
+  for (int i = 0; i < N; i += 1)
+    @{
+      c[i] -= a[i] * conj (b[i]);
+    @}
+@end smallexample
+
+In GCC lane ordering the real part of the number must be in the even lanes with
+the imaginary part in the odd lanes.
+
+The operation is only supported for vector modes @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{cmul@var{m}4} instruction pattern
 @item @samp{cmul@var{m}4}
 Perform a vector multiply that is semantically the same as multiply of
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 305450e026d4b94ab62ceb9ca719ec5570ff43eb..c8161509d9497afe58f32bde12d8e6bd7b876a3c 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -290,6 +290,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS, ECF_CONST, cmls, ternary)
+DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS_CONJ, ECF_CONST, cmls_conj, ternary)
 
 /* Unary integer ops.  */
 DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 8e2758d685ed85e02df10dac571eb40d45a294ed..320bb5f3dce31867d312bbbb6a4c6e31c534254e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -296,6 +296,8 @@ OPTAB_D (cmul_optab, "cmul$a3")
 OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
 OPTAB_D (cmla_optab, "cmla$a4")
 OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
+OPTAB_D (cmls_optab, "cmls$a4")
+OPTAB_D (cmls_conj_optab, "cmls_conj$a4")
 OPTAB_D (cos_optab, "cos$a2")
 OPTAB_D (cosh_optab, "cosh$a2")
 OPTAB_D (exp10_optab, "exp10$a2")
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 3625a80c08e3d70fd362fc52e17e65b3b2c7da83..ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1254,6 +1254,181 @@ complex_fma_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_fms_pattern class
+ ******************************************************************************/
+
+class complex_fms_pattern : public complex_pattern
+{
+  protected:
+    complex_fms_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 3;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+
+    static vect_pattern*
+    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
+    {
+      return new complex_fms_pattern (node, m_ops, ifn);
+    }
+};
+
+
+/* Pattern matcher for trying to match complex multiply and accumulate
+   and multiply and subtract patterns in SLP tree.
+   If the operation matches then IFN is set to the operation it matched and
+   the arguments to the two replacement statements are put in m_ops.
+
+   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
+
+   This function matches the patterns shaped as:
+
+   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
+   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
+
+   c[i] = c[i] - ax;
+   c[i+1] = c[i+1] + bx;
+
+   If a match occurred then TRUE is returned, else FALSE.  The initial match is
+   expected to be in OP1 and the initial match operands in args0.  */
+
+internal_fn
+complex_fms_pattern::matches (complex_operation_t op,
+			      slp_tree_to_load_perm_map_t *perm_cache,
+			      slp_tree * ref_node, vec<slp_tree> *ops)
+{
+  internal_fn ifn = IFN_LAST;
+
+  /* Find the two components.  We match Complex MUL first which reduces the
+     amount of work this pattern has to do.  After that we just match the
+     head node and we're done.:
+
+     * FMS: - +.  */
+  slp_tree child = NULL;
+
+  /* We need to ignore the two_operands nodes that may also match,
+     for that we can check if they have any scalar statements and also
+     check that it's not a permute node as we're looking for a normal
+     PLUS_EXPR operation.  */
+  if (op != PLUS_MINUS)
+    return IFN_LAST;
+
+  child = SLP_TREE_CHILDREN ((*ops)[1])[1];
+  if (vect_detect_pair_op (child) != MINUS_PLUS)
+    return IFN_LAST;
+
+  /* First two nodes must be a multiply.  */
+  auto_vec<slp_tree> muls;
+  if (vect_match_call_complex_mla (child, 0) != MULT_MULT
+      || vect_match_call_complex_mla (child, 1, &muls) != MULT_MULT)
+    return IFN_LAST;
+
+  /* Now operand2+4 may lead to another expression.  */
+  auto_vec<slp_tree> left_op, right_op;
+  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
+  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
+
+  bool is_neg = vect_normalize_conj_loc (left_op);
+
+  child = SLP_TREE_CHILDREN ((*ops)[1])[0];
+  bool conj_first_operand;
+  if (!vect_validate_multiplication (perm_cache, right_op, left_op, false,
+				     &conj_first_operand, true))
+    return IFN_LAST;
+
+  if (!is_neg)
+    ifn = IFN_COMPLEX_FMS;
+  else if (is_neg)
+    ifn = IFN_COMPLEX_FMS_CONJ;
+
+  if (!vect_pattern_validate_optab (ifn, *ref_node))
+    return IFN_LAST;
+
+  ops->truncate (0);
+  ops->create (4);
+
+  complex_perm_kinds_t kind = linear_loads_p (perm_cache, right_op[0]).first;
+  if (kind == PERM_EVENODD)
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (left_op[0]);
+    }
+  else if (kind == PERM_TOP)
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[0]);
+    }
+  else
+    {
+      ops->quick_push (child);
+      ops->quick_push (right_op[1]);
+      ops->quick_push (right_op[0]);
+      ops->quick_push (left_op[1]);
+    }
+
+  return ifn;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_fms_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn
+    = complex_fms_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn == IFN_LAST)
+    return NULL;
+
+  return new complex_fms_pattern (node, &ops, ifn);
+}
+
+/* Perform a replacement of the detected complex mul pattern with the new
+   instruction sequences.  */
+
+void
+complex_fms_pattern::build (vec_info *vinfo)
+{
+  auto_vec<slp_tree> nodes;
+
+  /* First re-arrange the children.  */
+  nodes.create (3);
+
+  nodes.quick_push (this->m_ops[0]);
+  nodes.quick_push (this->m_ops[1]);
+  nodes.quick_push (
+    vect_build_combine_node (this->m_ops[2], this->m_ops[3], *this->m_node));
+  SLP_TREE_REF_COUNT (this->m_ops[0])++;
+  SLP_TREE_REF_COUNT (this->m_ops[1])++;
+
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
+    vect_free_slp_tree (node);
+
+  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
+  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);
+
+  complex_pattern::build (vinfo);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate
  2020-12-28 13:38 ` [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate Tamar Christina
@ 2021-01-08  9:49   ` Richard Biener
  2021-01-08 10:02     ` Tamar Christina
  0 siblings, 1 reply; 27+ messages in thread
From: Richard Biener @ 2021-01-08  9:49 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This adds support for FMS and FMS conjugated to the slp pattern matcher.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

Interestingly this patch looks different from the FMA one(!?).  I
would have expected to have the same pattern for FMA and FMS in the
end.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* internal-fn.def (COMPLEX_FMS, COMPLEX_FMS_CONJ): New.
> 	* optabs.def (cmls_optab, cmls_conj_optab): New.
> 	* doc/md.texi: Document them.
> 	* tree-vect-slp-patterns.c (class complex_fms_pattern,
> 	complex_fms_pattern::matches, complex_fms_pattern::recognize,
> 	complex_fms_pattern::build): New.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7..3f5a42df285b3ee162edc9ec661f25c0eec5e4fa 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -6247,6 +6247,51 @@ The operation is only supported for vector modes @var{m}.
>  
>  This pattern is not allowed to @code{FAIL}.
>  
> +@cindex @code{cmls@var{m}4} instruction pattern
> +@item @samp{cmls@var{m}4}
> +Perform a vector multiply and subtract that is semantically the same as
> +a multiply and subtract of complex numbers.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] -= a[i] * b[i];
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
> +@cindex @code{cmls_conj@var{m}4} instruction pattern
> +@item @samp{cmls_conj@var{m}4}
> +Perform a vector multiply by conjugate and subtract that is semantically
> +the same as a multiply and subtract of complex numbers where the second
> +multiply arguments is conjugated.
> +
> +@smallexample
> +  complex TYPE c[N];
> +  complex TYPE a[N];
> +  complex TYPE b[N];
> +  for (int i = 0; i < N; i += 1)
> +    @{
> +      c[i] -= a[i] * conj (b[i]);
> +    @}
> +@end smallexample
> +
> +In GCC lane ordering the real part of the number must be in the even lanes with
> +the imaginary part in the odd lanes.
> +
> +The operation is only supported for vector modes @var{m}.
> +
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{cmul@var{m}4} instruction pattern
>  @item @samp{cmul@var{m}4}
>  Perform a vector multiply that is semantically the same as multiply of
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 305450e026d4b94ab62ceb9ca719ec5570ff43eb..c8161509d9497afe58f32bde12d8e6bd7b876a3c 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -290,6 +290,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
>  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
>  DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST, cmla_conj, ternary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS, ECF_CONST, cmls, ternary)
> +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS_CONJ, ECF_CONST, cmls_conj, ternary)
>  
>  /* Unary integer ops.  */
>  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb, unary)
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 8e2758d685ed85e02df10dac571eb40d45a294ed..320bb5f3dce31867d312bbbb6a4c6e31c534254e 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -296,6 +296,8 @@ OPTAB_D (cmul_optab, "cmul$a3")
>  OPTAB_D (cmul_conj_optab, "cmul_conj$a3")
>  OPTAB_D (cmla_optab, "cmla$a4")
>  OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> +OPTAB_D (cmls_optab, "cmls$a4")
> +OPTAB_D (cmls_conj_optab, "cmls_conj$a4")
>  OPTAB_D (cos_optab, "cos$a2")
>  OPTAB_D (cosh_optab, "cosh$a2")
>  OPTAB_D (exp10_optab, "exp10$a2")
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index 3625a80c08e3d70fd362fc52e17e65b3b2c7da83..ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -1254,6 +1254,181 @@ complex_fma_pattern::build (vec_info *vinfo)
>    complex_pattern::build (vinfo);
>  }
>  
> +/*******************************************************************************
> + * complex_fms_pattern class
> + ******************************************************************************/
> +
> +class complex_fms_pattern : public complex_pattern
> +{
> +  protected:
> +    complex_fms_pattern (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +      : complex_pattern (node, m_ops, ifn)
> +    {
> +      this->m_num_args = 3;
> +    }
> +
> +  public:
> +    void build (vec_info *);
> +    static internal_fn
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
> +	     vec<slp_tree> *);
> +
> +    static vect_pattern*
> +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +
> +    static vect_pattern*
> +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> +    {
> +      return new complex_fms_pattern (node, m_ops, ifn);
> +    }
> +};
> +
> +
> +/* Pattern matcher for trying to match complex multiply and accumulate
> +   and multiply and subtract patterns in SLP tree.
> +   If the operation matches then IFN is set to the operation it matched and
> +   the arguments to the two replacement statements are put in m_ops.
> +
> +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> +
> +   This function matches the patterns shaped as:
> +
> +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> +
> +   c[i] = c[i] - ax;
> +   c[i+1] = c[i+1] + bx;
> +
> +   If a match occurred then TRUE is returned, else FALSE.  The initial match is
> +   expected to be in OP1 and the initial match operands in args0.  */
> +
> +internal_fn
> +complex_fms_pattern::matches (complex_operation_t op,
> +			      slp_tree_to_load_perm_map_t *perm_cache,
> +			      slp_tree * ref_node, vec<slp_tree> *ops)
> +{
> +  internal_fn ifn = IFN_LAST;
> +
> +  /* Find the two components.  We match Complex MUL first which reduces the
> +     amount of work this pattern has to do.  After that we just match the
> +     head node and we're done.:
> +
> +     * FMS: - +.  */
> +  slp_tree child = NULL;
> +
> +  /* We need to ignore the two_operands nodes that may also match,
> +     for that we can check if they have any scalar statements and also
> +     check that it's not a permute node as we're looking for a normal
> +     PLUS_EXPR operation.  */
> +  if (op != PLUS_MINUS)
> +    return IFN_LAST;
> +
> +  child = SLP_TREE_CHILDREN ((*ops)[1])[1];
> +  if (vect_detect_pair_op (child) != MINUS_PLUS)
> +    return IFN_LAST;
> +
> +  /* First two nodes must be a multiply.  */
> +  auto_vec<slp_tree> muls;
> +  if (vect_match_call_complex_mla (child, 0) != MULT_MULT
> +      || vect_match_call_complex_mla (child, 1, &muls) != MULT_MULT)
> +    return IFN_LAST;
> +
> +  /* Now operand2+4 may lead to another expression.  */
> +  auto_vec<slp_tree> left_op, right_op;
> +  left_op.safe_splice (SLP_TREE_CHILDREN (muls[0]));
> +  right_op.safe_splice (SLP_TREE_CHILDREN (muls[1]));
> +
> +  bool is_neg = vect_normalize_conj_loc (left_op);
> +
> +  child = SLP_TREE_CHILDREN ((*ops)[1])[0];
> +  bool conj_first_operand;
> +  if (!vect_validate_multiplication (perm_cache, right_op, left_op, false,
> +				     &conj_first_operand, true))
> +    return IFN_LAST;
> +
> +  if (!is_neg)
> +    ifn = IFN_COMPLEX_FMS;
> +  else if (is_neg)
> +    ifn = IFN_COMPLEX_FMS_CONJ;
> +
> +  if (!vect_pattern_validate_optab (ifn, *ref_node))
> +    return IFN_LAST;
> +
> +  ops->truncate (0);
> +  ops->create (4);
> +
> +  complex_perm_kinds_t kind = linear_loads_p (perm_cache, right_op[0]).first;
> +  if (kind == PERM_EVENODD)
> +    {
> +      ops->quick_push (child);
> +      ops->quick_push (right_op[0]);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else if (kind == PERM_TOP)
> +    {
> +      ops->quick_push (child);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (right_op[0]);
> +      ops->quick_push (left_op[0]);
> +    }
> +  else
> +    {
> +      ops->quick_push (child);
> +      ops->quick_push (right_op[1]);
> +      ops->quick_push (right_op[0]);
> +      ops->quick_push (left_op[1]);
> +    }
> +
> +  return ifn;
> +}
> +
> +/* Attempt to recognize a complex mul pattern.  */
> +
> +vect_pattern*
> +complex_fms_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
> +				slp_tree *node)
> +{
> +  auto_vec<slp_tree> ops;
> +  complex_operation_t op
> +    = vect_detect_pair_op (*node, true, &ops);
> +  internal_fn ifn
> +    = complex_fms_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn == IFN_LAST)
> +    return NULL;
> +
> +  return new complex_fms_pattern (node, &ops, ifn);
> +}
> +
> +/* Perform a replacement of the detected complex mul pattern with the new
> +   instruction sequences.  */
> +
> +void
> +complex_fms_pattern::build (vec_info *vinfo)
> +{
> +  auto_vec<slp_tree> nodes;
> +
> +  /* First re-arrange the children.  */
> +  nodes.create (3);
> +
> +  nodes.quick_push (this->m_ops[0]);
> +  nodes.quick_push (this->m_ops[1]);
> +  nodes.quick_push (
> +    vect_build_combine_node (this->m_ops[2], this->m_ops[3], *this->m_node));
> +  SLP_TREE_REF_COUNT (this->m_ops[0])++;
> +  SLP_TREE_REF_COUNT (this->m_ops[1])++;
> +
> +  slp_tree node;
> +  unsigned i;
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> +    vect_free_slp_tree (node);
> +
> +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> +  SLP_TREE_CHILDREN (*this->m_node).safe_splice (nodes);

please elide the nodes vector.

Otherwise OK.
Richard.

> +
> +  complex_pattern::build (vinfo);
> +}
> +
>  /*******************************************************************************
>   * Pattern matching definitions
>   ******************************************************************************/
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate
  2021-01-08  9:49   ` Richard Biener
@ 2021-01-08 10:02     ` Tamar Christina
  0 siblings, 0 replies; 27+ messages in thread
From: Tamar Christina @ 2021-01-08 10:02 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd



> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, January 8, 2021 9:49 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH 7/8 v9]middle-end slp: support complex FMS and
> complex FMS conjugate
> 
> On Mon, 28 Dec 2020, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This adds support for FMS and FMS conjugated to the slp pattern matcher.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> 
> Interestingly this patch looks different from the FMA one(!?).  I would have
> expected to have the same pattern for FMA and FMS in the end.

No, because the mid-end canonization of the tree for FMA and FMS are different.
Because FMS has two TWO_OPERANDS nodes the order of the tree is swapped.

There's no real reason for it (as far as I can tell) but that results in a reverse tree.
However the operations are not sufficiently different that I can detect the MUL part.

I have a note for next year's rewrite to fix this during slp build so they can be shared.

> 
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* internal-fn.def (COMPLEX_FMS, COMPLEX_FMS_CONJ): New.
> > 	* optabs.def (cmls_optab, cmls_conj_optab): New.
> > 	* doc/md.texi: Document them.
> > 	* tree-vect-slp-patterns.c (class complex_fms_pattern,
> > 	complex_fms_pattern::matches, complex_fms_pattern::recognize,
> > 	complex_fms_pattern::build): New.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> >
> 6d5a98c4946d3ff4c2b8abea5c29caa6863fd3f7..3f5a42df285b3ee162edc9ec66
> 1f
> > 25c0eec5e4fa 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6247,6 +6247,51 @@ The operation is only supported for vector
> modes @var{m}.
> >
> >  This pattern is not allowed to @code{FAIL}.
> >
> > +@cindex @code{cmls@var{m}4} instruction pattern @item
> > +@samp{cmls@var{m}4} Perform a vector multiply and subtract that is
> > +semantically the same as a multiply and subtract of complex numbers.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] -= a[i] * b[i];
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> > +@cindex @code{cmls_conj@var{m}4} instruction pattern @item
> > +@samp{cmls_conj@var{m}4} Perform a vector multiply by conjugate and
> > +subtract that is semantically the same as a multiply and subtract of
> > +complex numbers where the second multiply arguments is conjugated.
> > +
> > +@smallexample
> > +  complex TYPE c[N];
> > +  complex TYPE a[N];
> > +  complex TYPE b[N];
> > +  for (int i = 0; i < N; i += 1)
> > +    @{
> > +      c[i] -= a[i] * conj (b[i]);
> > +    @}
> > +@end smallexample
> > +
> > +In GCC lane ordering the real part of the number must be in the even
> > +lanes with the imaginary part in the odd lanes.
> > +
> > +The operation is only supported for vector modes @var{m}.
> > +
> > +This pattern is not allowed to @code{FAIL}.
> > +
> >  @cindex @code{cmul@var{m}4} instruction pattern  @item
> > @samp{cmul@var{m}4}  Perform a vector multiply that is semantically
> > the same as multiply of diff --git a/gcc/internal-fn.def
> > b/gcc/internal-fn.def index
> >
> 305450e026d4b94ab62ceb9ca719ec5570ff43eb..c8161509d9497afe58f32bde1
> 2d8
> > e6bd7b876a3c 100644
> > --- a/gcc/internal-fn.def
> > +++ b/gcc/internal-fn.def
> > @@ -290,6 +290,8 @@ DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp,
> > binary)  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA, ECF_CONST, cmla, ternary)
> > DEF_INTERNAL_OPTAB_FN (COMPLEX_FMA_CONJ, ECF_CONST,
> cmla_conj,
> > ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS, ECF_CONST, cmls, ternary)
> > +DEF_INTERNAL_OPTAB_FN (COMPLEX_FMS_CONJ, ECF_CONST,
> cmls_conj,
> > +ternary)
> >
> >  /* Unary integer ops.  */
> >  DEF_INTERNAL_INT_FN (CLRSB, ECF_CONST | ECF_NOTHROW, clrsb,
> unary)
> > diff --git a/gcc/optabs.def b/gcc/optabs.def index
> >
> 8e2758d685ed85e02df10dac571eb40d45a294ed..320bb5f3dce31867d312bbb
> b6a4c
> > 6e31c534254e 100644
> > --- a/gcc/optabs.def
> > +++ b/gcc/optabs.def
> > @@ -296,6 +296,8 @@ OPTAB_D (cmul_optab, "cmul$a3")  OPTAB_D
> > (cmul_conj_optab, "cmul_conj$a3")  OPTAB_D (cmla_optab, "cmla$a4")
> > OPTAB_D (cmla_conj_optab, "cmla_conj$a4")
> > +OPTAB_D (cmls_optab, "cmls$a4")
> > +OPTAB_D (cmls_conj_optab, "cmls_conj$a4")
> >  OPTAB_D (cos_optab, "cos$a2")
> >  OPTAB_D (cosh_optab, "cosh$a2")
> >  OPTAB_D (exp10_optab, "exp10$a2")
> > diff --git a/gcc/tree-vect-slp-patterns.c
> > b/gcc/tree-vect-slp-patterns.c index
> >
> 3625a80c08e3d70fd362fc52e17e65b3b2c7da83..ab6587f0b8522ec5f916f74e7e
> 74
> > 01b1f7a35bbb 100644
> > --- a/gcc/tree-vect-slp-patterns.c
> > +++ b/gcc/tree-vect-slp-patterns.c
> > @@ -1254,6 +1254,181 @@ complex_fma_pattern::build (vec_info *vinfo)
> >    complex_pattern::build (vinfo);
> >  }
> >
> >
> +/*********************************************************
> ***********
> > +***********
> > + * complex_fms_pattern class
> > +
> >
> +*********************************************************
> ************
> > +*********/
> > +
> > +class complex_fms_pattern : public complex_pattern {
> > +  protected:
> > +    complex_fms_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> internal_fn ifn)
> > +      : complex_pattern (node, m_ops, ifn)
> > +    {
> > +      this->m_num_args = 3;
> > +    }
> > +
> > +  public:
> > +    void build (vec_info *);
> > +    static internal_fn
> > +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *,
> slp_tree *,
> > +	     vec<slp_tree> *);
> > +
> > +    static vect_pattern*
> > +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> > +
> > +    static vect_pattern*
> > +    mkInstance (slp_tree *node, vec<slp_tree> *m_ops, internal_fn ifn)
> > +    {
> > +      return new complex_fms_pattern (node, m_ops, ifn);
> > +    }
> > +};
> > +
> > +
> > +/* Pattern matcher for trying to match complex multiply and accumulate
> > +   and multiply and subtract patterns in SLP tree.
> > +   If the operation matches then IFN is set to the operation it matched and
> > +   the arguments to the two replacement statements are put in m_ops.
> > +
> > +   If no match is found then IFN is set to IFN_LAST and m_ops is unchanged.
> > +
> > +   This function matches the patterns shaped as:
> > +
> > +   double ax = (b[i+1] * a[i]) + (b[i] * a[i]);
> > +   double bx = (a[i+1] * b[i]) - (a[i+1] * b[i+1]);
> > +
> > +   c[i] = c[i] - ax;
> > +   c[i+1] = c[i+1] + bx;
> > +
> > +   If a match occurred then TRUE is returned, else FALSE.  The initial match
> is
> > +   expected to be in OP1 and the initial match operands in args0.  */
> > +
> > +internal_fn
> > +complex_fms_pattern::matches (complex_operation_t op,
> > +			      slp_tree_to_load_perm_map_t *perm_cache,
> > +			      slp_tree * ref_node, vec<slp_tree> *ops) {
> > +  internal_fn ifn = IFN_LAST;
> > +
> > +  /* Find the two components.  We match Complex MUL first which
> reduces the
> > +     amount of work this pattern has to do.  After that we just match the
> > +     head node and we're done.:
> > +
> > +     * FMS: - +.  */
> > +  slp_tree child = NULL;
> > +
> > +  /* We need to ignore the two_operands nodes that may also match,
> > +     for that we can check if they have any scalar statements and also
> > +     check that it's not a permute node as we're looking for a normal
> > +     PLUS_EXPR operation.  */
> > +  if (op != PLUS_MINUS)
> > +    return IFN_LAST;
> > +
> > +  child = SLP_TREE_CHILDREN ((*ops)[1])[1];  if (vect_detect_pair_op
> > + (child) != MINUS_PLUS)
> > +    return IFN_LAST;
> > +
> > +  /* First two nodes must be a multiply.  */  auto_vec<slp_tree>
> > + muls;  if (vect_match_call_complex_mla (child, 0) != MULT_MULT
> > +      || vect_match_call_complex_mla (child, 1, &muls) != MULT_MULT)
> > +    return IFN_LAST;
> > +
> > +  /* Now operand2+4 may lead to another expression.  */
> > + auto_vec<slp_tree> left_op, right_op;  left_op.safe_splice
> > + (SLP_TREE_CHILDREN (muls[0]));  right_op.safe_splice
> > + (SLP_TREE_CHILDREN (muls[1]));
> > +
> > +  bool is_neg = vect_normalize_conj_loc (left_op);
> > +
> > +  child = SLP_TREE_CHILDREN ((*ops)[1])[0];
> > +  bool conj_first_operand;
> > +  if (!vect_validate_multiplication (perm_cache, right_op, left_op, false,
> > +				     &conj_first_operand, true))
> > +    return IFN_LAST;
> > +
> > +  if (!is_neg)
> > +    ifn = IFN_COMPLEX_FMS;
> > +  else if (is_neg)
> > +    ifn = IFN_COMPLEX_FMS_CONJ;
> > +
> > +  if (!vect_pattern_validate_optab (ifn, *ref_node))
> > +    return IFN_LAST;
> > +
> > +  ops->truncate (0);
> > +  ops->create (4);
> > +
> > +  complex_perm_kinds_t kind = linear_loads_p (perm_cache,
> > + right_op[0]).first;  if (kind == PERM_EVENODD)
> > +    {
> > +      ops->quick_push (child);
> > +      ops->quick_push (right_op[0]);
> > +      ops->quick_push (right_op[1]);
> > +      ops->quick_push (left_op[0]);
> > +    }
> > +  else if (kind == PERM_TOP)
> > +    {
> > +      ops->quick_push (child);
> > +      ops->quick_push (right_op[1]);
> > +      ops->quick_push (right_op[0]);
> > +      ops->quick_push (left_op[0]);
> > +    }
> > +  else
> > +    {
> > +      ops->quick_push (child);
> > +      ops->quick_push (right_op[1]);
> > +      ops->quick_push (right_op[0]);
> > +      ops->quick_push (left_op[1]);
> > +    }
> > +
> > +  return ifn;
> > +}
> > +
> > +/* Attempt to recognize a complex mul pattern.  */
> > +
> > +vect_pattern*
> > +complex_fms_pattern::recognize (slp_tree_to_load_perm_map_t
> *perm_cache,
> > +				slp_tree *node)
> > +{
> > +  auto_vec<slp_tree> ops;
> > +  complex_operation_t op
> > +    = vect_detect_pair_op (*node, true, &ops);
> > +  internal_fn ifn
> > +    = complex_fms_pattern::matches (op, perm_cache, node, &ops);
> > +  if (ifn == IFN_LAST)
> > +    return NULL;
> > +
> > +  return new complex_fms_pattern (node, &ops, ifn); }
> > +
> > +/* Perform a replacement of the detected complex mul pattern with the
> new
> > +   instruction sequences.  */
> > +
> > +void
> > +complex_fms_pattern::build (vec_info *vinfo) {
> > +  auto_vec<slp_tree> nodes;
> > +
> > +  /* First re-arrange the children.  */  nodes.create (3);
> > +
> > +  nodes.quick_push (this->m_ops[0]);
> > +  nodes.quick_push (this->m_ops[1]);
> > +  nodes.quick_push (
> > +    vect_build_combine_node (this->m_ops[2], this->m_ops[3],
> > + *this->m_node));  SLP_TREE_REF_COUNT (this->m_ops[0])++;
> > + SLP_TREE_REF_COUNT (this->m_ops[1])++;
> > +
> > +  slp_tree node;
> > +  unsigned i;
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (*this->m_node), i, node)
> > +    vect_free_slp_tree (node);
> > +
> > +  SLP_TREE_CHILDREN (*this->m_node).truncate (0);
> SLP_TREE_CHILDREN
> > + (*this->m_node).safe_splice (nodes);
> 
> please elide the nodes vector.
> 
> Otherwise OK.
> Richard.
> 
> > +
> > +  complex_pattern::build (vinfo);
> > +}
> > +
> >
> /**********************************************************
> *********************
> >   * Pattern matching definitions
> >
> >
> **********************************************************
> ************
> > ********/
> >
> >
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409
> Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
                   ` (5 preceding siblings ...)
  2020-12-28 13:38 ` [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate Tamar Christina
@ 2020-12-28 13:38 ` Tamar Christina
  2021-01-08  9:50   ` Richard Biener
  2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
  7 siblings, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2020-12-28 13:38 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, ook

[-- Attachment #1: Type: text/plain, Size: 3694 bytes --]

Hi All,

This introduces a common class complex_operations_pattern which encapsulates
the complex add, mul, fma and fms pattern in such a way so that the first match
is shared.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp-patterns.c (class complex_operations_pattern,
	complex_operations_pattern::matches,
	complex_operations_pattern::recognize,
	complex_operations_pattern::build): New.
	(slp_patterns): Use it.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb..33d22e657ebf1d0454a134bab4febb2b65581822 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1429,6 +1429,83 @@ complex_fms_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_operations_pattern class
+ ******************************************************************************/
+
+/* This function combines all the existing pattern matchers above into one class
+   that shares the functionality between them.  The initial match is shared
+   between all complex operations.  */
+
+class complex_operations_pattern : public complex_pattern
+{
+  protected:
+    complex_operations_pattern (slp_tree *node, vec<slp_tree> *m_ops,
+				internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 0;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+};
+
+/* Dummy matches implementation for proxy object.  */
+
+internal_fn
+complex_operations_pattern::
+matches (complex_operation_t /* op */,
+	 slp_tree_to_load_perm_map_t * /* perm_cache */,
+	 slp_tree * /* ref_node */, vec<slp_tree> * /* ops */)
+{
+  return IFN_LAST;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_operations_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				       slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn = IFN_LAST;
+
+  ifn  = complex_fms_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_fms_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_mul_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_fma_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_fma_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_add_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_add_pattern::mkInstance (node, &ops, ifn);
+
+  return NULL;
+}
+
+/* Dummy implementation of build.  */
+
+void
+complex_operations_pattern::build (vec_info * /* vinfo */)
+{
+  gcc_unreachable ();
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/
@@ -1440,7 +1517,7 @@ vect_pattern_decl_t slp_patterns[]
      order patterns from the largest to the smallest.  Especially if they
      overlap in what they can detect.  */
 
-  SLP_PATTERN (complex_add_pattern),
+  SLP_PATTERN (complex_operations_pattern),
 };
 #undef SLP_PATTERN
 


-- 

[-- Attachment #2: rb13963.patch --]
[-- Type: text/x-diff, Size: 3140 bytes --]

diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb..33d22e657ebf1d0454a134bab4febb2b65581822 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1429,6 +1429,83 @@ complex_fms_pattern::build (vec_info *vinfo)
   complex_pattern::build (vinfo);
 }
 
+/*******************************************************************************
+ * complex_operations_pattern class
+ ******************************************************************************/
+
+/* This function combines all the existing pattern matchers above into one class
+   that shares the functionality between them.  The initial match is shared
+   between all complex operations.  */
+
+class complex_operations_pattern : public complex_pattern
+{
+  protected:
+    complex_operations_pattern (slp_tree *node, vec<slp_tree> *m_ops,
+				internal_fn ifn)
+      : complex_pattern (node, m_ops, ifn)
+    {
+      this->m_num_args = 0;
+    }
+
+  public:
+    void build (vec_info *);
+    static internal_fn
+    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
+	     vec<slp_tree> *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+};
+
+/* Dummy matches implementation for proxy object.  */
+
+internal_fn
+complex_operations_pattern::
+matches (complex_operation_t /* op */,
+	 slp_tree_to_load_perm_map_t * /* perm_cache */,
+	 slp_tree * /* ref_node */, vec<slp_tree> * /* ops */)
+{
+  return IFN_LAST;
+}
+
+/* Attempt to recognize a complex mul pattern.  */
+
+vect_pattern*
+complex_operations_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
+				       slp_tree *node)
+{
+  auto_vec<slp_tree> ops;
+  complex_operation_t op
+    = vect_detect_pair_op (*node, true, &ops);
+  internal_fn ifn = IFN_LAST;
+
+  ifn  = complex_fms_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_fms_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_mul_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_mul_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_fma_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_fma_pattern::mkInstance (node, &ops, ifn);
+
+  ifn  = complex_add_pattern::matches (op, perm_cache, node, &ops);
+  if (ifn != IFN_LAST)
+    return complex_add_pattern::mkInstance (node, &ops, ifn);
+
+  return NULL;
+}
+
+/* Dummy implementation of build.  */
+
+void
+complex_operations_pattern::build (vec_info * /* vinfo */)
+{
+  gcc_unreachable ();
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/
@@ -1440,7 +1517,7 @@ vect_pattern_decl_t slp_patterns[]
      order patterns from the largest to the smallest.  Especially if they
      overlap in what they can detect.  */
 
-  SLP_PATTERN (complex_add_pattern),
+  SLP_PATTERN (complex_operations_pattern),
 };
 #undef SLP_PATTERN
 


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers
  2020-12-28 13:38 ` [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers Tamar Christina
@ 2021-01-08  9:50   ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-08  9:50 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Mon, 28 Dec 2020, Tamar Christina wrote:

> Hi All,
> 
> This introduces a common class complex_operations_pattern which encapsulates
> the complex add, mul, fma and fms pattern in such a way so that the first match
> is shared.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?

OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp-patterns.c (class complex_operations_pattern,
> 	complex_operations_pattern::matches,
> 	complex_operations_pattern::recognize,
> 	complex_operations_pattern::build): New.
> 	(slp_patterns): Use it.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
> index ab6587f0b8522ec5f916f74e7e7401b1f7a35bbb..33d22e657ebf1d0454a134bab4febb2b65581822 100644
> --- a/gcc/tree-vect-slp-patterns.c
> +++ b/gcc/tree-vect-slp-patterns.c
> @@ -1429,6 +1429,83 @@ complex_fms_pattern::build (vec_info *vinfo)
>    complex_pattern::build (vinfo);
>  }
>  
> +/*******************************************************************************
> + * complex_operations_pattern class
> + ******************************************************************************/
> +
> +/* This function combines all the existing pattern matchers above into one class
> +   that shares the functionality between them.  The initial match is shared
> +   between all complex operations.  */
> +
> +class complex_operations_pattern : public complex_pattern
> +{
> +  protected:
> +    complex_operations_pattern (slp_tree *node, vec<slp_tree> *m_ops,
> +				internal_fn ifn)
> +      : complex_pattern (node, m_ops, ifn)
> +    {
> +      this->m_num_args = 0;
> +    }
> +
> +  public:
> +    void build (vec_info *);
> +    static internal_fn
> +    matches (complex_operation_t op, slp_tree_to_load_perm_map_t *, slp_tree *,
> +	     vec<slp_tree> *);
> +
> +    static vect_pattern*
> +    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
> +};
> +
> +/* Dummy matches implementation for proxy object.  */
> +
> +internal_fn
> +complex_operations_pattern::
> +matches (complex_operation_t /* op */,
> +	 slp_tree_to_load_perm_map_t * /* perm_cache */,
> +	 slp_tree * /* ref_node */, vec<slp_tree> * /* ops */)
> +{
> +  return IFN_LAST;
> +}
> +
> +/* Attempt to recognize a complex mul pattern.  */
> +
> +vect_pattern*
> +complex_operations_pattern::recognize (slp_tree_to_load_perm_map_t *perm_cache,
> +				       slp_tree *node)
> +{
> +  auto_vec<slp_tree> ops;
> +  complex_operation_t op
> +    = vect_detect_pair_op (*node, true, &ops);
> +  internal_fn ifn = IFN_LAST;
> +
> +  ifn  = complex_fms_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn != IFN_LAST)
> +    return complex_fms_pattern::mkInstance (node, &ops, ifn);
> +
> +  ifn  = complex_mul_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn != IFN_LAST)
> +    return complex_mul_pattern::mkInstance (node, &ops, ifn);
> +
> +  ifn  = complex_fma_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn != IFN_LAST)
> +    return complex_fma_pattern::mkInstance (node, &ops, ifn);
> +
> +  ifn  = complex_add_pattern::matches (op, perm_cache, node, &ops);
> +  if (ifn != IFN_LAST)
> +    return complex_add_pattern::mkInstance (node, &ops, ifn);
> +
> +  return NULL;
> +}
> +
> +/* Dummy implementation of build.  */
> +
> +void
> +complex_operations_pattern::build (vec_info * /* vinfo */)
> +{
> +  gcc_unreachable ();
> +}
> +
>  /*******************************************************************************
>   * Pattern matching definitions
>   ******************************************************************************/
> @@ -1440,7 +1517,7 @@ vect_pattern_decl_t slp_patterns[]
>       order patterns from the largest to the smallest.  Especially if they
>       overlap in what they can detect.  */
>  
> -  SLP_PATTERN (complex_add_pattern),
> +  SLP_PATTERN (complex_operations_pattern),
>  };
>  #undef SLP_PATTERN
>  
> 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
  2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
                   ` (6 preceding siblings ...)
  2020-12-28 13:38 ` [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers Tamar Christina
@ 2021-01-07 13:20 ` Richard Biener
  2021-01-07 13:25   ` Tamar Christina
  2021-01-11 11:01   ` Tamar Christina
  7 siblings, 2 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-07 13:20 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

> From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> Date: Mon, 28 Dec 2020 13:35:56 +0000
> From: Tamar Christina <tamar.christina@arm.com>
> To: gcc-patches@gcc.gnu.org
> Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
> 
> Hi All,
> 
> This introduces a post processing step for the pattern matcher to flatten
> permutes introduced by the complex multiplications patterns. 
> 
> This performs a blend early such that SLP is not cancelled by the LOAD_LANES
> permute.  This is a temporary workaround to the fact that loads are not CSEd
> during building and is required to produce efficient code.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
>
> 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> 	(optimize_load_redistribution): New.
> 	(vect_match_slp_patterns): Use it.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325cf8406466ac25c 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
>    return exact_div (common_multiple (nunits, group_size), group_size);
>  }
> 
> +/* Helper function of optimize_load_redistribution that performs the operation
> +   recursively.  */
> +
> +static slp_tree
> +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
> +				hash_set<slp_tree> *visited, slp_tree root)
> +{
> +  if (visited->add (root))
> +    return NULL;
> +
> +  slp_tree node;
> +  unsigned i;
> +
> +  /* For now, we don't know anything about externals so do not do anything.  */
> +  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)

use a single != vect_internal_def test please

> +    return NULL;
> +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> +      && !SLP_TREE_SCALAR_STMTS (root).exists ())

I think both last tests are unnecessary

> +    {
> +      /* First convert this node into a load node and add it to the leaves
> +         list and flatten the permute from a lane to a load one.  If it's
> +         unneeded it will be elided later.  */
> +      auto_vec<stmt_vec_info> stmts;
> +      stmts.create (SLP_TREE_LANES (root));
> +      load_permutation_t load_perm;
> +      load_perm.create (SLP_TREE_LANES (root));
> +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);

load_perm leaks when any of the below outs is taken

> +      for (unsigned j = 0; j < lane_perm.length (); j++)
> +        {
> +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> +	  /* This isn't strictly needed, but this function is a temporary
> +	     one for specifically pattern matching, so don't want it to
> +	     optimize things the remainder of the pipeline will.  */
> +	  if (perm.first != j)
> +	    goto next;

but please elide it nevertheless

> +          node = SLP_TREE_CHILDREN (root)[perm.first];
> +
> +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> +	    return NULL;

so you want to check whether this is a load, I think more to the point
would be a vect_internal_def + zero SLP children check.  And a comment
on what we test (we do lack classification of SLP nodes, so a helper
like vect_is_slp_load_node or so would be OK as well)

> +
> +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
> +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);

As you're doing here lacks a check that we are actually loading from
the same DR group.  I think it might be easier to just collect scalar
stmts and throw them at vect_build_slp_tree?  That should perform
the necessary verification, build the appropriate lane permute and
perform the CSE.  Which leads to the question why the VEC_PERM node
doesn't have scalar stmts set while we are actually be able to compute
them here ... that is, the CSE opportunity could have been noticed
during pattern matching itself?

> +        }
> +
> +      if (dump_enabled_p ())
> +	dump_printf_loc (MSG_NOTE, vect_location,
> +			 "converting stmts on permute node %p\n", root);
> +
> +      slp_tree *value = bst_map->get (stmts);
> +      if (value)
> +	node = *value;
> +      else
> +	{
> +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> +	    SLP_TREE_REF_COUNT (node)++;
> +
> +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> +	  bst_map->put (stmts_cpy, node);
> +	}
> +      SLP_TREE_REF_COUNT (node)++;

Adjusting the refcount here but doing the replacement in the caller
is a bit awkward to follow - how about passing a reference so you
can adjust the edge here?

> +
> +      return node;
> +    }
> +
> +next:
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited, node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Temporary workaround for loads not being CSEd during SLP build.  This
> +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> +   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
> +   same DR such that the final operation is equal to a permuted load.  Such
> +   NODES are then directly converted into LOADS themselves.  The nodes are
> +   CSEd using BST_MAP.  */
> +
> +static void
> +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
> +			      slp_tree root)
> +{
> +  slp_tree node;
> +  unsigned i;
> +  hash_set<slp_tree> visited;
> +
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited, node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +}
> +
>  /* Helper function of vect_match_slp_patterns.
>
>     Attempts to match patterns against the slp tree rooted in REF_NODE using
> @@ -2276,7 +2385,7 @@ static bool
>  vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>  			 hash_set<slp_tree> *visited,
>  			 slp_tree_to_load_perm_map_t *perm_cache,
> -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> +			 scalar_stmts_to_slp_tree_map_t *bst_map)
>  {
>    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
>    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
> @@ -2291,6 +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>
>    if (found_p)
>      {
> +      optimize_load_redistribution (bst_map, *ref_node);
> +
>        if (dump_enabled_p ())
>  	{
>  	  dump_printf_loc (MSG_NOTE, vect_location,
> 
> 
> -- 
> 
>
>     [ Part 2, Text/X-DIFF 140 lines. ]
>     [ Unable to print this part. ]


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
  2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
@ 2021-01-07 13:25   ` Tamar Christina
  2021-01-07 13:36     ` Richard Biener
  2021-01-11 11:01   ` Tamar Christina
  1 sibling, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2021-01-07 13:25 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd, ook

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, January 7, 2021 1:21 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> distribution
> 
> > From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> > Date: Mon, 28 Dec 2020 13:35:56 +0000
> > From: Tamar Christina <tamar.christina@arm.com>
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> > Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > distribution
> >
> > Hi All,
> >
> > This introduces a post processing step for the pattern matcher to
> > flatten permutes introduced by the complex multiplications patterns.
> >
> > This performs a blend early such that SLP is not cancelled by the
> > LOAD_LANES permute.  This is a temporary workaround to the fact that
> > loads are not CSEd during building and is required to produce efficient code.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> > 	(optimize_load_redistribution): New.
> > 	(vect_match_slp_patterns): Use it.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> >
> 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325c
> f
> > 8406466ac25c 100644
> > --- a/gcc/tree-vect-slp.c
> > +++ b/gcc/tree-vect-slp.c
> > @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits,
> unsigned int group_size)
> >    return exact_div (common_multiple (nunits, group_size),
> > group_size);  }
> >
> > +/* Helper function of optimize_load_redistribution that performs the
> operation
> > +   recursively.  */
> > +
> > +static slp_tree
> > +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
> *bst_map,
> > +				hash_set<slp_tree> *visited, slp_tree root) {
> > +  if (visited->add (root))
> > +    return NULL;
> > +
> > +  slp_tree node;
> > +  unsigned i;
> > +
> > +  /* For now, we don't know anything about externals so do not do
> > + anything.  */  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> 
> use a single != vect_internal_def test please
> 
> > +    return NULL;
> > +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> 
> I think both last tests are unnecessary

It's there to prevent it from trying to optimize  two_operands nodes
which are a vec_perm but contain no scalar statements. I didn't find a different
way to distinguish between the two. The SLP tree can contain a number of these
that haven't been pattern matched away.

> 
> > +    {
> > +      /* First convert this node into a load node and add it to the leaves
> > +         list and flatten the permute from a lane to a load one.  If it's
> > +         unneeded it will be elided later.  */
> > +      auto_vec<stmt_vec_info> stmts;
> > +      stmts.create (SLP_TREE_LANES (root));
> > +      load_permutation_t load_perm;
> > +      load_perm.create (SLP_TREE_LANES (root));
> > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION
> > + (root);
> 
> load_perm leaks when any of the below outs is taken
> 
> > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > +        {
> > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > +	  /* This isn't strictly needed, but this function is a temporary
> > +	     one for specifically pattern matching, so don't want it to
> > +	     optimize things the remainder of the pipeline will.  */
> > +	  if (perm.first != j)
> > +	    goto next;
> 
> but please elide it nevertheless
> 
> > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > +
> > +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > +	    return NULL;
> 
> so you want to check whether this is a load, I think more to the point would
> be a vect_internal_def + zero SLP children check.  And a comment on what
> we test (we do lack classification of SLP nodes, so a helper like
> vect_is_slp_load_node or so would be OK as well)
> 
> > +
> > +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS
> (node)[perm.second]);
> > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION
> > +(node)[perm.second]);
> 
> As you're doing here lacks a check that we are actually loading from the same
> DR group.  I think it might be easier to just collect scalar stmts and throw
> them at vect_build_slp_tree?  That should perform the necessary
> verification, build the appropriate lane permute and perform the CSE.  Which
> leads to the question why the VEC_PERM node doesn't have scalar stmts set
> while we are actually be able to compute them here ... that is, the CSE
> opportunity could have been noticed during pattern matching itself?
> 
> > +        }
> > +
> > +      if (dump_enabled_p ())
> > +	dump_printf_loc (MSG_NOTE, vect_location,
> > +			 "converting stmts on permute node %p\n", root);
> > +
> > +      slp_tree *value = bst_map->get (stmts);
> > +      if (value)
> > +	node = *value;
> > +      else
> > +	{
> > +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > +	    SLP_TREE_REF_COUNT (node)++;
> > +
> > +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > +	  bst_map->put (stmts_cpy, node);
> > +	}
> > +      SLP_TREE_REF_COUNT (node)++;
> 
> Adjusting the refcount here but doing the replacement in the caller is a bit
> awkward to follow - how about passing a reference so you can adjust the
> edge here?
> 
> > +
> > +      return node;
> > +    }
> > +
> > +next:
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > +    {
> > +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited,
> node);
> > +      if (value)
> > +	{
> > +          SLP_TREE_CHILDREN (root)[i] = value;
> > +          vect_free_slp_tree (node);
> > +	}
> > +    }
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Temporary workaround for loads not being CSEd during SLP build.  This
> > +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> > +   VEC_PERM nodes that blend vectors from multiple nodes that all read
> from the
> > +   same DR such that the final operation is equal to a permuted load.  Such
> > +   NODES are then directly converted into LOADS themselves.  The nodes
> are
> > +   CSEd using BST_MAP.  */
> > +
> > +static void
> > +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
> *bst_map,
> > +			      slp_tree root)
> > +{
> > +  slp_tree node;
> > +  unsigned i;
> > +  hash_set<slp_tree> visited;
> > +
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > +    {
> > +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited,
> node);
> > +      if (value)
> > +	{
> > +          SLP_TREE_CHILDREN (root)[i] = value;
> > +          vect_free_slp_tree (node);
> > +	}
> > +    }
> > +}
> > +
> >  /* Helper function of vect_match_slp_patterns.
> >
> >     Attempts to match patterns against the slp tree rooted in REF_NODE
> > using @@ -2276,7 +2385,7 @@ static bool  vect_match_slp_patterns
> > (slp_instance instance, vec_info *vinfo,
> >  			 hash_set<slp_tree> *visited,
> >  			 slp_tree_to_load_perm_map_t *perm_cache,
> > -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> > +			 scalar_stmts_to_slp_tree_map_t *bst_map)
> >  {
> >    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
> >    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6
> > +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info
> > *vinfo,
> >
> >    if (found_p)
> >      {
> > +      optimize_load_redistribution (bst_map, *ref_node);
> > +
> >        if (dump_enabled_p ())
> >  	{
> >  	  dump_printf_loc (MSG_NOTE, vect_location,
> >
> >
> > --
> >
> >
> >     [ Part 2, Text/X-DIFF 140 lines. ]
> >     [ Unable to print this part. ]


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
  2021-01-07 13:25   ` Tamar Christina
@ 2021-01-07 13:36     ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-07 13:36 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd, ook

On Thu, 7 Jan 2021, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, January 7, 2021 1:21 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > distribution
> > 
> > > From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> > > Date: Mon, 28 Dec 2020 13:35:56 +0000
> > > From: Tamar Christina <tamar.christina@arm.com>
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> > > Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > > distribution
> > >
> > > Hi All,
> > >
> > > This introduces a post processing step for the pattern matcher to
> > > flatten permutes introduced by the complex multiplications patterns.
> > >
> > > This performs a blend early such that SLP is not cancelled by the
> > > LOAD_LANES permute.  This is a temporary workaround to the fact that
> > > loads are not CSEd during building and is required to produce efficient code.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> > > 	(optimize_load_redistribution): New.
> > > 	(vect_match_slp_patterns): Use it.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> > >
> > 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325c
> > f
> > > 8406466ac25c 100644
> > > --- a/gcc/tree-vect-slp.c
> > > +++ b/gcc/tree-vect-slp.c
> > > @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits,
> > unsigned int group_size)
> > >    return exact_div (common_multiple (nunits, group_size),
> > > group_size);  }
> > >
> > > +/* Helper function of optimize_load_redistribution that performs the
> > operation
> > > +   recursively.  */
> > > +
> > > +static slp_tree
> > > +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +				hash_set<slp_tree> *visited, slp_tree root) {
> > > +  if (visited->add (root))
> > > +    return NULL;
> > > +
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +
> > > +  /* For now, we don't know anything about externals so do not do
> > > + anything.  */  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> > 
> > use a single != vect_internal_def test please
> > 
> > > +    return NULL;
> > > +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> > > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> > 
> > I think both last tests are unnecessary
> 
> It's there to prevent it from trying to optimize  two_operands nodes
> which are a vec_perm but contain no scalar statements. I didn't find a different
> way to distinguish between the two. The SLP tree can contain a number of these
> that haven't been pattern matched away.

Well, that's because of the weak check for what you want to pattern match
below.  Certainly !SLP_TREE_SCALAR_STMTS (root).exists () isn't a reliable
way to catch these.

> > 
> > > +    {
> > > +      /* First convert this node into a load node and add it to the leaves
> > > +         list and flatten the permute from a lane to a load one.  If it's
> > > +         unneeded it will be elided later.  */
> > > +      auto_vec<stmt_vec_info> stmts;
> > > +      stmts.create (SLP_TREE_LANES (root));
> > > +      load_permutation_t load_perm;
> > > +      load_perm.create (SLP_TREE_LANES (root));
> > > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION
> > > + (root);
> > 
> > load_perm leaks when any of the below outs is taken
> > 
> > > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > > +        {
> > > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > > +	  /* This isn't strictly needed, but this function is a temporary
> > > +	     one for specifically pattern matching, so don't want it to
> > > +	     optimize things the remainder of the pipeline will.  */
> > > +	  if (perm.first != j)
> > > +	    goto next;
> > 
> > but please elide it nevertheless
> > 
> > > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > > +
> > > +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > > +	    return NULL;
> > 
> > so you want to check whether this is a load, I think more to the point would
> > be a vect_internal_def + zero SLP children check.  And a comment on what
> > we test (we do lack classification of SLP nodes, so a helper like
> > vect_is_slp_load_node or so would be OK as well)
> > 
> > > +
> > > +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS
> > (node)[perm.second]);
> > > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION
> > > +(node)[perm.second]);
> > 
> > As you're doing here lacks a check that we are actually loading from the same
> > DR group.  I think it might be easier to just collect scalar stmts and throw
> > them at vect_build_slp_tree?  That should perform the necessary
> > verification, build the appropriate lane permute and perform the CSE.  Which
> > leads to the question why the VEC_PERM node doesn't have scalar stmts set
> > while we are actually be able to compute them here ... that is, the CSE
> > opportunity could have been noticed during pattern matching itself?
> > 
> > > +        }
> > > +
> > > +      if (dump_enabled_p ())
> > > +	dump_printf_loc (MSG_NOTE, vect_location,
> > > +			 "converting stmts on permute node %p\n", root);
> > > +
> > > +      slp_tree *value = bst_map->get (stmts);
> > > +      if (value)
> > > +	node = *value;
> > > +      else
> > > +	{
> > > +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > > +	    SLP_TREE_REF_COUNT (node)++;
> > > +
> > > +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > > +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > > +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > > +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > > +	  bst_map->put (stmts_cpy, node);
> > > +	}
> > > +      SLP_TREE_REF_COUNT (node)++;
> > 
> > Adjusting the refcount here but doing the replacement in the caller is a bit
> > awkward to follow - how about passing a reference so you can adjust the
> > edge here?
> > 
> > > +
> > > +      return node;
> > > +    }
> > > +
> > > +next:
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +
> > > +  return NULL;
> > > +}
> > > +
> > > +/* Temporary workaround for loads not being CSEd during SLP build.  This
> > > +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> > > +   VEC_PERM nodes that blend vectors from multiple nodes that all read
> > from the
> > > +   same DR such that the final operation is equal to a permuted load.  Such
> > > +   NODES are then directly converted into LOADS themselves.  The nodes
> > are
> > > +   CSEd using BST_MAP.  */
> > > +
> > > +static void
> > > +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +			      slp_tree root)
> > > +{
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +  hash_set<slp_tree> visited;
> > > +
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +}
> > > +
> > >  /* Helper function of vect_match_slp_patterns.
> > >
> > >     Attempts to match patterns against the slp tree rooted in REF_NODE
> > > using @@ -2276,7 +2385,7 @@ static bool  vect_match_slp_patterns
> > > (slp_instance instance, vec_info *vinfo,
> > >  			 hash_set<slp_tree> *visited,
> > >  			 slp_tree_to_load_perm_map_t *perm_cache,
> > > -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> > > +			 scalar_stmts_to_slp_tree_map_t *bst_map)
> > >  {
> > >    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
> > >    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6
> > > +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info
> > > *vinfo,
> > >
> > >    if (found_p)
> > >      {
> > > +      optimize_load_redistribution (bst_map, *ref_node);
> > > +
> > >        if (dump_enabled_p ())
> > >  	{
> > >  	  dump_printf_loc (MSG_NOTE, vect_location,
> > >
> > >
> > > --
> > >
> > >
> > >     [ Part 2, Text/X-DIFF 140 lines. ]
> > >     [ Unable to print this part. ]
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
  2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
  2021-01-07 13:25   ` Tamar Christina
@ 2021-01-11 11:01   ` Tamar Christina
  2021-01-11 13:54     ` Richard Biener
  1 sibling, 1 reply; 27+ messages in thread
From: Tamar Christina @ 2021-01-11 11:01 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches, nd

[-- Attachment #1: Type: text/plain, Size: 13906 bytes --]

Hi Richi,

Attached is the updated patch.

Note that testcases for all of these will be committed with the patch but I'm
Finishing up the 32-bit Arm changes to mirror the changes the AArch64 maintainer
wanted and then have to do bootstrap which will take the majority of the day so
wanted to get these patches out first.

I also built spec with the matcher on and off and noticed no meaningful change in
Compile time but replacements in several benchmarks.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-slp.c (optimize_load_redistribution_1): New.
	(optimize_load_redistribution, vect_is_slp_load_node): New.
	(vect_match_slp_patterns): Use it.

-- inline copy of patch --

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..89e226ca3a25a6c77b86d46ba234ce54bd3cb83b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,114 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper that checks to see if a node is a load node. This is done based on
+   two criterias:
+   1) The node is internal
+   2) The node has no childen.  */
+
+static inline bool
+vect_is_slp_load_node  (slp_tree root)
+{
+  return (SLP_TREE_DEF_TYPE (root) == vect_internal_def
+	  && !SLP_TREE_CHILDREN (root).exists ());
+}
+
+
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				vec_info *vinfo, unsigned int group_size,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!vect_is_slp_load_node (node))
+	   return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      bool *matches = XALLOCAVEC (bool, group_size);
+      poly_uint64 max_nunits = 1;
+      unsigned tree_size = 0, limit = 1;
+      node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
+				  matches, &limit, &tree_size, bst_map);
+      if (!node)
+	stmts.release ();
+
+      return node;
+    }
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value
+	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, visited,
+					  node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      vec_info *vinfo, unsigned int group_size,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value
+	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, &visited,
+					  node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2384,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2399,9 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (*ref_node),
+				    *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,

> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Thursday, January 7, 2021 1:21 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> Subject: Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> distribution
> 
> > From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> > Date: Mon, 28 Dec 2020 13:35:56 +0000
> > From: Tamar Christina <tamar.christina@arm.com>
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> > Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > distribution
> >
> > Hi All,
> >
> > This introduces a post processing step for the pattern matcher to
> > flatten permutes introduced by the complex multiplications patterns.
> >
> > This performs a blend early such that SLP is not cancelled by the
> > LOAD_LANES permute.  This is a temporary workaround to the fact that
> > loads are not CSEd during building and is required to produce efficient code.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> > 	(optimize_load_redistribution): New.
> > 	(vect_match_slp_patterns): Use it.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> >
> 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325c
> f
> > 8406466ac25c 100644
> > --- a/gcc/tree-vect-slp.c
> > +++ b/gcc/tree-vect-slp.c
> > @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits,
> unsigned int group_size)
> >    return exact_div (common_multiple (nunits, group_size),
> > group_size);  }
> >
> > +/* Helper function of optimize_load_redistribution that performs the
> operation
> > +   recursively.  */
> > +
> > +static slp_tree
> > +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
> *bst_map,
> > +				hash_set<slp_tree> *visited, slp_tree root) {
> > +  if (visited->add (root))
> > +    return NULL;
> > +
> > +  slp_tree node;
> > +  unsigned i;
> > +
> > +  /* For now, we don't know anything about externals so do not do
> > + anything.  */  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> 
> use a single != vect_internal_def test please
> 
> > +    return NULL;
> > +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> 
> I think both last tests are unnecessary
> 
> > +    {
> > +      /* First convert this node into a load node and add it to the leaves
> > +         list and flatten the permute from a lane to a load one.  If it's
> > +         unneeded it will be elided later.  */
> > +      auto_vec<stmt_vec_info> stmts;
> > +      stmts.create (SLP_TREE_LANES (root));
> > +      load_permutation_t load_perm;
> > +      load_perm.create (SLP_TREE_LANES (root));
> > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION
> > + (root);
> 
> load_perm leaks when any of the below outs is taken
> 
> > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > +        {
> > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > +	  /* This isn't strictly needed, but this function is a temporary
> > +	     one for specifically pattern matching, so don't want it to
> > +	     optimize things the remainder of the pipeline will.  */
> > +	  if (perm.first != j)
> > +	    goto next;
> 
> but please elide it nevertheless
> 
> > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > +
> > +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > +	    return NULL;
> 
> so you want to check whether this is a load, I think more to the point would
> be a vect_internal_def + zero SLP children check.  And a comment on what
> we test (we do lack classification of SLP nodes, so a helper like
> vect_is_slp_load_node or so would be OK as well)
> 
> > +
> > +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS
> (node)[perm.second]);
> > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION
> > +(node)[perm.second]);
> 
> As you're doing here lacks a check that we are actually loading from the same
> DR group.  I think it might be easier to just collect scalar stmts and throw
> them at vect_build_slp_tree?  That should perform the necessary
> verification, build the appropriate lane permute and perform the CSE.  Which
> leads to the question why the VEC_PERM node doesn't have scalar stmts set
> while we are actually be able to compute them here ... that is, the CSE
> opportunity could have been noticed during pattern matching itself?
> 

I thought about this myself, but given your previous comment of not touching the scalar
Statements during matching I thought it was a cleaner abstraction to separate the two.

Also this optimization function is temporary anyway so I figured I'd leave the matchers
"as they should be" and optimize it afterwards.

Regards,
Tamar

> > +        }
> > +
> > +      if (dump_enabled_p ())
> > +	dump_printf_loc (MSG_NOTE, vect_location,
> > +			 "converting stmts on permute node %p\n", root);
> > +
> > +      slp_tree *value = bst_map->get (stmts);
> > +      if (value)
> > +	node = *value;
> > +      else
> > +	{
> > +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > +	    SLP_TREE_REF_COUNT (node)++;
> > +
> > +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > +	  bst_map->put (stmts_cpy, node);
> > +	}
> > +      SLP_TREE_REF_COUNT (node)++;
> 
> Adjusting the refcount here but doing the replacement in the caller is a bit
> awkward to follow - how about passing a reference so you can adjust the
> edge here?
> 
> > +
> > +      return node;
> > +    }
> > +
> > +next:
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > +    {
> > +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited,
> node);
> > +      if (value)
> > +	{
> > +          SLP_TREE_CHILDREN (root)[i] = value;
> > +          vect_free_slp_tree (node);
> > +	}
> > +    }
> > +
> > +  return NULL;
> > +}
> > +
> > +/* Temporary workaround for loads not being CSEd during SLP build.  This
> > +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> > +   VEC_PERM nodes that blend vectors from multiple nodes that all read
> from the
> > +   same DR such that the final operation is equal to a permuted load.  Such
> > +   NODES are then directly converted into LOADS themselves.  The nodes
> are
> > +   CSEd using BST_MAP.  */
> > +
> > +static void
> > +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
> *bst_map,
> > +			      slp_tree root)
> > +{
> > +  slp_tree node;
> > +  unsigned i;
> > +  hash_set<slp_tree> visited;
> > +
> > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > +    {
> > +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited,
> node);
> > +      if (value)
> > +	{
> > +          SLP_TREE_CHILDREN (root)[i] = value;
> > +          vect_free_slp_tree (node);
> > +	}
> > +    }
> > +}
> > +
> >  /* Helper function of vect_match_slp_patterns.
> >
> >     Attempts to match patterns against the slp tree rooted in REF_NODE
> > using @@ -2276,7 +2385,7 @@ static bool  vect_match_slp_patterns
> > (slp_instance instance, vec_info *vinfo,
> >  			 hash_set<slp_tree> *visited,
> >  			 slp_tree_to_load_perm_map_t *perm_cache,
> > -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> > +			 scalar_stmts_to_slp_tree_map_t *bst_map)
> >  {
> >    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
> >    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6
> > +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info
> > *vinfo,
> >
> >    if (found_p)
> >      {
> > +      optimize_load_redistribution (bst_map, *ref_node);
> > +
> >        if (dump_enabled_p ())
> >  	{
> >  	  dump_printf_loc (MSG_NOTE, vect_location,
> >
> >
> > --
> >
> >
> >     [ Part 2, Text/X-DIFF 140 lines. ]
> >     [ Unable to print this part. ]


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: pr13956.patch --]
[-- Type: text/x-diff; name="pr13956.patch", Size: 4633 bytes --]

diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 2a58e54fe51471df5f55ce4a524d0022744054b0..89e226ca3a25a6c77b86d46ba234ce54bd3cb83b 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2228,6 +2228,114 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
   return exact_div (common_multiple (nunits, group_size), group_size);
 }
 
+/* Helper that checks to see if a node is a load node. This is done based on
+   two criterias:
+   1) The node is internal
+   2) The node has no childen.  */
+
+static inline bool
+vect_is_slp_load_node  (slp_tree root)
+{
+  return (SLP_TREE_DEF_TYPE (root) == vect_internal_def
+	  && !SLP_TREE_CHILDREN (root).exists ());
+}
+
+
+/* Helper function of optimize_load_redistribution that performs the operation
+   recursively.  */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+				vec_info *vinfo, unsigned int group_size,
+				hash_set<slp_tree> *visited, slp_tree root)
+{
+  if (visited->add (root))
+    return NULL;
+
+  slp_tree node;
+  unsigned i;
+
+  /* For now, we don't know anything about externals so do not do anything.  */
+  if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
+    return NULL;
+  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
+    {
+      /* First convert this node into a load node and add it to the leaves
+         list and flatten the permute from a lane to a load one.  If it's
+         unneeded it will be elided later.  */
+      vec<stmt_vec_info> stmts;
+      stmts.create (SLP_TREE_LANES (root));
+      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+      for (unsigned j = 0; j < lane_perm.length (); j++)
+        {
+          std::pair<unsigned, unsigned> perm = lane_perm[j];
+          node = SLP_TREE_CHILDREN (root)[perm.first];
+
+	  if (!vect_is_slp_load_node (node))
+	   return NULL;
+
+	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+        }
+
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "converting stmts on permute node %p\n", root);
+
+      bool *matches = XALLOCAVEC (bool, group_size);
+      poly_uint64 max_nunits = 1;
+      unsigned tree_size = 0, limit = 1;
+      node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
+				  matches, &limit, &tree_size, bst_map);
+      if (!node)
+	stmts.release ();
+
+      return node;
+    }
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value
+	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, visited,
+					  node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+
+  return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build.  This
+   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+   same DR such that the final operation is equal to a permuted load.  Such
+   NODES are then directly converted into LOADS themselves.  The nodes are
+   CSEd using BST_MAP.  */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+			      vec_info *vinfo, unsigned int group_size,
+			      slp_tree root)
+{
+  slp_tree node;
+  unsigned i;
+  hash_set<slp_tree> visited;
+
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+    {
+      slp_tree value
+	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, &visited,
+					  node);
+      if (value)
+	{
+          SLP_TREE_CHILDREN (root)[i] = value;
+          vect_free_slp_tree (node);
+	}
+    }
+}
+
 /* Helper function of vect_match_slp_patterns.
 
    Attempts to match patterns against the slp tree rooted in REF_NODE using
@@ -2276,7 +2384,7 @@ static bool
 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 			 hash_set<slp_tree> *visited,
 			 slp_tree_to_load_perm_map_t *perm_cache,
-			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
+			 scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@@ -2291,6 +2399,9 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
 
   if (found_p)
     {
+      optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (*ref_node),
+				    *ref_node);
+
       if (dump_enabled_p ())
 	{
 	  dump_printf_loc (MSG_NOTE, vect_location,


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution
  2021-01-11 11:01   ` Tamar Christina
@ 2021-01-11 13:54     ` Richard Biener
  0 siblings, 0 replies; 27+ messages in thread
From: Richard Biener @ 2021-01-11 13:54 UTC (permalink / raw)
  To: Tamar Christina; +Cc: gcc-patches, nd

On Mon, 11 Jan 2021, Tamar Christina wrote:

> Hi Richi,
> 
> Attached is the updated patch.
> 
> Note that testcases for all of these will be committed with the patch but I'm
> Finishing up the 32-bit Arm changes to mirror the changes the AArch64 maintainer
> wanted and then have to do bootstrap which will take the majority of the day so
> wanted to get these patches out first.
> 
> I also built spec with the matcher on and off and noticed no meaningful change in
> Compile time but replacements in several benchmarks.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> 	(optimize_load_redistribution, vect_is_slp_load_node): New.
> 	(vect_match_slp_patterns): Use it.
> 
> -- inline copy of patch --
> 
> diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
> index 2a58e54fe51471df5f55ce4a524d0022744054b0..89e226ca3a25a6c77b86d46ba234ce54bd3cb83b 100644
> --- a/gcc/tree-vect-slp.c
> +++ b/gcc/tree-vect-slp.c
> @@ -2228,6 +2228,114 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
>    return exact_div (common_multiple (nunits, group_size), group_size);
>  }
>  
> +/* Helper that checks to see if a node is a load node. This is done based on
> +   two criterias:
> +   1) The node is internal
> +   2) The node has no childen.  */
> +
> +static inline bool
> +vect_is_slp_load_node  (slp_tree root)
> +{
> +  return (SLP_TREE_DEF_TYPE (root) == vect_internal_def
> +	  && !SLP_TREE_CHILDREN (root).exists ());

this would return true for induction defs as well (the SLP_TREE_DEF_TYPE
only distinguishes between vect_internal_def and constant/external 
def...).  It would also not match masked loads.  A more close match
would be

  SLP_TREE_DEF_TYPE (root) == vect_internal_def
  && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
  && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)))

but not sure whether you handle masked loads OK (so you could
do the !SLP_TREE_CHILDREN (root).exists () in the caller if not).

> +}
> +
> +
> +/* Helper function of optimize_load_redistribution that performs the operation
> +   recursively.  */
> +
> +static slp_tree
> +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
> +				vec_info *vinfo, unsigned int group_size,
> +				hash_set<slp_tree> *visited, slp_tree root)
> +{
> +  if (visited->add (root))
> +    return NULL;
> +
> +  slp_tree node;
> +  unsigned i;
> +
> +  /* For now, we don't know anything about externals so do not do anything.  */
> +  if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
> +    return NULL;
> +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
> +    {
> +      /* First convert this node into a load node and add it to the leaves
> +         list and flatten the permute from a lane to a load one.  If it's
> +         unneeded it will be elided later.  */
> +      vec<stmt_vec_info> stmts;
> +      stmts.create (SLP_TREE_LANES (root));
> +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
> +      for (unsigned j = 0; j < lane_perm.length (); j++)
> +        {
> +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> +          node = SLP_TREE_CHILDREN (root)[perm.first];
> +
> +	  if (!vect_is_slp_load_node (node))

stmts leaks here - I think you also want to still recurse to the SLP
children, there can be two_operator nodes consuming the complex
ops.  So maybe a break and guard the rest with j == lane_perm.length ().

> +	   return NULL;
> +
> +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
> +        }
> +
> +      if (dump_enabled_p ())
> +	dump_printf_loc (MSG_NOTE, vect_location,
> +			 "converting stmts on permute node %p\n", root);
> +
> +      bool *matches = XALLOCAVEC (bool, group_size);
> +      poly_uint64 max_nunits = 1;
> +      unsigned tree_size = 0, limit = 1;
> +      node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
> +				  matches, &limit, &tree_size, bst_map);
> +      if (!node)
> +	stmts.release ();
> +
> +      return node;
> +    }
> +
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value
> +	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, visited,
> +					  node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +
> +  return NULL;
> +}
> +
> +/* Temporary workaround for loads not being CSEd during SLP build.  This
> +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> +   VEC_PERM nodes that blend vectors from multiple nodes that all read from the
> +   same DR such that the final operation is equal to a permuted load.  Such
> +   NODES are then directly converted into LOADS themselves.  The nodes are
> +   CSEd using BST_MAP.  */
> +
> +static void
> +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
> +			      vec_info *vinfo, unsigned int group_size,
> +			      slp_tree root)
> +{
> +  slp_tree node;
> +  unsigned i;
> +  hash_set<slp_tree> visited;
> +
> +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> +    {
> +      slp_tree value
> +	= optimize_load_redistribution_1 (bst_map, vinfo, group_size, &visited,
> +					  node);
> +      if (value)
> +	{
> +          SLP_TREE_CHILDREN (root)[i] = value;
> +          vect_free_slp_tree (node);
> +	}
> +    }
> +}
> +
>  /* Helper function of vect_match_slp_patterns.
>  
>     Attempts to match patterns against the slp tree rooted in REF_NODE using
> @@ -2276,7 +2384,7 @@ static bool
>  vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>  			 hash_set<slp_tree> *visited,
>  			 slp_tree_to_load_perm_map_t *perm_cache,
> -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> +			 scalar_stmts_to_slp_tree_map_t *bst_map)
>  {
>    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
>    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
> @@ -2291,6 +2399,9 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
>  
>    if (found_p)
>      {
> +      optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (*ref_node),
> +				    *ref_node);
> +

Can you move this up to the caller and share one visited set for all
SLP instances?  Because you will run into shared subtrees more than
once the way it is structured now.

Otherwise looks ok.

Richard.

>        if (dump_enabled_p ())
>  	{
>  	  dump_printf_loc (MSG_NOTE, vect_location,
> 
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Thursday, January 7, 2021 1:21 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; ook@ucw.cz
> > Subject: Re: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > distribution
> > 
> > > From tamar.christina@arm.com Mon Dec 28 14:36:32 2020
> > > Date: Mon, 28 Dec 2020 13:35:56 +0000
> > > From: Tamar Christina <tamar.christina@arm.com>
> > > To: gcc-patches@gcc.gnu.org
> > > Cc: nd@arm.com, rguenther@suse.de, ook@ucw.cz
> > > Subject: [PATCH 1/8 v9]middle-end slp: Support optimizing load
> > > distribution
> > >
> > > Hi All,
> > >
> > > This introduces a post processing step for the pattern matcher to
> > > flatten permutes introduced by the complex multiplications patterns.
> > >
> > > This performs a blend early such that SLP is not cancelled by the
> > > LOAD_LANES permute.  This is a temporary workaround to the fact that
> > > loads are not CSEd during building and is required to produce efficient code.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* tree-vect-slp.c (optimize_load_redistribution_1): New.
> > > 	(optimize_load_redistribution): New.
> > > 	(vect_match_slp_patterns): Use it.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index
> > >
> > 2a58e54fe51471df5f55ce4a524d0022744054b0..8360a59098f517498f3155f325c
> > f
> > > 8406466ac25c 100644
> > > --- a/gcc/tree-vect-slp.c
> > > +++ b/gcc/tree-vect-slp.c
> > > @@ -2228,6 +2228,115 @@ calculate_unrolling_factor (poly_uint64 nunits,
> > unsigned int group_size)
> > >    return exact_div (common_multiple (nunits, group_size),
> > > group_size);  }
> > >
> > > +/* Helper function of optimize_load_redistribution that performs the
> > operation
> > > +   recursively.  */
> > > +
> > > +static slp_tree
> > > +optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +				hash_set<slp_tree> *visited, slp_tree root) {
> > > +  if (visited->add (root))
> > > +    return NULL;
> > > +
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +
> > > +  /* For now, we don't know anything about externals so do not do
> > > + anything.  */  if (SLP_TREE_DEF_TYPE (root) == vect_external_def
> > > +      || SLP_TREE_DEF_TYPE (root) == vect_constant_def)
> > 
> > use a single != vect_internal_def test please
> > 
> > > +    return NULL;
> > > +  else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR
> > > +      && SLP_TREE_LANE_PERMUTATION (root).exists ()
> > > +      && !SLP_TREE_SCALAR_STMTS (root).exists ())
> > 
> > I think both last tests are unnecessary
> > 
> > > +    {
> > > +      /* First convert this node into a load node and add it to the leaves
> > > +         list and flatten the permute from a lane to a load one.  If it's
> > > +         unneeded it will be elided later.  */
> > > +      auto_vec<stmt_vec_info> stmts;
> > > +      stmts.create (SLP_TREE_LANES (root));
> > > +      load_permutation_t load_perm;
> > > +      load_perm.create (SLP_TREE_LANES (root));
> > > +      lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION
> > > + (root);
> > 
> > load_perm leaks when any of the below outs is taken
> > 
> > > +      for (unsigned j = 0; j < lane_perm.length (); j++)
> > > +        {
> > > +          std::pair<unsigned, unsigned> perm = lane_perm[j];
> > > +	  /* This isn't strictly needed, but this function is a temporary
> > > +	     one for specifically pattern matching, so don't want it to
> > > +	     optimize things the remainder of the pipeline will.  */
> > > +	  if (perm.first != j)
> > > +	    goto next;
> > 
> > but please elide it nevertheless
> > 
> > > +          node = SLP_TREE_CHILDREN (root)[perm.first];
> > > +
> > > +	  if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
> > > +	    return NULL;
> > 
> > so you want to check whether this is a load, I think more to the point would
> > be a vect_internal_def + zero SLP children check.  And a comment on what
> > we test (we do lack classification of SLP nodes, so a helper like
> > vect_is_slp_load_node or so would be OK as well)
> > 
> > > +
> > > +	  stmts.quick_push (SLP_TREE_SCALAR_STMTS
> > (node)[perm.second]);
> > > +          load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION
> > > +(node)[perm.second]);
> > 
> > As you're doing here lacks a check that we are actually loading from the same
> > DR group.  I think it might be easier to just collect scalar stmts and throw
> > them at vect_build_slp_tree?  That should perform the necessary
> > verification, build the appropriate lane permute and perform the CSE.  Which
> > leads to the question why the VEC_PERM node doesn't have scalar stmts set
> > while we are actually be able to compute them here ... that is, the CSE
> > opportunity could have been noticed during pattern matching itself?
> > 
> 
> I thought about this myself, but given your previous comment of not touching the scalar
> Statements during matching I thought it was a cleaner abstraction to separate the two.
> 
> Also this optimization function is temporary anyway so I figured I'd leave the matchers
> "as they should be" and optimize it afterwards.
> 
> Regards,
> Tamar
> 
> > > +        }
> > > +
> > > +      if (dump_enabled_p ())
> > > +	dump_printf_loc (MSG_NOTE, vect_location,
> > > +			 "converting stmts on permute node %p\n", root);
> > > +
> > > +      slp_tree *value = bst_map->get (stmts);
> > > +      if (value)
> > > +	node = *value;
> > > +      else
> > > +	{
> > > +	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i, node)
> > > +	    SLP_TREE_REF_COUNT (node)++;
> > > +
> > > +	  vec<stmt_vec_info> stmts_cpy = stmts.copy ();
> > > +	  node = vect_create_new_slp_node (stmts_cpy.copy (), 0);
> > > +	  SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (root);
> > > +	  SLP_TREE_LOAD_PERMUTATION (node) = load_perm;
> > > +	  bst_map->put (stmts_cpy, node);
> > > +	}
> > > +      SLP_TREE_REF_COUNT (node)++;
> > 
> > Adjusting the refcount here but doing the replacement in the caller is a bit
> > awkward to follow - how about passing a reference so you can adjust the
> > edge here?
> > 
> > > +
> > > +      return node;
> > > +    }
> > > +
> > > +next:
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +
> > > +  return NULL;
> > > +}
> > > +
> > > +/* Temporary workaround for loads not being CSEd during SLP build.  This
> > > +   function will traverse the SLP tree rooted in ROOT for INSTANCE and find
> > > +   VEC_PERM nodes that blend vectors from multiple nodes that all read
> > from the
> > > +   same DR such that the final operation is equal to a permuted load.  Such
> > > +   NODES are then directly converted into LOADS themselves.  The nodes
> > are
> > > +   CSEd using BST_MAP.  */
> > > +
> > > +static void
> > > +optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
> > *bst_map,
> > > +			      slp_tree root)
> > > +{
> > > +  slp_tree node;
> > > +  unsigned i;
> > > +  hash_set<slp_tree> visited;
> > > +
> > > +  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
> > > +    {
> > > +      slp_tree value = optimize_load_redistribution_1 (bst_map, &visited,
> > node);
> > > +      if (value)
> > > +	{
> > > +          SLP_TREE_CHILDREN (root)[i] = value;
> > > +          vect_free_slp_tree (node);
> > > +	}
> > > +    }
> > > +}
> > > +
> > >  /* Helper function of vect_match_slp_patterns.
> > >
> > >     Attempts to match patterns against the slp tree rooted in REF_NODE
> > > using @@ -2276,7 +2385,7 @@ static bool  vect_match_slp_patterns
> > > (slp_instance instance, vec_info *vinfo,
> > >  			 hash_set<slp_tree> *visited,
> > >  			 slp_tree_to_load_perm_map_t *perm_cache,
> > > -			 scalar_stmts_to_slp_tree_map_t * /* bst_map */)
> > > +			 scalar_stmts_to_slp_tree_map_t *bst_map)
> > >  {
> > >    DUMP_VECT_SCOPE ("vect_match_slp_patterns");
> > >    slp_tree *ref_node = &SLP_INSTANCE_TREE (instance); @@ -2291,6
> > > +2400,8 @@ vect_match_slp_patterns (slp_instance instance, vec_info
> > > *vinfo,
> > >
> > >    if (found_p)
> > >      {
> > > +      optimize_load_redistribution (bst_map, *ref_node);
> > > +
> > >        if (dump_enabled_p ())
> > >  	{
> > >  	  dump_printf_loc (MSG_NOTE, vect_location,
> > >
> > >
> > > --
> > >
> > >
> > >     [ Part 2, Text/X-DIFF 140 lines. ]
> > >     [ Unable to print this part. ]
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2021-01-11 13:54 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-28 13:35 [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Tamar Christina
2020-12-28 13:36 ` [PATCH 2/8 v9]middle-end slp: fix is_linear_load_p to prevent multiple answers Tamar Christina
2021-01-07 13:17   ` Richard Biener
2020-12-28 13:36 ` [PATCH 3/8 v9]middle-end slp: handle externals correctly in linear_loads_p Tamar Christina
2021-01-07 13:17   ` Richard Biener
2020-12-28 13:37 ` [PATCH 4/8 v9]middle-end slp: upgrade complex add to new format and fix memory leaks Tamar Christina
2021-01-07 13:18   ` Richard Biener
2020-12-28 13:37 ` [PATCH 5/8 v9]middle-end slp: support complex multiply and complex multiply conjugate Tamar Christina
2021-01-08  9:37   ` Richard Biener
2021-01-11 11:01     ` Tamar Christina
2021-01-11 12:04       ` Richard Biener
2020-12-28 13:37 ` [PATCH 6/8 v9]middle-end slp: support complex FMA and complex FMA conjugate Tamar Christina
2021-01-08  9:45   ` Richard Biener
2021-01-08  9:59     ` Tamar Christina
2021-01-08 10:17       ` Richard Biener
2021-01-08 10:21         ` Tamar Christina
2021-01-11 10:24     ` Tamar Christina
2020-12-28 13:38 ` [PATCH 7/8 v9]middle-end slp: support complex FMS and complex FMS conjugate Tamar Christina
2021-01-08  9:49   ` Richard Biener
2021-01-08 10:02     ` Tamar Christina
2020-12-28 13:38 ` [PATCH 8/8 v9]middle-end slp: Add complex operations class to share first match among all matchers Tamar Christina
2021-01-08  9:50   ` Richard Biener
2021-01-07 13:20 ` [PATCH 1/8 v9]middle-end slp: Support optimizing load distribution Richard Biener
2021-01-07 13:25   ` Tamar Christina
2021-01-07 13:36     ` Richard Biener
2021-01-11 11:01   ` Tamar Christina
2021-01-11 13:54     ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).