public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
@ 2022-07-12 13:35 Michal Jankovič
  2022-07-12 14:08 ` Iain Sandoe
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Jankovič @ 2022-07-12 13:35 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1139 bytes --]

Currently, coroutine frames store all variables of a coroutine separately,
even if their lifetime does not overlap (they are in distinct scopes). This
patch implements overlapping distinct variable scopes in the coroutine frame,
by storing the frame fields in nested unions of structs. This lowers the size
of the frame for larger coroutines significantly, and makes them more usable
on systems with limited memory.

Bootstrapped and regression tested on x86_64-pc-linux-gnu; new test fails
before the patch and succeeds after with no regressions.

	PR c++/105989

gcc/cp/ChangeLog:

	* coroutines.cc (struct local_var_info): Add field_access_path.
	(build_local_var_frame_access_expr): New.
	(transform_local_var_uses): Use build_local_var_frame_access_expr.
	(coro_make_frame_entry_id): New.
	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
	(struct local_vars_frame_data): Add orig, field_access_path.
	(register_local_var_uses): Generate new frame layout. Create access
	paths to vars.
	(morph_fn_to_coro): Set new fields in local_vars_frame_data. 

gcc/testsuite/ChangeLog:

	* g++.dg/coroutines/pr105989.C: New test.


[-- Attachment #2: pr105989.patch --]
[-- Type: application/octet-stream, Size: 10652 bytes --]

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index edb3b706ddc..ed1ac4decaf 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -1997,6 +1997,7 @@ struct local_var_info
   bool is_static;
   bool has_value_expr_p;
   location_t def_loc;
+  vec<tree, va_gc> *field_access_path;
 };
 
 /* For figuring out what local variable usage we have.  */
@@ -2009,6 +2010,26 @@ struct local_vars_transform
   hash_map<tree, local_var_info> *local_var_uses;
 };
 
+/* Build a COMPONENT_REF chain for accessing a nested variable in the coroutine
+   frame.  */
+static tree
+build_local_var_frame_access_expr (local_vars_transform *lvt,
+				   local_var_info *local_var)
+{
+  tree access_expr = lvt->actor_frame;
+
+  for (tree path_elem_id : *local_var->field_access_path)
+    {
+      tree path_elem_member = lookup_member (
+	TREE_TYPE (access_expr), path_elem_id, 1, 0, tf_warning_or_error);
+      access_expr = build3_loc (
+	lvt->loc, COMPONENT_REF, TREE_TYPE (path_elem_member),
+	access_expr, path_elem_member, NULL_TREE);
+    }
+
+  return access_expr;
+}
+
 static tree
 transform_local_var_uses (tree *stmt, int *do_subtree, void *d)
 {
@@ -2040,12 +2061,7 @@ transform_local_var_uses (tree *stmt, int *do_subtree, void *d)
 	  if (local_var.field_id == NULL_TREE)
 	    continue; /* Wasn't used.  */
 
-	  tree fld_ref
-	    = lookup_member (lvd->coro_frame_type, local_var.field_id,
-			     /*protect=*/1, /*want_type=*/0,
-			     tf_warning_or_error);
-	  tree fld_idx = build3_loc (lvd->loc, COMPONENT_REF, TREE_TYPE (lvar),
-				     lvd->actor_frame, fld_ref, NULL_TREE);
+	  tree fld_idx = build_local_var_frame_access_expr (lvd, &local_var);
 	  local_var.field_idx = fld_idx;
 	  SET_DECL_VALUE_EXPR (lvar, fld_idx);
 	  DECL_HAS_VALUE_EXPR_P (lvar) = true;
@@ -3873,14 +3889,24 @@ analyze_fn_parms (tree orig)
 /* Small helper for the repetitive task of adding a new field to the coro
    frame type.  */
 
+static void
+coro_make_frame_entry_id (tree *field_list, tree id, tree fld_type,
+			  location_t loc)
+{
+  tree decl = build_decl (loc, FIELD_DECL, id, fld_type);
+  DECL_CHAIN (decl) = *field_list;
+  *field_list = decl;
+}
+
+/* Same as coro_make_frame_entry_id, but creates an identifier from string.  */
+
 static tree
 coro_make_frame_entry (tree *field_list, const char *name, tree fld_type,
 		       location_t loc)
 {
   tree id = get_identifier (name);
-  tree decl = build_decl (loc, FIELD_DECL, id, fld_type);
-  DECL_CHAIN (decl) = *field_list;
-  *field_list = decl;
+  coro_make_frame_entry_id (field_list, id, fld_type, loc);
+
   return id;
 }
 
@@ -3894,6 +3920,8 @@ struct local_vars_frame_data
   location_t loc;
   bool saw_capture;
   bool local_var_seen;
+  tree orig;
+  vec<tree, va_gc> *field_access_path;
 };
 
 /* A tree-walk callback that processes one bind expression noting local
@@ -3912,6 +3940,21 @@ register_local_var_uses (tree *stmt, int *do_subtree, void *d)
 
   if (TREE_CODE (*stmt) == BIND_EXPR)
     {
+      tree scope_field_id = NULL_TREE;
+      if (lvd->nest_depth != 0)
+	{
+	  /* Create identifier under which fields for this bind-expression will
+	     be accessed.  */
+	  char *scope_field_name
+	    = xasprintf ("_Scope%u_%u", lvd->nest_depth, lvd->bind_indx);
+	  scope_field_id = get_identifier (scope_field_name);
+	  free (scope_field_name);
+
+	  vec_safe_push (lvd->field_access_path, scope_field_id);
+	}
+
+      tree scope_variables = NULL_TREE;
+
       tree lvar;
       unsigned serial = 0;
       for (lvar = BIND_EXPR_VARS (*stmt); lvar != NULL;
@@ -3980,17 +4023,99 @@ register_local_var_uses (tree *stmt, int *do_subtree, void *d)
 
 	  /* TODO: Figure out if we should build a local type that has any
 	     excess alignment or size from the original decl.  */
-	  local_var.field_id = coro_make_frame_entry (lvd->field_list, buf,
+
+	  local_var.field_id = coro_make_frame_entry (&scope_variables, buf,
 						      lvtype, lvd->loc);
 	  free (buf);
 	  /* We don't walk any of the local var sub-trees, they won't contain
 	     any bind exprs.  */
+
+	  local_var.field_access_path = make_tree_vector_copy (
+	    lvd->field_access_path);
+	  vec_safe_push (local_var.field_access_path, local_var.field_id);
 	}
+
+      unsigned bind_indx = lvd->bind_indx;
+      tree* parent_field_list = lvd->field_list;
+
+      /* Collect the scope structs of child bind-expressions when recursing.  */
+      tree child_scopes = NULL_TREE;
+      lvd->field_list = &child_scopes;
+
+      /* Create identifier under which fields for child bind-expressions will be
+	 accessed.  */
+      char *child_scopes_field_name
+	= xasprintf ("_Scope_list%u_%u", lvd->nest_depth, bind_indx);
+      tree child_scopes_field_id = get_identifier (child_scopes_field_name);
+      free (child_scopes_field_name);
+
+      /* Recurse to child bind expressions.  */
       lvd->bind_indx++;
       lvd->nest_depth++;
+      vec_safe_push (lvd->field_access_path, child_scopes_field_id);
       cp_walk_tree (&BIND_EXPR_BODY (*stmt), register_local_var_uses, d, NULL);
       *do_subtree = 0; /* We've done this.  */
+      lvd->field_access_path->pop ();
       lvd->nest_depth--;
+
+      /* Restore the parent field list.  */
+      lvd->field_list = parent_field_list;
+
+      if (child_scopes != NULL_TREE)
+	{
+	  /* Create a union to house the child scopes, so that they are
+	     overlapped in the coroutine frame.  */
+	  char *child_scopes_union_name_suffix
+	    = xasprintf ("Frame_scope_list%u_%u", lvd->nest_depth, bind_indx);
+	  tree child_scopes_union_name = get_fn_local_identifier (
+	    lvd->orig, child_scopes_union_name_suffix);
+	  free (child_scopes_union_name_suffix);
+	  tree child_scopes_union
+	    = xref_tag (union_type, child_scopes_union_name);
+	  DECL_CONTEXT (TYPE_NAME (child_scopes_union)) = current_scope ();
+	  child_scopes_union = begin_class_definition (child_scopes_union);
+	  TYPE_FIELDS (child_scopes_union) = child_scopes;
+	  TYPE_BINFO (child_scopes_union) = make_tree_binfo (0);
+	  BINFO_OFFSET (TYPE_BINFO (child_scopes_union)) = size_zero_node;
+	  BINFO_TYPE (TYPE_BINFO (child_scopes_union)) = child_scopes_union;
+	  child_scopes_union = finish_struct (child_scopes_union, NULL_TREE);
+
+	  /* Add it to the current scope fields.  */
+	  coro_make_frame_entry_id (
+	    &scope_variables, child_scopes_field_id,
+	    child_scopes_union, lvd->loc);
+	}
+
+      if (lvd->nest_depth == 0)
+	{
+	  /* The outermost scope contains special variables, embed them directly
+	     in the coroutine frame without nesting.  */
+	  *lvd->field_list = scope_variables;
+	} else
+	{
+	  /* Create a struct for housing the vars of this bind-expr
+	     in the coroutine frame.  */
+	  char *scope_struct_name_suffix
+	    = xasprintf ("Frame_scope%u_%u", lvd->nest_depth, bind_indx);
+	  tree scope_struct_name
+	    = get_fn_local_identifier (lvd->orig, scope_struct_name_suffix);
+	  free (scope_struct_name_suffix);
+	  tree scope_struct = xref_tag (record_type, scope_struct_name);
+	  DECL_CONTEXT (TYPE_NAME (scope_struct)) = current_scope ();
+	  scope_struct = begin_class_definition (scope_struct);
+	  TYPE_FIELDS (scope_struct) = scope_variables;
+	  TYPE_BINFO (scope_struct) = make_tree_binfo (0);
+	  BINFO_OFFSET (TYPE_BINFO (scope_struct)) = size_zero_node;
+	  BINFO_TYPE (TYPE_BINFO (scope_struct)) = scope_struct;
+	  scope_struct = finish_struct (scope_struct, NULL_TREE);
+
+	  /* Add the scope struct to the parent field list.  */
+	  coro_make_frame_entry_id (parent_field_list, scope_field_id,
+				    scope_struct,
+				    lvd->loc);
+
+	  lvd->field_access_path->pop ();
+	}
     }
   return NULL_TREE;
 }
@@ -4487,7 +4612,8 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer)
      would expect to delete unused entries later.  */
   hash_map<tree, local_var_info> local_var_uses;
   local_vars_frame_data local_vars_data
-    = {&field_list, &local_var_uses, 0, 0, fn_start, false, false};
+    = {&field_list, &local_var_uses, 0, 0, fn_start, false, false, orig,
+       make_tree_vector ()};
   cp_walk_tree (&fnbody, register_local_var_uses, &local_vars_data, NULL);
 
   /* Tie off the struct for now, so that we can build offsets to the
diff --git a/gcc/testsuite/g++.dg/coroutines/pr105989.C b/gcc/testsuite/g++.dg/coroutines/pr105989.C
new file mode 100644
index 00000000000..c8b9be634aa
--- /dev/null
+++ b/gcc/testsuite/g++.dg/coroutines/pr105989.C
@@ -0,0 +1,124 @@
+//  { dg-do run }
+
+#include <cstddef>
+
+#include "coro.h"
+
+struct promise;
+
+struct task
+{
+  coro::coroutine_handle<promise> handle;
+
+  ~task ()
+  {
+    if (handle)
+      {
+	handle.destroy ();
+      }
+  }
+};
+
+std::size_t frame_size = 0u;
+
+struct promise
+{
+  coro::coroutine_handle<> continuation = coro::noop_coroutine ();
+
+  auto get_return_object () noexcept
+  {
+    return task{coro::coroutine_handle<promise>::from_promise (*this)};
+  }
+
+  void unhandled_exception () noexcept {}
+
+  void return_void () {}
+
+  auto initial_suspend () noexcept { return coro::suspend_always{}; }
+
+  auto final_suspend () noexcept
+  {
+    struct awaiter_type : coro::suspend_always
+    {
+      auto await_suspend (coro::coroutine_handle <promise> handle) noexcept
+      {
+	return handle.promise ().continuation;
+      }
+    };
+
+    return awaiter_type{};
+  }
+
+  void * operator new (std::size_t size)
+  {
+    frame_size = size;
+    return new std::byte[size];
+  }
+
+  void operator delete (void *ptr, std::size_t size)
+  {
+    return delete[] static_cast<std::byte *>(ptr);
+  }
+};
+
+auto operator co_await (task &&t)
+{
+  struct awaiter_type : coro::suspend_always
+  {
+    coro::coroutine_handle<promise> handle;
+
+    auto await_suspend (coro::coroutine_handle<promise> continuation)
+    {
+      handle.promise ().continuation = continuation;
+      return handle;
+    }
+  };
+
+  return awaiter_type{{}, t.handle};
+}
+
+template<typename... Args>
+struct coro::coroutine_traits<task, Args...>
+{
+  using promise_type = promise;
+};
+
+auto coro_3 () -> task
+{
+  co_return;
+}
+
+std::byte *arr_ptr = nullptr;
+
+task coro_2 ()
+{
+  {
+    std::byte arr[256];
+    co_await coro_3 ();
+    arr_ptr = arr;
+  }
+  {
+    std::byte arr[256];
+    co_await coro_3 ();
+    arr_ptr = arr;
+  }
+}
+
+task coro_1 ()
+{
+  std::byte arr[256];
+  co_await coro_3 ();
+  arr_ptr = arr;
+}
+
+int main ()
+{
+  coro_1 ();
+  auto first_frame_size = frame_size;
+  coro_2 ();
+  auto second_frame_size = frame_size;
+
+  /* coro_1 frame should be the same size as coro_2 frame.  */
+
+  return first_frame_size == second_frame_size ? 0 : 1;
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2022-07-12 13:35 [PATCH] c++: coroutines - Overlap variables in frame [PR105989] Michal Jankovič
@ 2022-07-12 14:08 ` Iain Sandoe
  2022-07-12 15:14   ` Michal Jankovič
  0 siblings, 1 reply; 9+ messages in thread
From: Iain Sandoe @ 2022-07-12 14:08 UTC (permalink / raw)
  To: Michal Jankovič; +Cc: GCC Patches

Hi Michal,

> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> 
> Currently, coroutine frames store all variables of a coroutine separately,
> even if their lifetime does not overlap (they are in distinct scopes). This
> patch implements overlapping distinct variable scopes in the coroutine frame,
> by storing the frame fields in nested unions of structs. This lowers the size
> of the frame for larger coroutines significantly, and makes them more usable
> on systems with limited memory.

not a review (I will try to take a look at the weekend).

but … this is one of the two main optimisations on my TODO - so cool for doing it.

(the other related optimisation is to eliminate frame entries for scopes without any suspend
 points - which has the potential to save even more space for code with sparse use of co_xxxx)

Iain

> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new test fails
> before the patch and succeeds after with no regressions.
> 
> 	PR c++/105989
> 
> gcc/cp/ChangeLog:
> 
> 	* coroutines.cc (struct local_var_info): Add field_access_path.
> 	(build_local_var_frame_access_expr): New.
> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
> 	(coro_make_frame_entry_id): New.
> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
> 	(struct local_vars_frame_data): Add orig, field_access_path.
> 	(register_local_var_uses): Generate new frame layout. Create access
> 	paths to vars.
> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data. 
> 
> gcc/testsuite/ChangeLog:
> 
> 	* g++.dg/coroutines/pr105989.C: New test.
> 
> <pr105989.patch>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2022-07-12 14:08 ` Iain Sandoe
@ 2022-07-12 15:14   ` Michal Jankovič
  2022-07-12 17:11     ` Iain Sandoe
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Jankovič @ 2022-07-12 15:14 UTC (permalink / raw)
  To: Iain Sandoe; +Cc: GCC Patches

Hi Iain,

Thanks for the reply, this is my first time contributing and I am
looking forward to your input.

One other related thing I would like to investigate is reducing the
number of compiler generated variables in the frame, particularly
_Coro_destroy_fn and _Coro_self_handle.  

As I understand it, _Coro_destroy_fn just sets a flag in
_Coro_resume_index and calls _Coro_resume_fn; it should be possible to
move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
is stored in the frame; this would however change the coroutine ABI - I
don't know if that's a problem.

The _Coro_self_handle should be constructible on-demand from the frame address.

Do you have any advice / opinions on this before I try to implement it?

Michal

On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:

> Hi Michal,
>  
>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>>  
>> Currently, coroutine frames store all variables of a coroutine separately,
>> even if their lifetime does not overlap (they are in distinct
>> scopes). This
>> patch implements overlapping distinct variable scopes in the
>> coroutine frame,
>> by storing the frame fields in nested unions of structs. This lowers
>> the size
>> of the frame for larger coroutines significantly, and makes them more usable
>> on systems with limited memory.
>  
> not a review (I will try to take a look at the weekend).
>  
> but … this is one of the two main optimisations on my TODO - so cool
> for doing it.
>  
> (the other related optimisation is to eliminate frame entries for
> scopes without any suspend
> points - which has the potential to save even more space for code with
> sparse use of co_xxxx)
>  
> Iain
>  
>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new test fails
>> before the patch and succeeds after with no regressions.
>>  
>> 	PR c++/105989
>>  
>> gcc/cp/ChangeLog:
>>  
>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>> 	(build_local_var_frame_access_expr): New.
>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>> 	(coro_make_frame_entry_id): New.
>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>> 	(register_local_var_uses): Generate new frame layout. Create access
>> 	paths to vars.
>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.  
>>  
>> gcc/testsuite/ChangeLog:
>>  
>> 	* g++.dg/coroutines/pr105989.C: New test.
>>  
>> <pr105989.patch>
>  
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2022-07-12 15:14   ` Michal Jankovič
@ 2022-07-12 17:11     ` Iain Sandoe
  2022-07-13 12:54       ` Michal Jankovic
  0 siblings, 1 reply; 9+ messages in thread
From: Iain Sandoe @ 2022-07-12 17:11 UTC (permalink / raw)
  To: Michal Jankovič; +Cc: GCC Patches

Hi Michal,

> On 12 Jul 2022, at 16:14, Michal Jankovič <michal.jankovic59@gmail.com> wrote:

> One other related thing I would like to investigate is reducing the
> number of compiler generated variables in the frame, particularly
> _Coro_destroy_fn and _Coro_self_handle.  
> 
> As I understand it, _Coro_destroy_fn just sets a flag in
> _Coro_resume_index and calls _Coro_resume_fn; it should be possible to
> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
> is stored in the frame;

That is a particular point about GCC’s implementation … (it is not neccesarily, or even
likely to be the same for other implementations) - see below.

I was intending to do experiment with making the ramp/resume/destroy value a parameter
to the actor function so that we would have something like -

ramp calls      actor(frame, 0)
resume calls  actor(frame, 1)
destroy calls  actor(frame, 2) 
- the token values are illustrative, not intended to be a final version.

I think that should allow for more inlining opportunites and possibly a way forward to
frame elision (a.k.a halo).

> this would however change the coroutine ABI - I don't know if that's a problem.

The external ABI for the coroutine is the 
 resume,
 destroy pointers 
 and the promise 
 and that one can find each of these from the frame pointer.

This was agreed between the interested “vendors” so that one compiler could invoke
coroutines built by another.  So I do not think this is so much a useful area to explore.

Also the intent is that an indirect call through the frame pointer is the most frequent
operation so should be the most efficient.  
  resume() might be called many times, 
  destroy() just once thus it is a cold code path 
  - space can be important too - but interoperability was the goal here.

> The _Coro_self_handle should be constructible on-demand from the frame address.

Yes, and in the header the relevant items are all constexpr - so that should happen in the
user’s code.  I elected to have that value in the frame to avoid recreating it each time - I
suppose that is a trade-off of one oiptimisation c.f. another … 
> 
> Do you have any advice / opinions on this before I try to implement it?

Hopefully, the notes above help.

I will rebase my latest code changes as soon as I have a chance and put them somewhere
for you to look at - basically, these are to try and address the correctness issues we face,

Iain


> 
> Michal
> 
> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
> 
>> Hi Michal,
>> 
>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>> 
>>> Currently, coroutine frames store all variables of a coroutine separately,
>>> even if their lifetime does not overlap (they are in distinct
>>> scopes). This
>>> patch implements overlapping distinct variable scopes in the
>>> coroutine frame,
>>> by storing the frame fields in nested unions of structs. This lowers
>>> the size
>>> of the frame for larger coroutines significantly, and makes them more usable
>>> on systems with limited memory.
>> 
>> not a review (I will try to take a look at the weekend).
>> 
>> but … this is one of the two main optimisations on my TODO - so cool
>> for doing it.
>> 
>> (the other related optimisation is to eliminate frame entries for
>> scopes without any suspend
>> points - which has the potential to save even more space for code with
>> sparse use of co_xxxx)
>> 
>> Iain
>> 
>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new test fails
>>> before the patch and succeeds after with no regressions.
>>> 
>>> 	PR c++/105989
>>> 
>>> gcc/cp/ChangeLog:
>>> 
>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>> 	(build_local_var_frame_access_expr): New.
>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>> 	(coro_make_frame_entry_id): New.
>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>> 	paths to vars.
>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.  
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>> 
>>> <pr105989.patch>
>> 
>> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2022-07-12 17:11     ` Iain Sandoe
@ 2022-07-13 12:54       ` Michal Jankovic
  2023-05-14 15:36         ` Michal Jankovič
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Jankovic @ 2022-07-13 12:54 UTC (permalink / raw)
  To: Iain Sandoe; +Cc: GCC Patches

Hi Iain,

thanks for the info. I have some follow-up questions.

On Jul 12 2022, at 7:11 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:

> Hi Michal,
>  
>> On 12 Jul 2022, at 16:14, Michal Jankovič
>> <michal.jankovic59@gmail.com> wrote:
>  
>> One other related thing I would like to investigate is reducing the
>> number of compiler generated variables in the frame, particularly
>> _Coro_destroy_fn and _Coro_self_handle.   
>>  
>> As I understand it, _Coro_destroy_fn just sets a flag in
>> _Coro_resume_index and calls _Coro_resume_fn; it should be possible to
>> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
>> is stored in the frame;
>  
> That is a particular point about GCC’s implementation … (it is not
> neccesarily, or even
> likely to be the same for other implementations) - see below.
>  
> I was intending to do experiment with making the ramp/resume/destroy
> value a parameter
> to the actor function so that we would have something like -
>  
> ramp calls      actor(frame, 0)
> resume calls  actor(frame, 1)
> destroy calls  actor(frame, 2)  
> - the token values are illustrative, not intended to be a final version.
>  
> I think that should allow for more inlining opportunites and possibly
> a way forward to
> frame elision (a.k.a halo).
>  
>> this would however change the coroutine ABI - I don't know if that's
>> a problem.
>  
> The external ABI for the coroutine is the  
> resume,
> destroy pointers  
> and the promise  
> and that one can find each of these from the frame pointer.
>  
> This was agreed between the interested “vendors” so that one compiler
> could invoke
> coroutines built by another.  So I do not think this is so much a
> useful area to explore.
>  

I understand. I still want to try to implement a more light-weight frame
layout with just one function pointer; would it be possible to merge
such a change if it was made opt-in via a compiler flag, eg
`-fsmall-coroutine-frame`? My use-case for this is embedded environments
with very limited memory, and I do not care about interoperability with
other compilers there.  

> Also the intent is that an indirect call through the frame pointer is
> the most frequent
> operation so should be the most efficient.   
>  resume() might be called many times,  
>  destroy() just once thus it is a cold code path  
>  - space can be important too - but interoperability was the goal here.
>  
>> The _Coro_self_handle should be constructible on-demand from the
>> frame address.
>  
> Yes, and in the header the relevant items are all constexpr - so that
> should happen in the
> user’s code.  I elected to have that value in the frame to avoid
> recreating it each time - I
> suppose that is a trade-off of one oiptimisation c.f. another …  

If the handle construction cannot be optimized out, and its thus  
a tradeoff between frame size and number of instructions, then this
could also be enabled by a hypothetical `-fsmall-coroutine-frame`.

Coming back to this:

>>> (the other related optimisation is to eliminate frame entries for
>>> scopes without any suspend
>>> points - which has the potential to save even more space for code with
>>> sparse use of co_xxxx)

This would be nice; although it could encompassed by a more general  
optimization - eliminate frame entries for all variables which are not  
accessed (directly or via pointer / reference) beyond a suspend point.
To be fair, I do not know how to get started on such an optimization,
or if it is even possible to do on the frontend. This would however be
immensely useful for reducing the frame size taken-up by complicated
co_await expressions (among other things), for example, if I have a
composed operation:

co_await when_either(get_leaf_awaitable_1(), get_leaf_awaitable_2());

Right now, this creates space in the frame for the temporary 'leaf'  
awaitables, which were already moved into the composed awaitable.
If the awaitable has an operator co_await that returns the real awaiter,
the original awaitable is also stored in the frame, even if it  
is not referenced by the awaiter; another unused object gets stored if  
the .await_transform() customization point was used.

What are your thoughts on the feasibility / difficulty of implementing
such an optimization?

Michal

>>  
>> Do you have any advice / opinions on this before I try to implement it?
>  
> Hopefully, the notes above help.
>  
> I will rebase my latest code changes as soon as I have a chance and
> put them somewhere
> for you to look at - basically, these are to try and address the
> correctness issues we face,
>  
> Iain
>  
>  
>>  
>> Michal
>>  
>> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>  
>>> Hi Michal,
>>>  
>>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>  
>>>> Currently, coroutine frames store all variables of a coroutine separately,
>>>> even if their lifetime does not overlap (they are in distinct
>>>> scopes). This
>>>> patch implements overlapping distinct variable scopes in the
>>>> coroutine frame,
>>>> by storing the frame fields in nested unions of structs. This lowers
>>>> the size
>>>> of the frame for larger coroutines significantly, and makes them
>>>> more usable
>>>> on systems with limited memory.
>>>  
>>> not a review (I will try to take a look at the weekend).
>>>  
>>> but … this is one of the two main optimisations on my TODO - so cool
>>> for doing it.
>>>  
>>> (the other related optimisation is to eliminate frame entries for
>>> scopes without any suspend
>>> points - which has the potential to save even more space for code with
>>> sparse use of co_xxxx)
>>>  
>>> Iain
>>>  
>>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new test fails
>>>> before the patch and succeeds after with no regressions.
>>>>  
>>>> 	PR c++/105989
>>>>  
>>>> gcc/cp/ChangeLog:
>>>>  
>>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>>> 	(build_local_var_frame_access_expr): New.
>>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>>> 	(coro_make_frame_entry_id): New.
>>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>>> 	paths to vars.
>>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.   
>>>>  
>>>> gcc/testsuite/ChangeLog:
>>>>  
>>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>>>  
>>>> <pr105989.patch>
>>>  
>>>  
>  
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2022-07-13 12:54       ` Michal Jankovic
@ 2023-05-14 15:36         ` Michal Jankovič
  2023-05-14 16:07           ` Iain Sandoe
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Jankovič @ 2023-05-14 15:36 UTC (permalink / raw)
  To: Iain Sandoe; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 7332 bytes --]

Rebased the patch to GCC 14 trunk. Bootstrapped and regression tested
again on x86_64-pc-linux-gnu, only difference is the new test failing
without the patch.

On Jul 13 2022, at 2:54 pm, Michal Jankovic
<michal.jankovic59@gmail.com> wrote:

> Hi Iain,
>  
> thanks for the info. I have some follow-up questions.
>  
> On Jul 12 2022, at 7:11 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>  
>> Hi Michal,
>>   
>>> On 12 Jul 2022, at 16:14, Michal Jankovič
>>> <michal.jankovic59@gmail.com> wrote:
>>   
>>> One other related thing I would like to investigate is reducing the
>>> number of compiler generated variables in the frame, particularly
>>> _Coro_destroy_fn and _Coro_self_handle.    
>>>   
>>> As I understand it, _Coro_destroy_fn just sets a flag in
>>> _Coro_resume_index and calls _Coro_resume_fn; it should be possible to
>>> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
>>> is stored in the frame;
>>   
>> That is a particular point about GCC’s implementation … (it is not
>> neccesarily, or even
>> likely to be the same for other implementations) - see below.
>>   
>> I was intending to do experiment with making the ramp/resume/destroy
>> value a parameter
>> to the actor function so that we would have something like -
>>   
>> ramp calls      actor(frame, 0)
>> resume calls  actor(frame, 1)
>> destroy calls  actor(frame, 2)   
>> - the token values are illustrative, not intended to be a final version.
>>   
>> I think that should allow for more inlining opportunites and possibly
>> a way forward to
>> frame elision (a.k.a halo).
>>   
>>> this would however change the coroutine ABI - I don't know if that's
>>> a problem.
>>   
>> The external ABI for the coroutine is the   
>> resume,
>> destroy pointers   
>> and the promise   
>> and that one can find each of these from the frame pointer.
>>   
>> This was agreed between the interested “vendors” so that one compiler
>> could invoke
>> coroutines built by another.  So I do not think this is so much a
>> useful area to explore.
>>   
>  
> I understand. I still want to try to implement a more light-weight frame
> layout with just one function pointer; would it be possible to merge
> such a change if it was made opt-in via a compiler flag, eg
> `-fsmall-coroutine-frame`? My use-case for this is embedded environments
> with very limited memory, and I do not care about interoperability with
> other compilers there.   
>  
>> Also the intent is that an indirect call through the frame pointer is
>> the most frequent
>> operation so should be the most efficient.    
>>  resume() might be called many times,   
>>  destroy() just once thus it is a cold code path   
>>  - space can be important too - but interoperability was the goal here.
>>   
>>> The _Coro_self_handle should be constructible on-demand from the
>>> frame address.
>>   
>> Yes, and in the header the relevant items are all constexpr - so that
>> should happen in the
>> user’s code.  I elected to have that value in the frame to avoid
>> recreating it each time - I
>> suppose that is a trade-off of one oiptimisation c.f. another …   
>  
> If the handle construction cannot be optimized out, and its thus   
> a tradeoff between frame size and number of instructions, then this
> could also be enabled by a hypothetical `-fsmall-coroutine-frame`.
>  
> Coming back to this:
>  
>>>> (the other related optimisation is to eliminate frame entries for
>>>> scopes without any suspend
>>>> points - which has the potential to save even more space for code with
>>>> sparse use of co_xxxx)
>  
> This would be nice; although it could encompassed by a more general   
> optimization - eliminate frame entries for all variables which are not
>   
> accessed (directly or via pointer / reference) beyond a suspend point.
> To be fair, I do not know how to get started on such an optimization,
> or if it is even possible to do on the frontend. This would however be
> immensely useful for reducing the frame size taken-up by complicated
> co_await expressions (among other things), for example, if I have a
> composed operation:
>  
> co_await when_either(get_leaf_awaitable_1(), get_leaf_awaitable_2());
>  
> Right now, this creates space in the frame for the temporary 'leaf'   
> awaitables, which were already moved into the composed awaitable.
> If the awaitable has an operator co_await that returns the real awaiter,
> the original awaitable is also stored in the frame, even if it   
> is not referenced by the awaiter; another unused object gets stored if
>   
> the .await_transform() customization point was used.
>  
> What are your thoughts on the feasibility / difficulty of implementing
> such an optimization?
>  
> Michal
>  
>>>   
>>> Do you have any advice / opinions on this before I try to implement it?
>>   
>> Hopefully, the notes above help.
>>   
>> I will rebase my latest code changes as soon as I have a chance and
>> put them somewhere
>> for you to look at - basically, these are to try and address the
>> correctness issues we face,
>>   
>> Iain
>>   
>>   
>>>   
>>> Michal
>>>   
>>> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>   
>>>> Hi Michal,
>>>>   
>>>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>   
>>>>> Currently, coroutine frames store all variables of a coroutine separately,
>>>>> even if their lifetime does not overlap (they are in distinct
>>>>> scopes). This
>>>>> patch implements overlapping distinct variable scopes in the
>>>>> coroutine frame,
>>>>> by storing the frame fields in nested unions of structs. This lowers
>>>>> the size
>>>>> of the frame for larger coroutines significantly, and makes them
>>>>> more usable
>>>>> on systems with limited memory.
>>>>   
>>>> not a review (I will try to take a look at the weekend).
>>>>   
>>>> but … this is one of the two main optimisations on my TODO - so cool
>>>> for doing it.
>>>>   
>>>> (the other related optimisation is to eliminate frame entries for
>>>> scopes without any suspend
>>>> points - which has the potential to save even more space for code with
>>>> sparse use of co_xxxx)
>>>>   
>>>> Iain
>>>>   
>>>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new
>>>>> test fails
>>>>> before the patch and succeeds after with no regressions.
>>>>>   
>>>>> 	PR c++/105989
>>>>>   
>>>>> gcc/cp/ChangeLog:
>>>>>   
>>>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>>>> 	(build_local_var_frame_access_expr): New.
>>>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>>>> 	(coro_make_frame_entry_id): New.
>>>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>>>> 	paths to vars.
>>>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.    
>>>>>   
>>>>> gcc/testsuite/ChangeLog:
>>>>>   
>>>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>>>>   
>>>>> <pr105989.patch>
>>>>   
>>>>   
>>   
>>  
> 

[-- Attachment #2: pr105989.patch --]
[-- Type: application/octet-stream, Size: 11152 bytes --]

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index 59a240ebd40..d965afdd3dc 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -2007,6 +2007,7 @@ struct local_var_info
   bool is_static;
   bool has_value_expr_p;
   location_t def_loc;
+  vec<tree, va_gc> *field_access_path;
 };
 
 /* For figuring out what local variable usage we have.  */
@@ -2019,6 +2020,26 @@ struct local_vars_transform
   hash_map<tree, local_var_info> *local_var_uses;
 };
 
+/* Build a COMPONENT_REF chain for accessing a nested variable in the coroutine
+   frame.  */
+static tree
+build_local_var_frame_access_expr (local_vars_transform *lvt,
+				   local_var_info *local_var)
+{
+  tree access_expr = lvt->actor_frame;
+
+  for (tree path_elem_id : *local_var->field_access_path)
+    {
+      tree path_elem_member = lookup_member (
+	TREE_TYPE (access_expr), path_elem_id, 1, 0, tf_warning_or_error);
+      access_expr = build3_loc (
+	lvt->loc, COMPONENT_REF, TREE_TYPE (path_elem_member),
+	access_expr, path_elem_member, NULL_TREE);
+    }
+
+  return access_expr;
+}
+
 static tree
 transform_local_var_uses (tree *stmt, int *do_subtree, void *d)
 {
@@ -2050,12 +2071,7 @@ transform_local_var_uses (tree *stmt, int *do_subtree, void *d)
 	  if (local_var.field_id == NULL_TREE)
 	    continue; /* Wasn't used.  */
 
-	  tree fld_ref
-	    = lookup_member (lvd->coro_frame_type, local_var.field_id,
-			     /*protect=*/1, /*want_type=*/0,
-			     tf_warning_or_error);
-	  tree fld_idx = build3 (COMPONENT_REF, TREE_TYPE (lvar),
-				 lvd->actor_frame, fld_ref, NULL_TREE);
+	  tree fld_idx = build_local_var_frame_access_expr (lvd, &local_var);
 	  local_var.field_idx = fld_idx;
 	  SET_DECL_VALUE_EXPR (lvar, fld_idx);
 	  DECL_HAS_VALUE_EXPR_P (lvar) = true;
@@ -3884,14 +3900,24 @@ analyze_fn_parms (tree orig)
 /* Small helper for the repetitive task of adding a new field to the coro
    frame type.  */
 
+static void
+coro_make_frame_entry_id (tree *field_list, tree id, tree fld_type,
+			  location_t loc)
+{
+  tree decl = build_decl (loc, FIELD_DECL, id, fld_type);
+  DECL_CHAIN (decl) = *field_list;
+  *field_list = decl;
+}
+
+/* Same as coro_make_frame_entry_id, but creates an identifier from string.  */
+
 static tree
 coro_make_frame_entry (tree *field_list, const char *name, tree fld_type,
 		       location_t loc)
 {
   tree id = get_identifier (name);
-  tree decl = build_decl (loc, FIELD_DECL, id, fld_type);
-  DECL_CHAIN (decl) = *field_list;
-  *field_list = decl;
+  coro_make_frame_entry_id (field_list, id, fld_type, loc);
+
   return id;
 }
 
@@ -3905,6 +3931,8 @@ struct local_vars_frame_data
   location_t loc;
   bool saw_capture;
   bool local_var_seen;
+  tree orig;
+  vec<tree, va_gc> *field_access_path;
 };
 
 /* A tree-walk callback that processes one bind expression noting local
@@ -3923,6 +3951,21 @@ register_local_var_uses (tree *stmt, int *do_subtree, void *d)
 
   if (TREE_CODE (*stmt) == BIND_EXPR)
     {
+      tree scope_field_id = NULL_TREE;
+      if (lvd->nest_depth != 0)
+	{
+	  /* Create identifier under which fields for this bind-expression will
+	     be accessed.  */
+	  char *scope_field_name
+	    = xasprintf ("_Scope%u_%u", lvd->nest_depth, lvd->bind_indx);
+	  scope_field_id = get_identifier (scope_field_name);
+	  free (scope_field_name);
+
+	  vec_safe_push (lvd->field_access_path, scope_field_id);
+	}
+
+      tree scope_variables = NULL_TREE;
+
       tree lvar;
       unsigned serial = 0;
       for (lvar = BIND_EXPR_VARS (*stmt); lvar != NULL;
@@ -3991,17 +4034,99 @@ register_local_var_uses (tree *stmt, int *do_subtree, void *d)
 
 	  /* TODO: Figure out if we should build a local type that has any
 	     excess alignment or size from the original decl.  */
-	  local_var.field_id = coro_make_frame_entry (lvd->field_list, buf,
+
+	  local_var.field_id = coro_make_frame_entry (&scope_variables, buf,
 						      lvtype, lvd->loc);
 	  free (buf);
 	  /* We don't walk any of the local var sub-trees, they won't contain
 	     any bind exprs.  */
+
+	  local_var.field_access_path = make_tree_vector_copy (
+	    lvd->field_access_path);
+	  vec_safe_push (local_var.field_access_path, local_var.field_id);
 	}
+
+      unsigned bind_indx = lvd->bind_indx;
+      tree* parent_field_list = lvd->field_list;
+
+      /* Collect the scope structs of child bind-expressions when recursing.  */
+      tree child_scopes = NULL_TREE;
+      lvd->field_list = &child_scopes;
+
+      /* Create identifier under which fields for child bind-expressions will be
+	 accessed.  */
+      char *child_scopes_field_name
+	= xasprintf ("_Scope_list%u_%u", lvd->nest_depth, bind_indx);
+      tree child_scopes_field_id = get_identifier (child_scopes_field_name);
+      free (child_scopes_field_name);
+
+      /* Recurse to child bind expressions.  */
       lvd->bind_indx++;
       lvd->nest_depth++;
+      vec_safe_push (lvd->field_access_path, child_scopes_field_id);
       cp_walk_tree (&BIND_EXPR_BODY (*stmt), register_local_var_uses, d, NULL);
       *do_subtree = 0; /* We've done this.  */
+      lvd->field_access_path->pop ();
       lvd->nest_depth--;
+
+      /* Restore the parent field list.  */
+      lvd->field_list = parent_field_list;
+
+      if (child_scopes != NULL_TREE)
+	{
+	  /* Create a union to house the child scopes, so that they are
+	     overlapped in the coroutine frame.  */
+	  char *child_scopes_union_name_suffix
+	    = xasprintf ("Frame_scope_list%u_%u", lvd->nest_depth, bind_indx);
+	  tree child_scopes_union_name = get_fn_local_identifier (
+	    lvd->orig, child_scopes_union_name_suffix);
+	  free (child_scopes_union_name_suffix);
+	  tree child_scopes_union
+	    = xref_tag (union_type, child_scopes_union_name);
+	  DECL_CONTEXT (TYPE_NAME (child_scopes_union)) = current_scope ();
+	  child_scopes_union = begin_class_definition (child_scopes_union);
+	  TYPE_FIELDS (child_scopes_union) = child_scopes;
+	  TYPE_BINFO (child_scopes_union) = make_tree_binfo (0);
+	  BINFO_OFFSET (TYPE_BINFO (child_scopes_union)) = size_zero_node;
+	  BINFO_TYPE (TYPE_BINFO (child_scopes_union)) = child_scopes_union;
+	  child_scopes_union = finish_struct (child_scopes_union, NULL_TREE);
+
+	  /* Add it to the current scope fields.  */
+	  coro_make_frame_entry_id (
+	    &scope_variables, child_scopes_field_id,
+	    child_scopes_union, lvd->loc);
+	}
+
+      if (lvd->nest_depth == 0)
+	{
+	  /* The outermost scope contains special variables, embed them directly
+	     in the coroutine frame without nesting.  */
+	  *lvd->field_list = scope_variables;
+	} else
+	{
+	  /* Create a struct for housing the vars of this bind-expr
+	     in the coroutine frame.  */
+	  char *scope_struct_name_suffix
+	    = xasprintf ("Frame_scope%u_%u", lvd->nest_depth, bind_indx);
+	  tree scope_struct_name
+	    = get_fn_local_identifier (lvd->orig, scope_struct_name_suffix);
+	  free (scope_struct_name_suffix);
+	  tree scope_struct = xref_tag (record_type, scope_struct_name);
+	  DECL_CONTEXT (TYPE_NAME (scope_struct)) = current_scope ();
+	  scope_struct = begin_class_definition (scope_struct);
+	  TYPE_FIELDS (scope_struct) = scope_variables;
+	  TYPE_BINFO (scope_struct) = make_tree_binfo (0);
+	  BINFO_OFFSET (TYPE_BINFO (scope_struct)) = size_zero_node;
+	  BINFO_TYPE (TYPE_BINFO (scope_struct)) = scope_struct;
+	  scope_struct = finish_struct (scope_struct, NULL_TREE);
+
+	  /* Add the scope struct to the parent field list.  */
+	  coro_make_frame_entry_id (parent_field_list, scope_field_id,
+				    scope_struct,
+				    lvd->loc);
+
+	  lvd->field_access_path->pop ();
+	}
     }
   return NULL_TREE;
 }
@@ -4101,7 +4226,7 @@ coro_rewrite_function_body (location_t fn_start, tree fnbody, tree orig,
       for (tree b = BLOCK_SUBBLOCKS (replace_blk); b; b = BLOCK_CHAIN (b))
 	BLOCK_SUPERCONTEXT (b) = replace_blk;
       BIND_EXPR_BLOCK (first) = replace_blk;
-      /* The top block has one child, so far, and we have now got a 
+      /* The top block has one child, so far, and we have now got a
 	 superblock.  */
       BLOCK_SUPERCONTEXT (replace_blk) = top_block;
       BLOCK_SUBBLOCKS (top_block) = replace_blk;
@@ -4511,7 +4636,8 @@ morph_fn_to_coro (tree orig, tree *resumer, tree *destroyer)
      would expect to delete unused entries later.  */
   hash_map<tree, local_var_info> local_var_uses;
   local_vars_frame_data local_vars_data
-    = {&field_list, &local_var_uses, 0, 0, fn_start, false, false};
+    = {&field_list, &local_var_uses, 0, 0, fn_start, false, false, orig,
+       make_tree_vector ()};
   cp_walk_tree (&fnbody, register_local_var_uses, &local_vars_data, NULL);
 
   /* Tie off the struct for now, so that we can build offsets to the
diff --git a/gcc/testsuite/g++.dg/coroutines/pr105989.C b/gcc/testsuite/g++.dg/coroutines/pr105989.C
new file mode 100644
index 00000000000..c8b9be634aa
--- /dev/null
+++ b/gcc/testsuite/g++.dg/coroutines/pr105989.C
@@ -0,0 +1,124 @@
+//  { dg-do run }
+
+#include <cstddef>
+
+#include "coro.h"
+
+struct promise;
+
+struct task
+{
+  coro::coroutine_handle<promise> handle;
+
+  ~task ()
+  {
+    if (handle)
+      {
+	handle.destroy ();
+      }
+  }
+};
+
+std::size_t frame_size = 0u;
+
+struct promise
+{
+  coro::coroutine_handle<> continuation = coro::noop_coroutine ();
+
+  auto get_return_object () noexcept
+  {
+    return task{coro::coroutine_handle<promise>::from_promise (*this)};
+  }
+
+  void unhandled_exception () noexcept {}
+
+  void return_void () {}
+
+  auto initial_suspend () noexcept { return coro::suspend_always{}; }
+
+  auto final_suspend () noexcept
+  {
+    struct awaiter_type : coro::suspend_always
+    {
+      auto await_suspend (coro::coroutine_handle <promise> handle) noexcept
+      {
+	return handle.promise ().continuation;
+      }
+    };
+
+    return awaiter_type{};
+  }
+
+  void * operator new (std::size_t size)
+  {
+    frame_size = size;
+    return new std::byte[size];
+  }
+
+  void operator delete (void *ptr, std::size_t size)
+  {
+    return delete[] static_cast<std::byte *>(ptr);
+  }
+};
+
+auto operator co_await (task &&t)
+{
+  struct awaiter_type : coro::suspend_always
+  {
+    coro::coroutine_handle<promise> handle;
+
+    auto await_suspend (coro::coroutine_handle<promise> continuation)
+    {
+      handle.promise ().continuation = continuation;
+      return handle;
+    }
+  };
+
+  return awaiter_type{{}, t.handle};
+}
+
+template<typename... Args>
+struct coro::coroutine_traits<task, Args...>
+{
+  using promise_type = promise;
+};
+
+auto coro_3 () -> task
+{
+  co_return;
+}
+
+std::byte *arr_ptr = nullptr;
+
+task coro_2 ()
+{
+  {
+    std::byte arr[256];
+    co_await coro_3 ();
+    arr_ptr = arr;
+  }
+  {
+    std::byte arr[256];
+    co_await coro_3 ();
+    arr_ptr = arr;
+  }
+}
+
+task coro_1 ()
+{
+  std::byte arr[256];
+  co_await coro_3 ();
+  arr_ptr = arr;
+}
+
+int main ()
+{
+  coro_1 ();
+  auto first_frame_size = frame_size;
+  coro_2 ();
+  auto second_frame_size = frame_size;
+
+  /* coro_1 frame should be the same size as coro_2 frame.  */
+
+  return first_frame_size == second_frame_size ? 0 : 1;
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2023-05-14 15:36         ` Michal Jankovič
@ 2023-05-14 16:07           ` Iain Sandoe
  2023-05-14 16:31             ` Michal Jankovič
  0 siblings, 1 reply; 9+ messages in thread
From: Iain Sandoe @ 2023-05-14 16:07 UTC (permalink / raw)
  To: Michal Jankovič; +Cc: GCC Patches

Hi Michal,

> On 14 May 2023, at 16:36, Michal Jankovič <michal.jankovic59@gmail.com> wrote:
> 
> Rebased the patch to GCC 14 trunk. Bootstrapped and regression tested
> again on x86_64-pc-linux-gnu, only difference is the new test failing
> without the patch.

(as previously noted, I am much in favour of this optimisation)

Do you have any metrics on the reductions in frame size for realistic coroutines?

thanks
Iain

> 
> On Jul 13 2022, at 2:54 pm, Michal Jankovic
> <michal.jankovic59@gmail.com> wrote:
> 
>> Hi Iain,
>> 
>> thanks for the info. I have some follow-up questions.
>> 
>> On Jul 12 2022, at 7:11 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>> 
>>> Hi Michal,
>>> 
>>>> On 12 Jul 2022, at 16:14, Michal Jankovič
>>>> <michal.jankovic59@gmail.com> wrote:
>>> 
>>>> One other related thing I would like to investigate is reducing the
>>>> number of compiler generated variables in the frame, particularly
>>>> _Coro_destroy_fn and _Coro_self_handle.    
>>>> 
>>>> As I understand it, _Coro_destroy_fn just sets a flag in
>>>> _Coro_resume_index and calls _Coro_resume_fn; it should be possible to
>>>> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
>>>> is stored in the frame;
>>> 
>>> That is a particular point about GCC’s implementation … (it is not
>>> neccesarily, or even
>>> likely to be the same for other implementations) - see below.
>>> 
>>> I was intending to do experiment with making the ramp/resume/destroy
>>> value a parameter
>>> to the actor function so that we would have something like -
>>> 
>>> ramp calls      actor(frame, 0)
>>> resume calls  actor(frame, 1)
>>> destroy calls  actor(frame, 2)   
>>> - the token values are illustrative, not intended to be a final version.
>>> 
>>> I think that should allow for more inlining opportunites and possibly
>>> a way forward to
>>> frame elision (a.k.a halo).
>>> 
>>>> this would however change the coroutine ABI - I don't know if that's
>>>> a problem.
>>> 
>>> The external ABI for the coroutine is the   
>>> resume,
>>> destroy pointers   
>>> and the promise   
>>> and that one can find each of these from the frame pointer.
>>> 
>>> This was agreed between the interested “vendors” so that one compiler
>>> could invoke
>>> coroutines built by another.  So I do not think this is so much a
>>> useful area to explore.
>>> 
>> 
>> I understand. I still want to try to implement a more light-weight frame
>> layout with just one function pointer; would it be possible to merge
>> such a change if it was made opt-in via a compiler flag, eg
>> `-fsmall-coroutine-frame`? My use-case for this is embedded environments
>> with very limited memory, and I do not care about interoperability with
>> other compilers there.   
>> 
>>> Also the intent is that an indirect call through the frame pointer is
>>> the most frequent
>>> operation so should be the most efficient.    
>>> resume() might be called many times,   
>>> destroy() just once thus it is a cold code path   
>>> - space can be important too - but interoperability was the goal here.
>>> 
>>>> The _Coro_self_handle should be constructible on-demand from the
>>>> frame address.
>>> 
>>> Yes, and in the header the relevant items are all constexpr - so that
>>> should happen in the
>>> user’s code.  I elected to have that value in the frame to avoid
>>> recreating it each time - I
>>> suppose that is a trade-off of one oiptimisation c.f. another …   
>> 
>> If the handle construction cannot be optimized out, and its thus   
>> a tradeoff between frame size and number of instructions, then this
>> could also be enabled by a hypothetical `-fsmall-coroutine-frame`.
>> 
>> Coming back to this:
>> 
>>>>> (the other related optimisation is to eliminate frame entries for
>>>>> scopes without any suspend
>>>>> points - which has the potential to save even more space for code with
>>>>> sparse use of co_xxxx)
>> 
>> This would be nice; although it could encompassed by a more general   
>> optimization - eliminate frame entries for all variables which are not
>> 
>> accessed (directly or via pointer / reference) beyond a suspend point.
>> To be fair, I do not know how to get started on such an optimization,
>> or if it is even possible to do on the frontend. This would however be
>> immensely useful for reducing the frame size taken-up by complicated
>> co_await expressions (among other things), for example, if I have a
>> composed operation:
>> 
>> co_await when_either(get_leaf_awaitable_1(), get_leaf_awaitable_2());
>> 
>> Right now, this creates space in the frame for the temporary 'leaf'   
>> awaitables, which were already moved into the composed awaitable.
>> If the awaitable has an operator co_await that returns the real awaiter,
>> the original awaitable is also stored in the frame, even if it   
>> is not referenced by the awaiter; another unused object gets stored if
>> 
>> the .await_transform() customization point was used.
>> 
>> What are your thoughts on the feasibility / difficulty of implementing
>> such an optimization?
>> 
>> Michal
>> 
>>>> 
>>>> Do you have any advice / opinions on this before I try to implement it?
>>> 
>>> Hopefully, the notes above help.
>>> 
>>> I will rebase my latest code changes as soon as I have a chance and
>>> put them somewhere
>>> for you to look at - basically, these are to try and address the
>>> correctness issues we face,
>>> 
>>> Iain
>>> 
>>> 
>>>> 
>>>> Michal
>>>> 
>>>> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>> 
>>>>> Hi Michal,
>>>>> 
>>>>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>> 
>>>>>> Currently, coroutine frames store all variables of a coroutine separately,
>>>>>> even if their lifetime does not overlap (they are in distinct
>>>>>> scopes). This
>>>>>> patch implements overlapping distinct variable scopes in the
>>>>>> coroutine frame,
>>>>>> by storing the frame fields in nested unions of structs. This lowers
>>>>>> the size
>>>>>> of the frame for larger coroutines significantly, and makes them
>>>>>> more usable
>>>>>> on systems with limited memory.
>>>>> 
>>>>> not a review (I will try to take a look at the weekend).
>>>>> 
>>>>> but … this is one of the two main optimisations on my TODO - so cool
>>>>> for doing it.
>>>>> 
>>>>> (the other related optimisation is to eliminate frame entries for
>>>>> scopes without any suspend
>>>>> points - which has the potential to save even more space for code with
>>>>> sparse use of co_xxxx)
>>>>> 
>>>>> Iain
>>>>> 
>>>>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new
>>>>>> test fails
>>>>>> before the patch and succeeds after with no regressions.
>>>>>> 
>>>>>> 	PR c++/105989
>>>>>> 
>>>>>> gcc/cp/ChangeLog:
>>>>>> 
>>>>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>>>>> 	(build_local_var_frame_access_expr): New.
>>>>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>>>>> 	(coro_make_frame_entry_id): New.
>>>>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>>>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>>>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>>>>> 	paths to vars.
>>>>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.    
>>>>>> 
>>>>>> gcc/testsuite/ChangeLog:
>>>>>> 
>>>>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>>>>> 
>>>>>> <pr105989.patch>
>>>>> 
>>>>> 
>>> 
>>> 
> <pr105989.patch>


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2023-05-14 16:07           ` Iain Sandoe
@ 2023-05-14 16:31             ` Michal Jankovič
  2023-05-14 18:51               ` Iain Sandoe
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Jankovič @ 2023-05-14 16:31 UTC (permalink / raw)
  To: Iain Sandoe; +Cc: GCC Patches

Hi Iain,

I do not currently have metrics for this, but I can look into generating
them, however I currently do not know of any large open-source projects
using coroutines that I could use for this; I was thinking about using
cppcoro unit tests, but they mostly contain very simple coroutines. I
have source for ~20k LoC proprietary project relying heavily on
coroutines (using boost::asio::awaitable), but here I cannot show the
source along with the numbers - would this be enough or should I look
for more open source projects?

thanks,
Michal

On May 14 2023, at 6:07 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:

> Hi Michal,
>  
>> On 14 May 2023, at 16:36, Michal Jankovič
>> <michal.jankovic59@gmail.com> wrote:
>>  
>> Rebased the patch to GCC 14 trunk. Bootstrapped and regression tested
>> again on x86_64-pc-linux-gnu, only difference is the new test failing
>> without the patch.
>  
> (as previously noted, I am much in favour of this optimisation)
>  
> Do you have any metrics on the reductions in frame size for realistic coroutines?
>  
> thanks
> Iain
>  
>>  
>> On Jul 13 2022, at 2:54 pm, Michal Jankovic
>> <michal.jankovic59@gmail.com> wrote:
>>  
>>> Hi Iain,
>>>  
>>> thanks for the info. I have some follow-up questions.
>>>  
>>> On Jul 12 2022, at 7:11 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>  
>>>> Hi Michal,
>>>>  
>>>>> On 12 Jul 2022, at 16:14, Michal Jankovič
>>>>> <michal.jankovic59@gmail.com> wrote:
>>>>  
>>>>> One other related thing I would like to investigate is reducing the
>>>>> number of compiler generated variables in the frame, particularly
>>>>> _Coro_destroy_fn and _Coro_self_handle.     
>>>>>  
>>>>> As I understand it, _Coro_destroy_fn just sets a flag in
>>>>> _Coro_resume_index and calls _Coro_resume_fn; it should be
>>>>> possible to
>>>>> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
>>>>> is stored in the frame;
>>>>  
>>>> That is a particular point about GCC’s implementation … (it is not
>>>> neccesarily, or even
>>>> likely to be the same for other implementations) - see below.
>>>>  
>>>> I was intending to do experiment with making the ramp/resume/destroy
>>>> value a parameter
>>>> to the actor function so that we would have something like -
>>>>  
>>>> ramp calls      actor(frame, 0)
>>>> resume calls  actor(frame, 1)
>>>> destroy calls  actor(frame, 2)    
>>>> - the token values are illustrative, not intended to be a final version.
>>>>  
>>>> I think that should allow for more inlining opportunites and possibly
>>>> a way forward to
>>>> frame elision (a.k.a halo).
>>>>  
>>>>> this would however change the coroutine ABI - I don't know if that's
>>>>> a problem.
>>>>  
>>>> The external ABI for the coroutine is the    
>>>> resume,
>>>> destroy pointers    
>>>> and the promise    
>>>> and that one can find each of these from the frame pointer.
>>>>  
>>>> This was agreed between the interested “vendors” so that one compiler
>>>> could invoke
>>>> coroutines built by another.  So I do not think this is so much a
>>>> useful area to explore.
>>>>  
>>>  
>>> I understand. I still want to try to implement a more light-weight frame
>>> layout with just one function pointer; would it be possible to merge
>>> such a change if it was made opt-in via a compiler flag, eg
>>> `-fsmall-coroutine-frame`? My use-case for this is embedded environments
>>> with very limited memory, and I do not care about interoperability with
>>> other compilers there.    
>>>  
>>>> Also the intent is that an indirect call through the frame pointer is
>>>> the most frequent
>>>> operation so should be the most efficient.     
>>>> resume() might be called many times,    
>>>> destroy() just once thus it is a cold code path    
>>>> - space can be important too - but interoperability was the goal here.
>>>>  
>>>>> The _Coro_self_handle should be constructible on-demand from the
>>>>> frame address.
>>>>  
>>>> Yes, and in the header the relevant items are all constexpr - so that
>>>> should happen in the
>>>> user’s code.  I elected to have that value in the frame to avoid
>>>> recreating it each time - I
>>>> suppose that is a trade-off of one oiptimisation c.f. another …    
>>>  
>>> If the handle construction cannot be optimized out, and its thus    
>>> a tradeoff between frame size and number of instructions, then this
>>> could also be enabled by a hypothetical `-fsmall-coroutine-frame`.
>>>  
>>> Coming back to this:
>>>  
>>>>>> (the other related optimisation is to eliminate frame entries for
>>>>>> scopes without any suspend
>>>>>> points - which has the potential to save even more space for code with
>>>>>> sparse use of co_xxxx)
>>>  
>>> This would be nice; although it could encompassed by a more general  
>>>   
>>> optimization - eliminate frame entries for all variables which are not
>>>  
>>> accessed (directly or via pointer / reference) beyond a suspend point.
>>> To be fair, I do not know how to get started on such an optimization,
>>> or if it is even possible to do on the frontend. This would however be
>>> immensely useful for reducing the frame size taken-up by complicated
>>> co_await expressions (among other things), for example, if I have a
>>> composed operation:
>>>  
>>> co_await when_either(get_leaf_awaitable_1(), get_leaf_awaitable_2());
>>>  
>>> Right now, this creates space in the frame for the temporary 'leaf'  
>>>   
>>> awaitables, which were already moved into the composed awaitable.
>>> If the awaitable has an operator co_await that returns the real awaiter,
>>> the original awaitable is also stored in the frame, even if it    
>>> is not referenced by the awaiter; another unused object gets stored if
>>>  
>>> the .await_transform() customization point was used.
>>>  
>>> What are your thoughts on the feasibility / difficulty of implementing
>>> such an optimization?
>>>  
>>> Michal
>>>  
>>>>>  
>>>>> Do you have any advice / opinions on this before I try to
>>>>> implement it?
>>>>  
>>>> Hopefully, the notes above help.
>>>>  
>>>> I will rebase my latest code changes as soon as I have a chance and
>>>> put them somewhere
>>>> for you to look at - basically, these are to try and address the
>>>> correctness issues we face,
>>>>  
>>>> Iain
>>>>  
>>>>  
>>>>>  
>>>>> Michal
>>>>>  
>>>>> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>>>  
>>>>>> Hi Michal,
>>>>>>  
>>>>>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>>  
>>>>>>> Currently, coroutine frames store all variables of a coroutine separately,
>>>>>>> even if their lifetime does not overlap (they are in distinct
>>>>>>> scopes). This
>>>>>>> patch implements overlapping distinct variable scopes in the
>>>>>>> coroutine frame,
>>>>>>> by storing the frame fields in nested unions of structs. This lowers
>>>>>>> the size
>>>>>>> of the frame for larger coroutines significantly, and makes them
>>>>>>> more usable
>>>>>>> on systems with limited memory.
>>>>>>  
>>>>>> not a review (I will try to take a look at the weekend).
>>>>>>  
>>>>>> but … this is one of the two main optimisations on my TODO - so cool
>>>>>> for doing it.
>>>>>>  
>>>>>> (the other related optimisation is to eliminate frame entries for
>>>>>> scopes without any suspend
>>>>>> points - which has the potential to save even more space for code with
>>>>>> sparse use of co_xxxx)
>>>>>>  
>>>>>> Iain
>>>>>>  
>>>>>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new
>>>>>>> test fails
>>>>>>> before the patch and succeeds after with no regressions.
>>>>>>>  
>>>>>>> 	PR c++/105989
>>>>>>>  
>>>>>>> gcc/cp/ChangeLog:
>>>>>>>  
>>>>>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>>>>>> 	(build_local_var_frame_access_expr): New.
>>>>>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>>>>>> 	(coro_make_frame_entry_id): New.
>>>>>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>>>>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>>>>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>>>>>> 	paths to vars.
>>>>>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.     
>>>>>>>  
>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>  
>>>>>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>>>>>>  
>>>>>>> <pr105989.patch>
>>>>>>  
>>>>>>  
>>>>  
>>>>  
>> <pr105989.patch>
>  
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] c++: coroutines - Overlap variables in frame [PR105989]
  2023-05-14 16:31             ` Michal Jankovič
@ 2023-05-14 18:51               ` Iain Sandoe
  0 siblings, 0 replies; 9+ messages in thread
From: Iain Sandoe @ 2023-05-14 18:51 UTC (permalink / raw)
  To: Michal Jankovič; +Cc: GCC Patches

Hi Michal,

> On 14 May 2023, at 17:31, Michal Jankovič via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:

> I do not currently have metrics for this, but I can look into generating
> them,

To be clear, this is not in any way a precondition for patch acceptance
but I am curious as to some idea of the improvements seen.

>  however
> I currently do not know of any large open-source projects
> using coroutines that I could use for this; I was thinking about using
> cppcoro unit tests, but they mostly contain very simple coroutines.

Even cppcoro would be some idea.

Larger OSS projects  I know of are folly (which I have out of tree patches
to enable the coroutines tests) and seastar (which I have no idea how to
build at present).

If you are interested to try building folly (note: it has a lot of prereqs) then
I can push my WIP branch somewhere.

> I
> have source for ~20k LoC proprietary project relying heavily on
> coroutines (using boost::asio::awaitable), but here I cannot show the
> source along with the numbers - would this be enough or should I look
> for more open source projects?

The internal project figures are also useful (IMO) even tho we cannot
obviously repeat them.

thanks
Iain

> 
> thanks,
> Michal
> 
> On May 14 2023, at 6:07 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
> 
>> Hi Michal,
>> 
>>> On 14 May 2023, at 16:36, Michal Jankovič
>>> <michal.jankovic59@gmail.com> wrote:
>>> 
>>> Rebased the patch to GCC 14 trunk. Bootstrapped and regression tested
>>> again on x86_64-pc-linux-gnu, only difference is the new test failing
>>> without the patch.
>> 
>> (as previously noted, I am much in favour of this optimisation)
>> 
>> Do you have any metrics on the reductions in frame size for realistic coroutines?
>> 
>> thanks
>> Iain
>> 
>>> 
>>> On Jul 13 2022, at 2:54 pm, Michal Jankovic
>>> <michal.jankovic59@gmail.com> wrote:
>>> 
>>>> Hi Iain,
>>>> 
>>>> thanks for the info. I have some follow-up questions.
>>>> 
>>>> On Jul 12 2022, at 7:11 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>> 
>>>>> Hi Michal,
>>>>> 
>>>>>> On 12 Jul 2022, at 16:14, Michal Jankovič
>>>>>> <michal.jankovic59@gmail.com> wrote:
>>>>> 
>>>>>> One other related thing I would like to investigate is reducing the
>>>>>> number of compiler generated variables in the frame, particularly
>>>>>> _Coro_destroy_fn and _Coro_self_handle.     
>>>>>> 
>>>>>> As I understand it, _Coro_destroy_fn just sets a flag in
>>>>>> _Coro_resume_index and calls _Coro_resume_fn; it should be
>>>>>> possible to
>>>>>> move this logic to __builtin_coro_destroy, so that only _Coro_resume_fn
>>>>>> is stored in the frame;
>>>>> 
>>>>> That is a particular point about GCC’s implementation … (it is not
>>>>> neccesarily, or even
>>>>> likely to be the same for other implementations) - see below.
>>>>> 
>>>>> I was intending to do experiment with making the ramp/resume/destroy
>>>>> value a parameter
>>>>> to the actor function so that we would have something like -
>>>>> 
>>>>> ramp calls      actor(frame, 0)
>>>>> resume calls  actor(frame, 1)
>>>>> destroy calls  actor(frame, 2)    
>>>>> - the token values are illustrative, not intended to be a final version.
>>>>> 
>>>>> I think that should allow for more inlining opportunites and possibly
>>>>> a way forward to
>>>>> frame elision (a.k.a halo).
>>>>> 
>>>>>> this would however change the coroutine ABI - I don't know if that's
>>>>>> a problem.
>>>>> 
>>>>> The external ABI for the coroutine is the    
>>>>> resume,
>>>>> destroy pointers    
>>>>> and the promise    
>>>>> and that one can find each of these from the frame pointer.
>>>>> 
>>>>> This was agreed between the interested “vendors” so that one compiler
>>>>> could invoke
>>>>> coroutines built by another.  So I do not think this is so much a
>>>>> useful area to explore.
>>>>> 
>>>> 
>>>> I understand. I still want to try to implement a more light-weight frame
>>>> layout with just one function pointer; would it be possible to merge
>>>> such a change if it was made opt-in via a compiler flag, eg
>>>> `-fsmall-coroutine-frame`? My use-case for this is embedded environments
>>>> with very limited memory, and I do not care about interoperability with
>>>> other compilers there.    
>>>> 
>>>>> Also the intent is that an indirect call through the frame pointer is
>>>>> the most frequent
>>>>> operation so should be the most efficient.     
>>>>> resume() might be called many times,    
>>>>> destroy() just once thus it is a cold code path    
>>>>> - space can be important too - but interoperability was the goal here.
>>>>> 
>>>>>> The _Coro_self_handle should be constructible on-demand from the
>>>>>> frame address.
>>>>> 
>>>>> Yes, and in the header the relevant items are all constexpr - so that
>>>>> should happen in the
>>>>> user’s code.  I elected to have that value in the frame to avoid
>>>>> recreating it each time - I
>>>>> suppose that is a trade-off of one oiptimisation c.f. another …    
>>>> 
>>>> If the handle construction cannot be optimized out, and its thus    
>>>> a tradeoff between frame size and number of instructions, then this
>>>> could also be enabled by a hypothetical `-fsmall-coroutine-frame`.
>>>> 
>>>> Coming back to this:
>>>> 
>>>>>>> (the other related optimisation is to eliminate frame entries for
>>>>>>> scopes without any suspend
>>>>>>> points - which has the potential to save even more space for code with
>>>>>>> sparse use of co_xxxx)
>>>> 
>>>> This would be nice; although it could encompassed by a more general  
>>>> 
>>>> optimization - eliminate frame entries for all variables which are not
>>>> 
>>>> accessed (directly or via pointer / reference) beyond a suspend point.
>>>> To be fair, I do not know how to get started on such an optimization,
>>>> or if it is even possible to do on the frontend. This would however be
>>>> immensely useful for reducing the frame size taken-up by complicated
>>>> co_await expressions (among other things), for example, if I have a
>>>> composed operation:
>>>> 
>>>> co_await when_either(get_leaf_awaitable_1(), get_leaf_awaitable_2());
>>>> 
>>>> Right now, this creates space in the frame for the temporary 'leaf'  
>>>> 
>>>> awaitables, which were already moved into the composed awaitable.
>>>> If the awaitable has an operator co_await that returns the real awaiter,
>>>> the original awaitable is also stored in the frame, even if it    
>>>> is not referenced by the awaiter; another unused object gets stored if
>>>> 
>>>> the .await_transform() customization point was used.
>>>> 
>>>> What are your thoughts on the feasibility / difficulty of implementing
>>>> such an optimization?
>>>> 
>>>> Michal
>>>> 
>>>>>> 
>>>>>> Do you have any advice / opinions on this before I try to
>>>>>> implement it?
>>>>> 
>>>>> Hopefully, the notes above help.
>>>>> 
>>>>> I will rebase my latest code changes as soon as I have a chance and
>>>>> put them somewhere
>>>>> for you to look at - basically, these are to try and address the
>>>>> correctness issues we face,
>>>>> 
>>>>> Iain
>>>>> 
>>>>> 
>>>>>> 
>>>>>> Michal
>>>>>> 
>>>>>> On Jul 12 2022, at 4:08 pm, Iain Sandoe <iain@sandoe.co.uk> wrote:
>>>>>> 
>>>>>>> Hi Michal,
>>>>>>> 
>>>>>>>> On 12 Jul 2022, at 14:35, Michal Jankovič via Gcc-patches
>>>>>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>>>>>> 
>>>>>>>> Currently, coroutine frames store all variables of a coroutine separately,
>>>>>>>> even if their lifetime does not overlap (they are in distinct
>>>>>>>> scopes). This
>>>>>>>> patch implements overlapping distinct variable scopes in the
>>>>>>>> coroutine frame,
>>>>>>>> by storing the frame fields in nested unions of structs. This lowers
>>>>>>>> the size
>>>>>>>> of the frame for larger coroutines significantly, and makes them
>>>>>>>> more usable
>>>>>>>> on systems with limited memory.
>>>>>>> 
>>>>>>> not a review (I will try to take a look at the weekend).
>>>>>>> 
>>>>>>> but … this is one of the two main optimisations on my TODO - so cool
>>>>>>> for doing it.
>>>>>>> 
>>>>>>> (the other related optimisation is to eliminate frame entries for
>>>>>>> scopes without any suspend
>>>>>>> points - which has the potential to save even more space for code with
>>>>>>> sparse use of co_xxxx)
>>>>>>> 
>>>>>>> Iain
>>>>>>> 
>>>>>>>> Bootstrapped and regression tested on x86_64-pc-linux-gnu; new
>>>>>>>> test fails
>>>>>>>> before the patch and succeeds after with no regressions.
>>>>>>>> 
>>>>>>>> 	PR c++/105989
>>>>>>>> 
>>>>>>>> gcc/cp/ChangeLog:
>>>>>>>> 
>>>>>>>> 	* coroutines.cc (struct local_var_info): Add field_access_path.
>>>>>>>> 	(build_local_var_frame_access_expr): New.
>>>>>>>> 	(transform_local_var_uses): Use build_local_var_frame_access_expr.
>>>>>>>> 	(coro_make_frame_entry_id): New.
>>>>>>>> 	(coro_make_frame_entry): Delegate to coro_make_frame_entry_id.
>>>>>>>> 	(struct local_vars_frame_data): Add orig, field_access_path.
>>>>>>>> 	(register_local_var_uses): Generate new frame layout. Create access
>>>>>>>> 	paths to vars.
>>>>>>>> 	(morph_fn_to_coro): Set new fields in local_vars_frame_data.     
>>>>>>>> 
>>>>>>>> gcc/testsuite/ChangeLog:
>>>>>>>> 
>>>>>>>> 	* g++.dg/coroutines/pr105989.C: New test.
>>>>>>>> 
>>>>>>>> <pr105989.patch>
>>>>>>> 
>>>>>>> 
>>>>> 
>>>>> 
>>> <pr105989.patch>
>> 
>> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-05-14 18:51 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-12 13:35 [PATCH] c++: coroutines - Overlap variables in frame [PR105989] Michal Jankovič
2022-07-12 14:08 ` Iain Sandoe
2022-07-12 15:14   ` Michal Jankovič
2022-07-12 17:11     ` Iain Sandoe
2022-07-13 12:54       ` Michal Jankovic
2023-05-14 15:36         ` Michal Jankovič
2023-05-14 16:07           ` Iain Sandoe
2023-05-14 16:31             ` Michal Jankovič
2023-05-14 18:51               ` Iain Sandoe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).