public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Tobias Burnus <tobias@codesourcery.com>
To: Andrew Stubbs <ams@codesourcery.com>,
	gcc-patches <gcc-patches@gcc.gnu.org>
Subject: Re: [Patch] gcn: Add __builtin_gcn_{get_stack_limit,first_call_this_thread_p}
Date: Mon, 21 Nov 2022 14:41:48 +0100	[thread overview]
Message-ID: <2249a729-d961-3939-bc63-8c7564e03dd0@codesourcery.com> (raw)
In-Reply-To: <d6f80343-3b75-e1a5-8773-f8d30bdf16c0@codesourcery.com>

[-- Attachment #1: Type: text/plain, Size: 1493 bytes --]

On 19.11.22 11:46, Tobias Burnus wrote:
>> +       stacklimit = stackbase + seg_size*64;
> (this should be '*seg_size' not 'seg_size' and the name should be
> s/seg_size/seg_size_ptr/.)
I have updated the comment and ...
> (Reading it, I think it should be '..._MEM(SImode,' and
> '..._MULT(SImode' instead of DImode.)
Additionally, there was a problem of bytes vs. bits in:
> My understanding is that
> dispatch_ptr->private_segment_size == *((char*)dispatch_ptr + 192)

which is wrong - its 192 bits but only 24 bytes!

Finally, in the first_call_this_thread_p() call, I mixed up EQ vs. NE at one place.

BTW: It seems as if there is no problem with zero extension, if I look at the assembler result.

Updated version. Consists of: GCC patch adding the builtins,
the newlib patch using those (unchanged; used for testing + to be submitted), and
a 'test.c' using the builtins and its dump produced with amdgcn's
'cc1 -O2' to show the resulting assembly.

Tested with libgomp on gfx908 offloading and getting only the known fails:
(libgomp.c-c++-common/teams-2.c, libgomp.fortran/async_io_*.f90,
libgomp.oacc-c-c++-common/{deep-copy-10.c,static-variable-1.c,vprop.c})

OK for mainline?

Tobias
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955

[-- Attachment #2: gcn-stack-init-v2.diff --]
[-- Type: text/x-patch, Size: 5430 bytes --]

gcn: Add __builtin_gcn_{get_stack_limit,first_call_this_thread_p}

The new builtins have been added for newlib to reduce dependency on
compiler-internal implementation choices of GCC in newlibs' getreent.c.

gcc/ChangeLog:

	* config/gcn/gcn-builtins.def (FIRST_CALL_THIS_THREAD_P,
        GET_STACK_LIMIT): Add new builtins.
	* config/gcn/gcn.cc (gcn_expand_builtin_1): Expand them.
	* config/gcn/gcn.md (prologue_use): Add "register_operand" as
	arg to match_operand.
	(prologue_use_di): New; DI insn_and_split variant of the former.

Co-Authored-By: Andrew Stubbs <ams@codesourcery.com>

 gcc/config/gcn/gcn-builtins.def |  4 +++
 gcc/config/gcn/gcn.cc           | 70 ++++++++++++++++++++++++++++++++++++++++-
 gcc/config/gcn/gcn.md           | 15 ++++++++-
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/gcc/config/gcn/gcn-builtins.def b/gcc/config/gcn/gcn-builtins.def
index eeeaebf9013..f1cf30bbc94 100644
--- a/gcc/config/gcn/gcn-builtins.def
+++ b/gcc/config/gcn/gcn-builtins.def
@@ -160,8 +160,12 @@ DEF_BUILTIN (ACC_BARRIER, -1, "acc_barrier", B_INSN, _A1 (GCN_BTI_VOID),
 
 /* Kernel inputs.  */
 
+DEF_BUILTIN (FIRST_CALL_THIS_THREAD_P, -1, "first_call_this_thread_p", B_INSN,
+	     _A1 (GCN_BTI_BOOL), gcn_expand_builtin_1)
 DEF_BUILTIN (KERNARG_PTR, -1, "kernarg_ptr", B_INSN, _A1 (GCN_BTI_VOIDPTR),
 	     gcn_expand_builtin_1)
+DEF_BUILTIN (GET_STACK_LIMIT, -1, "get_stack_limit", B_INSN,
+	     _A1 (GCN_BTI_VOIDPTR), gcn_expand_builtin_1)
 
 #undef _A1
 #undef _A2
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index b3814c2e7c6..ea9631e8823 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -4493,6 +4493,45 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
       emit_insn (gen_gcn_wavefront_barrier ());
       return target;
 
+    case GCN_BUILTIN_GET_STACK_LIMIT:
+      {
+	/* stackbase = (stack_segment_decr & 0x0000ffffffffffff)
+			+ stack_wave_offset);
+	   seg_size = dispatch_ptr->private_segment_size;
+	   stacklimit = stackbase + seg_size*64;
+	   with segsize = *(uint32_t *) ((char *) dispatch_ptr
+				       + 6*sizeof(int16_t) + 3*sizeof(int32_t));
+	   cf. struct hsa_kernel_dispatch_packet_s in the HSA doc.  */
+	rtx ptr;
+	if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
+	    && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+	  {
+	    rtx size_rtx = gen_rtx_REG (DImode,
+			     cfun->machine->args.reg[DISPATCH_PTR_ARG]);
+	    size_rtx = gen_rtx_MEM (SImode,
+				    gen_rtx_PLUS (DImode, size_rtx,
+						  GEN_INT (6*2 + 3*4)));
+	    size_rtx = gen_rtx_MULT (SImode, size_rtx, GEN_INT (64));
+
+	    ptr = gen_rtx_REG (DImode,
+		    cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
+	    ptr = gen_rtx_AND (DImode, ptr, GEN_INT (0x0000ffffffffffff));
+	    ptr = gen_rtx_PLUS (DImode, ptr, size_rtx);
+	    if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
+	      {
+		rtx off;
+		off = gen_rtx_REG (SImode,
+		      cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
+		ptr = gen_rtx_PLUS (DImode, ptr, off);
+	      }
+	  }
+	else
+	  {
+	    ptr = gen_reg_rtx (DImode);
+	    emit_move_insn (ptr, const0_rtx);
+	  }
+	return ptr;
+      }
     case GCN_BUILTIN_KERNARG_PTR:
       {
 	rtx ptr;
@@ -4506,7 +4545,36 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	  }
 	return ptr;
       }
-
+    case GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P:
+      {
+	/* Stash a marker in the unused upper 16 bits of s[0:1] to indicate
+	   whether it was the first call.  */
+	rtx result = gen_reg_rtx (BImode);
+	emit_move_insn (result, const0_rtx);
+	if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+	  {
+	    rtx not_first = gen_label_rtx ();
+	    rtx reg = gen_rtx_REG (DImode,
+			cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
+	    rtx cmp = force_reg (DImode,
+				 gen_rtx_LSHIFTRT (DImode, reg, GEN_INT (48)));
+	    emit_insn (gen_cstoresi4 (result, gen_rtx_NE (BImode, cmp,
+							  GEN_INT(12345)),
+				      cmp, GEN_INT(12345)));
+	    emit_jump_insn (gen_cjump (not_first, gen_rtx_EQ (BImode, result,
+							      const0_rtx),
+				       result));
+	    emit_move_insn (reg,
+	      force_reg (DImode,
+		gen_rtx_IOR (DImode,
+			     gen_rtx_AND (DImode, reg,
+					  GEN_INT (0x0000ffffffffffffL)),
+			     GEN_INT (12345L << 48))));
+	    emit_insn (gen_prologue_use (reg));
+	    emit_label (not_first);
+	  }
+	return result;
+      }
     default:
       gcc_unreachable ();
     }
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index 987b76396cc..a8b9c28d115 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -692,11 +692,24 @@
 ;; {{{ Prologue/Epilogue
 
 (define_insn "prologue_use"
-  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
+  [(unspec_volatile [(match_operand 0 "register_operand")] UNSPECV_PROLOGUE_USE)]
   ""
   ""
   [(set_attr "length" "0")])
 
+(define_insn_and_split "prologue_use_di"
+  [(unspec_volatile [(match_operand:DI 0 "register_operand")] UNSPECV_PROLOGUE_USE)]
+  ""
+  "#"
+  "reload_completed"
+  [(unspec_volatile [(match_dup 0)] UNSPECV_PROLOGUE_USE)
+   (unspec_volatile [(match_dup 1)] UNSPECV_PROLOGUE_USE)]
+  {
+    operands[1] = gcn_operand_part (DImode, operands[0], 1);
+    operands[0] = gcn_operand_part (DImode, operands[0], 0);
+  }
+  [(set_attr "length" "0")])
+
 (define_expand "prologue"
   [(const_int 0)]
   ""

[-- Attachment #3: newlib-reent.diff --]
[-- Type: text/x-patch, Size: 2594 bytes --]

amdgcn: Use __builtin_gcn_ in libc/machine/amdgcn/getreent.c

Call __builtin_gcn_get_stack_limit and __builtin_gcn_first_call_this_thread_p
to reduce dependency on some register/layout assumptions by using the new
GCC mainline (GCC 13) builtins, if they are available. If not, the existing
code is used.

 newlib/libc/machine/amdgcn/getreent.c | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/newlib/libc/machine/amdgcn/getreent.c b/newlib/libc/machine/amdgcn/getreent.c
index be7d2edc9..ef731f649 100644
--- a/newlib/libc/machine/amdgcn/getreent.c
+++ b/newlib/libc/machine/amdgcn/getreent.c
@@ -29,22 +29,42 @@ typedef struct hsa_kernel_dispatch_packet_s {
 struct _reent *
 __getreent (void)
 {
-  /* Place the reent data at the top of the stack allocation.
-     s[0:1] contains a 48-bit private segment base address.
+  /* Place the reent data at the top of the stack allocation.  */
+  struct data {
+    int marker;
+    struct _reent reent;
+  } *data;
+
+#if defined(__has_builtin) \
+    && __has_builtin(__builtin_gcn_get_stack_limit) \
+    && __has_builtin(__builtin_gcn_first_call_this_thread_p)
+  unsigned long addr = (((unsigned long) __builtin_gcn_get_stack_limit()
+			 - sizeof(struct data)) & ~7);
+  data = (struct data *)addr;
+
+  register long sp asm("s16");
+
+  if (sp >= addr)
+    goto stackoverflow;
+  if (__builtin_gcn_first_call_this_thread_p())
+    {
+      data->marker = 12345;
+      __builtin_memset (&data->reent, 0, sizeof(struct _reent));
+      _REENT_INIT_PTR_ZEROED (&data->reent);
+    }
+  else if (data->marker != 12345)
+    goto stackoverflow;
+#else
+  /* s[0:1] contains a 48-bit private segment base address.
      s11 contains the offset to the base of the stack.
      s[4:5] contains the dispatch pointer.
-     
+
      WARNING: this code will break if s[0:1] is ever used for anything!  */
   const register unsigned long buffer_descriptor asm("s0");
   unsigned long private_segment = buffer_descriptor & 0x0000ffffffffffff;
   const register unsigned int stack_offset asm("s11");
   const register hsa_kernel_dispatch_packet_t *dispatch_ptr asm("s4");
 
-  struct data {
-    int marker;
-    struct _reent reent;
-  } *data;
-
   unsigned long stack_base = private_segment + stack_offset;
   unsigned long stack_end = stack_base + dispatch_ptr->private_segment_size * 64;
   unsigned long addr = (stack_end - sizeof(struct data)) & ~7;
@@ -69,7 +89,7 @@ __getreent (void)
     }
   else if (data->marker != 12345)
     goto stackoverflow;
-
+#endif
 
   return &data->reent;
 

[-- Attachment #4: test.c --]
[-- Type: text/x-csrc, Size: 325 bytes --]

volatile void *ptr;
void foo()
{
void *ptr2;
asm("; one");
ptr2 = __builtin_gcn_get_stack_limit() ;
asm("; two");
ptr = ptr2 + 1234;
}

volatile int b;
void bar()
{
  int v;
asm("; three");
  v = __builtin_gcn_first_call_this_thread_p ();
asm("; four");
  b = v;
asm("; five");
if (v)
  asm(";true");
else
  asm(";false");
}

[-- Attachment #5: test.s --]
[-- Type: text/plain, Size: 4981 bytes --]

	.amdgcn_target "amdgcn-unknown-amdhsa--gfx803"
	.text
	.align	4
	.globl	foo


	.type	foo,@function
foo:
.LFB0:
	; using flat addressing in function
	; frame pointer needed: true
	; lr needs saving: false
	; outgoing args size: 0
	; pretend size: 0
	; local vars size: 0
	; callee save size: 8
	v_writelane_b32 v6, s14, 0
	v_writelane_b32 v6, s15, 1
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_store_dword	v[4:5], v6
	s_add_u32	s16, s16, 8
	s_addc_u32	s17, s17, 0
.LCFI0:
	s_add_u32	s14, s16, 0
.LCFI1:
	s_addc_u32	s15, s17, 0
	s_mov_b32	m0, 65536
; 5 "test.c" 1
	; one
; 0 "" 2
	s_mov_b32	s2, -1
	s_mov_b32	s3, 65535
	s_and_b64	s[2:3], s[0:1], s[2:3]
	s_add_u32	s12, s4, 24
	s_addc_u32	s13, s5, 0
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	s_mov_b64	exec, 1
	flat_load_dword	v0, v[4:5]
	s_waitcnt	0
	v_lshlrev_b32	v0, 6, v0
	v_readlane_b32	s12, v0, 0
	s_mov_b32	s13, 0
	s_add_u32	s2, s2, s12
	s_addc_u32	s3, s3, s13
	s_mov_b32	s13, 0
	s_add_u32	s2, s2, s11
	s_addc_u32	s3, s3, s13
; 7 "test.c" 1
	; two
; 0 "" 2
	s_getpc_b64	s[12:13]
	s_add_u32	s12, s12, ptr@rel32@lo+4
	s_addc_u32	s13, s13, ptr@rel32@hi+4
	s_add_u32	s2, s2, 1234
	s_addc_u32	s3, s3, 0
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	v_writelane_b32	v6, s2, 0
	v_writelane_b32	v7, s3, 0
	flat_store_dwordx2	v[4:5], v[6:7]
	s_sub_u32	s16, s14, 8
	s_subb_u32	s17, s15, 0
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_load_dword	v6, v[4:5]
	s_waitcnt	0
	v_readlane_b32 s14, v6, 0
	v_readlane_b32 s15, v6, 1
	s_setpc_b64	s[18:19]
.LFE0:
	.size	foo, .-foo
	.align	4
	.globl	bar


	.type	bar,@function
bar:
.LFB1:
	; using flat addressing in function
	; frame pointer needed: true
	; lr needs saving: false
	; outgoing args size: 0
	; pretend size: 0
	; local vars size: 0
	; callee save size: 8
	v_writelane_b32 v6, s14, 0
	v_writelane_b32 v6, s15, 1
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_store_dword	v[4:5], v6
	s_add_u32	s16, s16, 8
	s_addc_u32	s17, s17, 0
.LCFI2:
	s_add_u32	s14, s16, 0
.LCFI3:
	s_addc_u32	s15, s17, 0
	s_mov_b32	m0, 65536
; 15 "test.c" 1
	; three
; 0 "" 2
	s_lshr_b64	s[2:3], s[0:1], 48
	s_cmp_lg_u64	s[2:3], 12345
	s_mov_b32	s2, scc
	s_mov_b32	vcc_lo, scc
	s_mov_b32	vcc_hi, 0
	s_cbranch_vccz	.L4
	v_writelane_b32	v4, s0, 0
	v_writelane_b32	v5, s1, 0
	s_mov_b64	exec, 1
	v_and_b32	v4, -1, v4
	v_and_b32	v5, 65535, v5
	v_or_b32	v4, 0, v4
	v_or_b32	v5, 809041920, v5
.L4:
	s_lshl_b32	s2, s2, 31
	s_lshr_b32	s2, s2, 31
; 17 "test.c" 1
	; four
; 0 "" 2
	s_getpc_b64	s[12:13]
	s_add_u32	s12, s12, b@rel32@lo+4
	s_addc_u32	s13, s13, b@rel32@hi+4
	v_writelane_b32	v4, s12, 0
	v_writelane_b32	v5, s13, 0
	v_writelane_b32	v0, s2, 0
	s_mov_b64	exec, 1
	flat_store_dword	v[4:5], v0 glc
; 19 "test.c" 1
	; five
; 0 "" 2
	s_cmp_eq_u32	s2, 0
	s_cbranch_scc1	.L5
; 21 "test.c" 1
	;true
; 0 "" 2
.L3:
	s_sub_u32	s16, s14, 8
	s_subb_u32	s17, s15, 0
	s_mov_b64	exec, -1
	v_lshlrev_b32	v3, 2, v1
	v_mov_b32	v4, s16
	v_mov_b32	v5, s17
	v_add_u32	v4, vcc, v3, v4
	v_addc_u32	v5, vcc, 0, v5, vcc
	s_mov_b64	exec, 3
	flat_load_dword	v6, v[4:5]
	s_waitcnt	0
	v_readlane_b32 s14, v6, 0
	v_readlane_b32 s15, v6, 1
	s_setpc_b64	s[18:19]
.L5:
; 23 "test.c" 1
	;false
; 0 "" 2
	s_branch	.L3
.LFE1:
	.size	bar, .-bar
	.globl	b
	.bss
	.align	16
	.type	b, @object
	.size	b, 4
b:
	.zero	4
	.globl	ptr
	.align	16
	.type	ptr, @object
	.size	ptr, 8
ptr:
	.zero	8
	.section	.debug_frame,"",@progbits
.Lframe0:
	.4byte	.LECIE0-.LSCIE0
.LSCIE0:
	.4byte	0xffffffff
	.byte	0x3
	.string	""
	.byte	0x1
	.byte	0x4
	.byte	0x10
	.byte	0xf
	.byte	0xa
	.byte	0x92
	.byte	0x31
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x30
	.byte	0
	.byte	0x22
	.byte	0x10
	.byte	0x10
	.byte	0xa
	.byte	0x92
	.byte	0x33
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x32
	.byte	0
	.byte	0x22
	.align	8
.LECIE0:
.LSFDE0:
	.4byte	.LEFDE0-.LASFDE0
.LASFDE0:
	.4byte	.Lframe0
	.8byte	.LFB0
	.8byte	.LFE0-.LFB0
	.byte	0x4
	.4byte	.LCFI0-.LFB0
	.byte	0xae
	.byte	0
	.byte	0xaf
	.byte	0x1
	.byte	0x4
	.4byte	.LCFI1-.LCFI0
	.byte	0xf
	.byte	0xc
	.byte	0x92
	.byte	0x2f
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x2e
	.byte	0
	.byte	0x22
	.byte	0x38
	.byte	0x1c
	.align	8
.LEFDE0:
.LSFDE2:
	.4byte	.LEFDE2-.LASFDE2
.LASFDE2:
	.4byte	.Lframe0
	.8byte	.LFB1
	.8byte	.LFE1-.LFB1
	.byte	0x4
	.4byte	.LCFI2-.LFB1
	.byte	0xae
	.byte	0
	.byte	0xaf
	.byte	0x1
	.byte	0x4
	.4byte	.LCFI3-.LCFI2
	.byte	0xf
	.byte	0xc
	.byte	0x92
	.byte	0x2f
	.byte	0
	.byte	0x8
	.byte	0x20
	.byte	0x24
	.byte	0x92
	.byte	0x2e
	.byte	0
	.byte	0x22
	.byte	0x38
	.byte	0x1c
	.align	8
.LEFDE2:
	.ident	"GCC: (GNU) 13.0.0 20221121 (experimental)"

  parent reply	other threads:[~2022-11-21 13:41 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-18 17:20 Tobias Burnus
2022-11-18 17:49 ` Andrew Stubbs
2022-11-19 10:46   ` Tobias Burnus
2022-11-20  0:23     ` Andrew Stubbs
2022-11-21 13:41     ` Tobias Burnus [this message]
2022-11-21 14:58       ` Stubbs, Andrew

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2249a729-d961-3939-bc63-8c7564e03dd0@codesourcery.com \
    --to=tobias@codesourcery.com \
    --cc=ams@codesourcery.com \
    --cc=gcc-patches@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).