From 3f5524adacff23710cf1cab393a56bf23853cafa Mon Sep 17 00:00:00 2001 From: Thomas Schwinge Date: Wed, 21 Dec 2022 21:25:19 +0100 Subject: [PATCH] [WIP] nvptx: '-mframe-malloc-threshold', '-Wframe-malloc-threshold' --- gcc/config/nvptx/nvptx.cc | 102 ++++++++++++++++-- gcc/config/nvptx/nvptx.h | 3 + gcc/config/nvptx/nvptx.opt | 12 +++ gcc/doc/invoke.texi | 16 ++- .../nvptx/frame-malloc-threshold-1.c | 29 +++++ .../nvptx/frame-malloc-threshold-2.c | 13 +++ .../nvptx/frame-malloc-threshold-3.c | 14 +++ .../nvptx/frame-malloc-threshold-4.c | 16 +++ .../nvptx/frame-malloc-threshold-5.c | 15 +++ .../nvptx/frame-malloc-threshold-6.c | 15 +++ .../nvptx/frame-malloc-threshold-7.c | 15 +++ 11 files changed, 240 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-1.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-2.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-3.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-4.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-5.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-6.c create mode 100644 gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-7.c diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc index b93a253ab318..2efd70595991 100644 --- a/gcc/config/nvptx/nvptx.cc +++ b/gcc/config/nvptx/nvptx.cc @@ -178,6 +178,16 @@ static hash_map gang_private_shared_hmap; /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; +/* True if any function 'has_malloc_frame'. + Because of 'nvptx_name_replacement', we can't just: + nvptx_record_fndecl (builtin_decl_explicit (BUILT_IN_FREE)); + nvptx_record_fndecl (builtin_decl_explicit (BUILT_IN_MALLOC)); + ..., but instead have to track them individually. +*/ +static bool need_free_malloc_decl; +static bool have_free_decl; +static bool have_malloc_decl; + /* True if any function references __nvptx_stacks. */ static bool need_softstack_decl; static bool have_softstack_decl; @@ -976,6 +986,11 @@ write_fn_marker (std::stringstream &s, bool is_defn, bool globalize, s << " GLOBAL"; s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: "); s << name << "\n"; + + if (strcmp (name, "free") == 0) + have_free_decl = true; + else if (strcmp (name, "malloc") == 0) + have_malloc_decl = true; } /* Emit a linker marker for a variable decl or defn. */ @@ -1231,22 +1246,66 @@ nvptx_maybe_record_fnsym (rtx sym) nvptx_record_needed_fndecl (decl); } +//TODO /* Emit a local array to hold some part of a conventional stack frame and initialize REGNO to point to it. If the size is zero, it'll never be valid to dereference, so we can simply initialize to zero. */ static void -init_frame (FILE *file, int regno, unsigned align, unsigned size) +init_frame (FILE *file, int regno, int align, HOST_WIDE_INT size) { - if (size) - fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n", - align, reg_names[regno], size); fprintf (file, "\t.reg.u%d %s;\n", POINTER_SIZE, reg_names[regno]); - fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n" - : "\tmov.u%d %s, 0;\n"), - POINTER_SIZE, reg_names[regno], reg_names[regno]); + + if (regno == FRAME_POINTER_REGNUM + && ((unsigned HOST_WIDE_INT) size + >= (unsigned HOST_WIDE_INT) nvptx_frame_malloc_threshold)) + { + warning_at (DECL_SOURCE_LOCATION (current_function_decl), + OPT_Wframe_malloc_threshold, + "using % for frame with size of %wu bytes", size); + + /* + (2022-12-21, v12.0) states that in addition to the "in-kernel + 'malloc()' function" there also exists an "in-kernel + '__nv_aligned_device_malloc()' function", where "the address of the + allocated memory will be a multiple of 'align'". However that's not + documented on + + (2022-12-21, v12.0), so we shall not use that function. */ + /* + (2022-12-21, v12.0) does not, but + + (2022-12-21, v12.0) does state that the pointer returned by + "in-kernel 'malloc()' [...] is guaranteed to be aligned to a + 16-byte boundary". */ + if (align > 16) + sorry ("unfulfilled %d bytes alignment for frame", align); + + /* We don't need to support 'realloc', so instead of newlib 'malloc' + directly use the PTX 'malloc'. */ + fprintf (file, + "\t{\n" + "\t .param .u64 %%ptr;\n" + "\t .param .u64 %%size;\n" + "\t st.param.u64 [%%size], " HOST_WIDE_INT_PRINT_DEC ";\n" + "\t call (%%ptr), malloc, (%%size);\n" + "\t ld.param.u64 %s, [%%ptr];\n" + "\t}\n", + size, reg_names[regno]); + cfun->machine->has_malloc_frame = true; + need_free_malloc_decl = true; + } + else + { + if (size) + fprintf (file, "\t.local .align %d .b8 %s_ar[" HOST_WIDE_INT_PRINT_DEC "];\n", + align, reg_names[regno], size); + fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n" + : "\tmov.u%d %s, 0;\n"), + POINTER_SIZE, reg_names[regno], reg_names[regno]); + } } /* Emit soft stack frame setup sequence. */ @@ -1744,12 +1803,22 @@ nvptx_output_set_softstack (unsigned src_regno) } return ""; } + /* Output a return instruction. Also copy the return value to its outgoing location. */ const char * nvptx_output_return (void) { + if (cfun->machine->has_malloc_frame) + fprintf (asm_out_file, + "\t{\n" + "\t .param .u64 %%ptr;\n" + "\t st.param.u64 [%%ptr], %s;\n" + "\t call free, (%%ptr);\n" + "\t}\n", + reg_names[FRAME_POINTER_REGNUM]); + machine_mode mode = (machine_mode)cfun->machine->return_mode; if (mode != VOIDmode) @@ -4470,8 +4539,8 @@ nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, rtx_code_label *label = NULL; empty = false; - /* The frame size might not be DImode compatible, but the frame - array's declaration will be. So it's ok to round up here. */ + /* The frame size might not be DImode-compatible, but the actual frame + allocated by 'init_frame' will be. So it's ok to round up here. */ fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); /* Detect single iteration loop. */ if (fs == 1) @@ -5989,6 +6058,21 @@ write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size) static void nvptx_file_end (void) { + if (need_free_malloc_decl) + { + if (!have_free_decl) + { + write_fn_marker (func_decls, false, true, "free"); + func_decls << ".extern .func free (.param .b64 %ptr);\n"; + } + if (!have_malloc_decl) + { + write_fn_marker (func_decls, false, true, "malloc"); + func_decls + << ".extern .func (.param .b64 %ptr) malloc (.param .b64 %size);\n"; + } + } + hash_table::iterator iter; tree decl; FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) diff --git a/gcc/config/nvptx/nvptx.h b/gcc/config/nvptx/nvptx.h index bc1021a80317..82d695551090 100644 --- a/gcc/config/nvptx/nvptx.h +++ b/gcc/config/nvptx/nvptx.h @@ -214,6 +214,8 @@ struct nvptx_args { #define TRAMPOLINE_SIZE 32 #define TRAMPOLINE_ALIGNMENT 256 + +#define NVPTX_FRAME_MALLOC_THRESHOLD_INIT 257 /* We don't run reload, so this isn't actually used, but it still needs to be defined. Showing an argp->fp elimination also stops @@ -244,6 +246,7 @@ struct GTY(()) machine_function bool is_varadic; /* This call is varadic */ bool has_varadic; /* Current function has a varadic call. */ bool has_chain; /* Current function has outgoing static chain. */ + bool has_malloc_frame; bool has_softstack; /* Current function has a soft stack frame. */ bool has_simtreg; /* Current function has an OpenMP SIMD region. */ int num_args; /* Number of args of current call. */ diff --git a/gcc/config/nvptx/nvptx.opt b/gcc/config/nvptx/nvptx.opt index 71d3b68510bd..6ccd3defc776 100644 --- a/gcc/config/nvptx/nvptx.opt +++ b/gcc/config/nvptx/nvptx.opt @@ -28,6 +28,18 @@ Target RejectNegative Mask(ABI64) Ignored, but preserved for backward compatibility. Only 64-bit ABI is supported. +mframe-malloc-threshold= +Target Joined RejectNegative Host_Wide_Int ByteSize Var(nvptx_frame_malloc_threshold) Init(NVPTX_FRAME_MALLOC_THRESHOLD_INIT) +-mframe-malloc-threshold= When the frame size exceeds , frame allocation switches from '.local' memory to 'malloc'. + +mno-frame-malloc-threshold +Target Alias(mframe-malloc-threshold=,18446744073709551615EiB,none) +Always use '.local' memory for frame allocation. Equivalent to -mframe-malloc-threshold= or larger. + +Wframe-malloc-threshold +Target Warning +Warn when the threshold is reached where frame allocation switches from '.local' memory to 'malloc'. + mmainkernel Target RejectNegative Link in code for a __main kernel. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 471309dfacfe..e3b6ea0fe4b8 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -1179,7 +1179,9 @@ Objective-C and Objective-C++ Dialects}. -march=@var{arch} -mbmx -mno-bmx -mcdx -mno-cdx} @emph{Nvidia PTX Options} -@gccoptlist{-m64 -mmainkernel -moptimize} +@gccoptlist{-m64 @gol +-mframe-malloc-threshold=@var{byte-size} @gol +-mmainkernel -moptimize} @emph{OpenRISC Options} @gccoptlist{-mboard=@var{name} -mnewlib -mhard-mul -mhard-div @gol @@ -28367,6 +28369,18 @@ This option sets the values of the preprocessor macros for instance, for @samp{3.1} the macros have the values @samp{3} and @samp{1}, respectively. +@item -mframe-malloc-threshold=@var{byte-size} +@opindex mframe-malloc-threshold= +@opindex mno-frame-malloc-threshold +TODO + +This is not relevant if @code{-msoft-stack} is enabled. + +@option{-mframe-malloc-threshold=TODO} is enabled by default. +This may be disabled either by specifying +@var{byte-size} of @samp{SIZE_MAX} or more or by +@option{-mno-frame-malloc-threshold}. + @item -mmainkernel @opindex mmainkernel Link in code for a __main kernel. This is for stand-alone instead of diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-1.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-1.c new file mode 100644 index 000000000000..b16c17bfdf99 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-1.c @@ -0,0 +1,29 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +/* PTX-provided 'free', 'malloc'; cf. 'nvptx_name_replacement'. */ +void ptx_free (void *) __asm__ ("free"); +void *ptx_malloc (__SIZE_TYPE__) __asm__ ("malloc"); + +int f (void) +/* { dg-warning {using 'malloc' for frame with size of [0-9]+ bytes} {} { target *-*-* } .-1 } */ +{ + char a[1234]; + + ptx_malloc (5); + + ptx_free (ptx_malloc (1)); +} + +/* We exceed the default '-mframe-malloc-threshold=[...]'. + { dg-final { scan-assembler-not {%frame_ar} } } + { dg-final { scan-assembler-times {(?n)call free,.*;} 2 } } + { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 3 } } +*/ + +/* Of the implicit (via 'need_free_malloc_decl') and explicit declarations of + 'free', 'malloc', only one is emitted each: + { dg-final { scan-assembler-times {(?n)\.extern .* free .*;} 1 } } + { dg-final { scan-assembler-times {(?n)\.extern .* malloc .*;} 1 } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-2.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-2.c new file mode 100644 index 000000000000..2f6a919eb1f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-2.c @@ -0,0 +1,13 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ + +int f (void) +{ + char a[1234]; +} + +/* We exceed the default '-mframe-malloc-threshold=[...]'. + { dg-final { scan-assembler-not {%frame_ar} } } + { dg-final { scan-assembler-times {(?n)call free,.*;} 1 } } + { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 1 } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-3.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-3.c new file mode 100644 index 000000000000..7434132b2ad5 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-3.c @@ -0,0 +1,14 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +int f (void) +{ + char a[256]; +} + +/* We don't exceed the default '-mframe-malloc-threshold=[...]'. + { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } } + { dg-final { scan-assembler-not {free} } } + { dg-final { scan-assembler-not {malloc} } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-4.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-4.c new file mode 100644 index 000000000000..c4068ab7ad23 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-4.c @@ -0,0 +1,16 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -mframe-malloc-threshold=32 } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +int f (void) +/* { dg-warning {using 'malloc' for frame with size of [0-9]+ bytes} {} { target *-*-* } .-1 } */ +{ + char a[32]; +} + +/* We exceed the specified '-mframe-malloc-threshold=[...]'. + { dg-final { scan-assembler-not {%frame_ar} } } + { dg-final { scan-assembler-times {(?n)call free,.*;} 1 } } + { dg-final { scan-assembler-times {(?n)call .*, malloc, .*;} 1 } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-5.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-5.c new file mode 100644 index 000000000000..cc262427b03c --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-5.c @@ -0,0 +1,15 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -mframe-malloc-threshold=1249 } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +int f (void) +{ + char a[1234]; +} + +/* We don't exceed the specified '-mframe-malloc-threshold=[...]'. +/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } } + { dg-final { scan-assembler-not {free} } } + { dg-final { scan-assembler-not {malloc} } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-6.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-6.c new file mode 100644 index 000000000000..72017ca2f439 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-6.c @@ -0,0 +1,15 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -mframe-malloc-threshold=2KiB } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +int f (void) +{ + char a[1234]; +} + +/* We don't exceed the specified '-mframe-malloc-threshold=[...]'. +/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } } + { dg-final { scan-assembler-not {free} } } + { dg-final { scan-assembler-not {malloc} } } +*/ diff --git a/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-7.c b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-7.c new file mode 100644 index 000000000000..b2f85a55f050 --- /dev/null +++ b/gcc/testsuite/gcc.target/nvptx/frame-malloc-threshold-7.c @@ -0,0 +1,15 @@ +/* { dg-do assemble } */ +/* { dg-options {-save-temps -O0} } */ +/* { dg-additional-options -mno-frame-malloc-threshold } */ +/* { dg-additional-options -Wframe-malloc-threshold } */ + +int f (void) +{ + char a[1234]; +} + +/* We'll never exceed the specified unlimited '-mframe-malloc-threshold=[...]'. +/* { dg-final { scan-assembler-times {(?n)cvta\.local\.u64 %frame, %frame_ar;} 1 } } + { dg-final { scan-assembler-not {free} } } + { dg-final { scan-assembler-not {malloc} } } +*/ -- 2.35.1