From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1461) id E60D93858412; Mon, 24 Oct 2022 16:19:48 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E60D93858412 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1666628388; bh=LV3oysLooPTGQawSJD0NvdmUNovaNxDiN0WLhQ2ykH8=; h=From:To:Subject:Date:From; b=N2qutf0CjOQYhUC1U5gm2hNe51rSWBv4rta8yDdNwYwXPAXGWHwJUy81xfe1BjqU6 OfDt4zGloThdmB6lODBAazA2kCyPnzfmb1IGsSsD/AXljyLrQ51wkPHkHW0GBb47n7 qQftTnDTCMjTVMIC7lqYMxGWTUfjN8GB9geIEUz0= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Andrew Stubbs To: gcc-cvs@gcc.gnu.org Subject: [gcc/devel/omp/gcc-12] amdgcn, libgomp: USM allocation update X-Act-Checkin: gcc X-Git-Author: Andrew Stubbs X-Git-Refname: refs/heads/devel/omp/gcc-12 X-Git-Oldrev: c682c50354d95a5b118bbedb5eba8d222ada268b X-Git-Newrev: 6ec2c29dbbc19e7d2a8f991a5848e10c65c7c74c Message-Id: <20221024161948.E60D93858412@sourceware.org> Date: Mon, 24 Oct 2022 16:19:48 +0000 (GMT) List-Id: https://gcc.gnu.org/g:6ec2c29dbbc19e7d2a8f991a5848e10c65c7c74c commit 6ec2c29dbbc19e7d2a8f991a5848e10c65c7c74c Author: Andrew Stubbs Date: Sat Oct 15 23:38:50 2022 +0100 amdgcn, libgomp: USM allocation update Allocate Unified Shared Memory via malloc and hsa_amd_svm_attributes_set, instead of hsa_allocate_memory. This scheme should be more efficient for for memory that is first accessed by the CPU. libgomp/ChangeLog: * plugin/plugin-gcn.c (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED): New. (HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT): New. (HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG): New. (HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED): New. (hsa_amd_svm_attribute_pair_t): New. (struct hsa_runtime_fn_info): Add hsa_amd_svm_attributes_set_fn. (dump_hsa_system_info): Dump HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED and HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT. (DLSYM_OPT_FN): New. (init_hsa_runtime_functions): Add hsa_amd_svm_attributes_set. (GOMP_OFFLOAD_usm_alloc): Use malloc and hsa_amd_svm_attributes_set. (GOMP_OFFLOAD_usm_free): Use regular free. * testsuite/libgomp.c/usm-1.c: Add -mxnack=on for amdgcn. * testsuite/libgomp.c/usm-2.c: Likewise. * testsuite/libgomp.c/usm-3.c: Likewise. * testsuite/libgomp.c/usm-4.c: Likewise. Diff: --- gcc/ChangeLog.omp | 19 +++++++++++ libgomp/plugin/plugin-gcn.c | 68 ++++++++++++++++++++++++++++++++++--- libgomp/testsuite/libgomp.c/usm-1.c | 1 + libgomp/testsuite/libgomp.c/usm-2.c | 1 + libgomp/testsuite/libgomp.c/usm-3.c | 1 + libgomp/testsuite/libgomp.c/usm-4.c | 1 + 6 files changed, 86 insertions(+), 5 deletions(-) diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 8fc8e06e9ff..1e05594a4c6 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,22 @@ +2022-10-24 Andrew Stubbs + + * plugin/plugin-gcn.c (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED): New. + (HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT): New. + (HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG): New. + (HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED): New. + (hsa_amd_svm_attribute_pair_t): New. + (struct hsa_runtime_fn_info): Add hsa_amd_svm_attributes_set_fn. + (dump_hsa_system_info): Dump HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED and + HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT. + (DLSYM_OPT_FN): New. + (init_hsa_runtime_functions): Add hsa_amd_svm_attributes_set. + (GOMP_OFFLOAD_usm_alloc): Use malloc and hsa_amd_svm_attributes_set. + (GOMP_OFFLOAD_usm_free): Use regular free. + * testsuite/libgomp.c/usm-1.c: Add -mxnack=on for amdgcn. + * testsuite/libgomp.c/usm-2.c: Likewise. + * testsuite/libgomp.c/usm-3.c: Likewise. + * testsuite/libgomp.c/usm-4.c: Likewise. + 2022-10-24 Tobias Burnus Backported from master: diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index dd493f63912..4871a6a793b 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -113,6 +113,16 @@ struct gcn_thread int async; }; +/* TEMPORARY IMPORT, UNTIL hsa_ext_amd.h GETS UPDATED. */ +const static int HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201; +const static int HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202; +const static int HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0; +const static int HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1; +typedef struct hsa_amd_svm_attribute_pair_s { + uint64_t attribute; + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + /* As an HSA runtime is dlopened, following structure defines function pointers utilized by the HSA plug-in. */ @@ -195,6 +205,9 @@ struct hsa_runtime_fn_info hsa_status_t (*hsa_code_object_deserialize_fn) (void *serialized_code_object, size_t serialized_code_object_size, const char *options, hsa_code_object_t *code_object); + hsa_status_t (*hsa_amd_svm_attributes_set_fn) + (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); }; /* Structure describing the run-time and grid properties of an HSA kernel @@ -720,6 +733,24 @@ dump_hsa_system_info (void) } else GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n"); + + bool svm_supported; + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n", + (svm_supported ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n"); + + bool svm_accessible; + status = hsa_fns.hsa_system_get_info_fn + (HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n", + (svm_accessible ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n"); } /* Dump information about the available hardware. */ @@ -1361,6 +1392,8 @@ init_hsa_runtime_functions (void) hsa_fns.function##_fn = dlsym (handle, #function); \ if (hsa_fns.function##_fn == NULL) \ return false; +#define DLSYM_OPT_FN(function) \ + hsa_fns.function##_fn = dlsym (handle, #function); void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY); if (handle == NULL) return false; @@ -1395,6 +1428,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_signal_load_acquire) DLSYM_FN (hsa_queue_destroy) DLSYM_FN (hsa_code_object_deserialize) + DLSYM_OPT_FN (hsa_amd_svm_attributes_set) return true; #undef DLSYM_FN } @@ -3886,15 +3920,38 @@ static struct usm_splay_tree_s usm_map = { NULL }; /* Allocate memory suitable for Unified Shared Memory. - In fact, AMD memory need only be "coarse grained", which target - allocations already are. We do need to track allocations so that - GOMP_OFFLOAD_is_usm_ptr can look them up. */ + Normal heap memory is already enabled for USM, but by default it is "fine- + grained" memory, meaning that the GPU must access it via the system bus, + slowly. Changing the page to "coarse-grained" mode means that the page + is migrated on-demand and can therefore be accessed quickly by both CPU and + GPU (although care should be taken to prevent thrashing the page back and + forth). + + GOMP_OFFLOAD_alloc also allocates coarse-grained memory, but in that case + the initial location is GPU memory; this function returns system memory. + + We record and track allocations so that GOMP_OFFLOAD_is_usm_ptr can look + them up. */ void * GOMP_OFFLOAD_usm_alloc (int device, size_t size) { - void *ptr = GOMP_OFFLOAD_alloc (device, size); + void *ptr = malloc (size); + if (!ptr || !hsa_fns.hsa_amd_svm_attributes_set_fn) + return ptr; + + /* Register the heap allocation as coarse grained, which implies USM. */ + struct hsa_amd_svm_attribute_pair_s attr = { + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED + }; + hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (ptr, size, + &attr, 1); + if (status != HSA_STATUS_SUCCESS) + GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" + " please update your drivers and/or kernel"); + /* Record the allocation for GOMP_OFFLOAD_is_usm_ptr. */ usm_splay_tree_node node = malloc (sizeof (struct usm_splay_tree_node_s)); node->key.addr = ptr; node->key.size = size; @@ -3918,7 +3975,8 @@ GOMP_OFFLOAD_usm_free (int device, void *ptr) free (node); } - return GOMP_OFFLOAD_free (device, ptr); + free (ptr); + return true; } /* True if the memory was allocated via GOMP_OFFLOAD_usm_alloc. */ diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c index e73f1816f9a..f7bf897b839 100644 --- a/libgomp/testsuite/libgomp.c/usm-1.c +++ b/libgomp/testsuite/libgomp.c/usm-1.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */ #include #include diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c index 31f2bae7145..3f52adbd7e1 100644 --- a/libgomp/testsuite/libgomp.c/usm-2.c +++ b/libgomp/testsuite/libgomp.c/usm-2.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */ #include #include diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c index 2c78a0d8ced..225cba5fe58 100644 --- a/libgomp/testsuite/libgomp.c/usm-3.c +++ b/libgomp/testsuite/libgomp.c/usm-3.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */ #include #include diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c index 1ac5498f73f..d4addfc587a 100644 --- a/libgomp/testsuite/libgomp.c/usm-4.c +++ b/libgomp/testsuite/libgomp.c/usm-4.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_usm } */ +/* { dg-options "-foffload=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn } } */ #include #include