From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 52142 invoked by alias); 13 Nov 2015 15:11:51 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 52126 invoked by uid 89); 13 Nov 2015 15:11:51 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=BAYES_00,FREEMAIL_FROM,KAM_ASCII_DIVIDERS,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=no version=3.3.2 X-HELO: mail-yk0-f179.google.com Received: from mail-yk0-f179.google.com (HELO mail-yk0-f179.google.com) (209.85.160.179) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Fri, 13 Nov 2015 15:11:48 +0000 Received: by ykdv3 with SMTP id v3so150737341ykd.0 for ; Fri, 13 Nov 2015 07:11:47 -0800 (PST) X-Received: by 10.13.213.131 with SMTP id x125mr9071192ywd.229.1447427506736; Fri, 13 Nov 2015 07:11:46 -0800 (PST) Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48? ([2601:181:c000:c497:a2a8:cdff:fe3e:b48]) by smtp.googlemail.com with ESMTPSA id m128sm21841703ywm.34.2015.11.13.07.11.45 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Fri, 13 Nov 2015 07:11:46 -0800 (PST) To: GCC Patches Cc: Bernd Schmidt From: Nathan Sidwell Subject: [nvptx] complex vector reductions Message-ID: <5645FDB1.4060400@acm.org> Date: Fri, 13 Nov 2015 15:11:00 -0000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------070507080501060607050900" X-SW-Source: 2015-11/txt/msg01713.txt.bz2 This is a multi-part message in MIME format. --------------070507080501060607050900 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Content-length: 630 I noticed that we weren't supporting reductions of complex type, particularly complex double. I've committed this patch to add support for vector reductions. We split the complex type apart and process each half before sticking it back together again. As it happens only 32-bit shuffles exist on hardware, so splitting (say) complex float this way doesn't result in worse code that punning the whole thing to a 64-bit int. I suspect it might play better, by keeping the real and imaginary parts separate. Worker and gang-level reductions of such types will be a different patch, which I need to think about. nathan --------------070507080501060607050900 Content-Type: text/x-patch; name="trunk-nvptx-cplx-1113.patch" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="trunk-nvptx-cplx-1113.patch" Content-length: 5588 2015-11-13 Nathan Sidwell gcc/ * config/nvptx/nvptx.c (nvptx_generate_vector_shuffle): Deal with complex types. libgomp/ * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: New. * testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: New. Index: gcc/config/nvptx/nvptx.c =================================================================== --- gcc/config/nvptx/nvptx.c (revision 230320) +++ gcc/config/nvptx/nvptx.c (working copy) @@ -3634,26 +3634,51 @@ nvptx_generate_vector_shuffle (location_ { unsigned fn = NVPTX_BUILTIN_SHUFFLE; tree_code code = NOP_EXPR; - tree type = unsigned_type_node; - enum machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + tree arg_type = unsigned_type_node; + tree var_type = TREE_TYPE (var); + tree dest_type = var_type; - if (!INTEGRAL_MODE_P (mode)) + if (TREE_CODE (var_type) == COMPLEX_TYPE) + var_type = TREE_TYPE (var_type); + + if (TREE_CODE (var_type) == REAL_TYPE) code = VIEW_CONVERT_EXPR; - if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode)) + + if (TYPE_SIZE (var_type) + == TYPE_SIZE (long_long_unsigned_type_node)) { fn = NVPTX_BUILTIN_SHUFFLELL; - type = long_long_unsigned_type_node; + arg_type = long_long_unsigned_type_node; } - + tree call = nvptx_builtin_decl (fn, true); - call = build_call_expr_loc - (loc, call, 3, fold_build1 (code, type, var), - build_int_cst (unsigned_type_node, shift), - build_int_cst (unsigned_type_node, SHUFFLE_DOWN)); + tree bits = build_int_cst (unsigned_type_node, shift); + tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); + tree expr; + + if (var_type != dest_type) + { + /* Do real and imaginary parts separately. */ + tree real = fold_build1 (REALPART_EXPR, var_type, var); + real = fold_build1 (code, arg_type, real); + real = build_call_expr_loc (loc, call, 3, real, bits, kind); + real = fold_build1 (code, var_type, real); - call = fold_build1 (code, TREE_TYPE (dest_var), call); + tree imag = fold_build1 (IMAGPART_EXPR, var_type, var); + imag = fold_build1 (code, arg_type, imag); + imag = build_call_expr_loc (loc, call, 3, imag, bits, kind); + imag = fold_build1 (code, var_type, imag); + + expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag); + } + else + { + expr = fold_build1 (code, arg_type, var); + expr = build_call_expr_loc (loc, call, 3, expr, bits, kind); + expr = fold_build1 (code, dest_type, expr); + } - gimplify_assign (dest_var, call, seq); + gimplify_assign (dest_var, expr, seq); } /* Insert code to locklessly update *PTR with *PTR OP VAR just before Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c (working copy) @@ -0,0 +1,52 @@ + +#include + +/* Double float has 53 bits of fraction. */ +#define FRAC (1.0 / (1LL << 48)) + +int close_enough (double _Complex a, double _Complex b) +{ + double _Complex diff = a - b; + double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); + double mag2_diff = (__real__(diff) * __real__ (diff) + + __imag__ (diff) * __imag__ (diff)); + + return mag2_diff / mag2_a < (FRAC * FRAC); +} + +int main (void) +{ +#define N 100 + double _Complex ary[N], sum, prod, tsum, tprod; + int ix; + + sum = tsum = 0; + prod = tprod = 1; + + for (ix = 0; ix < N; ix++) + { + double frac = ix * (1.0 / 1024) + 1.0; + + ary[ix] = frac + frac * 2.0i - 1.0i; + sum += ary[ix]; + prod *= ary[ix]; + } + +#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c (working copy) @@ -0,0 +1,52 @@ + +#include + +/* Single float has 23 bits of fraction. */ +#define FRAC (1.0f / (1 << 20)) + +int close_enough (float _Complex a, float _Complex b) +{ + float _Complex diff = a - b; + float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a); + float mag2_diff = (__real__(diff) * __real__ (diff) + + __imag__ (diff) * __imag__ (diff)); + + return mag2_diff / mag2_a < (FRAC * FRAC); +} + +int main (void) +{ +#define N 100 + float _Complex ary[N], sum, prod, tsum, tprod; + int ix; + + sum = tsum = 0; + prod = tprod = 1; + + for (ix = 0; ix < N; ix++) + { + float frac = ix * (1.0f / 1024) + 1.0f; + + ary[ix] = frac + frac * 2.0i - 1.0i; + sum += ary[ix]; + prod *= ary[ix]; + } + +#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod) + { +#pragma acc loop vector reduction(+:tsum) reduction (*:tprod) + for (ix = 0; ix < N; ix++) + { + tsum += ary[ix]; + tprod *= ary[ix]; + } + } + + if (!close_enough (sum, tsum)) + return 1; + + if (!close_enough (prod, tprod)) + return 1; + + return 0; +} --------------070507080501060607050900--