From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-413997-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 52142 invoked by alias); 13 Nov 2015 15:11:51 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 52126 invoked by uid 89); 13 Nov 2015 15:11:51 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=BAYES_00,FREEMAIL_FROM,KAM_ASCII_DIVIDERS,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=no version=3.3.2
X-HELO: mail-yk0-f179.google.com
Received: from mail-yk0-f179.google.com (HELO mail-yk0-f179.google.com) (209.85.160.179) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Fri, 13 Nov 2015 15:11:48 +0000
Received: by ykdv3 with SMTP id v3so150737341ykd.0        for <gcc-patches@gcc.gnu.org>; Fri, 13 Nov 2015 07:11:47 -0800 (PST)
X-Received: by 10.13.213.131 with SMTP id x125mr9071192ywd.229.1447427506736;        Fri, 13 Nov 2015 07:11:46 -0800 (PST)
Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48? ([2601:181:c000:c497:a2a8:cdff:fe3e:b48])        by smtp.googlemail.com with ESMTPSA id m128sm21841703ywm.34.2015.11.13.07.11.45        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);        Fri, 13 Nov 2015 07:11:46 -0800 (PST)
To: GCC Patches <gcc-patches@gcc.gnu.org>
Cc: Bernd Schmidt <bschmidt@redhat.com>
From: Nathan Sidwell <nathan@acm.org>
Subject: [nvptx] complex vector reductions
Message-ID: <5645FDB1.4060400@acm.org>
Date: Fri, 13 Nov 2015 15:11:00 -0000
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="------------070507080501060607050900"
X-SW-Source: 2015-11/txt/msg01713.txt.bz2

This is a multi-part message in MIME format.
--------------070507080501060607050900
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Transfer-Encoding: 7bit
Content-length: 630

I noticed that we weren't supporting reductions of complex type, particularly 
complex double.

I've committed this patch to add support for vector reductions.  We split the 
complex type apart and process each half before sticking it back together again. 
  As it happens only 32-bit shuffles exist on hardware, so splitting (say) 
complex float this way doesn't result in worse code that punning the  whole 
thing to a 64-bit int.  I suspect it might play better, by keeping the real and 
imaginary parts separate.

Worker and gang-level reductions of such types will be a different patch, which 
I need to think about.

nathan

--------------070507080501060607050900
Content-Type: text/x-patch;
 name="trunk-nvptx-cplx-1113.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="trunk-nvptx-cplx-1113.patch"
Content-length: 5588

2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_generate_vector_shuffle): Deal with
	complex types.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: New.
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: New.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 230320)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3634,26 +3634,51 @@ nvptx_generate_vector_shuffle (location_
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
   tree_code code = NOP_EXPR;
-  tree type = unsigned_type_node;
-  enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+  tree arg_type = unsigned_type_node;
+  tree var_type = TREE_TYPE (var);
+  tree dest_type = var_type;
 
-  if (!INTEGRAL_MODE_P (mode))
+  if (TREE_CODE (var_type) == COMPLEX_TYPE)
+    var_type = TREE_TYPE (var_type);
+
+  if (TREE_CODE (var_type) == REAL_TYPE)
     code = VIEW_CONVERT_EXPR;
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
+
+  if (TYPE_SIZE (var_type)
+      == TYPE_SIZE (long_long_unsigned_type_node))
     {
       fn = NVPTX_BUILTIN_SHUFFLELL;
-      type = long_long_unsigned_type_node;
+      arg_type = long_long_unsigned_type_node;
     }
-
+  
   tree call = nvptx_builtin_decl (fn, true);
-  call = build_call_expr_loc
-    (loc, call, 3, fold_build1 (code, type, var),
-     build_int_cst (unsigned_type_node, shift),
-     build_int_cst (unsigned_type_node, SHUFFLE_DOWN));
+  tree bits = build_int_cst (unsigned_type_node, shift);
+  tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
+  tree expr;
+
+  if (var_type != dest_type)
+    {
+      /* Do real and imaginary parts separately.  */
+      tree real = fold_build1 (REALPART_EXPR, var_type, var);
+      real = fold_build1 (code, arg_type, real);
+      real = build_call_expr_loc (loc, call, 3, real, bits, kind);
+      real = fold_build1 (code, var_type, real);
 
-  call = fold_build1 (code, TREE_TYPE (dest_var), call);
+      tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
+      imag = fold_build1 (code, arg_type, imag);
+      imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
+      imag = fold_build1 (code, var_type, imag);
+
+      expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
+    }
+  else
+    {
+      expr = fold_build1 (code, arg_type, var);
+      expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
+      expr = fold_build1 (code, dest_type, expr);
+    }
 
-  gimplify_assign (dest_var, call, seq);
+  gimplify_assign (dest_var, expr, seq);
 }
 
 /* Insert code to locklessly update  *PTR with *PTR OP VAR just before
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(working copy)
@@ -0,0 +1,52 @@
+
+#include <complex.h>
+
+/* Double float has 53 bits of fraction. */
+#define FRAC (1.0 / (1LL << 48))
+
+int close_enough (double _Complex a, double _Complex b)
+{
+  double _Complex diff = a - b;
+  double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  double mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  double _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      double frac = ix * (1.0 / 1024) + 1.0;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(working copy)
@@ -0,0 +1,52 @@
+
+#include <complex.h>
+
+/* Single float has 23 bits of fraction. */
+#define FRAC (1.0f / (1 << 20))
+
+int close_enough (float _Complex a, float _Complex b)
+{
+  float _Complex diff = a - b;
+  float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  float mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  float _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      float frac = ix * (1.0f / 1024) + 1.0f;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}

--------------070507080501060607050900--