[nvptx] complex vector reductions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [nvptx] complex vector reductions
@ 2015-11-13 15:11 Nathan Sidwell
  2015-11-13 16:46 ` Bernd Schmidt
  0 siblings, 1 reply; 2+ messages in thread
From: Nathan Sidwell @ 2015-11-13 15:11 UTC (permalink / raw)
  To: GCC Patches; +Cc: Bernd Schmidt

[-- Attachment #1: Type: text/plain, Size: 630 bytes --]

I noticed that we weren't supporting reductions of complex type, particularly 
complex double.

I've committed this patch to add support for vector reductions.  We split the 
complex type apart and process each half before sticking it back together again. 
  As it happens only 32-bit shuffles exist on hardware, so splitting (say) 
complex float this way doesn't result in worse code that punning the  whole 
thing to a 64-bit int.  I suspect it might play better, by keeping the real and 
imaginary parts separate.

Worker and gang-level reductions of such types will be a different patch, which 
I need to think about.

nathan

[-- Attachment #2: trunk-nvptx-cplx-1113.patch --]
[-- Type: text/x-patch, Size: 5588 bytes --]

2015-11-13  Nathan Sidwell  <nathan@codesourcery.com>

	gcc/
	* config/nvptx/nvptx.c (nvptx_generate_vector_shuffle): Deal with
	complex types.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c: New.
	* testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c: New.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 230320)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -3634,26 +3634,51 @@ nvptx_generate_vector_shuffle (location_
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
   tree_code code = NOP_EXPR;
-  tree type = unsigned_type_node;
-  enum machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+  tree arg_type = unsigned_type_node;
+  tree var_type = TREE_TYPE (var);
+  tree dest_type = var_type;
 
-  if (!INTEGRAL_MODE_P (mode))
+  if (TREE_CODE (var_type) == COMPLEX_TYPE)
+    var_type = TREE_TYPE (var_type);
+
+  if (TREE_CODE (var_type) == REAL_TYPE)
     code = VIEW_CONVERT_EXPR;
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (DImode))
+
+  if (TYPE_SIZE (var_type)
+      == TYPE_SIZE (long_long_unsigned_type_node))
     {
       fn = NVPTX_BUILTIN_SHUFFLELL;
-      type = long_long_unsigned_type_node;
+      arg_type = long_long_unsigned_type_node;
     }
-
+  
   tree call = nvptx_builtin_decl (fn, true);
-  call = build_call_expr_loc
-    (loc, call, 3, fold_build1 (code, type, var),
-     build_int_cst (unsigned_type_node, shift),
-     build_int_cst (unsigned_type_node, SHUFFLE_DOWN));
+  tree bits = build_int_cst (unsigned_type_node, shift);
+  tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
+  tree expr;
+
+  if (var_type != dest_type)
+    {
+      /* Do real and imaginary parts separately.  */
+      tree real = fold_build1 (REALPART_EXPR, var_type, var);
+      real = fold_build1 (code, arg_type, real);
+      real = build_call_expr_loc (loc, call, 3, real, bits, kind);
+      real = fold_build1 (code, var_type, real);
 
-  call = fold_build1 (code, TREE_TYPE (dest_var), call);
+      tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
+      imag = fold_build1 (code, arg_type, imag);
+      imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
+      imag = fold_build1 (code, var_type, imag);
+
+      expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
+    }
+  else
+    {
+      expr = fold_build1 (code, arg_type, var);
+      expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
+      expr = fold_build1 (code, dest_type, expr);
+    }
 
-  gimplify_assign (dest_var, call, seq);
+  gimplify_assign (dest_var, expr, seq);
 }
 
 /* Insert code to locklessly update  *PTR with *PTR OP VAR just before
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-dbl.c	(working copy)
@@ -0,0 +1,52 @@
+
+#include <complex.h>
+
+/* Double float has 53 bits of fraction. */
+#define FRAC (1.0 / (1LL << 48))
+
+int close_enough (double _Complex a, double _Complex b)
+{
+  double _Complex diff = a - b;
+  double mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  double mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  double _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      double frac = ix * (1.0 / 1024) + 1.0;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-cplx-flt.c	(working copy)
@@ -0,0 +1,52 @@
+
+#include <complex.h>
+
+/* Single float has 23 bits of fraction. */
+#define FRAC (1.0f / (1 << 20))
+
+int close_enough (float _Complex a, float _Complex b)
+{
+  float _Complex diff = a - b;
+  float mag2_a = __real__(a) * __real__ (a) + __imag__ (a) * __imag__ (a);
+  float mag2_diff = (__real__(diff) * __real__ (diff)
+		     + __imag__ (diff) * __imag__ (diff));
+
+  return mag2_diff / mag2_a < (FRAC * FRAC);
+}
+
+int main (void)
+{
+#define N 100
+  float _Complex ary[N], sum, prod, tsum, tprod;
+  int ix;
+
+  sum = tsum = 0;
+  prod = tprod = 1;
+  
+  for (ix = 0; ix < N;  ix++)
+    {
+      float frac = ix * (1.0f / 1024) + 1.0f;
+      
+      ary[ix] = frac + frac * 2.0i - 1.0i;
+      sum += ary[ix];
+      prod *= ary[ix];
+    }
+
+#pragma acc parallel vector_length(32) copyin(ary) copy (tsum, tprod)
+  {
+#pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
+    for (ix = 0; ix < N; ix++)
+      {
+	tsum += ary[ix];
+	tprod *= ary[ix];
+      }
+  }
+
+  if (!close_enough (sum, tsum))
+    return 1;
+
+  if (!close_enough (prod, tprod))
+    return 1;
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [nvptx] complex vector reductions
  2015-11-13 15:11 [nvptx] complex vector reductions Nathan Sidwell
@ 2015-11-13 16:46 ` Bernd Schmidt
  0 siblings, 0 replies; 2+ messages in thread
From: Bernd Schmidt @ 2015-11-13 16:46 UTC (permalink / raw)
  To: Nathan Sidwell, GCC Patches

On 11/13/2015 04:11 PM, Nathan Sidwell wrote:
> I noticed that we weren't supporting reductions of complex type,
> particularly complex double.
>
> I've committed this patch to add support for vector reductions.  We
> split the complex type apart and process each half before sticking it
> back together again.  As it happens only 32-bit shuffles exist on
> hardware, so splitting (say) complex float this way doesn't result in
> worse code that punning the  whole thing to a 64-bit int.  I suspect it
> might play better, by keeping the real and imaginary parts separate.
>
> Worker and gang-level reductions of such types will be a different
> patch, which I need to think about.

LGTM.


Bernd

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-11-13 16:46 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-13 15:11 [nvptx] complex vector reductions Nathan Sidwell
2015-11-13 16:46 ` Bernd Schmidt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).