public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-4336] rs6000: Support more SSE4 "cmp", "mul", "pack" intrinsics
@ 2021-10-12  1:32 Paul Clarke
  0 siblings, 0 replies; only message in thread
From: Paul Clarke @ 2021-10-12  1:32 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:29fb1e831bf1c25e4574bf2f98a9f534e5c67665

commit r12-4336-g29fb1e831bf1c25e4574bf2f98a9f534e5c67665
Author: Paul A. Clarke <pc@us.ibm.com>
Date:   Sun Aug 8 15:57:20 2021 -0500

    rs6000: Support more SSE4 "cmp", "mul", "pack" intrinsics
    
    Function signatures and decorations match gcc/config/i386/smmintrin.h.
    
    Also, copy tests for:
    - _mm_cmpeq_epi64
    - _mm_mullo_epi32, _mm_mul_epi32
    - _mm_packus_epi32
    - _mm_cmpgt_epi64 (SSE4.2)
    
    from gcc/testsuite/gcc.target/i386.
    
    2021-10-11  Paul A. Clarke  <pc@us.ibm.com>
    
    gcc
            * config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
            _mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
            * config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.
    
    gcc/testsuite
            * gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
            adjust dg directives to suit.
            * gcc.target/powerpc/sse4_1-packusdw.c: Same.
            * gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
            * gcc.target/powerpc/sse4_1-pmuldq.c: Same.
            * gcc.target/powerpc/sse4_1-pmulld.c: Same.
            * gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
            * gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
            tweak to suit.

Diff:
---
 gcc/config/rs6000/nmmintrin.h                      | 40 ++++++++++++
 gcc/config/rs6000/smmintrin.h                      | 41 ++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr78102.c         | 23 +++++++
 gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c | 73 ++++++++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c  | 46 ++++++++++++++
 gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c   | 51 +++++++++++++++
 gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c   | 46 ++++++++++++++
 gcc/testsuite/gcc.target/powerpc/sse4_2-check.h    | 18 ++++++
 gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c  | 46 ++++++++++++++
 9 files changed, 384 insertions(+)

diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
new file mode 100644
index 00000000000..20a70bee377
--- /dev/null
+++ b/gcc/config/rs6000/nmmintrin.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef _NMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* _NMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index ad6b68e13cc..90ce03d2270 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -274,6 +274,15 @@ _mm_floor_ss (__m128 __A, __m128 __B)
   return __r;
 }
 
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_epi8 (__m128i __X, __m128i __Y)
@@ -332,6 +341,22 @@ _mm_max_epu32 (__m128i __X, __m128i __Y)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtepi8_epi16 (__m128i __A)
 {
   return (__m128i) vec_unpackh ((__v16qi) __A);
@@ -495,4 +520,20 @@ _mm_minpos_epu16 (__m128i __A)
   return __r.__m;
 }
 
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
 #endif
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78102.c b/gcc/testsuite/gcc.target/powerpc/pr78102.c
new file mode 100644
index 00000000000..68898c7f942
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr78102.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#include <x86intrin.h>
+
+__m128i
+foo (const __m128i x, const __m128i y)
+{
+  return _mm_cmpeq_epi64 (x, y);
+}
+
+__v2di
+bar (const __v2di x, const __v2di y)
+{
+  return x == y;
+}
+
+__v2di
+baz (const __v2di x, const __v2di y)
+{
+  return x != y;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
new file mode 100644
index 00000000000..8b757a26746
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
@@ -0,0 +1,73 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+  unsigned short sVal;
+
+  if (iVal < 0)
+    sVal = 0;
+  else if (iVal > 0xffff)
+    sVal = 0xffff;
+  else sVal = iVal;
+
+  return sVal;
+}
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } src1, src2;
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned short s[NUM * 2];
+    } dst;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+  for (i = 0; i < NUM; i ++)
+    {
+      int dstIndex;
+      unsigned short sVal;
+
+      sVal = int_to_ushort (src1.i[i]);
+      dstIndex = (i % 4) + (i / 4) * 8;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+
+      sVal = int_to_ushort (src2.i[i]);
+      dstIndex += 4;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
new file mode 100644
index 00000000000..39b9f01d64a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign=1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
new file mode 100644
index 00000000000..6a884f46235
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst;
+  union
+    {
+      __m128i x[NUM / 2];
+      int i[NUM * 2];
+    } src1, src2;
+  int i, sign = 1;
+  long long value;
+
+  for (i = 0; i < NUM * 2; i += 2)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+      if (value != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
new file mode 100644
index 00000000000..73033436642
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int value;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = src1.i[i] * src2.i[i];
+      if (value != dst.i[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
new file mode 100644
index 00000000000..f6264e5a108
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
@@ -0,0 +1,18 @@
+#define NO_WARN_X86_INTRINSICS 1
+
+static void sse4_2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  sse4_2_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
new file mode 100644
index 00000000000..a8a6a2010f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_2_test
+#endif
+
+#include CHECK_H
+
+#include <nmmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_cmpgt_epi64 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] > src2.ll[i] ? 0xFFFFFFFFFFFFFFFFLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-10-12  1:32 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-12  1:32 [gcc r12-4336] rs6000: Support more SSE4 "cmp", "mul", "pack" intrinsics Paul Clarke

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).