From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hongtao.liu@intel.com>
Received: from mga01.intel.com (mga01.intel.com [192.55.52.88])
 by sourceware.org (Postfix) with ESMTPS id C62003858C27
 for <gcc-patches@gcc.gnu.org>; Mon, 27 Sep 2021 08:54:12 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C62003858C27
X-IronPort-AV: E=McAfee;i="6200,9189,10119"; a="246924220"
X-IronPort-AV: E=Sophos;i="5.85,326,1624345200"; d="scan'208";a="246924220"
Received: from fmsmga003.fm.intel.com ([10.253.24.29])
 by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 27 Sep 2021 01:54:10 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.85,326,1624345200"; d="scan'208";a="553113669"
Received: from scymds01.sc.intel.com ([10.148.94.138])
 by FMSMGA003.fm.intel.com with ESMTP; 27 Sep 2021 01:54:07 -0700
Received: from shliclel219.sh.intel.com (shliclel219.sh.intel.com
 [10.239.236.219]) by scymds01.sc.intel.com
 with ESMTP id 18R8s5mY009194; Mon, 27 Sep 2021 01:54:06 -0700
From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Subject: [PATCH] Support 128/256/512-bit vector _Float16 plus/smin/smax reduce.
Date: Mon, 27 Sep 2021 16:54:05 +0800
Message-Id: <20210927085405.3420838-1-hongtao.liu@intel.com>
X-Mailer: git-send-email 2.27.0
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-11.5 required=5.0 tests=BAYES_00, GIT_PATCH_0,
 KAM_DMARC_NONE, KAM_DMARC_STATUS, KAM_LAZY_DOMAIN_SECURITY, KAM_SHORT,
 RCVD_IN_MSPIKE_H3, RCVD_IN_MSPIKE_WL, SPF_HELO_NONE, SPF_NONE,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Mon, 27 Sep 2021 08:54:16 -0000

Hi:
  Add expanders for reduc_{smin,smax,plus}_scal_{v8hf,v16hf,v32hf}
  Bootstrapped and regtest on x86_64-pc-linux-gnu{-m32,}
  
gcc/ChangeLog:

	* config/i386/i386-expand.c (emit_reduc_half): Handle
	V8HF/V16HF/V32HFmode.
	* config/i386/sse.md (REDUC_SSE_PLUS_MODE): Add V8HF.
	(REDUC_SSE_SMINMAX_MODE): Ditto.
	(REDUC_PLUS_MODE): Add V16HF and V32HF.
	(REDUC_SMINMAX_MODE): Ditto.

gcc/testsuite

	* gcc.target/i386/avx512fp16-reduce-op-2.c: New test.
	* gcc.target/i386/avx512fp16-reduce-op-3.c: New test.
---
 gcc/config/i386/i386-expand.c                 |  3 +
 gcc/config/i386/sse.md                        | 10 +-
 .../gcc.target/i386/avx512fp16-reduce-op-2.c  | 96 +++++++++++++++++++
 .../gcc.target/i386/avx512fp16-reduce-op-3.c  | 91 ++++++++++++++++++
 4 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 94ac303585e..4780b993917 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -16045,6 +16045,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
       d = gen_reg_rtx (V1TImode);
@@ -16066,6 +16067,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V32QImode:
     case E_V16HImode:
+    case E_V16HFmode:
     case E_V8SImode:
     case E_V4DImode:
       if (i == 256)
@@ -16085,6 +16087,7 @@ emit_reduc_half (rtx dest, rtx src, int i)
       break;
     case E_V64QImode:
     case E_V32HImode:
+    case E_V32HFmode:
       if (i < 64)
 	{
 	  d = gen_reg_rtx (V4TImode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb7600edbab..4559b0ce9c9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3157,7 +3157,8 @@ (define_insn "sse3_h<insn>v4sf3"
    (set_attr "mode" "V4SF")])
 
 (define_mode_iterator REDUC_SSE_PLUS_MODE
- [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")])
+ [(V2DF "TARGET_SSE") (V4SF "TARGET_SSE")
+  (V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")])
 
 (define_expand "reduc_plus_scal_<mode>"
  [(plus:REDUC_SSE_PLUS_MODE
@@ -3194,7 +3195,9 @@ (define_expand "reduc_plus_scal_v16qi"
 
 (define_mode_iterator REDUC_PLUS_MODE
  [(V4DF "TARGET_AVX") (V8SF "TARGET_AVX")
+  (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V8DF "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
+  (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
   (V32QI "TARGET_AVX") (V64QI "TARGET_AVX512F")])
 
 (define_expand "reduc_plus_scal_<mode>"
@@ -3214,7 +3217,8 @@ (define_expand "reduc_plus_scal_<mode>"
 
 ;; Modes handled by reduc_sm{in,ax}* patterns.
 (define_mode_iterator REDUC_SSE_SMINMAX_MODE
-  [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
+  [(V8HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
+   (V4SF "TARGET_SSE") (V2DF "TARGET_SSE")
    (V4SI "TARGET_SSE2") (V8HI "TARGET_SSE2") (V16QI "TARGET_SSE2")
    (V2DI "TARGET_SSE4_2")])
 
@@ -3233,9 +3237,11 @@ (define_expand "reduc_<code>_scal_<mode>"
 
 (define_mode_iterator REDUC_SMINMAX_MODE
   [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
+   (V16HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
    (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
    (V64QI "TARGET_AVX512BW")
+   (V32HF "TARGET_AVX512FP16 && TARGET_AVX512VL")
    (V32HI "TARGET_AVX512BW") (V16SI "TARGET_AVX512F")
    (V8DI "TARGET_AVX512F") (V16SF "TARGET_AVX512F")
    (V8DF "TARGET_AVX512F")])
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
new file mode 100644
index 00000000000..593340e4afa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-2.c
@@ -0,0 +1,96 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mprefer-vector-width=512 -fdump-tree-optimized" } */
+
+/* { dg-final { scan-tree-dump-times "\.REDUC_PLUS" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.REDUC_MIN" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.REDUC_MAX" 3 "optimized" } } */
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_128 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 8; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_256 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 16; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_add_512 (_Float16* p)
+{
+  _Float16 sum = 0;
+  for (int i = 0; i != 32; i++)
+    sum += p[i];
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_128 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 8; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_256 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 16; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_min_512 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 32; i++)
+    sum = sum > p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_128 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 8; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_256 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 16; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
+
+_Float16
+__attribute__((noipa, target("avx512fp16,avx512vl"), optimize("Ofast")))
+reduc_max_512 (_Float16* p)
+{
+  _Float16 sum = p[0];
+  for (int i = 0; i != 32; i++)
+    sum = sum < p[i] ? p[i] : sum;
+  return sum;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c
new file mode 100644
index 00000000000..9281a3be248
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16-reduce-op-3.c
@@ -0,0 +1,91 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16" } */
+/* { dg-require-effective-target avx512fp16 } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512FP16
+#define AVX512VL
+
+#include "avx512f-helper.h"
+
+#include "avx512fp16-reduce-op-2.c"
+
+void
+test_256 (void)
+{
+  _Float16 a[32];
+  int sign = 1;
+  _Float16 res1 = 0, exp1;
+  _Float16 res2 = 0, exp2;
+  _Float16 res3 = 0, exp3;
+
+  for (int i = 0; i != 32; i++)
+    {
+      a[i] = sign * (4.0 * i);
+      sign *= -1;
+      if (i < 8)
+	res1 += a[i];
+      if (i < 16)
+	res2 += a[i];
+      res3 += a[i];
+    }
+
+  exp1 = reduc_add_128 (a);
+  exp2 = reduc_add_256 (a);
+  exp3 = reduc_add_512 (a);
+  if (exp1 != res1 || exp2 != res2 || exp3 != res3)
+    abort();
+}
+
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+
+void
+test_128 ()
+{
+  _Float16 a[32];
+  int sign = 1;
+  _Float16 min_res1, min_exp1, max_res1, max_exp1;
+  _Float16 min_res2, min_exp2, max_res2, max_exp2;
+  _Float16 min_res3, min_exp3, max_res3, max_exp3;
+
+  for (int i = 0; i != 32; i++)
+    {
+      a[i] = sign * (4.9 * i * i - 8.3 * i + 14.8);
+      sign *= -1;
+    }
+
+  min_res1 = max_res1 = a[0];
+  for (int i = 0 ; i != 8; i++)
+    {
+      min_res1 = MIN (min_res1, a[i]);
+      max_res1 = MAX (max_res1, a[i]);
+    }
+
+  min_res2 = min_res1;
+  max_res2 = max_res1;
+  for (int i = 8 ; i != 16; i++)
+    {
+      min_res2 = MIN (min_res2, a[i]);
+      max_res2 = MAX (max_res2, a[i]);
+    }
+
+  min_res3 = min_res2;
+  max_res3 = max_res2;
+  for (int i = 16 ; i != 32; i++)
+    {
+      min_res3 = MIN (min_res3, a[i]);
+      max_res3 = MAX (max_res3, a[i]);
+    }
+
+  min_exp1 = reduc_min_128 (a);
+  min_exp2 = reduc_min_256 (a);
+  min_exp3 = reduc_min_512 (a);
+  max_exp1 = reduc_max_128 (a);
+  max_exp2 = reduc_max_256 (a);
+  max_exp3 = reduc_max_512 (a);
+
+  if (min_exp1 != min_res1 || min_exp2 != min_res2 || min_exp3 != min_res3
+      || max_exp1 != max_res1 || max_exp2 != max_res2 || max_exp3 != max_res3)
+    abort();
+}
-- 
2.27.0