public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] i386: Optimization for mm512_set1_pch.
       [not found] <20211105071827.7258-1-lingling.kong@intel.com>
@ 2021-11-05  7:19 ` Kong, Lingling
  2021-11-08  1:48   ` Hongtao Liu
  0 siblings, 1 reply; 2+ messages in thread
From: Kong, Lingling @ 2021-11-05  7:19 UTC (permalink / raw)
  To: gcc-patches; +Cc: Kong, Lingling, Liu, Hongtao

Hi,

This patch is to support fold _mm512_fmadd_pch (a, _mm512_set1_pch(*(b)), c) to 1 instruction vfmaddcph (%rsp){1to16}, %zmm1, %zmm2.
OK for master?

gcc/ChangeLog:

	* config/i386/sse.md (fma_<complexpairopname>_<mode>_pair):
	Add new define_insn.
	(fma_<mode>_fmaddc_bcst): Add new define_insn_and_split.
	(fma_<mode>_fcmaddc_bcst): Likewise

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512fp16vl-complex-broadcast-1.c: New test.
---
 gcc/config/i386/sse.md                        | 62 +++++++++++++++++++
 .../i386/avx512fp16vl-complex-broadcast-1.c   | 25 ++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0a7f5b178f9..eba8e77515f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -193,7 +193,9 @@
 
   ;; For AVX512FP16 suppport
   UNSPEC_COMPLEX_FMA
+  UNSPEC_COMPLEX_FMA_PAIR
   UNSPEC_COMPLEX_FCMA
+  UNSPEC_COMPLEX_FCMA_PAIR
   UNSPEC_COMPLEX_FMUL
   UNSPEC_COMPLEX_FCMUL
   UNSPEC_COMPLEX_MASK
@@ -5913,6 +5915,9 @@
 (define_int_iterator UNSPEC_COMPLEX_F_C_MA
 	[UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
 
+(define_int_iterator UNSPEC_COMPLEX_F_C_MA_PAIR
+	[UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA_PAIR])
+
 (define_int_iterator UNSPEC_COMPLEX_F_C_MUL
 	[UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
 
@@ -5922,6 +5927,10 @@
 	 (UNSPEC_COMPLEX_FMUL "fmulc")
 	 (UNSPEC_COMPLEX_FCMUL "fcmulc")])
 
+(define_int_attr complexpairopname
+	[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
+	 (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
+
 (define_mode_attr complexmove
   [(V32HF "avx512f_loadv16sf")
    (V16HF "avx512vl_loadv8sf")
@@ -6067,6 +6076,59 @@
 	  [(match_dup 1) (match_dup 2) (match_dup 4)]
 	   UNSPEC_COMPLEX_F_C_MA))])
 
+(define_insn "fma_<complexpairopname>_<mode>_pair"
+ [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
+       (unspec:VF1_AVX512VL
+	 [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+	  (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
+	  (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
+	  UNSPEC_COMPLEX_F_C_MA_PAIR))]
+ "TARGET_AVX512FP16"
+ "v<complexpairopname>ph\t{%2, %1, %0|%0, %1, %2}"
+ [(set_attr "type" "ssemuladd")])
+
+(define_insn_and_split "fma_<mode>_fmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+	   (subreg:VF_AVX512FP16VL
+	     (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
+	   (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+	   UNSPEC_COMPLEX_FMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<ssePSmode>
+	  [(match_dup 1) (match_dup 2) (match_dup 3)]
+	   UNSPEC_COMPLEX_FMA_PAIR))]
+  {
+    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+    operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
+    operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3], 
+<MODE>mode);
+  })
+
+(define_insn_and_split "fma_<mode>_fcmaddc_bcst"
+  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
+	(unspec:VF_AVX512FP16VL
+	  [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
+	   (subreg:VF_AVX512FP16VL
+	     (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
+	   (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
+	   UNSPEC_COMPLEX_FCMA))]
+  "TARGET_AVX512FP16"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(unspec:<ssePSmode>
+	  [(match_dup 1) (match_dup 2) (match_dup 3)]
+	   UNSPEC_COMPLEX_FCMA_PAIR))]
+  {
+    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
+    operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
+    operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3], 
+<MODE>mode);
+  })
+
 (define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>"
   [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
 	(vec_merge:VF_AVX512FP16VL
diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
new file mode 100644
index 00000000000..3c8e84230f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
+
+#include <immintrin.h>
+
+volatile __m512h res0, a0, c0;
+volatile __m256h res1, a1, c1;
+volatile __m128h res2, a2, c2;
+volatile _Float16 *b;
+
+void extern
+avx_test(void)
+{
+  res0 = _mm512_fmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+  res0 = _mm512_fcmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
+
+  res1 = _mm256_fmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+  res1 = _mm256_fcmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
+
+  res2 =  _mm_fmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2);
+  res2 =  _mm_fcmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2); }
--
2.18.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] i386: Optimization for mm512_set1_pch.
  2021-11-05  7:19 ` [PATCH] i386: Optimization for mm512_set1_pch Kong, Lingling
@ 2021-11-08  1:48   ` Hongtao Liu
  0 siblings, 0 replies; 2+ messages in thread
From: Hongtao Liu @ 2021-11-08  1:48 UTC (permalink / raw)
  To: Kong, Lingling; +Cc: gcc-patches, Liu, Hongtao

On Fri, Nov 5, 2021 at 3:20 PM Kong, Lingling via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> This patch is to support fold _mm512_fmadd_pch (a, _mm512_set1_pch(*(b)), c) to 1 instruction vfmaddcph (%rsp){1to16}, %zmm1, %zmm2.
> OK for master?
>
LGTM.
> gcc/ChangeLog:
>
>         * config/i386/sse.md (fma_<complexpairopname>_<mode>_pair):
>         Add new define_insn.
>         (fma_<mode>_fmaddc_bcst): Add new define_insn_and_split.
>         (fma_<mode>_fcmaddc_bcst): Likewise
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512fp16vl-complex-broadcast-1.c: New test.
> ---
>  gcc/config/i386/sse.md                        | 62 +++++++++++++++++++
>  .../i386/avx512fp16vl-complex-broadcast-1.c   | 25 ++++++++
>  2 files changed, 87 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0a7f5b178f9..eba8e77515f 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -193,7 +193,9 @@
>
>    ;; For AVX512FP16 suppport
>    UNSPEC_COMPLEX_FMA
> +  UNSPEC_COMPLEX_FMA_PAIR
>    UNSPEC_COMPLEX_FCMA
> +  UNSPEC_COMPLEX_FCMA_PAIR
>    UNSPEC_COMPLEX_FMUL
>    UNSPEC_COMPLEX_FCMUL
>    UNSPEC_COMPLEX_MASK
> @@ -5913,6 +5915,9 @@
>  (define_int_iterator UNSPEC_COMPLEX_F_C_MA
>         [UNSPEC_COMPLEX_FMA UNSPEC_COMPLEX_FCMA])
>
> +(define_int_iterator UNSPEC_COMPLEX_F_C_MA_PAIR
> +       [UNSPEC_COMPLEX_FMA_PAIR UNSPEC_COMPLEX_FCMA_PAIR])
> +
>  (define_int_iterator UNSPEC_COMPLEX_F_C_MUL
>         [UNSPEC_COMPLEX_FMUL UNSPEC_COMPLEX_FCMUL])
>
> @@ -5922,6 +5927,10 @@
>          (UNSPEC_COMPLEX_FMUL "fmulc")
>          (UNSPEC_COMPLEX_FCMUL "fcmulc")])
>
> +(define_int_attr complexpairopname
> +       [(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
> +        (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
> +
>  (define_mode_attr complexmove
>    [(V32HF "avx512f_loadv16sf")
>     (V16HF "avx512vl_loadv8sf")
> @@ -6067,6 +6076,59 @@
>           [(match_dup 1) (match_dup 2) (match_dup 4)]
>            UNSPEC_COMPLEX_F_C_MA))])
>
> +(define_insn "fma_<complexpairopname>_<mode>_pair"
> + [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=&v")
> +       (unspec:VF1_AVX512VL
> +        [(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
> +         (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
> +         (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
> +         UNSPEC_COMPLEX_F_C_MA_PAIR))]
> + "TARGET_AVX512FP16"
> + "v<complexpairopname>ph\t{%2, %1, %0|%0, %1, %2}"
> + [(set_attr "type" "ssemuladd")])
> +
> +(define_insn_and_split "fma_<mode>_fmaddc_bcst"
> +  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
> +       (unspec:VF_AVX512FP16VL
> +         [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
> +          (subreg:VF_AVX512FP16VL
> +            (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
> +          (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
> +          UNSPEC_COMPLEX_FMA))]
> +  "TARGET_AVX512FP16"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (unspec:<ssePSmode>
> +         [(match_dup 1) (match_dup 2) (match_dup 3)]
> +          UNSPEC_COMPLEX_FMA_PAIR))]
> +  {
> +    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
> +    operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
> +    operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3],
> +<MODE>mode);
> +  })
> +
> +(define_insn_and_split "fma_<mode>_fcmaddc_bcst"
> +  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
> +       (unspec:VF_AVX512FP16VL
> +         [(match_operand:VF_AVX512FP16VL 1 "vector_operand")
> +          (subreg:VF_AVX512FP16VL
> +            (match_operand:<ssePSmode> 2 "bcst_vector_operand") 0)
> +          (match_operand:VF_AVX512FP16VL 3 "vector_operand")]
> +          UNSPEC_COMPLEX_FCMA))]
> +  "TARGET_AVX512FP16"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 0)
> +       (unspec:<ssePSmode>
> +         [(match_dup 1) (match_dup 2) (match_dup 3)]
> +          UNSPEC_COMPLEX_FCMA_PAIR))]
> +  {
> +    operands[0] = lowpart_subreg (<ssePSmode>mode, operands[0], <MODE>mode);
> +    operands[1] = lowpart_subreg (<ssePSmode>mode, operands[1], <MODE>mode);
> +    operands[3] = lowpart_subreg (<ssePSmode>mode, operands[3],
> +<MODE>mode);
> +  })
> +
>  (define_insn "<avx512>_<complexopname>_<mode>_mask<round_name>"
>    [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=&v")
>         (vec_merge:VF_AVX512FP16VL
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
> new file mode 100644
> index 00000000000..3c8e84230f3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16vl-complex-broadcast-1.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 2 } }  */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
> +
> +#include <immintrin.h>
> +
> +volatile __m512h res0, a0, c0;
> +volatile __m256h res1, a1, c1;
> +volatile __m128h res2, a2, c2;
> +volatile _Float16 *b;
> +
> +void extern
> +avx_test(void)
> +{
> +  res0 = _mm512_fmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
> +  res0 = _mm512_fcmadd_pch (a0, _mm512_set1_pch(*(b + 2 * 6)), c0);
> +
> +  res1 = _mm256_fmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
> +  res1 = _mm256_fcmadd_pch (a1, _mm256_set1_pch(*(b + 2 * 6)), c1);
> +
> +  res2 =  _mm_fmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2);
> +  res2 =  _mm_fcmadd_pch (a2, _mm_set1_pch(*(b + 2 * 6)), c2); }
> --
> 2.18.1
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-11-08  1:42 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20211105071827.7258-1-lingling.kong@intel.com>
2021-11-05  7:19 ` [PATCH] i386: Optimization for mm512_set1_pch Kong, Lingling
2021-11-08  1:48   ` Hongtao Liu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).