[PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
@ 2019-11-26 17:25 Dennis Zhang
  2019-11-29 13:02 ` Richard Sandiford
  0 siblings, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2019-11-26 17:25 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 1308 bytes --]

Hi all,

This patch is part of a series adding support for Armv8.6-A features.
It enables options including -march=armv8.6-a, +i8mm and +bf16.
The +i8mm and +bf16 features are mandatory for Armv8.6-a and optional 
for Armv8.2-a and onward.
Documents are at https://developer.arm.com/docs/ddi0596/latest

Regtested for aarch64-none-linux-gnu.

Please help to check if it's ready for trunk.

Many thanks!
Dennis

gcc/ChangeLog:

2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options.

gcc/testsuite/ChangeLog:

2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
	and bf16 features.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: cli-aarch-armv8.6-a+i8mm+bf16-20191126.patch --]
[-- Type: text/x-patch; name="cli-aarch-armv8.6-a+i8mm+bf16-20191126.patch", Size: 9168 bytes --]

diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index d258bd49244..e464d329c1a 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
 AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
+AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index f3da07fd28a..20d1e00552b 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -165,6 +165,12 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
   aarch64_def_or_undef (TARGET_MEMTAG, "__ARM_FEATURE_MEMORY_TAGGING", pfile);
 
+  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
+  aarch64_def_or_undef (TARGET_BF16_SIMD,
+			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
+  aarch64_def_or_undef (TARGET_BF16_FP,
+			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
+
   /* Not for ACLE, but required to keep "float.h" correct if we switch
      target between implementations that do or do not support ARMv8.2-A
      16-bit floating-point extensions.  */
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index d3ae1b2431b..5b7c3b8a213 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -198,4 +198,14 @@ AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD |
 /* Enabling or disabling "tme" only changes "tme".  */
 AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
 
+/* Enabling "i8mm" also enables "simd".
+   Disabling "i8mm" only disables "i8mm".  */
+AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, AARCH64_FL_SIMD, \
+		      0, false, "i8mm")
+
+/* Enabling "bf16" also enables "simd" and "fp".
+   Disabling "bf16" only disables "bf16".  */
+AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, AARCH64_FL_SIMD | AARCH64_FL_FP,
+		      0, false, "bf16")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index ee01909abb9..7de99285e8a 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -202,6 +202,15 @@ extern unsigned aarch64_architecture_version;
 /* Transactional Memory Extension.  */
 #define AARCH64_FL_TME	      (1ULL << 33)  /* Has TME instructions.  */
 
+/* Armv8.6-A architecture extensions.  */
+#define AARCH64_FL_V8_6	      (1ULL << 34)
+
+/* 8-bit Integer Matrix Multiply (I8MM) extensions.  */
+#define AARCH64_FL_I8MM       (1ULL << 35)
+
+/* Brain half-precision floating-point (BFloat16) Extension.  */
+#define AARCH64_FL_BF16	      (1ULL << 36)
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -223,6 +232,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_5			\
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_5	\
    | AARCH64_FL_SB | AARCH64_FL_SSBS | AARCH64_FL_PREDRES)
+#define AARCH64_FL_FOR_ARCH8_6			\
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
+   | AARCH64_FL_I8MM | AARCH64_FL_BF16)
 
 /* Macros to test ISA flags.  */
 
@@ -249,6 +261,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_5	   (aarch64_isa_flags & AARCH64_FL_V8_5)
 #define AARCH64_ISA_TME		   (aarch64_isa_flags & AARCH64_FL_TME)
 #define AARCH64_ISA_MEMTAG	   (aarch64_isa_flags & AARCH64_FL_MEMTAG)
+#define AARCH64_ISA_V8_6	   (aarch64_isa_flags & AARCH64_FL_V8_6)
+#define AARCH64_ISA_I8MM	   (aarch64_isa_flags & AARCH64_FL_I8MM)
+#define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -308,6 +323,13 @@ extern unsigned aarch64_architecture_version;
 /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
 #define TARGET_MEMTAG (AARCH64_ISA_V8_5 && AARCH64_ISA_MEMTAG)
 
+/* I8MM instructions are enabled through +i8mm.  */
+#define TARGET_I8MM (TARGET_SIMD && AARCH64_ISA_I8MM)
+
+/* BF16 instructions are enabled through +bf16.  */
+#define TARGET_BF16_FP (AARCH64_ISA_BF16 && TARGET_FLOAT)
+#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 403be47d893..30647e52c07 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16045,7 +16045,10 @@ more feature modifiers.  This option has the form
 
 The permissible values for @var{arch} are @samp{armv8-a},
 @samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
-@samp{armv8.5-a} or @var{native}.
+@samp{armv8.5-a}, @samp{armv8.6-a} or @var{native}.
+
+The value @samp{armv8.6-a} implies @samp{armv8.5-a} and enables compiler
+support for the ARMv8.6-A architecture extensions.
 
 The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
 support for the ARMv8.5-A architecture extensions.
@@ -16276,6 +16279,7 @@ generation.  This option is enabled by default for @option{-march=armv8.5-a}.
 Enable the Armv8-a Execution and Data Prediction Restriction instructions.
 This option is only to enable the extension at the assembler level and does
 not affect code generation.  This option is enabled by default for
+@option{-march=armv8.5-a}.
 @item sve2
 Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
 instructions.
@@ -16287,9 +16291,18 @@ Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
 Enable SVE2 aes instructions.  This also enables SVE2 instructions.
 @item sve2-sha3
 Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
-@option{-march=armv8.5-a}.
 @item tme
 Enable the Transactional Memory Extension.
+@item i8mm
+Enable 8-bit Integer Matrix Multiply instructions.  This also enables
+Advanced SIMD instructions.  This option is enabled by default for
+@option{-march=armv8.6-a}.  Use of this option with architectures prior to
+Armv8.2-A is not supported.
+@item bf16
+Enable brain half-precision floating-point instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
 
 @end table
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
index 608b89d19ce..2983e271114 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
@@ -13,6 +13,59 @@
 #error "__ARM_FEATURE_TME is defined but should not be!"
 #endif
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+
 int
 foo (int a)
 {

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
  2019-11-26 17:25 [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16 Dennis Zhang
@ 2019-11-29 13:02 ` Richard Sandiford
  2019-12-05 15:31   ` Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2019-11-29 13:02 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

Hi Dennis,

Sorry for the slow response.

Dennis Zhang <Dennis.Zhang@arm.com> writes:
> Hi all,
>
> This patch is part of a series adding support for Armv8.6-A features.
> It enables options including -march=armv8.6-a, +i8mm and +bf16.
> The +i8mm and +bf16 features are mandatory for Armv8.6-a and optional 
> for Armv8.2-a and onward.
> Documents are at https://developer.arm.com/docs/ddi0596/latest
>
> Regtested for aarch64-none-linux-gnu.
>
> Please help to check if it's ready for trunk.
>
> Many thanks!
> Dennis
>
> gcc/ChangeLog:
>
> 2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options.
>
> gcc/testsuite/ChangeLog:
>
> 2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
> 	and bf16 features.
>
> diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
> index d258bd49244..e464d329c1a 100644
> --- a/gcc/config/aarch64/aarch64-arches.def
> +++ b/gcc/config/aarch64/aarch64-arches.def
> @@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
>  AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
>  AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
>  AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
> +AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
>  
>  #undef AARCH64_ARCH
> diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
> index f3da07fd28a..20d1e00552b 100644
> --- a/gcc/config/aarch64/aarch64-c.c
> +++ b/gcc/config/aarch64/aarch64-c.c
> @@ -165,6 +165,12 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
>    aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
>    aarch64_def_or_undef (TARGET_MEMTAG, "__ARM_FEATURE_MEMORY_TAGGING", pfile);
>  
> +  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
> +  aarch64_def_or_undef (TARGET_BF16_SIMD,
> +			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
> +  aarch64_def_or_undef (TARGET_BF16_FP,
> +			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
> +
>    /* Not for ACLE, but required to keep "float.h" correct if we switch
>       target between implementations that do or do not support ARMv8.2-A
>       16-bit floating-point extensions.  */
> diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
> index d3ae1b2431b..5b7c3b8a213 100644
> --- a/gcc/config/aarch64/aarch64-option-extensions.def
> +++ b/gcc/config/aarch64/aarch64-option-extensions.def
> @@ -198,4 +198,14 @@ AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD |
>  /* Enabling or disabling "tme" only changes "tme".  */
>  AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
>  
> +/* Enabling "i8mm" also enables "simd".
> +   Disabling "i8mm" only disables "i8mm".  */
> +AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, AARCH64_FL_SIMD, \
> +		      0, false, "i8mm")

We have to maintain the transitive closure of features by hand,
so anything that enables AARCH64_FL_SIMD also needs to enable
AARCH64_FL_FP.

We should also add i8mm to the list of things that +nosimd and +nofp
disable.

(It would be better to do this automatically, but that's future work.)

> +/* Enabling "bf16" also enables "simd" and "fp".
> +   Disabling "bf16" only disables "bf16".  */
> +AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, AARCH64_FL_SIMD | AARCH64_FL_FP,
> +		      0, false, "bf16")

Similarly here we should add bf16 to the list of things that +nofp disables.

> @@ -308,6 +323,13 @@ extern unsigned aarch64_architecture_version;
>  /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
>  #define TARGET_MEMTAG (AARCH64_ISA_V8_5 && AARCH64_ISA_MEMTAG)
>  
> +/* I8MM instructions are enabled through +i8mm.  */
> +#define TARGET_I8MM (TARGET_SIMD && AARCH64_ISA_I8MM)

This should then just be AARCH64_ISA_I8MM (i.e. no need to test
TARGET_SIMD).

> +
> +/* BF16 instructions are enabled through +bf16.  */
> +#define TARGET_BF16_FP (AARCH64_ISA_BF16 && TARGET_FLOAT)

Similarly here we don't need a test for TARGET_FLOAT.

> +#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
> +
>  /* Make sure this is always defined so we don't have to check for ifdefs
>     but rather use normal ifs.  */
>  #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 403be47d893..30647e52c07 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -16045,7 +16045,10 @@ more feature modifiers.  This option has the form
>  
>  The permissible values for @var{arch} are @samp{armv8-a},
>  @samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
> -@samp{armv8.5-a} or @var{native}.
> +@samp{armv8.5-a}, @samp{armv8.6-a} or @var{native}.
> +
> +The value @samp{armv8.6-a} implies @samp{armv8.5-a} and enables compiler
> +support for the ARMv8.6-A architecture extensions.

This then goes on to:

-----------------------------------------------------------------
The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
support for the ARMv8.5-A architecture extensions.

The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
support for the ARMv8.4-A architecture extensions.

The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
support for the ARMv8.3-A architecture extensions.

The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
support for the ARMv8.2-A architecture extensions.

The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
support for the ARMv8.1-A architecture extension.  In particular, it
enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
-----------------------------------------------------------------

I don't think we'd have written it like this if we'd been developing
and submitting Armv8-A to Armv8.6-A (or Armv8.5-A) all in one go.
It's just kind of grown by copying what Armv8.(x-1)-A did.

How about replacing "The permissible values..." onwards with something like:

-----------------------------------------------------------------
The table below summarizes the permissable values for @var{arch}
and the features that they enable by default:

@multitable @columnfractions .25 .25 .50
@headitem @var{arch} value @tab Architecture @tab Includes by default
...
@item @samp{armv8.6-a} @tab Armv8.6
@tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
@end multitable
-----------------------------------------------------------------

(Completely untested.)

Or we could put "armv8.5-a" and "bf16, i8mm" (with or without "+"s)
into separate columns, perhaps "Extends" and "Includes by default".

(Any of those options would be fine with me FWIW.)

> @@ -16276,6 +16279,7 @@ generation.  This option is enabled by default for @option{-march=armv8.5-a}.
>  Enable the Armv8-a Execution and Data Prediction Restriction instructions.
>  This option is only to enable the extension at the assembler level and does
>  not affect code generation.  This option is enabled by default for
> +@option{-march=armv8.5-a}.
>  @item sve2
>  Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
>  instructions.
> @@ -16287,9 +16291,18 @@ Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
>  Enable SVE2 aes instructions.  This also enables SVE2 instructions.
>  @item sve2-sha3
>  Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
> -@option{-march=armv8.5-a}.
>  @item tme
>  Enable the Transactional Memory Extension.
> +@item i8mm
> +Enable 8-bit Integer Matrix Multiply instructions.  This also enables
> +Advanced SIMD instructions.  This option is enabled by default for

After the above: "Advanced SIMD and floating-point"

> +@option{-march=armv8.6-a}.  Use of this option with architectures prior to
> +Armv8.2-A is not supported.
> +@item bf16
> +Enable brain half-precision floating-point instructions.  This also enables
> +Advanced SIMD and floating-point instructions.  This option is enabled by
> +default for @option{-march=armv8.6-a}.  Use of this option with architectures
> +prior to Armv8.2-A is not supported.
>  
>  @end table
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> index 608b89d19ce..2983e271114 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> @@ -13,6 +13,59 @@
>  #error "__ARM_FEATURE_TME is defined but should not be!"
>  #endif
>  
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.6-a")
> +#ifndef __ARM_FEATURE_MATMUL_INT8
> +#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
> +#endif
> +#pragma GCC pop_options
> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+i8mm")
> +#ifndef __ARM_FEATURE_MATMUL_INT8
> +#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
> +#endif
> +#pragma GCC pop_options
> +
> +#ifdef __ARM_FEATURE_MATMUL_INT8
> +#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
> +#endif

Not your bug, but we should start the file with:

#pragma GCC target ("arch=armv8-a")

otherwise anyone running the testsuite with -march=armv8.6-a will
get failures here.

> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.6-a")
> +#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
> +#endif
> +#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
> +#endif
> +#pragma GCC pop_options
> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
> +#endif
> +#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
> +#endif
> +#pragma GCC pop_options
> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
> +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
> +#endif
> +#pragma GCC pop_options
> +

For completeness, we might as well test __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
is defined.

> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
> +#endif
> +
> +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
> +#endif

Very, very minor, but since the others have no blank line between the
two tests, I think it'd be more consistent not to have one here either.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
  2019-11-29 13:02 ` Richard Sandiford
@ 2019-12-05 15:31   ` Dennis Zhang
  2019-12-06 10:22     ` Richard Sandiford
  0 siblings, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2019-12-05 15:31 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, James Greenhalgh,
	Marcus Shawcroft, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 13821 bytes --]

Hi Richard,

On 29/11/2019 13:00, Richard Sandiford wrote:
> Hi Dennis,
> 
> Sorry for the slow response.
> 
> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>> Hi all,
>>
>> This patch is part of a series adding support for Armv8.6-A features.
>> It enables options including -march=armv8.6-a, +i8mm and +bf16.
>> The +i8mm and +bf16 features are mandatory for Armv8.6-a and optional
>> for Armv8.2-a and onward.
>> Documents are at https://developer.arm.com/docs/ddi0596/latest
>>
>> Regtested for aarch64-none-linux-gnu.
>>
>> Please help to check if it's ready for trunk.
>>
>> Many thanks!
>> Dennis
>>
>> gcc/ChangeLog:
>>
>> 2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>
>>
>> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
>> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
>> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
>> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
>> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
>> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
>> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
>> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
>> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
>> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-11-26  Dennis Zhang  <dennis.zhang@arm.com>
>>
>> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
>> 	and bf16 features.
>>
>> diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
>> index d258bd49244..e464d329c1a 100644
>> --- a/gcc/config/aarch64/aarch64-arches.def
>> +++ b/gcc/config/aarch64/aarch64-arches.def
>> @@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
>>   AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
>>   AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
>>   AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
>> +AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
>>   
>>   #undef AARCH64_ARCH
>> diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
>> index f3da07fd28a..20d1e00552b 100644
>> --- a/gcc/config/aarch64/aarch64-c.c
>> +++ b/gcc/config/aarch64/aarch64-c.c
>> @@ -165,6 +165,12 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
>>     aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
>>     aarch64_def_or_undef (TARGET_MEMTAG, "__ARM_FEATURE_MEMORY_TAGGING", pfile);
>>   
>> +  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
>> +  aarch64_def_or_undef (TARGET_BF16_SIMD,
>> +			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
>> +  aarch64_def_or_undef (TARGET_BF16_FP,
>> +			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
>> +
>>     /* Not for ACLE, but required to keep "float.h" correct if we switch
>>        target between implementations that do or do not support ARMv8.2-A
>>        16-bit floating-point extensions.  */
>> diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
>> index d3ae1b2431b..5b7c3b8a213 100644
>> --- a/gcc/config/aarch64/aarch64-option-extensions.def
>> +++ b/gcc/config/aarch64/aarch64-option-extensions.def
>> @@ -198,4 +198,14 @@ AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD |
>>   /* Enabling or disabling "tme" only changes "tme".  */
>>   AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
>>   
>> +/* Enabling "i8mm" also enables "simd".
>> +   Disabling "i8mm" only disables "i8mm".  */
>> +AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, AARCH64_FL_SIMD, \
>> +		      0, false, "i8mm")
> 
> We have to maintain the transitive closure of features by hand,
> so anything that enables AARCH64_FL_SIMD also needs to enable
> AARCH64_FL_FP.
> 
> We should also add i8mm to the list of things that +nosimd and +nofp
> disable.
> 
> (It would be better to do this automatically, but that's future work.)
> 
>> +/* Enabling "bf16" also enables "simd" and "fp".
>> +   Disabling "bf16" only disables "bf16".  */
>> +AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, AARCH64_FL_SIMD | AARCH64_FL_FP,
>> +		      0, false, "bf16")
> 
> Similarly here we should add bf16 to the list of things that +nofp disables.
> 
>> @@ -308,6 +323,13 @@ extern unsigned aarch64_architecture_version;
>>   /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
>>   #define TARGET_MEMTAG (AARCH64_ISA_V8_5 && AARCH64_ISA_MEMTAG)
>>   
>> +/* I8MM instructions are enabled through +i8mm.  */
>> +#define TARGET_I8MM (TARGET_SIMD && AARCH64_ISA_I8MM)
> 
> This should then just be AARCH64_ISA_I8MM (i.e. no need to test
> TARGET_SIMD).
> 
>> +
>> +/* BF16 instructions are enabled through +bf16.  */
>> +#define TARGET_BF16_FP (AARCH64_ISA_BF16 && TARGET_FLOAT)
> 
> Similarly here we don't need a test for TARGET_FLOAT.
> 

The updated patch has added fp to i8mm 'also enable' list.
And i8mm and bf16 are added to the disabling lists of nofp and nosimd.
Also removed unnecessary checking of flags.

>> +#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
>> +
>>   /* Make sure this is always defined so we don't have to check for ifdefs
>>      but rather use normal ifs.  */
>>   #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index 403be47d893..30647e52c07 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -16045,7 +16045,10 @@ more feature modifiers.  This option has the form
>>   
>>   The permissible values for @var{arch} are @samp{armv8-a},
>>   @samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
>> -@samp{armv8.5-a} or @var{native}.
>> +@samp{armv8.5-a}, @samp{armv8.6-a} or @var{native}.
>> +
>> +The value @samp{armv8.6-a} implies @samp{armv8.5-a} and enables compiler
>> +support for the ARMv8.6-A architecture extensions.
> 
> This then goes on to:
> 
> -----------------------------------------------------------------
> The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
> support for the ARMv8.5-A architecture extensions.
> 
> The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
> support for the ARMv8.4-A architecture extensions.
> 
> The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
> support for the ARMv8.3-A architecture extensions.
> 
> The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
> support for the ARMv8.2-A architecture extensions.
> 
> The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
> support for the ARMv8.1-A architecture extension.  In particular, it
> enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
> -----------------------------------------------------------------
> 
> I don't think we'd have written it like this if we'd been developing
> and submitting Armv8-A to Armv8.6-A (or Armv8.5-A) all in one go.
> It's just kind of grown by copying what Armv8.(x-1)-A did.
> 
> How about replacing "The permissible values..." onwards with something like:
> 
> -----------------------------------------------------------------
> The table below summarizes the permissable values for @var{arch}
> and the features that they enable by default:
> 
> @multitable @columnfractions .25 .25 .50
> @headitem @var{arch} value @tab Architecture @tab Includes by default
> ...
> @item @samp{armv8.6-a} @tab Armv8.6
> @tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
> @end multitable
> -----------------------------------------------------------------
> 
> (Completely untested.)
> 
> Or we could put "armv8.5-a" and "bf16, i8mm" (with or without "+"s)
> into separate columns, perhaps "Extends" and "Includes by default".
> 
> (Any of those options would be fine with me FWIW.)
> 

This part is modified according to your comment.
Thanks for the detailed suggestions.

>> @@ -16276,6 +16279,7 @@ generation.  This option is enabled by default for @option{-march=armv8.5-a}.
>>   Enable the Armv8-a Execution and Data Prediction Restriction instructions.
>>   This option is only to enable the extension at the assembler level and does
>>   not affect code generation.  This option is enabled by default for
>> +@option{-march=armv8.5-a}.
>>   @item sve2
>>   Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
>>   instructions.
>> @@ -16287,9 +16291,18 @@ Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
>>   Enable SVE2 aes instructions.  This also enables SVE2 instructions.
>>   @item sve2-sha3
>>   Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
>> -@option{-march=armv8.5-a}.
>>   @item tme
>>   Enable the Transactional Memory Extension.
>> +@item i8mm
>> +Enable 8-bit Integer Matrix Multiply instructions.  This also enables
>> +Advanced SIMD instructions.  This option is enabled by default for
> 
> After the above: "Advanced SIMD and floating-point"
> 
>> +@option{-march=armv8.6-a}.  Use of this option with architectures prior to
>> +Armv8.2-A is not supported.
>> +@item bf16
>> +Enable brain half-precision floating-point instructions.  This also enables
>> +Advanced SIMD and floating-point instructions.  This option is enabled by
>> +default for @option{-march=armv8.6-a}.  Use of this option with architectures
>> +prior to Armv8.2-A is not supported.
>>   
>>   @end table
>>   
>> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> index 608b89d19ce..2983e271114 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> @@ -13,6 +13,59 @@
>>   #error "__ARM_FEATURE_TME is defined but should not be!"
>>   #endif
>>   
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.6-a")
>> +#ifndef __ARM_FEATURE_MATMUL_INT8
>> +#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
>> +#endif
>> +#pragma GCC pop_options
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+i8mm")
>> +#ifndef __ARM_FEATURE_MATMUL_INT8
>> +#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
>> +#endif
>> +#pragma GCC pop_options
>> +
>> +#ifdef __ARM_FEATURE_MATMUL_INT8
>> +#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
>> +#endif
> 
> Not your bug, but we should start the file with:
> 
> #pragma GCC target ("arch=armv8-a")
> 
> otherwise anyone running the testsuite with -march=armv8.6-a will
> get failures here.
> 
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.6-a")
>> +#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
>> +#endif
>> +#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
>> +#endif
>> +#pragma GCC pop_options
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
>> +#endif
>> +#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
>> +#endif
>> +#pragma GCC pop_options
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
>> +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
>> +#endif
>> +#pragma GCC pop_options
>> +
> 
> For completeness, we might as well test __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> is defined.
> 
>> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
>> +#endif
>> +
>> +#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
>> +#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
>> +#endif
> 
> Very, very minor, but since the others have no blank line between the
> two tests, I think it'd be more consistent not to have one here either.
> 

The tests are updated with several new cases to show nofp and nosimd 
working properly for i8mm and bf16.

Finally, the ChangeLog is updated as following:

gcc/ChangeLog:

2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
	(fp): Disabling fp also disables i8mm and bf16.
	(simd): Disabling simd also disables i8mm.
	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options. Add
	a new table to list permissible values for ARCH.

gcc/testsuite/ChangeLog:

2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
	and bf16 features.


Many thanks!
Dennis


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: cli-aarch-armv8.6-a+i8mm+bf16-20191202-2.patch --]
[-- Type: text/x-patch; name="cli-aarch-armv8.6-a+i8mm+bf16-20191202-2.patch", Size: 13436 bytes --]

diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index d258bd49244..e464d329c1a 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
 AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
+AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index f3da07fd28a..20d1e00552b 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -165,6 +165,12 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
   aarch64_def_or_undef (TARGET_MEMTAG, "__ARM_FEATURE_MEMORY_TAGGING", pfile);
 
+  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
+  aarch64_def_or_undef (TARGET_BF16_SIMD,
+			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
+  aarch64_def_or_undef (TARGET_BF16_FP,
+			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
+
   /* Not for ACLE, but required to keep "float.h" correct if we switch
      target between implementations that do or do not support ARMv8.2-A
      16-bit floating-point extensions.  */
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index d3ae1b2431b..52c0fb79f82 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -52,25 +52,27 @@
 
 /* Enabling "fp" just enables "fp".
    Disabling "fp" also disables "simd", "crypto", "fp16", "aes", "sha2",
-   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
-   "sve2-bitperm".  */
+   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", "i8mm" and "bf16".  */
 AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | \
 		      AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | \
 		      AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | \
 		      AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
 		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
-		      AARCH64_FL_SVE2_BITPERM, false, "fp")
+		      AARCH64_FL_SVE2_BITPERM | AARCH64_FL_I8MM | \
+		      AARCH64_FL_BF16, false, "fp")
 
 /* Enabling "simd" also enables "fp".
    Disabling "simd" also disables "crypto", "dotprod", "aes", "sha2", "sha3",
-   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
-   "sve2-bitperm".  */
+   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", and "i8mm".  */
 AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, \
 		      AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | \
 		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
 		      AARCH64_FL_SM4 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | \
 		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
-		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, \
+		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM | \
+		      AARCH64_FL_I8MM, false, \
 		      "asimd")
 
 /* Enabling "crypto" also enables "fp", "simd", "aes" and "sha2".
@@ -198,4 +200,14 @@ AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD |
 /* Enabling or disabling "tme" only changes "tme".  */
 AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
 
+/* Enabling "i8mm" also enables "simd" and "fp".
+   Disabling "i8mm" only disables "i8mm".  */
+AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "i8mm")
+
+/* Enabling "bf16" also enables "simd" and "fp".
+   Disabling "bf16" only disables "bf16".  */
+AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "bf16")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index ee01909abb9..1408e7989b6 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -202,6 +202,15 @@ extern unsigned aarch64_architecture_version;
 /* Transactional Memory Extension.  */
 #define AARCH64_FL_TME	      (1ULL << 33)  /* Has TME instructions.  */
 
+/* Armv8.6-A architecture extensions.  */
+#define AARCH64_FL_V8_6	      (1ULL << 34)
+
+/* 8-bit Integer Matrix Multiply (I8MM) extensions.  */
+#define AARCH64_FL_I8MM	      (1ULL << 35)
+
+/* Brain half-precision floating-point (BFloat16) Extension.  */
+#define AARCH64_FL_BF16	      (1ULL << 36)
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -223,6 +232,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_5			\
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_5	\
    | AARCH64_FL_SB | AARCH64_FL_SSBS | AARCH64_FL_PREDRES)
+#define AARCH64_FL_FOR_ARCH8_6			\
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
+   | AARCH64_FL_I8MM | AARCH64_FL_BF16)
 
 /* Macros to test ISA flags.  */
 
@@ -249,6 +261,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_5	   (aarch64_isa_flags & AARCH64_FL_V8_5)
 #define AARCH64_ISA_TME		   (aarch64_isa_flags & AARCH64_FL_TME)
 #define AARCH64_ISA_MEMTAG	   (aarch64_isa_flags & AARCH64_FL_MEMTAG)
+#define AARCH64_ISA_V8_6	   (aarch64_isa_flags & AARCH64_FL_V8_6)
+#define AARCH64_ISA_I8MM	   (aarch64_isa_flags & AARCH64_FL_I8MM)
+#define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -308,6 +323,13 @@ extern unsigned aarch64_architecture_version;
 /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
 #define TARGET_MEMTAG (AARCH64_ISA_V8_5 && AARCH64_ISA_MEMTAG)
 
+/* I8MM instructions are enabled through +i8mm.  */
+#define TARGET_I8MM (AARCH64_ISA_I8MM)
+
+/* BF16 instructions are enabled through +bf16.  */
+#define TARGET_BF16_FP (AARCH64_ISA_BF16)
+#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d165f31a865..1192e8f4b06 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16050,25 +16050,22 @@ Specify the name of the target architecture and, optionally, one or
 more feature modifiers.  This option has the form
 @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
 
-The permissible values for @var{arch} are @samp{armv8-a},
-@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
-@samp{armv8.5-a} or @var{native}.
-
-The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
-support for the ARMv8.5-A architecture extensions.
-
-The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
-support for the ARMv8.4-A architecture extensions.
-
-The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
-support for the ARMv8.3-A architecture extensions.
-
-The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
-support for the ARMv8.2-A architecture extensions.
-
-The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
-support for the ARMv8.1-A architecture extension.  In particular, it
-enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
+The table below summarizes the permissible values for @var{arch}
+and the features that they enable by default:
+
+@multitable @columnfractions 0.20 0.20 0.60
+@headitem @var{arch} value @tab Architecture @tab Includes by default
+@item @samp{armv8.1-a} @tab Armv8.1-A
+@tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
+@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
+@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
+@item @samp{armv8.4-a} @tab Armv8.4-A
+@tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
+@item @samp{armv8.5-a} @tab Armv8.5-A
+@tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
+@item @samp{armv8.6-a} @tab Armv8.6-A
+@tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
+@end multitable
 
 The value @samp{native} is available on native AArch64 GNU/Linux and
 causes the compiler to pick the architecture of the host system.  This
@@ -16283,6 +16280,7 @@ generation.  This option is enabled by default for @option{-march=armv8.5-a}.
 Enable the Armv8-a Execution and Data Prediction Restriction instructions.
 This option is only to enable the extension at the assembler level and does
 not affect code generation.  This option is enabled by default for
+@option{-march=armv8.5-a}.
 @item sve2
 Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
 instructions.
@@ -16294,9 +16292,18 @@ Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
 Enable SVE2 aes instructions.  This also enables SVE2 instructions.
 @item sve2-sha3
 Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
-@option{-march=armv8.5-a}.
 @item tme
 Enable the Transactional Memory Extension.
+@item i8mm
+Enable 8-bit Integer Matrix Multiply instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
+@item bf16
+Enable brain half-precision floating-point instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
 
 @end table
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
index 608b89d19ce..5ae39bc6cf0 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
@@ -13,6 +13,92 @@
 #error "__ARM_FEATURE_TME is defined but should not be!"
 #endif
 
+/* Test Armv8.6-a features.  */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8-a")
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nosimd")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC pop_options
+
 int
 foo (int a)
 {

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
  2019-12-05 15:31   ` Dennis Zhang
@ 2019-12-06 10:22     ` Richard Sandiford
  2019-12-12 17:01       ` Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2019-12-06 10:22 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

Dennis Zhang <Dennis.Zhang@arm.com> writes:
> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
> 	(fp): Disabling fp also disables i8mm and bf16.
> 	(simd): Disabling simd also disables i8mm.
> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options. Add
> 	a new table to list permissible values for ARCH.
>
> gcc/testsuite/ChangeLog:
>
> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
> 	and bf16 features.

Thanks for the update, looks great.  A couple of comments below.

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index d165f31a865..1192e8f4b06 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -16050,25 +16050,22 @@ Specify the name of the target architecture and, optionally, one or
>  more feature modifiers.  This option has the form
>  @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
>  
> -The permissible values for @var{arch} are @samp{armv8-a},
> -@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
> -@samp{armv8.5-a} or @var{native}.
> -
> -The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
> -support for the ARMv8.5-A architecture extensions.
> -
> -The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
> -support for the ARMv8.4-A architecture extensions.
> -
> -The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
> -support for the ARMv8.3-A architecture extensions.
> -
> -The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
> -support for the ARMv8.2-A architecture extensions.
> -
> -The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
> -support for the ARMv8.1-A architecture extension.  In particular, it
> -enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
> +The table below summarizes the permissible values for @var{arch}
> +and the features that they enable by default:
> +
> +@multitable @columnfractions 0.20 0.20 0.60
> +@headitem @var{arch} value @tab Architecture @tab Includes by default

We should have an armv8-a entry here, something like:

@item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}

> +@item @samp{armv8.1-a} @tab Armv8.1-A
> +@tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
> +@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
> +@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
> +@item @samp{armv8.4-a} @tab Armv8.4-A
> +@tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
> +@item @samp{armv8.5-a} @tab Armv8.5-A
> +@tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
> +@item @samp{armv8.6-a} @tab Armv8.6-A
> +@tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
> +@end multitable

I should have tried a proof of concept of this before suggesting it, sorry.
Trying the patch locally I get:

gcc.pod around line 18643: You can't have =items (as at line 18649) unless the first thing after the =over is an =item
POD document had syntax errors at /usr/bin/pod2man line 71.
Makefile:3363: recipe for target 'doc/gcc.1' failed
make: [doc/gcc.1] Error 1 (ignored)

(Odd that this is an ignored error, since we end up with an empty man page.)

I've posted a texi2pod.pl patch for that:

    https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00407.html

However, even with that patch, the script needs the full table row to be
on a single line, so I think we need to do that and live with the long lines.

> [...]
> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> index 608b89d19ce..5ae39bc6cf0 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
> @@ -13,6 +13,92 @@
>  #error "__ARM_FEATURE_TME is defined but should not be!"
>  #endif
>  
> +/* Test Armv8.6-a features.  */
> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8-a")

These two pragmas should be at the beginning of the file, so that we
start with base armv8-a for all the tests.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
  2019-12-06 10:22     ` Richard Sandiford
@ 2019-12-12 17:01       ` Dennis Zhang
  2019-12-13 10:23         ` Richard Sandiford
  2020-10-29 12:19         ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Dennis Zhang
  0 siblings, 2 replies; 23+ messages in thread
From: Dennis Zhang @ 2019-12-12 17:01 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, James Greenhalgh,
	Marcus Shawcroft, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 6229 bytes --]

Hi Richard,

On 06/12/2019 10:22, Richard Sandiford wrote:
> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>>
>> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
>> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
>> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
>> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
>> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
>> 	(fp): Disabling fp also disables i8mm and bf16.
>> 	(simd): Disabling simd also disables i8mm.
>> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
>> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
>> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
>> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
>> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options. Add
>> 	a new table to list permissible values for ARCH.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>>
>> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
>> 	and bf16 features.
> 
> Thanks for the update, looks great.  A couple of comments below.
> 
>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> index d165f31a865..1192e8f4b06 100644
>> --- a/gcc/doc/invoke.texi
>> +++ b/gcc/doc/invoke.texi
>> @@ -16050,25 +16050,22 @@ Specify the name of the target architecture and, optionally, one or
>>   more feature modifiers.  This option has the form
>>   @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
>>   
>> -The permissible values for @var{arch} are @samp{armv8-a},
>> -@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
>> -@samp{armv8.5-a} or @var{native}.
>> -
>> -The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
>> -support for the ARMv8.5-A architecture extensions.
>> -
>> -The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
>> -support for the ARMv8.4-A architecture extensions.
>> -
>> -The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
>> -support for the ARMv8.3-A architecture extensions.
>> -
>> -The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
>> -support for the ARMv8.2-A architecture extensions.
>> -
>> -The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
>> -support for the ARMv8.1-A architecture extension.  In particular, it
>> -enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
>> +The table below summarizes the permissible values for @var{arch}
>> +and the features that they enable by default:
>> +
>> +@multitable @columnfractions 0.20 0.20 0.60
>> +@headitem @var{arch} value @tab Architecture @tab Includes by default
> 
> We should have an armv8-a entry here, something like:
> 
> @item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}
> 

The armv8-a entry is added.

>> +@item @samp{armv8.1-a} @tab Armv8.1-A
>> +@tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
>> +@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
>> +@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
>> +@item @samp{armv8.4-a} @tab Armv8.4-A
>> +@tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
>> +@item @samp{armv8.5-a} @tab Armv8.5-A
>> +@tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
>> +@item @samp{armv8.6-a} @tab Armv8.6-A
>> +@tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
>> +@end multitable
> 
> I should have tried a proof of concept of this before suggesting it, sorry.
> Trying the patch locally I get:
> 
> gcc.pod around line 18643: You can't have =items (as at line 18649) unless the first thing after the =over is an =item
> POD document had syntax errors at /usr/bin/pod2man line 71.
> Makefile:3363: recipe for target 'doc/gcc.1' failed
> make: [doc/gcc.1] Error 1 (ignored)
> 
> (Odd that this is an ignored error, since we end up with an empty man page.)
> 
> I've posted a texi2pod.pl patch for that:
> 
>      https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00407.html
> 
> However, even with that patch, the script needs the full table row to be
> on a single line, so I think we need to do that and live with the long lines.
> 

The items are kept in a single line for each.

>> [...]
>> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> index 608b89d19ce..5ae39bc6cf0 100644
>> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>> @@ -13,6 +13,92 @@
>>   #error "__ARM_FEATURE_TME is defined but should not be!"
>>   #endif
>>   
>> +/* Test Armv8.6-a features.  */
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8-a")
> 
> These two pragmas should be at the beginning of the file, so that we
> start with base armv8-a for all the tests.

The pragmas are moved to the top.

The ChangeLog is updated as below:

gcc/ChangeLog:

2019-12-12  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
	(fp): Disabling fp also disables i8mm and bf16.
	(simd): Disabling simd also disables i8mm.
	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options
	and add a new table to list permissible values for ARCH.

gcc/testsuite/ChangeLog:

2019-12-12  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
	and bf16 features.


Many thanks!
Dennis

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: cli-aarch-armv8.6-a+i8mm+bf16-20191210.patch --]
[-- Type: text/x-patch; name="cli-aarch-armv8.6-a+i8mm+bf16-20191210.patch", Size: 13666 bytes --]

diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def
index d258bd49244..e464d329c1a 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -36,5 +36,6 @@ AARCH64_ARCH("armv8.2-a",     generic,	     8_2A,	8,  AARCH64_FL_FOR_ARCH8_2)
 AARCH64_ARCH("armv8.3-a",     generic,	     8_3A,	8,  AARCH64_FL_FOR_ARCH8_3)
 AARCH64_ARCH("armv8.4-a",     generic,	     8_4A,	8,  AARCH64_FL_FOR_ARCH8_4)
 AARCH64_ARCH("armv8.5-a",     generic,	     8_5A,	8,  AARCH64_FL_FOR_ARCH8_5)
+AARCH64_ARCH("armv8.6-a",     generic,	     8_6A,	8,  AARCH64_FL_FOR_ARCH8_6)
 
 #undef AARCH64_ARCH
diff --git a/gcc/config/aarch64/aarch64-c.c b/gcc/config/aarch64/aarch64-c.c
index f3da07fd28a..20d1e00552b 100644
--- a/gcc/config/aarch64/aarch64-c.c
+++ b/gcc/config/aarch64/aarch64-c.c
@@ -165,6 +165,12 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
   aarch64_def_or_undef (TARGET_RNG, "__ARM_FEATURE_RNG", pfile);
   aarch64_def_or_undef (TARGET_MEMTAG, "__ARM_FEATURE_MEMORY_TAGGING", pfile);
 
+  aarch64_def_or_undef (TARGET_I8MM, "__ARM_FEATURE_MATMUL_INT8", pfile);
+  aarch64_def_or_undef (TARGET_BF16_SIMD,
+			"__ARM_FEATURE_BF16_VECTOR_ARITHMETIC", pfile);
+  aarch64_def_or_undef (TARGET_BF16_FP,
+			"__ARM_FEATURE_BF16_SCALAR_ARITHMETIC", pfile);
+
   /* Not for ACLE, but required to keep "float.h" correct if we switch
      target between implementations that do or do not support ARMv8.2-A
      16-bit floating-point extensions.  */
diff --git a/gcc/config/aarch64/aarch64-option-extensions.def b/gcc/config/aarch64/aarch64-option-extensions.def
index d3ae1b2431b..52c0fb79f82 100644
--- a/gcc/config/aarch64/aarch64-option-extensions.def
+++ b/gcc/config/aarch64/aarch64-option-extensions.def
@@ -52,25 +52,27 @@
 
 /* Enabling "fp" just enables "fp".
    Disabling "fp" also disables "simd", "crypto", "fp16", "aes", "sha2",
-   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
-   "sve2-bitperm".  */
+   "sha3", sm3/sm4, "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", "i8mm" and "bf16".  */
 AARCH64_OPT_EXTENSION("fp", AARCH64_FL_FP, 0, AARCH64_FL_SIMD | \
 		      AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | \
 		      AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | AARCH64_FL_SM4 | \
 		      AARCH64_FL_SVE | AARCH64_FL_SVE2 | AARCH64_FL_SVE2_AES | \
 		      AARCH64_FL_SVE2_SHA3 | AARCH64_FL_SVE2_SM4 | \
-		      AARCH64_FL_SVE2_BITPERM, false, "fp")
+		      AARCH64_FL_SVE2_BITPERM | AARCH64_FL_I8MM | \
+		      AARCH64_FL_BF16, false, "fp")
 
 /* Enabling "simd" also enables "fp".
    Disabling "simd" also disables "crypto", "dotprod", "aes", "sha2", "sha3",
-   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4", and
-   "sve2-bitperm".  */
+   "sm3/sm4", "sve", "sve2", "sve2-aes", "sve2-sha3", "sve2-sm4",
+   "sve2-bitperm", and "i8mm".  */
 AARCH64_OPT_EXTENSION("simd", AARCH64_FL_SIMD, AARCH64_FL_FP, \
 		      AARCH64_FL_CRYPTO | AARCH64_FL_DOTPROD | \
 		      AARCH64_FL_AES | AARCH64_FL_SHA2 | AARCH64_FL_SHA3 | \
 		      AARCH64_FL_SM4 | AARCH64_FL_SVE | AARCH64_FL_SVE2 | \
 		      AARCH64_FL_SVE2_AES | AARCH64_FL_SVE2_SHA3 | \
-		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM, false, \
+		      AARCH64_FL_SVE2_SM4 | AARCH64_FL_SVE2_BITPERM | \
+		      AARCH64_FL_I8MM, false, \
 		      "asimd")
 
 /* Enabling "crypto" also enables "fp", "simd", "aes" and "sha2".
@@ -198,4 +200,14 @@ AARCH64_OPT_EXTENSION("sve2-bitperm", AARCH64_FL_SVE2_BITPERM, AARCH64_FL_SIMD |
 /* Enabling or disabling "tme" only changes "tme".  */
 AARCH64_OPT_EXTENSION("tme", AARCH64_FL_TME, 0, 0, false, "")
 
+/* Enabling "i8mm" also enables "simd" and "fp".
+   Disabling "i8mm" only disables "i8mm".  */
+AARCH64_OPT_EXTENSION("i8mm", AARCH64_FL_I8MM, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "i8mm")
+
+/* Enabling "bf16" also enables "simd" and "fp".
+   Disabling "bf16" only disables "bf16".  */
+AARCH64_OPT_EXTENSION("bf16", AARCH64_FL_BF16, \
+		      AARCH64_FL_SIMD | AARCH64_FL_FP, 0, false, "bf16")
+
 #undef AARCH64_OPT_EXTENSION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index ee01909abb9..2bb5a208720 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -202,6 +202,15 @@ extern unsigned aarch64_architecture_version;
 /* Transactional Memory Extension.  */
 #define AARCH64_FL_TME	      (1ULL << 33)  /* Has TME instructions.  */
 
+/* Armv8.6-A architecture extensions.  */
+#define AARCH64_FL_V8_6	      (1ULL << 34)
+
+/* 8-bit Integer Matrix Multiply (I8MM) extensions.  */
+#define AARCH64_FL_I8MM	      (1ULL << 35)
+
+/* Brain half-precision floating-point (BFloat16) Extension.  */
+#define AARCH64_FL_BF16	      (1ULL << 36)
+
 /* Has FP and SIMD.  */
 #define AARCH64_FL_FPSIMD     (AARCH64_FL_FP | AARCH64_FL_SIMD)
 
@@ -223,6 +232,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_FL_FOR_ARCH8_5			\
   (AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_V8_5	\
    | AARCH64_FL_SB | AARCH64_FL_SSBS | AARCH64_FL_PREDRES)
+#define AARCH64_FL_FOR_ARCH8_6			\
+  (AARCH64_FL_FOR_ARCH8_5 | AARCH64_FL_V8_6 | AARCH64_FL_FPSIMD \
+   | AARCH64_FL_I8MM | AARCH64_FL_BF16)
 
 /* Macros to test ISA flags.  */
 
@@ -249,6 +261,9 @@ extern unsigned aarch64_architecture_version;
 #define AARCH64_ISA_V8_5	   (aarch64_isa_flags & AARCH64_FL_V8_5)
 #define AARCH64_ISA_TME		   (aarch64_isa_flags & AARCH64_FL_TME)
 #define AARCH64_ISA_MEMTAG	   (aarch64_isa_flags & AARCH64_FL_MEMTAG)
+#define AARCH64_ISA_V8_6	   (aarch64_isa_flags & AARCH64_FL_V8_6)
+#define AARCH64_ISA_I8MM	   (aarch64_isa_flags & AARCH64_FL_I8MM)
+#define AARCH64_ISA_BF16	   (aarch64_isa_flags & AARCH64_FL_BF16)
 
 /* Crypto is an optional extension to AdvSIMD.  */
 #define TARGET_CRYPTO (TARGET_SIMD && AARCH64_ISA_CRYPTO)
@@ -308,6 +323,13 @@ extern unsigned aarch64_architecture_version;
 /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
 #define TARGET_MEMTAG (AARCH64_ISA_V8_5 && AARCH64_ISA_MEMTAG)
 
+/* I8MM instructions are enabled through +i8mm.  */
+#define TARGET_I8MM (AARCH64_ISA_I8MM)
+
+/* BF16 instructions are enabled through +bf16.  */
+#define TARGET_BF16_FP (AARCH64_ISA_BF16)
+#define TARGET_BF16_SIMD (AARCH64_ISA_BF16 && TARGET_SIMD)
+
 /* Make sure this is always defined so we don't have to check for ifdefs
    but rather use normal ifs.  */
 #ifndef TARGET_FIX_ERR_A53_835769_DEFAULT
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index af3c7f2b910..f4619876318 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16063,25 +16063,19 @@ Specify the name of the target architecture and, optionally, one or
 more feature modifiers.  This option has the form
 @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
 
-The permissible values for @var{arch} are @samp{armv8-a},
-@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
-@samp{armv8.5-a} or @var{native}.
-
-The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
-support for the ARMv8.5-A architecture extensions.
-
-The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
-support for the ARMv8.4-A architecture extensions.
-
-The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
-support for the ARMv8.3-A architecture extensions.
-
-The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
-support for the ARMv8.2-A architecture extensions.
-
-The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
-support for the ARMv8.1-A architecture extension.  In particular, it
-enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
+The table below summarizes the permissible values for @var{arch}
+and the features that they enable by default:
+
+@multitable @columnfractions 0.20 0.20 0.60
+@headitem @var{arch} value @tab Architecture @tab Includes by default
+@item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}
+@item @samp{armv8.1-a} @tab Armv8.1-A @tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
+@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
+@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
+@item @samp{armv8.4-a} @tab Armv8.4-A @tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
+@item @samp{armv8.5-a} @tab Armv8.5-A @tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
+@item @samp{armv8.6-a} @tab Armv8.6-A @tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
+@end multitable
 
 The value @samp{native} is available on native AArch64 GNU/Linux and
 causes the compiler to pick the architecture of the host system.  This
@@ -16296,6 +16290,7 @@ generation.  This option is enabled by default for @option{-march=armv8.5-a}.
 Enable the Armv8-a Execution and Data Prediction Restriction instructions.
 This option is only to enable the extension at the assembler level and does
 not affect code generation.  This option is enabled by default for
+@option{-march=armv8.5-a}.
 @item sve2
 Enable the Armv8-a Scalable Vector Extension 2.  This also enables SVE
 instructions.
@@ -16307,9 +16302,18 @@ Enable SVE2 sm4 instructions.  This also enables SVE2 instructions.
 Enable SVE2 aes instructions.  This also enables SVE2 instructions.
 @item sve2-sha3
 Enable SVE2 sha3 instructions.  This also enables SVE2 instructions.
-@option{-march=armv8.5-a}.
 @item tme
 Enable the Transactional Memory Extension.
+@item i8mm
+Enable 8-bit Integer Matrix Multiply instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
+@item bf16
+Enable brain half-precision floating-point instructions.  This also enables
+Advanced SIMD and floating-point instructions.  This option is enabled by
+default for @option{-march=armv8.6-a}.  Use of this option with architectures
+prior to Armv8.2-A is not supported.
 
 @end table
 
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
index 608b89d19ce..f61d635bd2a 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
@@ -1,6 +1,9 @@
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
 
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8-a")
+
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8-a+tme")
 #ifndef __ARM_FEATURE_TME
@@ -13,6 +16,89 @@
 #error "__ARM_FEATURE_TME is defined but should not be!"
 #endif
 
+/* Test Armv8.6-A features.  */
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+#ifndef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nosimd")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_MATMUL_INT8
+#error "__ARM_FEATURE_MATMUL_INT8 is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifndef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is not defined but should be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16+nosimd")
+#ifndef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is not defined but should be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.6-a+nofp")
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_SCALAR_ARITHMETIC is defined but should not be!"
+#endif
+#ifdef __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
+#error "__ARM_FEATURE_BF16_VECTOR_ARITHMETIC is defined but should not be!"
+#endif
+#pragma GCC pop_options
+
+#pragma GCC pop_options
+
 int
 foo (int a)
 {

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16
  2019-12-12 17:01       ` Dennis Zhang
@ 2019-12-13 10:23         ` Richard Sandiford
  2020-10-29 12:19         ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Dennis Zhang
  1 sibling, 0 replies; 23+ messages in thread
From: Richard Sandiford @ 2019-12-13 10:23 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, James Greenhalgh, Marcus Shawcroft

Dennis Zhang <Dennis.Zhang@arm.com> writes:
> Hi Richard,
>
> On 06/12/2019 10:22, Richard Sandiford wrote:
>> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>>> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>>>
>>> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
>>> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
>>> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
>>> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
>>> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
>>> 	(fp): Disabling fp also disables i8mm and bf16.
>>> 	(simd): Disabling simd also disables i8mm.
>>> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
>>> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
>>> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
>>> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
>>> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options. Add
>>> 	a new table to list permissible values for ARCH.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> 2019-12-04  Dennis Zhang  <dennis.zhang@arm.com>
>>>
>>> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
>>> 	and bf16 features.
>> 
>> Thanks for the update, looks great.  A couple of comments below.
>> 
>>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>>> index d165f31a865..1192e8f4b06 100644
>>> --- a/gcc/doc/invoke.texi
>>> +++ b/gcc/doc/invoke.texi
>>> @@ -16050,25 +16050,22 @@ Specify the name of the target architecture and, optionally, one or
>>>   more feature modifiers.  This option has the form
>>>   @option{-march=@var{arch}@r{@{}+@r{[}no@r{]}@var{feature}@r{@}*}}.
>>>   
>>> -The permissible values for @var{arch} are @samp{armv8-a},
>>> -@samp{armv8.1-a}, @samp{armv8.2-a}, @samp{armv8.3-a}, @samp{armv8.4-a},
>>> -@samp{armv8.5-a} or @var{native}.
>>> -
>>> -The value @samp{armv8.5-a} implies @samp{armv8.4-a} and enables compiler
>>> -support for the ARMv8.5-A architecture extensions.
>>> -
>>> -The value @samp{armv8.4-a} implies @samp{armv8.3-a} and enables compiler
>>> -support for the ARMv8.4-A architecture extensions.
>>> -
>>> -The value @samp{armv8.3-a} implies @samp{armv8.2-a} and enables compiler
>>> -support for the ARMv8.3-A architecture extensions.
>>> -
>>> -The value @samp{armv8.2-a} implies @samp{armv8.1-a} and enables compiler
>>> -support for the ARMv8.2-A architecture extensions.
>>> -
>>> -The value @samp{armv8.1-a} implies @samp{armv8-a} and enables compiler
>>> -support for the ARMv8.1-A architecture extension.  In particular, it
>>> -enables the @samp{+crc}, @samp{+lse}, and @samp{+rdma} features.
>>> +The table below summarizes the permissible values for @var{arch}
>>> +and the features that they enable by default:
>>> +
>>> +@multitable @columnfractions 0.20 0.20 0.60
>>> +@headitem @var{arch} value @tab Architecture @tab Includes by default
>> 
>> We should have an armv8-a entry here, something like:
>> 
>> @item @samp{armv8-a} @tab Armv8-A @tab @samp{+fp}, @samp{+simd}
>> 
>
> The armv8-a entry is added.
>
>>> +@item @samp{armv8.1-a} @tab Armv8.1-A
>>> +@tab @samp{armv8-a}, @samp{+crc}, @samp{+lse}, @samp{+rdma}
>>> +@item @samp{armv8.2-a} @tab Armv8.2-A @tab @samp{armv8.1-a}
>>> +@item @samp{armv8.3-a} @tab Armv8.3-A @tab @samp{armv8.2-a}
>>> +@item @samp{armv8.4-a} @tab Armv8.4-A
>>> +@tab @samp{armv8.3-a}, @samp{+fp16fml}, @samp{+dotprod}
>>> +@item @samp{armv8.5-a} @tab Armv8.5-A
>>> +@tab @samp{armv8.4-a}, @samp{+sb}, @samp{+ssbs}, @samp{+predres}
>>> +@item @samp{armv8.6-a} @tab Armv8.6-A
>>> +@tab @samp{armv8.5-a}, @samp{+bf16}, @samp{+i8mm}
>>> +@end multitable
>> 
>> I should have tried a proof of concept of this before suggesting it, sorry.
>> Trying the patch locally I get:
>> 
>> gcc.pod around line 18643: You can't have =items (as at line 18649) unless the first thing after the =over is an =item
>> POD document had syntax errors at /usr/bin/pod2man line 71.
>> Makefile:3363: recipe for target 'doc/gcc.1' failed
>> make: [doc/gcc.1] Error 1 (ignored)
>> 
>> (Odd that this is an ignored error, since we end up with an empty man page.)
>> 
>> I've posted a texi2pod.pl patch for that:
>> 
>>      https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00407.html
>> 
>> However, even with that patch, the script needs the full table row to be
>> on a single line, so I think we need to do that and live with the long lines.
>> 
>
> The items are kept in a single line for each.
>
>>> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>>> index 608b89d19ce..5ae39bc6cf0 100644
>>> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>>> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
>>> @@ -13,6 +13,92 @@
>>>   #error "__ARM_FEATURE_TME is defined but should not be!"
>>>   #endif
>>>   
>>> +/* Test Armv8.6-a features.  */
>>> +
>>> +#pragma GCC push_options
>>> +#pragma GCC target ("arch=armv8-a")
>> 
>> These two pragmas should be at the beginning of the file, so that we
>> start with base armv8-a for all the tests.
>
> The pragmas are moved to the top.
>
> The ChangeLog is updated as below:
>
> gcc/ChangeLog:
>
> 2019-12-12  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* config/aarch64/aarch64-arches.def (armv8.6-a): New.
> 	* config/aarch64/aarch64-c.c (aarch64_update_cpp_builtins): Define
> 	__ARM_FEATURE_MATMUL_INT8, __ARM_FEATURE_BF16_VECTOR_ARITHMETIC and
> 	__ARM_FEATURE_BF16_SCALAR_ARITHMETIC when enabled.
> 	* config/aarch64/aarch64-option-extensions.def (i8mm, bf16): New.
> 	(fp): Disabling fp also disables i8mm and bf16.
> 	(simd): Disabling simd also disables i8mm.
> 	* config/aarch64/aarch64.h (AARCH64_FL_V8_6): New macro.
> 	(AARCH64_FL_I8MM, AARCH64_FL_BF16, AARCH64_FL_FOR_ARCH8_6): Likewise.
> 	(AARCH64_ISA_V8_6, AARCH64_ISA_I8MM, AARCH64_ISA_BF16): Likewise.
> 	(TARGET_I8MM, TARGET_BF16_FP, TARGET_BF16_SIMD): Likewise.
> 	* doc/invoke.texi (armv8.6-a, i8mm, bf16): Document new options
> 	and add a new table to list permissible values for ARCH.
>
> gcc/testsuite/ChangeLog:
>
> 2019-12-12  Dennis Zhang  <dennis.zhang@arm.com>
>
> 	* gcc.target/aarch64/pragma_cpp_predefs_2.c: Add tests for i8mm
> 	and bf16 features.

Thanks, applied as r279370.

Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32
  2019-12-12 17:01       ` Dennis Zhang
  2019-12-13 10:23         ` Richard Sandiford
@ 2020-10-29 12:19         ` Dennis Zhang
  2020-10-29 12:28           ` [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector Dennis Zhang
  2020-10-29 17:48           ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Richard Sandiford
  1 sibling, 2 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-10-29 12:19 UTC (permalink / raw)
  To: gcc-patches
  Cc: Richard Sandiford, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 1213 bytes --]

Hi all,

This patch enables intrinsics to convert BFloat16 scalar and vector operands to Float32 modes.
The intrinsics are implemented by shifting each BFloat16 item 16 bits to left using shl/shll/shll2 instructions.

Intrinsics are documented at https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
ISA is documented at https://developer.arm.com/docs/ddi0596/latest

Regtested and bootstrapped.

Is it OK for trunk please?

Thanks
Dennis

gcc/ChangeLog:

2020-10-29  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-simd-builtins.def(vbfcvt): New entry.
	(vbfcvt_high, bfcvt): Likewise.
	* config/aarch64/aarch64-simd.md(aarch64_vbfcvt<mode>): New entry.
	(aarch64_vbfcvt_highv8bf, aarch64_bfcvtsf): Likewise.
	* config/aarch64/arm_bf16.h (vcvtah_f32_bf16): New intrinsic.
	* config/aarch64/arm_neon.h (vcvt_f32_bf16): Likewise.
	(vcvtq_low_f32_bf16, vcvtq_high_f32_bf16): Likewise.

gcc/testsuite/ChangeLog

2020-10-29  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
	(test_vcvt_f32_bf16, test_vcvtq_low_f32_bf16): New tests.
	(test_vcvtq_high_f32_bf16, test_vcvth_f32_bf16): Likewise.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: a64-bfcvt-trunk-20201026.patch --]
[-- Type: text/x-patch; name="a64-bfcvt-trunk-20201026.patch", Size: 4393 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 5bc596dbffc..b68c3ca7f4b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -732,3 +732,8 @@
   VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
   VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
   VAR1 (UNOP, bfcvt, 0, ALL, bf)
+
+  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
+  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
+  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
+  VAR1 (UNOP, bfcvt, 0, ALL, sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba0..5ae79d67981 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7238,3 +7238,31 @@
   "bfcvt\\t%h0, %s1"
   [(set_attr "type" "f_cvt")]
 )
+
+;; Use shl/shll/shll2 to convert BF scalar/vector modes to SF modes.
+(define_insn "aarch64_vbfcvt<mode>"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:VBF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "shll\\t%0.4s, %1.4h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_vbfcvt_highv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "shll2\\t%0.4s, %1.8h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_bfcvtsf"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
+		    UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "shl\\t%d0, %d1, #16"
+  [(set_attr "type" "neon_shift_reg")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 984875dcc01..881615498d3 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -40,6 +40,13 @@ vcvth_bf16_f32 (float32_t __a)
   return __builtin_aarch64_bfcvtbf (__a);
 }
 
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtah_f32_bf16 (bfloat16_t __a)
+{
+  return __builtin_aarch64_bfcvtsf (__a);
+}
+
 #pragma GCC pop_options
 
 #endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 85c0d62ca12..9c0386ed7b1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35716,6 +35716,27 @@ vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a)
   return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
 }
 
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_bf16 (bfloat16x4_t __a)
+{
+  return __builtin_aarch64_vbfcvtv4bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvtv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvt_highv8bf (__a);
+}
+
 #pragma GCC pop_options
 
 /* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
index bbea630b182..47af7c494d9 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
@@ -46,3 +46,43 @@ bfloat16_t test_bfcvt (float32_t a)
 {
   return vcvth_bf16_f32 (a);
 }
+
+/*
+**test_vcvt_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a)
+{
+  return vcvt_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_low_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_low_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_high_f32_bf16:
+**     shll2	v0.4s, v0.8h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_high_f32_bf16 (a);
+}
+
+/*
+**test_vcvtah_f32_bf16:
+**     shl	d0, d0, #16
+**     ret
+*/
+float32_t test_vcvtah_f32_bf16 (bfloat16_t a)
+{
+  return vcvtah_f32_bf16 (a);
+}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-10-29 12:19         ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Dennis Zhang
@ 2020-10-29 12:28           ` Dennis Zhang
  2020-10-30 14:07             ` Richard Sandiford
  2020-10-29 17:48           ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Richard Sandiford
  1 sibling, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2020-10-29 12:28 UTC (permalink / raw)
  To: gcc-patches
  Cc: Richard Sandiford, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 1282 bytes --]

Hi all,

This patch implements ACLE intrinsics vget_low_bf16 and vget_high_bf16 to extract lower or higher half from a bfloat16x8 vector.
The vget_high_bf16 is done by 'dup' instruction. The vget_low_bf16 could be done by a 'dup' or 'mov', or it's mostly optimized out by just using the lower half of a vector register.
The test for vget_low_bf16 only checks that the interface can be compiled but no instruction is checked since none is generated in the test case.

Arm ACLE document at https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics

Regtested and bootstrapped.

Is it OK for trunk please?

Thanks
Denni

gcc/ChangeLog:

2020-10-29  Dennis Zhang  <dennis.zhang@arm.com>

	* config/aarch64/aarch64-simd-builtins.def (vget_half): New entry.
	* config/aarch64/aarch64-simd.md (aarch64_vget_halfv8bf): New entry.
	* config/aarch64/arm_neon.h (vget_low_bf16): New intrinsic.
	(vget_high_bf16): Likewise.
	* config/aarch64/predicates.md (aarch64_zero_or_1): New predicate
	for zero or one immediate to indicate the lower or higher half.

gcc/testsuite/ChangeLog

2020-10-29  Dennis Zhang  <dennis.zhang@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
	(test_vget_low_bf16, test_vget_high_bf16): New tests.

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: vgetbf-trunk-20201029-2.patch --]
[-- Type: text/x-patch; name="vgetbf-trunk-20201029-2.patch", Size: 3669 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..39ebb776d1d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -719,6 +719,9 @@
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
+  /* Implemented by aarch64_vget_halfv8bf.  */
+  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
+
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
   VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6f..f62c52ca327 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,19 @@
   [(set_attr "type" "neon_dot<VDQSF:q>")]
 )
 
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")
+   (match_operand:SI 2 "aarch64_zero_or_1")]
+  "TARGET_BF16_SIMD"
+{
+  int hbase = INTVAL (operands[2]);
+  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
+  DONE;
+})
+
 ;; bfmmla
 (define_insn "aarch64_bfmmlaqv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..c6ac0b8dd17 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35530,6 +35530,20 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_halfv8bf (__a, 0);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_halfv8bf (__a, 1);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 215fcec5955..0c8bc2b0c73 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -84,6 +84,10 @@
 		 (ior (match_test "op == constm1_rtx")
 		      (match_test "op == const1_rtx"))))))
 
+(define_predicate "aarch64_zero_or_1"
+  (and (match_code "const_int")
+       (match_test "op == const0_rtx || op == const1_rtx")))
+
 (define_predicate "aarch64_reg_or_orr_imm"
    (ior (match_operand 0 "register_operand")
 	(and (match_code "const_vector")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
index c42c7acbbe9..35f4cb864f2 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -83,3 +83,14 @@ bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
   return vduph_laneq_bf16 (a, 7);
 }
 /* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
+
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}
+/* { dg-final { scan-assembler-times "dup\\td\[0-9\]+, v\[0-9\]+\.d\\\[1\\\]" 1 } } */

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32
  2020-10-29 12:19         ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Dennis Zhang
  2020-10-29 12:28           ` [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector Dennis Zhang
@ 2020-10-29 17:48           ` Richard Sandiford
  2020-11-02 17:27             ` Dennis Zhang
  1 sibling, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2020-10-29 17:48 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Dennis Zhang <Dennis.Zhang@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 5bc596dbffc..b68c3ca7f4b 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -732,3 +732,8 @@
>    VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
>    VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
>    VAR1 (UNOP, bfcvt, 0, ALL, bf)
> +
> +  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
> +  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
> +  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
> +  VAR1 (UNOP, bfcvt, 0, ALL, sf)

New intrinsics should use something more specific than “ALL”.
Since these functions are pure non-trapping integer operations,
I think they should use “AUTO_FP” instead.  (On reflection,
we should probably change the name.)

> +(define_insn "aarch64_bfcvtsf"
> +  [(set (match_operand:SF 0 "register_operand" "=w")
> +	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
> +		    UNSPEC_BFCVT))]
> +  "TARGET_BF16_FP"
> +  "shl\\t%d0, %d1, #16"
> +  [(set_attr "type" "neon_shift_reg")]

I think this should be neon_shift_imm instead.

OK with those changes, thanks.

Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-10-29 12:28           ` [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector Dennis Zhang
@ 2020-10-30 14:07             ` Richard Sandiford
  2020-11-03 11:16               ` Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2020-10-30 14:07 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Dennis Zhang <Dennis.Zhang@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 332a0b6b1ea..39ebb776d1d 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -719,6 +719,9 @@
>    VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
>    VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
>  
> +  /* Implemented by aarch64_vget_halfv8bf.  */
> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)

This should be AUTO_FP, since it doesn't have any side-effects.
(As before, we should probably rename the flag, but that's separate work.)

> +
>    /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
>    VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
>    VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 9f0e2bd1e6f..f62c52ca327 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -7159,6 +7159,19 @@
>    [(set_attr "type" "neon_dot<VDQSF:q>")]
>  )
>  
> +;; vget_low/high_bf16
> +(define_expand "aarch64_vget_halfv8bf"
> +  [(match_operand:V4BF 0 "register_operand")
> +   (match_operand:V8BF 1 "register_operand")
> +   (match_operand:SI 2 "aarch64_zero_or_1")]
> +  "TARGET_BF16_SIMD"
> +{
> +  int hbase = INTVAL (operands[2]);
> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);

I think this needs to be:

  aarch64_simd_vect_par_cnst_half

instead.  The issue is that on big-endian targets, GCC assumes vector
lane 0 is in the high part of the register, whereas for AArch64 it's
always in the low part of the register.  So we convert from AArch64
numbering to GCC numbering when generating the rtx and then take
endianness into account when matching the rtx later.

It would be good to have -mbig-endian tests that make sure we generate
the right instruction for each function (i.e. we get them the right way
round).  I guess it would be good to test that for little-endian too.

> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
> +  DONE;
> +})
> +
>  ;; bfmmla
>  (define_insn "aarch64_bfmmlaqv4sf"
>    [(set (match_operand:V4SF 0 "register_operand" "=w")
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index 215fcec5955..0c8bc2b0c73 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -84,6 +84,10 @@
>  		 (ior (match_test "op == constm1_rtx")
>  		      (match_test "op == const1_rtx"))))))
>  
> +(define_predicate "aarch64_zero_or_1"
> +  (and (match_code "const_int")
> +       (match_test "op == const0_rtx || op == const1_rtx")))

zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
so let's keep it as-is.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32
  2020-10-29 17:48           ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Richard Sandiford
@ 2020-11-02 17:27             ` Dennis Zhang
  2020-11-02 19:05               ` Richard Sandiford
  0 siblings, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2020-11-02 17:27 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 1447 bytes --]

Hi Richard,

On 10/29/20 5:48 PM, Richard Sandiford wrote:
> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index 5bc596dbffc..b68c3ca7f4b 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -732,3 +732,8 @@
>>     VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
>>     VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
>>     VAR1 (UNOP, bfcvt, 0, ALL, bf)
>> +
>> +  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
>> +  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
>> +  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
>> +  VAR1 (UNOP, bfcvt, 0, ALL, sf)
> 
> New intrinsics should use something more specific than “ALL”.
> Since these functions are pure non-trapping integer operations,
> I think they should use “AUTO_FP” instead.  (On reflection,
> we should probably change the name.)
> 
>> +(define_insn "aarch64_bfcvtsf"
>> +  [(set (match_operand:SF 0 "register_operand" "=w")
>> +	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
>> +		    UNSPEC_BFCVT))]
>> +  "TARGET_BF16_FP"
>> +  "shl\\t%d0, %d1, #16"
>> +  [(set_attr "type" "neon_shift_reg")]
> 
> I think this should be neon_shift_imm instead.
> 
> OK with those changes, thanks.
> 
> Richard
> 

I've fixed the Flag and the insn attribute.
I will commit it if no further issues.
Thanks for the review.

Regards
Dennis

[-- Attachment #2: a64-bfcvt-trunk-20201102.patch --]
[-- Type: text/x-patch, Size: 4481 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index eb8e6f7b3d8..f494b535a30 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -732,3 +732,8 @@
   VAR1 (UNOP, bfcvtn_q, 0, FP, v8bf)
   VAR1 (BINOP, bfcvtn2, 0, FP, v8bf)
   VAR1 (UNOP, bfcvt, 0, FP, bf)
+
+  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
+  VAR2 (UNOP, vbfcvt, 0, AUTO_FP, v4bf, v8bf)
+  VAR1 (UNOP, vbfcvt_high, 0, AUTO_FP, v8bf)
+  VAR1 (UNOP, bfcvt, 0, AUTO_FP, sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba0..030a086d31c 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7238,3 +7238,31 @@
   "bfcvt\\t%h0, %s1"
   [(set_attr "type" "f_cvt")]
 )
+
+;; Use shl/shll/shll2 to convert BF scalar/vector modes to SF modes.
+(define_insn "aarch64_vbfcvt<mode>"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:VBF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "shll\\t%0.4s, %1.4h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_vbfcvt_highv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "shll2\\t%0.4s, %1.8h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_bfcvtsf"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
+		    UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "shl\\t%d0, %d1, #16"
+  [(set_attr "type" "neon_shift_imm")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 984875dcc01..881615498d3 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -40,6 +40,13 @@ vcvth_bf16_f32 (float32_t __a)
   return __builtin_aarch64_bfcvtbf (__a);
 }
 
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtah_f32_bf16 (bfloat16_t __a)
+{
+  return __builtin_aarch64_bfcvtsf (__a);
+}
+
 #pragma GCC pop_options
 
 #endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 95bfa5ebba2..69cccd32786 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35680,6 +35680,27 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_bf16 (bfloat16x4_t __a)
+{
+  return __builtin_aarch64_vbfcvtv4bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvtv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvt_highv8bf (__a);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
index bbea630b182..47af7c494d9 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
@@ -46,3 +46,43 @@ bfloat16_t test_bfcvt (float32_t a)
 {
   return vcvth_bf16_f32 (a);
 }
+
+/*
+**test_vcvt_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a)
+{
+  return vcvt_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_low_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_low_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_high_f32_bf16:
+**     shll2	v0.4s, v0.8h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_high_f32_bf16 (a);
+}
+
+/*
+**test_vcvtah_f32_bf16:
+**     shl	d0, d0, #16
+**     ret
+*/
+float32_t test_vcvtah_f32_bf16 (bfloat16_t a)
+{
+  return vcvtah_f32_bf16 (a);
+}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32
  2020-11-02 17:27             ` Dennis Zhang
@ 2020-11-02 19:05               ` Richard Sandiford
  2020-11-03 13:06                 ` Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2020-11-02 19:05 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Dennis Zhang <dennis.zhang@arm.com> writes:
> Hi Richard,
>
> On 10/29/20 5:48 PM, Richard Sandiford wrote:
>> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> index 5bc596dbffc..b68c3ca7f4b 100644
>>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> @@ -732,3 +732,8 @@
>>>     VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
>>>     VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
>>>     VAR1 (UNOP, bfcvt, 0, ALL, bf)
>>> +
>>> +  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
>>> +  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
>>> +  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
>>> +  VAR1 (UNOP, bfcvt, 0, ALL, sf)
>> 
>> New intrinsics should use something more specific than “ALL”.
>> Since these functions are pure non-trapping integer operations,
>> I think they should use “AUTO_FP” instead.  (On reflection,
>> we should probably change the name.)
>> 
>>> +(define_insn "aarch64_bfcvtsf"
>>> +  [(set (match_operand:SF 0 "register_operand" "=w")
>>> +	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
>>> +		    UNSPEC_BFCVT))]
>>> +  "TARGET_BF16_FP"
>>> +  "shl\\t%d0, %d1, #16"
>>> +  [(set_attr "type" "neon_shift_reg")]
>> 
>> I think this should be neon_shift_imm instead.
>> 
>> OK with those changes, thanks.
>> 
>> Richard
>> 
>
> I've fixed the Flag and the insn attribute.
> I will commit it if no further issues.

LGTM, thanks.

Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-10-30 14:07             ` Richard Sandiford
@ 2020-11-03 11:16               ` Dennis Zhang
  2020-11-03 14:05                 ` Richard Sandiford
  2020-11-05 20:07                 ` Christophe Lyon
  0 siblings, 2 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-11-03 11:16 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 3503 bytes --]

Hi Richard,

On 10/30/20 2:07 PM, Richard Sandiford wrote:
> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>> index 332a0b6b1ea..39ebb776d1d 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>> @@ -719,6 +719,9 @@
>>     VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
>>     VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
>>   
>> +  /* Implemented by aarch64_vget_halfv8bf.  */
>> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
> 
> This should be AUTO_FP, since it doesn't have any side-effects.
> (As before, we should probably rename the flag, but that's separate work.)
> 
>> +
>>     /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
>>     VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
>>     VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>> index 9f0e2bd1e6f..f62c52ca327 100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -7159,6 +7159,19 @@
>>     [(set_attr "type" "neon_dot<VDQSF:q>")]
>>   )
>>   
>> +;; vget_low/high_bf16
>> +(define_expand "aarch64_vget_halfv8bf"
>> +  [(match_operand:V4BF 0 "register_operand")
>> +   (match_operand:V8BF 1 "register_operand")
>> +   (match_operand:SI 2 "aarch64_zero_or_1")]
>> +  "TARGET_BF16_SIMD"
>> +{
>> +  int hbase = INTVAL (operands[2]);
>> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
> 
> I think this needs to be:
> 
>    aarch64_simd_vect_par_cnst_half
> 
> instead.  The issue is that on big-endian targets, GCC assumes vector
> lane 0 is in the high part of the register, whereas for AArch64 it's
> always in the low part of the register.  So we convert from AArch64
> numbering to GCC numbering when generating the rtx and then take
> endianness into account when matching the rtx later.
> 
> It would be good to have -mbig-endian tests that make sure we generate
> the right instruction for each function (i.e. we get them the right way
> round).  I guess it would be good to test that for little-endian too.
> 

I've updated the expander using aarch64_simd_vect_par_cnst_half.
And the expander is divided into two for getting low and high half 
seperately.
It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu 
targets with new tests including -mbig-endian option.

>> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
>> +  DONE;
>> +})
>> +
>>   ;; bfmmla
>>   (define_insn "aarch64_bfmmlaqv4sf"
>>     [(set (match_operand:V4SF 0 "register_operand" "=w")
>> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
>> index 215fcec5955..0c8bc2b0c73 100644
>> --- a/gcc/config/aarch64/predicates.md
>> +++ b/gcc/config/aarch64/predicates.md
>> @@ -84,6 +84,10 @@
>>   		 (ior (match_test "op == constm1_rtx")
>>   		      (match_test "op == const1_rtx"))))))
>>   
>> +(define_predicate "aarch64_zero_or_1"
>> +  (and (match_code "const_int")
>> +       (match_test "op == const0_rtx || op == const1_rtx")))
> 
> zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
> But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
> so let's keep it as-is.
> 

This predicate is removed since there is no need of the imm operand in 
the new expanders.

Thanks for the reviews.
Is it OK for trunk now?

Cheers
Dennis



[-- Attachment #2: a64-bfget-trunk-20201102-3.patch --]
[-- Type: text/x-patch, Size: 4519 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index eb8e6f7b3d8..f26a96042bc 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -722,6 +722,10 @@
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
+  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
+  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
+  VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
+
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
   VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 381a702eba0..af29a2f26f5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,27 @@
   [(set_attr "type" "neon_dot<VDQSF:q>")]
 )
 
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_lo_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
+(define_expand "aarch64_vget_hi_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, true);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
 ;; bfmmla
 (define_insn "aarch64_bfmmlaqv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 95bfa5ebba2..0fd78a6fd07 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35680,6 +35680,20 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_lo_halfv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_hi_halfv8bf (__a);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get-be.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get-be.c
new file mode 100644
index 00000000000..bd9bb110974
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get-be.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-mbig-endian -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_vget_low_bf16:
+**     ret
+*/
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+/*
+**test_vget_high_bf16:
+**     dup	d0, v0.d\[1\]
+**     ret
+*/
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c
new file mode 100644
index 00000000000..2193753ffbb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_vget_low_bf16:
+**     ret
+*/
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+/*
+**test_vget_high_bf16:
+**     dup	d0, v0.d\[1\]
+**     ret
+*/
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32
  2020-11-02 19:05               ` Richard Sandiford
@ 2020-11-03 13:06                 ` Dennis Zhang
  2020-12-10 14:26                   ` [backport gcc-10][AArch64] ACLE bf16 convert Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2020-11-03 13:06 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov, richard.sandiford


On 11/2/20 7:05 PM, Richard Sandiford wrote:
> Dennis Zhang <dennis.zhang@arm.com> writes:
>> Hi Richard,
>>
>> On 10/29/20 5:48 PM, Richard Sandiford wrote:
>>> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>>>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> index 5bc596dbffc..b68c3ca7f4b 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> @@ -732,3 +732,8 @@
>>>>      VAR1 (UNOP, bfcvtn_q, 0, ALL, v8bf)
>>>>      VAR1 (BINOP, bfcvtn2, 0, ALL, v8bf)
>>>>      VAR1 (UNOP, bfcvt, 0, ALL, bf)
>>>> +
>>>> +  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
>>>> +  VAR2 (UNOP, vbfcvt, 0, ALL, v4bf, v8bf)
>>>> +  VAR1 (UNOP, vbfcvt_high, 0, ALL, v8bf)
>>>> +  VAR1 (UNOP, bfcvt, 0, ALL, sf)
>>>
>>> New intrinsics should use something more specific than “ALL”.
>>> Since these functions are pure non-trapping integer operations,
>>> I think they should use “AUTO_FP” instead.  (On reflection,
>>> we should probably change the name.)
>>>
>>>> +(define_insn "aarch64_bfcvtsf"
>>>> +  [(set (match_operand:SF 0 "register_operand" "=w")
>>>> +	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
>>>> +		    UNSPEC_BFCVT))]
>>>> +  "TARGET_BF16_FP"
>>>> +  "shl\\t%d0, %d1, #16"
>>>> +  [(set_attr "type" "neon_shift_reg")]
>>>
>>> I think this should be neon_shift_imm instead.
>>>
>>> OK with those changes, thanks.
>>>
>>> Richard
>>>
>>
>> I've fixed the Flag and the insn attribute.
>> I will commit it if no further issues.
> 
> LGTM, thanks.
> 
> Richard
> 
Thanks Richard!
This patch is committed as f7d6961126a7f06c8089d8a58bd21be43bc16806.

Bests
Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-11-03 11:16               ` Dennis Zhang
@ 2020-11-03 14:05                 ` Richard Sandiford
  2020-11-03 17:00                   ` Dennis Zhang
  2020-11-05 20:07                 ` Christophe Lyon
  1 sibling, 1 reply; 23+ messages in thread
From: Richard Sandiford @ 2020-11-03 14:05 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Dennis Zhang <dennis.zhang@arm.com> writes:
> Hi Richard,
>
> On 10/30/20 2:07 PM, Richard Sandiford wrote:
>> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> index 332a0b6b1ea..39ebb776d1d 100644
>>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>>> @@ -719,6 +719,9 @@
>>>     VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
>>>     VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
>>>   
>>> +  /* Implemented by aarch64_vget_halfv8bf.  */
>>> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
>> 
>> This should be AUTO_FP, since it doesn't have any side-effects.
>> (As before, we should probably rename the flag, but that's separate work.)
>> 
>>> +
>>>     /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
>>>     VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
>>>     VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>> index 9f0e2bd1e6f..f62c52ca327 100644
>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>> @@ -7159,6 +7159,19 @@
>>>     [(set_attr "type" "neon_dot<VDQSF:q>")]
>>>   )
>>>   
>>> +;; vget_low/high_bf16
>>> +(define_expand "aarch64_vget_halfv8bf"
>>> +  [(match_operand:V4BF 0 "register_operand")
>>> +   (match_operand:V8BF 1 "register_operand")
>>> +   (match_operand:SI 2 "aarch64_zero_or_1")]
>>> +  "TARGET_BF16_SIMD"
>>> +{
>>> +  int hbase = INTVAL (operands[2]);
>>> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
>> 
>> I think this needs to be:
>> 
>>    aarch64_simd_vect_par_cnst_half
>> 
>> instead.  The issue is that on big-endian targets, GCC assumes vector
>> lane 0 is in the high part of the register, whereas for AArch64 it's
>> always in the low part of the register.  So we convert from AArch64
>> numbering to GCC numbering when generating the rtx and then take
>> endianness into account when matching the rtx later.
>> 
>> It would be good to have -mbig-endian tests that make sure we generate
>> the right instruction for each function (i.e. we get them the right way
>> round).  I guess it would be good to test that for little-endian too.
>> 
>
> I've updated the expander using aarch64_simd_vect_par_cnst_half.
> And the expander is divided into two for getting low and high half 
> seperately.
> It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu 
> targets with new tests including -mbig-endian option.
>
>>> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
>>> +  DONE;
>>> +})
>>> +
>>>   ;; bfmmla
>>>   (define_insn "aarch64_bfmmlaqv4sf"
>>>     [(set (match_operand:V4SF 0 "register_operand" "=w")
>>> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
>>> index 215fcec5955..0c8bc2b0c73 100644
>>> --- a/gcc/config/aarch64/predicates.md
>>> +++ b/gcc/config/aarch64/predicates.md
>>> @@ -84,6 +84,10 @@
>>>   		 (ior (match_test "op == constm1_rtx")
>>>   		      (match_test "op == const1_rtx"))))))
>>>   
>>> +(define_predicate "aarch64_zero_or_1"
>>> +  (and (match_code "const_int")
>>> +       (match_test "op == const0_rtx || op == const1_rtx")))
>> 
>> zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
>> But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
>> so let's keep it as-is.
>> 
>
> This predicate is removed since there is no need of the imm operand in 
> the new expanders.
>
> Thanks for the reviews.
> Is it OK for trunk now?

Looks good.  OK for trunk and branches, thanks.

Richard

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-11-03 14:05                 ` Richard Sandiford
@ 2020-11-03 17:00                   ` Dennis Zhang
  0 siblings, 0 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-11-03 17:00 UTC (permalink / raw)
  To: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov, richard.sandiford

On 11/3/20 2:05 PM, Richard Sandiford wrote:
> Dennis Zhang <dennis.zhang@arm.com> writes:
>> Hi Richard,
>>
>> On 10/30/20 2:07 PM, Richard Sandiford wrote:
>>> Dennis Zhang <Dennis.Zhang@arm.com> writes:
>>>> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> index 332a0b6b1ea..39ebb776d1d 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
>>>> @@ -719,6 +719,9 @@
>>>>      VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
>>>>      VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
>>>>    
>>>> +  /* Implemented by aarch64_vget_halfv8bf.  */
>>>> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
>>>
>>> This should be AUTO_FP, since it doesn't have any side-effects.
>>> (As before, we should probably rename the flag, but that's separate work.)
>>>
>>>> +
>>>>      /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
>>>>      VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
>>>>      VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
>>>> index 9f0e2bd1e6f..f62c52ca327 100644
>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>> @@ -7159,6 +7159,19 @@
>>>>      [(set_attr "type" "neon_dot<VDQSF:q>")]
>>>>    )
>>>>    
>>>> +;; vget_low/high_bf16
>>>> +(define_expand "aarch64_vget_halfv8bf"
>>>> +  [(match_operand:V4BF 0 "register_operand")
>>>> +   (match_operand:V8BF 1 "register_operand")
>>>> +   (match_operand:SI 2 "aarch64_zero_or_1")]
>>>> +  "TARGET_BF16_SIMD"
>>>> +{
>>>> +  int hbase = INTVAL (operands[2]);
>>>> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
>>>
>>> I think this needs to be:
>>>
>>>     aarch64_simd_vect_par_cnst_half
>>>
>>> instead.  The issue is that on big-endian targets, GCC assumes vector
>>> lane 0 is in the high part of the register, whereas for AArch64 it's
>>> always in the low part of the register.  So we convert from AArch64
>>> numbering to GCC numbering when generating the rtx and then take
>>> endianness into account when matching the rtx later.
>>>
>>> It would be good to have -mbig-endian tests that make sure we generate
>>> the right instruction for each function (i.e. we get them the right way
>>> round).  I guess it would be good to test that for little-endian too.
>>>
>>
>> I've updated the expander using aarch64_simd_vect_par_cnst_half.
>> And the expander is divided into two for getting low and high half
>> seperately.
>> It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu
>> targets with new tests including -mbig-endian option.
>>
>>>> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
>>>> +  DONE;
>>>> +})
>>>> +
>>>>    ;; bfmmla
>>>>    (define_insn "aarch64_bfmmlaqv4sf"
>>>>      [(set (match_operand:V4SF 0 "register_operand" "=w")
>>>> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
>>>> index 215fcec5955..0c8bc2b0c73 100644
>>>> --- a/gcc/config/aarch64/predicates.md
>>>> +++ b/gcc/config/aarch64/predicates.md
>>>> @@ -84,6 +84,10 @@
>>>>    		 (ior (match_test "op == constm1_rtx")
>>>>    		      (match_test "op == const1_rtx"))))))
>>>>    
>>>> +(define_predicate "aarch64_zero_or_1"
>>>> +  (and (match_code "const_int")
>>>> +       (match_test "op == const0_rtx || op == const1_rtx")))
>>>
>>> zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
>>> But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
>>> so let's keep it as-is.
>>>
>>
>> This predicate is removed since there is no need of the imm operand in
>> the new expanders.
>>
>> Thanks for the reviews.
>> Is it OK for trunk now?
> 
> Looks good.  OK for trunk and branches, thanks.
> 
> Richard
> 

Thanks for approval, Richard!
This patch is committed at 3553c658533e430b232997bdfd97faf6606fb102

Bests
Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector
  2020-11-03 11:16               ` Dennis Zhang
  2020-11-03 14:05                 ` Richard Sandiford
@ 2020-11-05 20:07                 ` Christophe Lyon
  1 sibling, 0 replies; 23+ messages in thread
From: Christophe Lyon @ 2020-11-05 20:07 UTC (permalink / raw)
  To: Dennis Zhang
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft,
	Kyrylo Tkachov, Richard Sandiford

On Tue, 3 Nov 2020 at 12:17, Dennis Zhang via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi Richard,
>
> On 10/30/20 2:07 PM, Richard Sandiford wrote:
> > Dennis Zhang <Dennis.Zhang@arm.com> writes:
> >> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> >> index 332a0b6b1ea..39ebb776d1d 100644
> >> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> >> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> >> @@ -719,6 +719,9 @@
> >>     VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
> >>     VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
> >>
> >> +  /* Implemented by aarch64_vget_halfv8bf.  */
> >> +  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
> >
> > This should be AUTO_FP, since it doesn't have any side-effects.
> > (As before, we should probably rename the flag, but that's separate work.)
> >
> >> +
> >>     /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
> >>     VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
> >>     VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
> >> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> >> index 9f0e2bd1e6f..f62c52ca327 100644
> >> --- a/gcc/config/aarch64/aarch64-simd.md
> >> +++ b/gcc/config/aarch64/aarch64-simd.md
> >> @@ -7159,6 +7159,19 @@
> >>     [(set_attr "type" "neon_dot<VDQSF:q>")]
> >>   )
> >>
> >> +;; vget_low/high_bf16
> >> +(define_expand "aarch64_vget_halfv8bf"
> >> +  [(match_operand:V4BF 0 "register_operand")
> >> +   (match_operand:V8BF 1 "register_operand")
> >> +   (match_operand:SI 2 "aarch64_zero_or_1")]
> >> +  "TARGET_BF16_SIMD"
> >> +{
> >> +  int hbase = INTVAL (operands[2]);
> >> +  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
> >
> > I think this needs to be:
> >
> >    aarch64_simd_vect_par_cnst_half
> >
> > instead.  The issue is that on big-endian targets, GCC assumes vector
> > lane 0 is in the high part of the register, whereas for AArch64 it's
> > always in the low part of the register.  So we convert from AArch64
> > numbering to GCC numbering when generating the rtx and then take
> > endianness into account when matching the rtx later.
> >
> > It would be good to have -mbig-endian tests that make sure we generate
> > the right instruction for each function (i.e. we get them the right way
> > round).  I guess it would be good to test that for little-endian too.
> >
>
> I've updated the expander using aarch64_simd_vect_par_cnst_half.
> And the expander is divided into two for getting low and high half
> seperately.
> It's tested for aarch64-none-linux-gnu and aarch64_be-none-linux-gnu
> targets with new tests including -mbig-endian option.
>

Hi,

When testing with a cross x86_64 -> aarch64-none-linux-gnu, the new
big-endian test fails:
FAIL: gcc.target/aarch64/advsimd-intrinsics/bf16_get-be.c   -O0  (test
for excess errors)
Excess errors:
/aci-gcc-fsf/builds/gcc-fsf-gccsrc/sysroot-aarch64-none-linux-gnu/usr/include/gnu/stubs.h:11:11:
fatal error: gnu/stubs-lp64_be.h: No such file or directory
compilation terminated.

What am I missing, since it works for you?

Thanks

Christophe

> >> +  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
> >> +  DONE;
> >> +})
> >> +
> >>   ;; bfmmla
> >>   (define_insn "aarch64_bfmmlaqv4sf"
> >>     [(set (match_operand:V4SF 0 "register_operand" "=w")
> >> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> >> index 215fcec5955..0c8bc2b0c73 100644
> >> --- a/gcc/config/aarch64/predicates.md
> >> +++ b/gcc/config/aarch64/predicates.md
> >> @@ -84,6 +84,10 @@
> >>               (ior (match_test "op == constm1_rtx")
> >>                    (match_test "op == const1_rtx"))))))
> >>
> >> +(define_predicate "aarch64_zero_or_1"
> >> +  (and (match_code "const_int")
> >> +       (match_test "op == const0_rtx || op == const1_rtx")))
> >
> > zero_or_1 looked odd to me, feels like it should be 0_or_1 or zero_or_one.
> > But I see that it's for consistency with aarch64_reg_zero_or_m1_or_1,
> > so let's keep it as-is.
> >
>
> This predicate is removed since there is no need of the imm operand in
> the new expanders.
>
> Thanks for the reviews.
> Is it OK for trunk now?
>
> Cheers
> Dennis
>
>

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [backport gcc-10][AArch64] ACLE bf16 convert
  2020-11-03 13:06                 ` Dennis Zhang
@ 2020-12-10 14:26                   ` Dennis Zhang
  2020-12-10 14:34                     ` [backport gcc-10][AArch64] ACLE bf16 get Dennis Zhang
  2020-12-11 11:23                     ` [backport gcc-10][AArch64] ACLE bf16 convert Kyrylo Tkachov
  0 siblings, 2 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-12-10 14:26 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 388 bytes --]

Hi all,

This patch backports the commit f7d6961126a7f06c8089d8a58bd21be43bc16806.
The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-November/557859.html
The only change is to remove FPCR-reading flags for builtin definition since it's not supported in gcc-10.
Regtested and bootstrapped for aarch64-none-linux-gnu.

Is it OK to backport?

Cheers
Dennis

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb13902.patch --]
[-- Type: text/x-patch; name="rb13902.patch", Size: 4732 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index ba2bda26dcdd4947dc724851433451433d378724..7192f3954d311d89064707cfcb735efad4377c12 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -728,3 +728,8 @@
   VAR1 (UNOP, bfcvtn_q, 0, v8bf)
   VAR1 (BINOP, bfcvtn2, 0, v8bf)
   VAR1 (UNOP, bfcvt, 0, bf)
+
+  /* Implemented by aarch64_{v}bfcvt{_high}<mode>.  */
+  VAR2 (UNOP, vbfcvt, 0, v4bf, v8bf)
+  VAR1 (UNOP, vbfcvt_high, 0, v8bf)
+  VAR1 (UNOP, bfcvt, 0, sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6ff5246f84e919402c687687a84beb8..2e8aa668b107f039e4958b6998da180a6d11b881 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7238,3 +7238,31 @@
   "bfcvt\\t%h0, %s1"
   [(set_attr "type" "f_cvt")]
 )
+
+;; Use shl/shll/shll2 to convert BF scalar/vector modes to SF modes.
+(define_insn "aarch64_vbfcvt<mode>"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:VBF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN))]
+  "TARGET_BF16_SIMD"
+  "shll\\t%0.4s, %1.4h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_vbfcvt_highv8bf"
+  [(set (match_operand:V4SF 0 "register_operand" "=w")
+	(unspec:V4SF [(match_operand:V8BF 1 "register_operand" "w")]
+		      UNSPEC_BFCVTN2))]
+  "TARGET_BF16_SIMD"
+  "shll2\\t%0.4s, %1.8h, #16"
+  [(set_attr "type" "neon_shift_imm_long")]
+)
+
+(define_insn "aarch64_bfcvtsf"
+  [(set (match_operand:SF 0 "register_operand" "=w")
+	(unspec:SF [(match_operand:BF 1 "register_operand" "w")]
+		    UNSPEC_BFCVT))]
+  "TARGET_BF16_FP"
+  "shl\\t%d0, %d1, #16"
+  [(set_attr "type" "neon_shift_imm")]
+)
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
index 984875dcc014300c489209c11abf41b1c47b7fbe..881615498d3d52662d7ebb3ab1e8d52d5a40cab8 100644
--- a/gcc/config/aarch64/arm_bf16.h
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -40,6 +40,13 @@ vcvth_bf16_f32 (float32_t __a)
   return __builtin_aarch64_bfcvtbf (__a);
 }
 
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtah_f32_bf16 (bfloat16_t __a)
+{
+  return __builtin_aarch64_bfcvtsf (__a);
+}
+
 #pragma GCC pop_options
 
 #endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 95bfa5ebba21b739ee3c84e3971337646f8881d4..69cccd3278642814f3961c5bf52be5639f5ef3f3 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35680,6 +35680,27 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_bf16 (bfloat16x4_t __a)
+{
+  return __builtin_aarch64_vbfcvtv4bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvtv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_f32_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vbfcvt_highv8bf (__a);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
index bbea630b1820d578bdf1619834f29b919f5c3f32..47af7c494d9b9d1f4b63e802efc293348a40e270 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfcvt-compile.c
@@ -46,3 +46,43 @@ bfloat16_t test_bfcvt (float32_t a)
 {
   return vcvth_bf16_f32 (a);
 }
+
+/*
+**test_vcvt_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvt_f32_bf16 (bfloat16x4_t a)
+{
+  return vcvt_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_low_f32_bf16:
+**     shll	v0.4s, v0.4h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_low_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_low_f32_bf16 (a);
+}
+
+/*
+**test_vcvtq_high_f32_bf16:
+**     shll2	v0.4s, v0.8h, #16
+**     ret
+*/
+float32x4_t test_vcvtq_high_f32_bf16 (bfloat16x8_t a)
+{
+  return vcvtq_high_f32_bf16 (a);
+}
+
+/*
+**test_vcvtah_f32_bf16:
+**     shl	d0, d0, #16
+**     ret
+*/
+float32_t test_vcvtah_f32_bf16 (bfloat16_t a)
+{
+  return vcvtah_f32_bf16 (a);
+}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [backport gcc-10][AArch64] ACLE bf16 get
  2020-12-10 14:26                   ` [backport gcc-10][AArch64] ACLE bf16 convert Dennis Zhang
@ 2020-12-10 14:34                     ` Dennis Zhang
  2020-12-11 11:58                       ` Kyrylo Tkachov
  2020-12-11 11:23                     ` [backport gcc-10][AArch64] ACLE bf16 convert Kyrylo Tkachov
  1 sibling, 1 reply; 23+ messages in thread
From: Dennis Zhang @ 2020-12-10 14:34 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 700 bytes --]

Hi all,

This patch backports the commit 3553c658533e430b232997bdfd97faf6606fb102.
The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-November/557871.html
There is a change to remove FPCR-reading flag for builtin declaration since it's not supported in gcc-10.

Another change is to remove a test (bf16_get-be.c) that fails compiling on aarch64-none-linux-gnu in the original patch.
This is reported at https://gcc.gnu.org/pipermail/gcc-patches/2020-November/558195.html
The failure happens for several bf16 big-endian tests so the bug would be fixed in a separate patch.
And the test should be added after the bug is fixed.

Is it OK to backport?

Cheers
Dennis

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: rb13903.patch --]
[-- Type: text/x-patch; name="rb13903.patch", Size: 3748 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index ba2bda26dcdd4947dc724851433451433d378724..05726db1f6137f9ab29fcdd51f804199e24bbfcf 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -718,6 +718,10 @@
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
 
+  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
+  VAR1 (UNOP, vget_lo_half, 0, v8bf)
+  VAR1 (UNOP, vget_hi_half, 0, v8bf)
+
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
   VAR1 (TERNOP, simd_smmla, 0, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6ff5246f84e919402c687687a84beb8..43ac3cd40fe8379567b7a60772f360d37818e8e9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,27 @@
   [(set_attr "type" "neon_dot<VDQSF:q>")]
 )
 
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_lo_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
+(define_expand "aarch64_vget_hi_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")]
+  "TARGET_BF16_SIMD"
+{
+  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, true);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
+  DONE;
+})
+
 ;; bfmmla
 (define_insn "aarch64_bfmmlaqv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 95bfa5ebba21b739ee3c84e3971337646f8881d4..0fd78a6fd076f788d2618c492a026246e61e438c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35680,6 +35680,20 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_lo_halfv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_hi_halfv8bf (__a);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c
new file mode 100644
index 0000000000000000000000000000000000000000..2193753ffbb6246aa16eb5033559b21266a556a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_get.c
@@ -0,0 +1,27 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_vget_low_bf16:
+**     ret
+*/
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+/*
+**test_vget_high_bf16:
+**     dup	d0, v0.d\[1\]
+**     ret
+*/
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [backport gcc-10][AArch64] ACLE bf16 convert
  2020-12-10 14:26                   ` [backport gcc-10][AArch64] ACLE bf16 convert Dennis Zhang
  2020-12-10 14:34                     ` [backport gcc-10][AArch64] ACLE bf16 get Dennis Zhang
@ 2020-12-11 11:23                     ` Kyrylo Tkachov
  2020-12-11 16:35                       ` Dennis Zhang
  1 sibling, 1 reply; 23+ messages in thread
From: Kyrylo Tkachov @ 2020-12-11 11:23 UTC (permalink / raw)
  To: Dennis Zhang, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Richard Sandiford



> -----Original Message-----
> From: Dennis Zhang <Dennis.Zhang@arm.com>
> Sent: 10 December 2020 14:27
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [backport gcc-10][AArch64] ACLE bf16 convert
> 
> Hi all,
> 
> This patch backports the commit
> f7d6961126a7f06c8089d8a58bd21be43bc16806.
> The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> November/557859.html
> The only change is to remove FPCR-reading flags for builtin definition since
> it's not supported in gcc-10.
> Regtested and bootstrapped for aarch64-none-linux-gnu.
> 
> Is it OK to backport?

Ok.
Thanks,
Kyrill

> 
> Cheers
> Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

* RE: [backport gcc-10][AArch64] ACLE bf16 get
  2020-12-10 14:34                     ` [backport gcc-10][AArch64] ACLE bf16 get Dennis Zhang
@ 2020-12-11 11:58                       ` Kyrylo Tkachov
  2020-12-11 16:31                         ` Dennis Zhang
  0 siblings, 1 reply; 23+ messages in thread
From: Kyrylo Tkachov @ 2020-12-11 11:58 UTC (permalink / raw)
  To: Dennis Zhang, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Richard Sandiford



> -----Original Message-----
> From: Dennis Zhang <Dennis.Zhang@arm.com>
> Sent: 10 December 2020 14:35
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> <Richard.Sandiford@arm.com>
> Subject: [backport gcc-10][AArch64] ACLE bf16 get
> 
> Hi all,
> 
> This patch backports the commit
> 3553c658533e430b232997bdfd97faf6606fb102.
> The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> November/557871.html
> There is a change to remove FPCR-reading flag for builtin declaration since
> it's not supported in gcc-10.
> 
> Another change is to remove a test (bf16_get-be.c) that fails compiling on
> aarch64-none-linux-gnu in the original patch.
> This is reported at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> November/558195.html
> The failure happens for several bf16 big-endian tests so the bug would be
> fixed in a separate patch.
> And the test should be added after the bug is fixed.
> 
> Is it OK to backport?

But do the tests added here work for big-endian?
Ok if they do.
Thanks,
Kyrill

> 
> Cheers
> Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [backport gcc-10][AArch64] ACLE bf16 get
  2020-12-11 11:58                       ` Kyrylo Tkachov
@ 2020-12-11 16:31                         ` Dennis Zhang
  0 siblings, 0 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-12-11 16:31 UTC (permalink / raw)
  To: Kyrylo Tkachov, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Richard Sandiford

Hi Kyrylo,

> ________________________________________
> From: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Sent: Friday, December 11, 2020 11:58 AM
> To: Dennis Zhang; gcc-patches@gcc.gnu.org
> Cc: nd; Richard Earnshaw; Marcus Shawcroft; Richard Sandiford
> Subject: RE: [backport gcc-10][AArch64] ACLE bf16 get
> 
> > -----Original Message-----
> > From: Dennis Zhang <Dennis.Zhang@arm.com>
> > Sent: 10 December 2020 14:35
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [backport gcc-10][AArch64] ACLE bf16 get
> >
> > Hi all,
> >
> > This patch backports the commit
> > 3553c658533e430b232997bdfd97faf6606fb102.
> > The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> > November/557871.html
> > There is a change to remove FPCR-reading flag for builtin declaration since
> > it's not supported in gcc-10.
> >
> > Another change is to remove a test (bf16_get-be.c) that fails compiling on
> > aarch64-none-linux-gnu in the original patch.
> > This is reported at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> > November/558195.html
> > The failure happens for several bf16 big-endian tests so the bug would be
> > fixed in a separate patch.
> > And the test should be added after the bug is fixed.
> >
> > Is it OK to backport?
> 
> But do the tests added here work for big-endian?
> Ok if they do.
> Thanks,
> Kyrill

Thanks for asking. The added test (bf16_get.c) works for both aarch64-none-linux-gnu and aarch64_be-none-linux-gnu.
The patch is commited as c25f7eac6555d67523f0520c7e93bbc398d0da84.

Cheers
Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [backport gcc-10][AArch64] ACLE bf16 convert
  2020-12-11 11:23                     ` [backport gcc-10][AArch64] ACLE bf16 convert Kyrylo Tkachov
@ 2020-12-11 16:35                       ` Dennis Zhang
  0 siblings, 0 replies; 23+ messages in thread
From: Dennis Zhang @ 2020-12-11 16:35 UTC (permalink / raw)
  To: Kyrylo Tkachov, gcc-patches
  Cc: nd, Richard Earnshaw, Marcus Shawcroft, Richard Sandiford

> ________________________________________
> From: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Sent: Friday, December 11, 2020 11:23 AM
> To: Dennis Zhang; gcc-patches@gcc.gnu.org
> Cc: nd; Richard Earnshaw; Marcus Shawcroft; Richard Sandiford
> Subject: RE: [backport gcc-10][AArch64] ACLE bf16 convert
> 
> > -----Original Message-----
> > From: Dennis Zhang <Dennis.Zhang@arm.com>
> > Sent: 10 December 2020 14:27
> > To: gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>; Richard Sandiford
> > <Richard.Sandiford@arm.com>
> > Subject: [backport gcc-10][AArch64] ACLE bf16 convert
> >
> > Hi all,
> >
> > This patch backports the commit
> > f7d6961126a7f06c8089d8a58bd21be43bc16806.
> > The original is approved at https://gcc.gnu.org/pipermail/gcc-patches/2020-
> > November/557859.html
> > The only change is to remove FPCR-reading flags for builtin definition since
> > it's not supported in gcc-10.
> > Regtested and bootstrapped for aarch64-none-linux-gnu.
> >
> > Is it OK to backport?
> 
> Ok.
> Thanks,
> Kyrill

Thanks Kyrill!
The patch is committed as 702e45ee471422dee86d32fc84f617d341d33175.

Bests
Dennis

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2020-12-11 16:35 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-26 17:25 [PATCH][AArch64] Enable CLI for Armv8.6-a: armv8.6-a, i8mm and bf16 Dennis Zhang
2019-11-29 13:02 ` Richard Sandiford
2019-12-05 15:31   ` Dennis Zhang
2019-12-06 10:22     ` Richard Sandiford
2019-12-12 17:01       ` Dennis Zhang
2019-12-13 10:23         ` Richard Sandiford
2020-10-29 12:19         ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Dennis Zhang
2020-10-29 12:28           ` [PATCH][AArch64] ACLE intrinsics: get low/high half from BFloat16 vector Dennis Zhang
2020-10-30 14:07             ` Richard Sandiford
2020-11-03 11:16               ` Dennis Zhang
2020-11-03 14:05                 ` Richard Sandiford
2020-11-03 17:00                   ` Dennis Zhang
2020-11-05 20:07                 ` Christophe Lyon
2020-10-29 17:48           ` [PATCH][AArch64] ACLE intrinsics: convert from BFloat16 to Float32 Richard Sandiford
2020-11-02 17:27             ` Dennis Zhang
2020-11-02 19:05               ` Richard Sandiford
2020-11-03 13:06                 ` Dennis Zhang
2020-12-10 14:26                   ` [backport gcc-10][AArch64] ACLE bf16 convert Dennis Zhang
2020-12-10 14:34                     ` [backport gcc-10][AArch64] ACLE bf16 get Dennis Zhang
2020-12-11 11:58                       ` Kyrylo Tkachov
2020-12-11 16:31                         ` Dennis Zhang
2020-12-11 11:23                     ` [backport gcc-10][AArch64] ACLE bf16 convert Kyrylo Tkachov
2020-12-11 16:35                       ` Dennis Zhang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).