public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
@ 2019-12-18 16:35 Stam Markianos-Wright
  2019-12-19 10:07 ` Richard Sandiford
  0 siblings, 1 reply; 9+ messages in thread
From: Stam Markianos-Wright @ 2019-12-18 16:35 UTC (permalink / raw)
  To: gcc-patches
  Cc: Richard Earnshaw, Richard Sandiford, Kyrylo Tkachov, Marcus Shawcroft

[-- Attachment #1: Type: text/plain, Size: 2265 bytes --]

Hi all,

This patch adds Bfloat type support to the ARM back-end.
It also adds a new machine_mode (BFmode) for this type and accompanying Vector 
modes V4BFmode and V8BFmode.

The second patch in this series uses existing target hooks to restrict type use.

Regression testing on aarch64-none-elf passed successfully.

This patch depends on:

https://gcc.gnu.org/ml/gcc-patches/2019-12/msg00857.html

for test suite effective_target update.

Ok for trunk?

Cheers,
Stam


ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

Details on ARM Bfloat can be found here:
https://community.arm.com/developer/ip-products/processors/b/ml-ip-blog/posts/bfloat16-processing-for-neural-networks-on-armv8_2d00_a 


PS. I don't have commit rights, so if someone could commit on my behalf,
that would be great :)



gcc/ChangeLog:

2019-12-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* config.gcc: Add arm_bf16.h.
	* config/aarch64/aarch64-builtins.c
         (aarch64_simd_builtin_std_type): Add BFmode.
         (aarch64_init_simd_builtin_types): Add element types for vector types.
	(aarch64_init_bf16_types): New function.
	(aarch64_general_init_builtins): Add arm_init_bf16_types function call.
	* config/aarch64/aarch64-modes.def: Add BFmode and vector modes.
	* config/aarch64/aarch64-simd-builtin-types.def:
	* config/aarch64/aarch64-simd.md: Add BF types to NEON move patterns.
	* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Add BF modes.
	(aarch64_gimplify_va_arg_expr): Add BFmode.
	* config/aarch64/aarch64.h (AARCH64_VALID_SIMD_DREG_MODE): Add V4BF.
	(AARCH64_VALID_SIMD_QREG_MODE): Add V8BF.
	* config/aarch64/aarch64.md: New enabled_for_bfmode_scalar,
          enabled_for_bfmode_vector attributes. Add BFmode to movhf pattern.
	* config/aarch64/arm_bf16.h: New file.
	* config/aarch64/arm_neon.h: Add arm_bf16.h and Bfloat vector types.
	* config/aarch64/iterators.md
          (HFBF, GPF_TF_F16_MOV, VDMOV, VQMOV, VALL_F16MOV): New.



gcc/testsuite/ChangeLog:

2019-12-16  Stam Markianos-Wright  <stam.markianos-wright@arm.com>

	* gcc.target/aarch64/bfloat16_compile.c: New test.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: BFmode1of2.patch --]
[-- Type: text/x-patch; name="BFmode1of2.patch", Size: 17876 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 9802f436e06..b49c110ccaf 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index c35a1b1f029..3ba2f12166f 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,11 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+
+/* Init Bfloat vector types with underlying uint types.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1073,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1241,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 3c698b620cd..59f2ec4eaec 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index b015694293c..3b387377f38 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 15)
+  ENTRY (Bfloat16x8_t, V8BF, none, 15)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad4676bc167..c4858ab7cff 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -126,13 +126,14 @@
 }
   [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
-		     mov_reg, neon_move<q>")]
+		     mov_reg, neon_move<q>")
+    (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -161,7 +162,8 @@
   [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\
 		     neon_logic<q>, multiple, multiple,\
 		     multiple, neon_move<q>")
-   (set_attr "length" "4,4,4,4,8,8,8,4")]
+   (set_attr "length" "4,4,4,4,8,8,8,4")
+   (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")]
 )
 
 ;; When storing lane zero we can use the normal STR and its more permissive
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b0aca03bcb4..f57469b6e23 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15548,6 +15550,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2bb5a208720..857e2b8f90e 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  We
+   need it in many places in the backend.  Defined in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b11ead7ab23..6c1cd76bb16 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -344,7 +344,7 @@
 ;; alternative). This attribute is used to compute attribute "enabled", use type
 ;; "any" to enable an alternative in all cases.
 
-(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
+(define_enum "arches" [ any rcpc8_4 fp simd sve fp16 fp16_notbf16 notbf16])
 
 (define_enum_attr "arch" "arches" (const_string "any"))
 
@@ -378,6 +378,12 @@
 	(and (eq_attr "arch" "fp16")
 	     (match_test "TARGET_FP_F16INST"))
 
+	(and (eq_attr "arch" "fp16_notbf16")
+	     (match_test "TARGET_FP_F16INST && !TARGET_BF16_FP"))
+
+	(and (eq_attr "arch" "notbf16")
+	     (match_test "!TARGET_BF16_SIMD"))
+
 	(and (eq_attr "arch" "sve")
 	     (match_test "TARGET_SVE")))
     (const_string "yes")
@@ -1304,8 +1310,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1327,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
@@ -1341,7 +1347,7 @@
    mov\\t%w0, %w1"
   [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \
 		     neon_move,f_loads,f_stores,load_4,store_4,mov_reg")
-   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
+   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16_notbf16,simd,*,*,*,*,*")]
 )
 
 (define_insn "*movsf_aarch64"
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 00000000000..aedb0972735
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,42 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+#include <stdint.h>
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
+
+typedef __bf16 bfloat16_t;
+
+
+#endif
+#pragma GCC pop_options
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8b861601a48..5996df0a612 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,10 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __bf16 bfloat16_t;
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34610,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 1ca5ed1ef1b..9480efef47c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,17 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type.(HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [(HF "") (BF "TARGET_BF16_FP") (SF "")
+				      (DF "") (TF "")])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +87,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -94,6 +105,9 @@
 ;; Quad vector modes.
 (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
@@ -160,6 +174,15 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [(V8QI "") (V16QI "") (V4HI "") (V8HI "")
+				   (V2SI "") (V4SI "") (V2DI "")
+				   (V4HF "") (V8HF "")
+				   (V4BF "TARGET_BF16_SIMD")
+				   (V8BF "TARGET_BF16_SIMD")
+				   (V2SF "") (V4SF "") (V2DF "")])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -885,7 +908,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8HF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -1265,6 +1289,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
new file mode 100644
index 00000000000..f2bef671deb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
@@ -0,0 +1,51 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, [0-9]+\]
+**	ldr	h0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	...
+**	str	d0, \[sp, [0-9]+\]
+**	ldr	d0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	...
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	...
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2019-12-18 16:35 [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2] Stam Markianos-Wright
@ 2019-12-19 10:07 ` Richard Sandiford
  2019-12-23 16:57   ` Stam Markianos-Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Sandiford @ 2019-12-19 10:07 UTC (permalink / raw)
  To: Stam Markianos-Wright
  Cc: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> [...]
> @@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
>        return float_type_node;
>      case E_DFmode:
>        return double_type_node;
> +    case E_BFmode:
> +      return aarch64_bf16_type_node;
>      default:
>        gcc_unreachable ();
>      }
> @@ -750,6 +759,11 @@ aarch64_init_simd_builtin_types (void)
>    aarch64_simd_types[Float64x1_t].eltype = double_type_node;
>    aarch64_simd_types[Float64x2_t].eltype = double_type_node;
>  
> +
> +/* Init Bfloat vector types with underlying uint types.  */
> +  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
> +  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;

Formatting nits: too many blank lines, comment should be indented
to match the code.

> +
>    for (i = 0; i < nelts; i++)
>      {
>        tree eltype = aarch64_simd_types[i].eltype;
> @@ -1059,6 +1073,19 @@ aarch64_init_fp16_types (void)
>    aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
>  }
>  
> +/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
> +static void
> +aarch64_init_bf16_types (void)
> +{
> +  aarch64_bf16_type_node = make_node (REAL_TYPE);
> +  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
> +  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
> +  layout_type (aarch64_bf16_type_node);
> +
> +  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, "__bf16");

This style is mostly a carry-over from pre-ANSI days.  New code
can just use "lang_hooks.types.register_builtin_type (...)".

> +  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
> +}
> +
>  /* Pointer authentication builtins that will become NOP on legacy platform.
>     Currently, these builtins are for internal use only (libgcc EH unwinder).  */
>  
> [...]
> diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
> index b015694293c..3b387377f38 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
> @@ -50,3 +50,5 @@
>    ENTRY (Float32x4_t, V4SF, none, 13)
>    ENTRY (Float64x1_t, V1DF, none, 13)
>    ENTRY (Float64x2_t, V2DF, none, 13)
> +  ENTRY (Bfloat16x4_t, V4BF, none, 15)
> +  ENTRY (Bfloat16x8_t, V8BF, none, 15)

Should be 14 (number of characters + 2 for "__").  Would be good to have
a test for correct C++ mangling.

> [...]
> @@ -101,10 +101,10 @@
>    [(set_attr "type" "neon_dup<q>")]
>  )
>  
> -(define_insn "*aarch64_simd_mov<VD:mode>"
> -  [(set (match_operand:VD 0 "nonimmediate_operand"
> +(define_insn "*aarch64_simd_mov<VDMOV:mode>"
> +  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
>  		"=w, m,  m,  w, ?r, ?w, ?r, w")
> -	(match_operand:VD 1 "general_operand"
> +	(match_operand:VDMOV 1 "general_operand"
>  		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>    "TARGET_SIMD
>     && (register_operand (operands[0], <MODE>mode)
> @@ -126,13 +126,14 @@
>  }
>    [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
>  		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
> -		     mov_reg, neon_move<q>")]
> +		     mov_reg, neon_move<q>")
> +    (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")]
>  )

Together with the changes to the arch attribute:

> @@ -378,6 +378,12 @@
>  	(and (eq_attr "arch" "fp16")
>  	     (match_test "TARGET_FP_F16INST"))
>  
> +	(and (eq_attr "arch" "fp16_notbf16")
> +	     (match_test "TARGET_FP_F16INST && !TARGET_BF16_FP"))
> +
> +	(and (eq_attr "arch" "notbf16")
> +	     (match_test "!TARGET_BF16_SIMD"))
> +
>  	(and (eq_attr "arch" "sve")
>  	     (match_test "TARGET_SVE")))
>      (const_string "yes")

this will disable the second and final alternatives for all VDMOV modes
when bf16 is enabled.  E.g. enabling bf16 will disable those alternatives
for V4HI as well as V4BF.

If you want to disable some alternatives for V4BF then it'd be better to
use define_mode_attr instead.  But are you sure we need to disable them?
The m<-Dz alternative should work for V4BF as well.  The w<-Dn alternative
should work too -- it's up to aarch64_simd_valid_immediate to decide
which immediates are valid.

> [...]
> @@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>  extern tree aarch64_fp16_type_node;
>  extern tree aarch64_fp16_ptr_type_node;
>  
> +/* This type is the user-visible __bf16, and a pointer to that type.  We
> +   need it in many places in the backend.  Defined in aarch64-builtins.c.  */

Not sure the number of places in this patch counts as "many" :-)
Probably best just to drop that sentence.

> +extern tree aarch64_bf16_type_node;
> +extern tree aarch64_bf16_ptr_type_node;
> +
>  /* The generic unwind code in libgcc does not initialize the frame pointer.
>     So in order to unwind a function using a frame pointer, the very first
>     function that is unwound must save the frame pointer.  That way the frame
> [...]
> @@ -1321,11 +1327,11 @@
>    }
>  )
>  
> -(define_insn "*movhf_aarch64"
> -  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
> -	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
> -  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
> -    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
> +(define_insn "*mov<mode>_aarch64"
> +  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
> +	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
> +  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
> +    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
>    "@
>     movi\\t%0.4h, #0
>     fmov\\t%h0, %w1
> @@ -1341,7 +1347,7 @@
>     mov\\t%w0, %w1"
>    [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \
>  		     neon_move,f_loads,f_stores,load_4,store_4,mov_reg")
> -   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
> +   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16_notbf16,simd,*,*,*,*,*")]
>  )

Here too we should avoid changing "arch" if possible.  Why do you need
to exclude the FMOV alternative for bf16?

> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
> new file mode 100644
> index 00000000000..aedb0972735
> --- /dev/null
> +++ b/gcc/config/aarch64/arm_bf16.h
> @@ -0,0 +1,42 @@
> +/* Arm BF16 instrinsics include file.
> +
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   Contributed by Arm.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published
> +   by the Free Software Foundation; either version 3, or (at your
> +   option) any later version.
> +
> +   GCC is distributed in the hope that it will be useful, but WITHOUT
> +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
> +   License for more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _AARCH64_BF16_H_
> +#define _AARCH64_BF16_H_
> +
> +#include <stdint.h>

Are we supposed to include stdint.h?  The ACLE spec doesn't seem
to require it.

> +
> +#pragma GCC push_options
> +#pragma GCC target ("arch=armv8.2-a+bf16")
> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
> +
> +typedef __bf16 bfloat16_t;
> +
> +
> +#endif
> +#pragma GCC pop_options
> +
> +#endif

Are you sure we need the #ifdef?  The target pragma should guarantee
that the macro's defined.

But the validity of the typedef shouldn't depend on target options,
so AFAICT this should just be:

typedef __bf16 bfloat16_t;

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
> new file mode 100644
> index 00000000000..f2bef671deb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
> @@ -0,0 +1,51 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O3 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**stacktest1:
> +**	...
> +**	str	h0, \[sp, [0-9]+\]
> +**	ldr	h0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16_t stacktest1 (bfloat16_t __a)
> +{
> +  volatile bfloat16_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest2:
> +**	...
> +**	str	d0, \[sp, [0-9]+\]
> +**	ldr	d0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
> +{
> +  volatile bfloat16x4_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest3:
> +**	...
> +**	str	q0, \[sp\]
> +**	ldr	q0, \[sp\]
> +**	...
> +**	ret
> +*/
> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
> +{
> +  volatile bfloat16x8_t b = __a;
> +  return b;
> +}
> +
> +

It would be good to have more test coverage than this.  E.g.:

- a test that includes arm_bf16.h, with just scalar tests.

- a test that includes arm_bf16.h without bf16 enabled, switches bf16 on,
  and then uses bfloat16_t.

- a test that includes arm_bf16.h without bf16 enabled and tries to use
  bfloat16_t without turning bf16 on.

- a test for _Complex bfloat16_t.

- a test for moves involving:

    typedef bfloat16_t v16bf __attribute__((vector_size(32)));

- a test that involves moving constants, for both scalars and vectors.
  You can create zero scalar constants in C++ using bfloat16_t() etc.
  For vectors it's possible to do things like:

    typedef short v2hi __attribute__((vector_size(4)));
    v2hi foo (void) { return (v2hi) 0x12345678; }

  The same sort of things should work for bfloat16x4_t and bfloat16x8_t.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2019-12-19 10:07 ` Richard Sandiford
@ 2019-12-23 16:57   ` Stam Markianos-Wright
  2019-12-23 17:07     ` Richard Sandiford
  0 siblings, 1 reply; 9+ messages in thread
From: Stam Markianos-Wright @ 2019-12-23 16:57 UTC (permalink / raw)
  To: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 18159 bytes --]



On 12/19/19 10:01 AM, Richard Sandiford wrote:
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> [...]
>> @@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
>>         return float_type_node;
>>       case E_DFmode:
>>         return double_type_node;
>> +    case E_BFmode:
>> +      return aarch64_bf16_type_node;
>>       default:
>>         gcc_unreachable ();
>>       }
>> @@ -750,6 +759,11 @@ aarch64_init_simd_builtin_types (void)
>>     aarch64_simd_types[Float64x1_t].eltype = double_type_node;
>>     aarch64_simd_types[Float64x2_t].eltype = double_type_node;
>>   
>> +
>> +/* Init Bfloat vector types with underlying uint types.  */
>> +  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
>> +  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
> 
> Formatting nits: too many blank lines, comment should be indented
> to match the code.

Done :)

> 
>> +
>>     for (i = 0; i < nelts; i++)
>>       {
>>         tree eltype = aarch64_simd_types[i].eltype;
>> @@ -1059,6 +1073,19 @@ aarch64_init_fp16_types (void)
>>     aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
>>   }
>>   
>> +/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
>> +static void
>> +aarch64_init_bf16_types (void)
>> +{
>> +  aarch64_bf16_type_node = make_node (REAL_TYPE);
>> +  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
>> +  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
>> +  layout_type (aarch64_bf16_type_node);
>> +
>> +  (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, "__bf16");
> 
> This style is mostly a carry-over from pre-ANSI days.  New code
> can just use "lang_hooks.types.register_builtin_type (...)".

Ahh good to know, thanks! Done

> 
>> +  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
>> +}
>> +
>>   /* Pointer authentication builtins that will become NOP on legacy platform.
>>      Currently, these builtins are for internal use only (libgcc EH unwinder).  */
>>   
>> [...]
>> diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
>> index b015694293c..3b387377f38 100644
>> --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
>> +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
>> @@ -50,3 +50,5 @@
>>     ENTRY (Float32x4_t, V4SF, none, 13)
>>     ENTRY (Float64x1_t, V1DF, none, 13)
>>     ENTRY (Float64x2_t, V2DF, none, 13)
>> +  ENTRY (Bfloat16x4_t, V4BF, none, 15)
>> +  ENTRY (Bfloat16x8_t, V8BF, none, 15)
> 
> Should be 14 (number of characters + 2 for "__").  Would be good to have
> a test for correct C++ mangling.

Done, thank you for pointing it out!!

> 
>> [...]
>> @@ -101,10 +101,10 @@
>>     [(set_attr "type" "neon_dup<q>")]
>>   )
>>   
>> -(define_insn "*aarch64_simd_mov<VD:mode>"
>> -  [(set (match_operand:VD 0 "nonimmediate_operand"
>> +(define_insn "*aarch64_simd_mov<VDMOV:mode>"
>> +  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
>>   		"=w, m,  m,  w, ?r, ?w, ?r, w")
>> -	(match_operand:VD 1 "general_operand"
>> +	(match_operand:VDMOV 1 "general_operand"
>>   		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
>>     "TARGET_SIMD
>>      && (register_operand (operands[0], <MODE>mode)
>> @@ -126,13 +126,14 @@
>>   }
>>     [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
>>   		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
>> -		     mov_reg, neon_move<q>")]
>> +		     mov_reg, neon_move<q>")
>> +    (set_attr "arch" "*,notbf16,*,*,*,*,*,notbf16")]
>>   )
> 
> Together with the changes to the arch attribute:
> 
>> @@ -378,6 +378,12 @@
>>   	(and (eq_attr "arch" "fp16")
>>   	     (match_test "TARGET_FP_F16INST"))
>>   
>> +	(and (eq_attr "arch" "fp16_notbf16")
>> +	     (match_test "TARGET_FP_F16INST && !TARGET_BF16_FP"))
>> +
>> +	(and (eq_attr "arch" "notbf16")
>> +	     (match_test "!TARGET_BF16_SIMD"))
>> +
>>   	(and (eq_attr "arch" "sve")
>>   	     (match_test "TARGET_SVE")))
>>       (const_string "yes")
> 
> this will disable the second and final alternatives for all VDMOV modes
> when bf16 is enabled.  E.g. enabling bf16 will disable those alternatives
> for V4HI as well as V4BF.
> 
> If you want to disable some alternatives for V4BF then it'd be better to
> use define_mode_attr instead.  But are you sure we need to disable them?
> The m<-Dz alternative should work for V4BF as well.  The w<-Dn alternative
> should work too -- it's up to aarch64_simd_valid_immediate to decide
> which immediates are valid.

Oh yes, I see what you mean about blocking it for V4HI and everything else under 
VDMOV as well...
Yea it was in the principle of doing what we can to block any internal Bfloat 
processing, Bfloat immediates, Bfloat constants, etc., but I wasn't sure on what 
should/shouldn't be allowed so was blocking anything that might allow for 
unintended operations to happen in BFmode. But I've got a better idea of this 
now, so, agreed, looking these basically fine to be enabled :)

This does make this patch look cleaner!
> 
>> [...]
>> @@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
>>   extern tree aarch64_fp16_type_node;
>>   extern tree aarch64_fp16_ptr_type_node;
>>   
>> +/* This type is the user-visible __bf16, and a pointer to that type.  We
>> +   need it in many places in the backend.  Defined in aarch64-builtins.c.  */
> 
> Not sure the number of places in this patch counts as "many" :-)
> Probably best just to drop that sentence.

Haha, fair!! Dropped it.

> 
>> +extern tree aarch64_bf16_type_node;
>> +extern tree aarch64_bf16_ptr_type_node;
>> +
>>   /* The generic unwind code in libgcc does not initialize the frame pointer.
>>      So in order to unwind a function using a frame pointer, the very first
>>      function that is unwound must save the frame pointer.  That way the frame
>> [...]
>> @@ -1321,11 +1327,11 @@
>>     }
>>   )
>>   
>> -(define_insn "*movhf_aarch64"
>> -  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
>> -	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
>> -  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
>> -    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
>> +(define_insn "*mov<mode>_aarch64"
>> +  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
>> +	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
>> +  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
>> +    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
>>     "@
>>      movi\\t%0.4h, #0
>>      fmov\\t%h0, %w1
>> @@ -1341,7 +1347,7 @@
>>      mov\\t%w0, %w1"
>>     [(set_attr "type" "neon_move,f_mcr,neon_move,neon_to_gp, neon_move,fconsts, \
>>   		     neon_move,f_loads,f_stores,load_4,store_4,mov_reg")
>> -   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16,simd,*,*,*,*,*")]
>> +   (set_attr "arch" "simd,fp16,simd,simd,simd,fp16_notbf16,simd,*,*,*,*,*")]
>>   )
> 
> Here too we should avoid changing "arch" if possible.  Why do you need
> to exclude the FMOV alternative for bf16?

Same as above. but as you say these should work regardless.

> 
>> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
>> new file mode 100644
>> index 00000000000..aedb0972735
>> --- /dev/null
>> +++ b/gcc/config/aarch64/arm_bf16.h
>> @@ -0,0 +1,42 @@
>> +/* Arm BF16 instrinsics include file.
>> +
>> +   Copyright (C) 2019 Free Software Foundation, Inc.
>> +   Contributed by Arm.
>> +
>> +   This file is part of GCC.
>> +
>> +   GCC is free software; you can redistribute it and/or modify it
>> +   under the terms of the GNU General Public License as published
>> +   by the Free Software Foundation; either version 3, or (at your
>> +   option) any later version.
>> +
>> +   GCC is distributed in the hope that it will be useful, but WITHOUT
>> +   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
>> +   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
>> +   License for more details.
>> +
>> +   Under Section 7 of GPL version 3, you are granted additional
>> +   permissions described in the GCC Runtime Library Exception, version
>> +   3.1, as published by the Free Software Foundation.
>> +
>> +   You should have received a copy of the GNU General Public License and
>> +   a copy of the GCC Runtime Library Exception along with this program;
>> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _AARCH64_BF16_H_
>> +#define _AARCH64_BF16_H_
>> +
>> +#include <stdint.h>
> 
> Are we supposed to include stdint.h?  The ACLE spec doesn't seem
> to require it.

Hmm, agreed, I included it only because arm_fp16 did, too.
As far as I can tell everything works without it, so removed it :)

> 
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>> +
>> +typedef __bf16 bfloat16_t;
>> +
>> +
>> +#endif
>> +#pragma GCC pop_options
>> +
>> +#endif
> 
> Are you sure we need the #ifdef?  The target pragma should guarantee
> that the macro's defined.
> 
> But the validity of the typedef shouldn't depend on target options,
> so AFAICT this should just be:
> 
> typedef __bf16 bfloat16_t;

Ok so it's a case of "what do we want to happen if the user tries to use bfloats 
without +bf16 enabled.

So the intent of the ifdef was to not have bfloat16_t be visible if the macro 
wasn't defined (i.e. not having any bf16 support), but I see now that this was 
being negated by the target macro, anyway! Oops, my bad for not really 
understanding that, sorry!

If we have the types always visible, then the user may use them, resulting in an 
ICE.

But even if the #ifdef worked this still doesn't stop the user from trying to 
use  __bf16 or __Bfloat16x4_t, __Bfloat16x8_t , which would still do produce an 
ICE, so it's not a perfect solution anyway...

One other thing I tried was the below change to aarch64-builtins.c which stops 
__bf16 or the vector types from being registered at all:

--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -759,26 +759,32 @@ aarch64_init_simd_builtin_types (void)
     aarch64_simd_types[Float64x1_t].eltype = double_type_node;
     aarch64_simd_types[Float64x2_t].eltype = double_type_node;

-  /* Init Bfloat vector types with underlying __bf16 type.  */
-  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
-  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+  if (TARGET_BF16_SIMD)
+    {
+      /* Init Bfloat vector types with underlying __bf16 type.  */
+      aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+      aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+    }

     for (i = 0; i < nelts; i++)
       {
         tree eltype = aarch64_simd_types[i].eltype;
         machine_mode mode = aarch64_simd_types[i].mode;

-      if (aarch64_simd_types[i].itype == NULL)
+      if (eltype != NULL)
          {
-         aarch64_simd_types[i].itype
-           = build_distinct_type_copy
-             (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
-         SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
-       }
+         if (aarch64_simd_types[i].itype == NULL)
+           {
+             aarch64_simd_types[i].itype
+               = build_distinct_type_copy
+               (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
+             SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
+           }

-      tdecl = add_builtin_type (aarch64_simd_types[i].name,
-                               aarch64_simd_types[i].itype);
-      TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
+         tdecl = add_builtin_type (aarch64_simd_types[i].name,
+                                   aarch64_simd_types[i].itype);
+         TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
+       }
       }

   #define AARCH64_BUILD_SIGNED_TYPE(mode)  \
@@ -1240,7 +1246,8 @@ aarch64_general_init_builtins (void)

     aarch64_init_fp16_types ();

-  aarch64_init_bf16_types ();
+  if (TARGET_BF16_FP)
+    aarch64_init_bf16_types ();

     if (TARGET_SIMD)
       aarch64_init_simd_builtins ();



But the problem in that case was that it the types could not be re-enabled using 
a target pragma like:

#pragma GCC push_options
#pragma GCC target ("+bf16")

Inside the test.

(i.e. the pragma caused the ifdef to be TRUE, but __bf16 was still not being 
enabled afaict?)

So I'm not sure what to do, presumably we do want some guard around the type so 
as not to just ICE if the type is used without +bf16?

> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
>> new file mode 100644
>> index 00000000000..f2bef671deb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile.c
>> @@ -0,0 +1,51 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-O3 --save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**stacktest1:
>> +**	...
>> +**	str	h0, \[sp, [0-9]+\]
>> +**	ldr	h0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16_t stacktest1 (bfloat16_t __a)
>> +{
>> +  volatile bfloat16_t b = __a;
>> +  return b;
>> +}
>> +
>> +/*
>> +**stacktest2:
>> +**	...
>> +**	str	d0, \[sp, [0-9]+\]
>> +**	ldr	d0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
>> +{
>> +  volatile bfloat16x4_t b = __a;
>> +  return b;
>> +}
>> +
>> +/*
>> +**stacktest3:
>> +**	...
>> +**	str	q0, \[sp\]
>> +**	ldr	q0, \[sp\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
>> +{
>> +  volatile bfloat16x8_t b = __a;
>> +  return b;
>> +}
>> +
>> +
> 
> It would be good to have more test coverage than this.  E.g.:
> 
> - a test that includes arm_bf16.h, with just scalar tests.

Done as test 2, but it is a small test. Is there anything I could add to it?
(I feel like ideally I'd want to try and force it down every alternative of the 
RTL pattern)

> 
> - a test that includes arm_bf16.h without bf16 enabled, switches bf16 on,
>    and then uses bfloat16_t.

Done as test 3. Same question as above, lmk if you have any ideas of things to 
add to it.

> 
> - a test that includes arm_bf16.h without bf16 enabled and tries to use
>    bfloat16_t without turning bf16 on.

Would have been test 4, but depends on what sort of behaviour we want and where 
the error message will come from.

> 
> - a test for _Complex bfloat16_t.

I don't think we currently have a decision on whether this should be supported 
or not.
AFAICT we also don't have complex __fp16 support either. I'm getting the same 
error messages attempting to compile a _Complex __fp16 but it's always likely 
I'm going at this wrong!

Added test 5 to show you what I was trying to do and to catch the error messages 
in their current form, but I'm not sure if I've done this right either, tbh!

> 
> - a test for moves involving:
> 
>      typedef bfloat16_t v16bf __attribute__((vector_size(32)));

Oh that's a good idea, thank you for pointing it out!

See test 6 for reference.

So for vector size 16, 128bits, this looks fine, loading and storing from q 
registers (using aarch64_simd_movv8bf).

For vector size 32, 256 bits, the compiler chooses to use 4*x-registers instead, 
resulting in this piece of assembler

stacktest2:
          sub     sp, sp, #64
          ldp     x2, x3, [x0]
          stp     x2, x3, [sp]
          ldp     x0, x1, [x0, 16]
          stp     x0, x1, [sp, 16]
          ldp     x0, x1, [sp]
          stp     x0, x1, [sp, 32]
          ldp     x2, x3, [sp, 16]
          stp     x2, x3, [sp, 48]
          stp     x0, x1, [x8]
          ldp     x0, x1, [sp, 48]
          stp     x0, x1, [x8, 16]
          add     sp, sp, 64
          ret

Which looks strange using regular registers in movti mode, but I tested it with 
float16 and float32 vectors and they the same also give the same result.

However, using an integer vector generates:

stacktest2:
          ld1     {v0.16b - v1.16b}, [x0]
          sub     sp, sp, #32
          st1     {v0.16b - v1.16b}, [sp]
          ld1     {v0.16b - v1.16b}, [sp]
          st1     {v0.16b - v1.16b}, [x8]
          add     sp, sp, 32
          ret

from the aarch64_movoi pattern. So now I'm unsure whether to leave this as is or 
to look into why all float modes are not being used through the seemingly more 
efficient movoi pattern. What do you think?
(i intend to look into this further)

> 
> - a test that involves moving constants, for both scalars and vectors.
>    You can create zero scalar constants in C++ using bfloat16_t() etc.
>    For vectors it's possible to do things like:
> 
>      typedef short v2bf __attribute__((vector_size(4)));
>      v2hi foo (void) { return (v2hi) 0x12345678; }
> 
>    The same sort of things should work for bfloat16x4_t and bfloat16x8_t.

Leaving this as an open issue for now because I'm not 100% sure what we 
should/shouldn't be allowing past the tree-level target hooks.

If we do want to block this we would do this in the [2/2] patch.
I will come back to it and create a scan-assembler test when I'm more clear on 
what we should and shouldn't allow at the higher level :)
> 
> Thanks,
> Richard
> 


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: BFmode1of2-rev2.patch --]
[-- Type: text/x-patch; name="BFmode1of2-rev2.patch", Size: 19803 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 9802f436e06..b49c110ccaf 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index c35a1b1f029..7512f8cf01d 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1072,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1240,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 3c698b620cd..59f2ec4eaec 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index b015694293c..2be0ce82445 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad4676bc167..7dd28b31547 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index a85f8b04c20..0d08382ebbe 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15603,6 +15605,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2bb5a208720..68121a16072 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index b11ead7ab23..d48d67ea7ec 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1304,8 +1304,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1321,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 00000000000..884b6f3bc7a
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,32 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8b861601a48..ee4bb76bcd4 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,9 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34609,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 1ca5ed1ef1b..9480efef47c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,17 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type.(HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [(HF "") (BF "TARGET_BF16_FP") (SF "")
+				      (DF "") (TF "")])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +87,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -94,6 +105,9 @@
 ;; Quad vector modes.
 (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
@@ -160,6 +174,15 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [(V8QI "") (V16QI "") (V4HI "") (V8HI "")
+				   (V2SI "") (V4SI "") (V2DI "")
+				   (V4HF "") (V8HF "")
+				   (V4BF "TARGET_BF16_SIMD")
+				   (V8BF "TARGET_BF16_SIMD")
+				   (V2SF "") (V4SF "") (V2DF "")])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -885,7 +908,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8HF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -1265,6 +1289,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
new file mode 100644
index 00000000000..f2bef671deb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
@@ -0,0 +1,51 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, [0-9]+\]
+**	ldr	h0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	...
+**	str	d0, \[sp, [0-9]+\]
+**	ldr	d0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	...
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	...
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_2.c
new file mode 100644
index 00000000000..c3c3a951111
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_2.c
@@ -0,0 +1,21 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, [0-9]+\]
+**	ldr	h0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
new file mode 100644
index 00000000000..9bcb53b32d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-march=armv8.2-a -O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, [0-9]+\]
+**	ldr	h0, \[sp, [0-9]+\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_5.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_5.c
new file mode 100644
index 00000000000..b812011c223
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_5.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_6.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_6.c
new file mode 100644
index 00000000000..9a967de439b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_6.c
@@ -0,0 +1,49 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*  Create vectors of 8 and 16 BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+
+/*
+**stacktest1:
+**	...
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	...
+**	ret
+*/
+v8bf stacktest1 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+
+/*
+**stacktest2:
+**	...
+**	ldp	x[0-9]+, x[0-9]+, \[x[0-9]+\]
+**	stp	x[0-9]+, x[0-9]+, \[sp\]
+**	ldp	x[0-9]+, x[0-9]+, \[x0, [0-9]+\]
+**	stp	x[0-9]+, x[0-9]+, \[sp, [0-9]+\]
+**	ldp	x[0-9]+, x[0-9]+, \[sp\]
+**	stp	x[0-9]+, x[0-9]+, \[sp, [0-9]+\]
+**	ldp	x[0-9]+, x[0-9]+, \[sp, [0-9]+\]
+**	stp	x[0-9]+, x[0-9]+, \[sp, [0-9]+\]
+**	stp	x[0-9]+, x[0-9]+, \[x[0-9]+\]
+**	ldp	x[0-9]+, x[0-9]+, \[sp, [0-9]+\]
+**	stp	x[0-9]+, x[0-9]+, \[x[0-9]+, [0-9]+\]
+**	...
+**	ret
+*/
+v16bf stacktest2 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2019-12-23 16:57   ` Stam Markianos-Wright
@ 2019-12-23 17:07     ` Richard Sandiford
  2020-01-07 11:42       ` Stam Markianos-Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Sandiford @ 2019-12-23 17:07 UTC (permalink / raw)
  To: Stam Markianos-Wright
  Cc: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> On 12/19/19 10:01 AM, Richard Sandiford wrote:
>>> +
>>> +#pragma GCC push_options
>>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>>> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>>> +
>>> +typedef __bf16 bfloat16_t;
>>> +
>>> +
>>> +#endif
>>> +#pragma GCC pop_options
>>> +
>>> +#endif
>> 
>> Are you sure we need the #ifdef?  The target pragma should guarantee
>> that the macro's defined.
>> 
>> But the validity of the typedef shouldn't depend on target options,
>> so AFAICT this should just be:
>> 
>> typedef __bf16 bfloat16_t;
>
> Ok so it's a case of "what do we want to happen if the user tries to use bfloats 
> without +bf16 enabled.
>
> So the intent of the ifdef was to not have bfloat16_t be visible if the macro 
> wasn't defined (i.e. not having any bf16 support), but I see now that this was 
> being negated by the target macro, anyway! Oops, my bad for not really 
> understanding that, sorry!
>
> If we have the types always visible, then the user may use them, resulting in an 
> ICE.
>
> But even if the #ifdef worked this still doesn't stop the user from trying to 
> use  __bf16 or __Bfloat16x4_t, __Bfloat16x8_t , which would still do produce an 
> ICE, so it's not a perfect solution anyway...

Right.  Or they could use #pragma GCC target to switch to a different
non-bf16 target after including arm_bf16.h.

> One other thing I tried was the below change to aarch64-builtins.c which stops 
> __bf16 or the vector types from being registered at all:
>
> --- a/gcc/config/aarch64/aarch64-builtins.c
> +++ b/gcc/config/aarch64/aarch64-builtins.c
> @@ -759,26 +759,32 @@ aarch64_init_simd_builtin_types (void)
>      aarch64_simd_types[Float64x1_t].eltype = double_type_node;
>      aarch64_simd_types[Float64x2_t].eltype = double_type_node;
>
> -  /* Init Bfloat vector types with underlying __bf16 type.  */
> -  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
> -  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
> +  if (TARGET_BF16_SIMD)
> +    {
> +      /* Init Bfloat vector types with underlying __bf16 type.  */
> +      aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
> +      aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
> +    }
>
>      for (i = 0; i < nelts; i++)
>        {
>          tree eltype = aarch64_simd_types[i].eltype;
>          machine_mode mode = aarch64_simd_types[i].mode;
>
> -      if (aarch64_simd_types[i].itype == NULL)
> +      if (eltype != NULL)
>           {
> -         aarch64_simd_types[i].itype
> -           = build_distinct_type_copy
> -             (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
> -         SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
> -       }
> +         if (aarch64_simd_types[i].itype == NULL)
> +           {
> +             aarch64_simd_types[i].itype
> +               = build_distinct_type_copy
> +               (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
> +             SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
> +           }
>
> -      tdecl = add_builtin_type (aarch64_simd_types[i].name,
> -                               aarch64_simd_types[i].itype);
> -      TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
> +         tdecl = add_builtin_type (aarch64_simd_types[i].name,
> +                                   aarch64_simd_types[i].itype);
> +         TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
> +       }
>        }
>
>    #define AARCH64_BUILD_SIGNED_TYPE(mode)  \
> @@ -1240,7 +1246,8 @@ aarch64_general_init_builtins (void)
>
>      aarch64_init_fp16_types ();
>
> -  aarch64_init_bf16_types ();
> +  if (TARGET_BF16_FP)
> +    aarch64_init_bf16_types ();
>
>      if (TARGET_SIMD)
>        aarch64_init_simd_builtins ();
>
>
>
> But the problem in that case was that it the types could not be re-enabled using 
> a target pragma like:
>
> #pragma GCC push_options
> #pragma GCC target ("+bf16")
>
> Inside the test.
>
> (i.e. the pragma caused the ifdef to be TRUE, but __bf16 was still not being 
> enabled afaict?)
>
> So I'm not sure what to do, presumably we do want some guard around the type so 
> as not to just ICE if the type is used without +bf16?

Other header files work both ways: you get the same definitions regardless
of what the target was when the header file was included.  Then we need
to raise an error if the user tries to do something that the current
target doesn't support.

I suppose for bf16 we could either (a) try to raise an error whenever
BF-related moves are emitted without the required target feature or
(b) handle __bf16 types like __fp16 types.  The justification for
(b) is that there aren't really any new instructions for moves;
__bf16 is mostly a software construct as far as this specific
patch goes.  (It's a different story for the intrinsics patch
of course.)

I don't know which of (a) or (b) is better.  Whichever we go for,
it would be good if clang and GCC were consistent here.

>> It would be good to have more test coverage than this.  E.g.:
>> 
>> - a test that includes arm_bf16.h, with just scalar tests.
>
> Done as test 2, but it is a small test. Is there anything I could add to it?
> (I feel like ideally I'd want to try and force it down every alternative of the 
> RTL pattern)

register asms are one way of doing that, see e.g
gcc.target/aarch64/sve/struct_move_1.c

>> 
>> - a test for _Complex bfloat16_t.
>
> I don't think we currently have a decision on whether this should be supported 
> or not.
> AFAICT we also don't have complex __fp16 support either. I'm getting the same 
> error messages attempting to compile a _Complex __fp16 but it's always likely 
> I'm going at this wrong!
>
> Added test 5 to show you what I was trying to do and to catch the error messages 
> in their current form, but I'm not sure if I've done this right either, tbh!

Testing for an error is a good option if we don't intend to support this.
The main reason for having a test is to make sure that there's no ICE.

So the test in the new patch LGTM, thanks.

>> - a test for moves involving:
>> 
>>      typedef bfloat16_t v16bf __attribute__((vector_size(32)));
>
> Oh that's a good idea, thank you for pointing it out!
>
> See test 6 for reference.
>
> So for vector size 16, 128bits, this looks fine, loading and storing from q 
> registers (using aarch64_simd_movv8bf).
>
> For vector size 32, 256 bits, the compiler chooses to use 4*x-registers instead, 
> resulting in this piece of assembler
>
> stacktest2:
>           sub     sp, sp, #64
>           ldp     x2, x3, [x0]
>           stp     x2, x3, [sp]
>           ldp     x0, x1, [x0, 16]
>           stp     x0, x1, [sp, 16]
>           ldp     x0, x1, [sp]
>           stp     x0, x1, [sp, 32]
>           ldp     x2, x3, [sp, 16]
>           stp     x2, x3, [sp, 48]
>           stp     x0, x1, [x8]
>           ldp     x0, x1, [sp, 48]
>           stp     x0, x1, [x8, 16]
>           add     sp, sp, 64
>           ret
>
> Which looks strange using regular registers in movti mode, but I tested it with 
> float16 and float32 vectors and they the same also give the same result.
>
> However, using an integer vector generates:
>
> stacktest2:
>           ld1     {v0.16b - v1.16b}, [x0]
>           sub     sp, sp, #32
>           st1     {v0.16b - v1.16b}, [sp]
>           ld1     {v0.16b - v1.16b}, [sp]
>           st1     {v0.16b - v1.16b}, [x8]
>           add     sp, sp, 32
>           ret
>
> from the aarch64_movoi pattern. So now I'm unsure whether to leave this as is or 
> to look into why all float modes are not being used through the seemingly more 
> efficient movoi pattern. What do you think?
> (i intend to look into this further)

Haven't tried, but is this affected by -fno-split-wide-types?

But here too the main thing is to make sure that there's no ICE when
using the vectors.  Making it efficient can be (very low priority)
follow-on work.

So it's probably best not to match any specific output here.
Just testing that the moves compile is OK.

>> - a test that involves moving constants, for both scalars and vectors.
>>    You can create zero scalar constants in C++ using bfloat16_t() etc.
>>    For vectors it's possible to do things like:
>> 
>>      typedef short v2bf __attribute__((vector_size(4)));
>>      v2hi foo (void) { return (v2hi) 0x12345678; }
>> 
>>    The same sort of things should work for bfloat16x4_t and bfloat16x8_t.
>
> Leaving this as an open issue for now because I'm not 100% sure what we 
> should/shouldn't be allowing past the tree-level target hooks.
>
> If we do want to block this we would do this in the [2/2] patch.
> I will come back to it and create a scan-assembler test when I'm more clear on 
> what we should and shouldn't allow at the higher level :)

FWIW, I'm not sure we should go out of our way to disallow this.
Preventing bfloat16_t() in C++ would IMO be unnatural.  And the
"(vector) vector-sized-integer" syntax specifically treats the vector
as a bundle of bits without really caring what the element type is.
Even if we did manage to forbid the conversion in that context,
it would still be possible to achieve the same thing using:

   v2hi
   foo (void)
   {
     union { v2hi v; unsigned int i; } u;
     u.i = 0x12345678;
     return u.v;
   }

Thanks for the new patch, looks good apart from the points above and:

> +;; Iterator for all scalar floating point modes suitable for moving, including
> +;; special BF type.(HF, SF, DF, TF and BF)

Nit: should be space rather than "." before "(".

> +(define_mode_iterator GPF_TF_F16_MOV [(HF "") (BF "TARGET_BF16_FP") (SF "")
> +				      (DF "") (TF "")])
> +
>  ;; Double vector modes.
>  (define_mode_iterator VDF [V2SF V4HF])
>  
> @@ -79,6 +87,9 @@
>  ;; Double vector modes.
>  (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
>  
> +;; Double vector modes suitable for moving.  Includes BFmode.
> +(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
> +
>  ;; All modes stored in registers d0-d31.
>  (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
>  
> @@ -94,6 +105,9 @@
>  ;; Quad vector modes.
>  (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>  
> +;; Quad vector modes suitable for moving.  Includes BFmode.
> +(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
> +
>  ;; Copy of the above.
>  (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>  

This looks a bit inconsistent: the scalar iterator requires
TARGET_BF16_FP for bf16 modes, but the vector iterator doesn't.

> @@ -160,6 +174,15 @@
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>  				V4HF V8HF V2SF V4SF V2DF])
>  
> +;; All Advanced SIMD modes suitable for moving, loading, and storing,
> +;; including special Bfloat vector types.
> +(define_mode_iterator VALL_F16MOV [(V8QI "") (V16QI "") (V4HI "") (V8HI "")
> +				   (V2SI "") (V4SI "") (V2DI "")
> +				   (V4HF "") (V8HF "")
> +				   (V4BF "TARGET_BF16_SIMD")
> +				   (V8BF "TARGET_BF16_SIMD")
> +				   (V2SF "") (V4SF "") (V2DF "")])
> +
>  ;; The VALL_F16 modes except the 128-bit 2-element ones.
>  (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
>  				V4HF V8HF V2SF V4SF])

whereas here we do check.  But that comes back to the (a)/(b) choice above.

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
> new file mode 100644
> index 00000000000..f2bef671deb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
> @@ -0,0 +1,51 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O3 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +**stacktest1:
> +**	...
> +**	str	h0, \[sp, [0-9]+\]
> +**	ldr	h0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16_t stacktest1 (bfloat16_t __a)
> +{
> +  volatile bfloat16_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest2:
> +**	...
> +**	str	d0, \[sp, [0-9]+\]
> +**	ldr	d0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
> +{
> +  volatile bfloat16x4_t b = __a;
> +  return b;
> +}
> +
> +/*
> +**stacktest3:
> +**	...
> +**	str	q0, \[sp\]
> +**	ldr	q0, \[sp\]
> +**	...
> +**	ret
> +*/
> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
> +{
> +  volatile bfloat16x8_t b = __a;
> +  return b;
> +}

Might be a daft question, but why do we have an offset for the first
two and not for the last one?  Might be worth hard-coding whatever
offset we use.

If we use -fomit-frame-pointer then the whole function body should
be stable: sub, str, ldr, add, ret.

> @@ -0,0 +1,21 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
> +/* { dg-additional-options "-O3 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <arm_bf16.h>
> +
> +/*
> +**stacktest1:
> +**	...
> +**	str	h0, \[sp, [0-9]+\]
> +**	ldr	h0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16_t stacktest1 (bfloat16_t __a)
> +{
> +  volatile bfloat16_t b = __a;
> +  return b;
> +}

Same comment here.

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
> new file mode 100644
> index 00000000000..9bcb53b32d8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=armv8.2-a -O2" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#pragma GCC push_options
> +#pragma GCC target ("+bf16")
> +
> +#include <arm_bf16.h>
> +
> +/*
> +**stacktest1:
> +**	...
> +**	str	h0, \[sp, [0-9]+\]
> +**	ldr	h0, \[sp, [0-9]+\]
> +**	...
> +**	ret
> +*/
> +bfloat16_t stacktest1 (bfloat16_t __a)
> +{
> +  volatile bfloat16_t b = __a;
> +  return b;
> +}
> +
> +#pragma GCC pop_options

Here too.  No real need for the push & pop, but keeping them is fine
if that seems more obvious.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2019-12-23 17:07     ` Richard Sandiford
@ 2020-01-07 11:42       ` Stam Markianos-Wright
  2020-01-07 17:15         ` Richard Sandiford
  0 siblings, 1 reply; 9+ messages in thread
From: Stam Markianos-Wright @ 2020-01-07 11:42 UTC (permalink / raw)
  To: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 18713 bytes --]

On 23/12/2019 16:57, Richard Sandiford wrote:
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> On 12/19/19 10:01 AM, Richard Sandiford wrote:
>>>> +
>>>> +#pragma GCC push_options
>>>> +#pragma GCC target ("arch=armv8.2-a+bf16")
>>>> +#ifdef __ARM_FEATURE_BF16_SCALAR_ARITHMETIC
>>>> +
>>>> +typedef __bf16 bfloat16_t;
>>>> +
>>>> +
>>>> +#endif
>>>> +#pragma GCC pop_options
>>>> +
>>>> +#endif
>>>
>>> Are you sure we need the #ifdef?  The target pragma should guarantee
>>> that the macro's defined.
>>>
>>> But the validity of the typedef shouldn't depend on target options,
>>> so AFAICT this should just be:
>>>
>>> typedef __bf16 bfloat16_t;
>>
>> Ok so it's a case of "what do we want to happen if the user tries to use bfloats
>> without +bf16 enabled.
>>
>> So the intent of the ifdef was to not have bfloat16_t be visible if the macro
>> wasn't defined (i.e. not having any bf16 support), but I see now that this was
>> being negated by the target macro, anyway! Oops, my bad for not really
>> understanding that, sorry!
>>
>> If we have the types always visible, then the user may use them, resulting in an
>> ICE.
>>
>> But even if the #ifdef worked this still doesn't stop the user from trying to
>> use  __bf16 or __Bfloat16x4_t, __Bfloat16x8_t , which would still do produce an
>> ICE, so it's not a perfect solution anyway...
> 
> Right.  Or they could use #pragma GCC target to switch to a different
> non-bf16 target after including arm_bf16.h.
> 
>> One other thing I tried was the below change to aarch64-builtins.c which stops
>> __bf16 or the vector types from being registered at all:
>>
>> --- a/gcc/config/aarch64/aarch64-builtins.c
>> +++ b/gcc/config/aarch64/aarch64-builtins.c
>> @@ -759,26 +759,32 @@ aarch64_init_simd_builtin_types (void)
>>       aarch64_simd_types[Float64x1_t].eltype = double_type_node;
>>       aarch64_simd_types[Float64x2_t].eltype = double_type_node;
>>
>> -  /* Init Bfloat vector types with underlying __bf16 type.  */
>> -  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
>> -  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
>> +  if (TARGET_BF16_SIMD)
>> +    {
>> +      /* Init Bfloat vector types with underlying __bf16 type.  */
>> +      aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
>> +      aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
>> +    }
>>
>>       for (i = 0; i < nelts; i++)
>>         {
>>           tree eltype = aarch64_simd_types[i].eltype;
>>           machine_mode mode = aarch64_simd_types[i].mode;
>>
>> -      if (aarch64_simd_types[i].itype == NULL)
>> +      if (eltype != NULL)
>>            {
>> -         aarch64_simd_types[i].itype
>> -           = build_distinct_type_copy
>> -             (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
>> -         SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
>> -       }
>> +         if (aarch64_simd_types[i].itype == NULL)
>> +           {
>> +             aarch64_simd_types[i].itype
>> +               = build_distinct_type_copy
>> +               (build_vector_type (eltype, GET_MODE_NUNITS (mode)));
>> +             SET_TYPE_STRUCTURAL_EQUALITY (aarch64_simd_types[i].itype);
>> +           }
>>
>> -      tdecl = add_builtin_type (aarch64_simd_types[i].name,
>> -                               aarch64_simd_types[i].itype);
>> -      TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
>> +         tdecl = add_builtin_type (aarch64_simd_types[i].name,
>> +                                   aarch64_simd_types[i].itype);
>> +         TYPE_NAME (aarch64_simd_types[i].itype) = tdecl;
>> +       }
>>         }
>>
>>     #define AARCH64_BUILD_SIGNED_TYPE(mode)  \
>> @@ -1240,7 +1246,8 @@ aarch64_general_init_builtins (void)
>>
>>       aarch64_init_fp16_types ();
>>
>> -  aarch64_init_bf16_types ();
>> +  if (TARGET_BF16_FP)
>> +    aarch64_init_bf16_types ();
>>
>>       if (TARGET_SIMD)
>>         aarch64_init_simd_builtins ();
>>
>>
>>
>> But the problem in that case was that it the types could not be re-enabled using
>> a target pragma like:
>>
>> #pragma GCC push_options
>> #pragma GCC target ("+bf16")
>>
>> Inside the test.
>>
>> (i.e. the pragma caused the ifdef to be TRUE, but __bf16 was still not being
>> enabled afaict?)
>>
>> So I'm not sure what to do, presumably we do want some guard around the type so
>> as not to just ICE if the type is used without +bf16?
> 
> Other header files work both ways: you get the same definitions regardless
> of what the target was when the header file was included.  Then we need
> to raise an error if the user tries to do something that the current
> target doesn't support.
> 
> I suppose for bf16 we could either (a) try to raise an error whenever
> BF-related moves are emitted without the required target feature or
> (b) handle __bf16 types like __fp16 types.  The justification for
> (b) is that there aren't really any new instructions for moves;
> __bf16 is mostly a software construct as far as this specific
> patch goes.  (It's a different story for the intrinsics patch
> of course.)
> 
> I don't know which of (a) or (b) is better.  Whichever we go for,
> it would be good if clang and GCC were consistent here.

Following our downstream discussions we have implemented (b) by removing 
TARGET_xx restrictions on BFmode MOVs.

I also noticed an ICE when typing to move BF vector types to X registers ("could 
not split insn"),so I have added the BF-enabled iterators to other patterns 
needed for this

Lmk if you spot any issues!

Also I've update the filenames of all our tests to make them a bit clearer:

C tests:

__ bfloat16_scalar_compile_1.c to bfloat16_scalar_compile_3.c: Compilation of 
scalar moves/loads/stores with "-march8.2-a+bf16", "-march8.2-a and +bf16 target 
pragma", "-march8.2-a" (now does not error out at all). There now include 
register asms to check more MOV alternatives.

__ bfloat16_scalar_compile_4.c: The _Complex error test.

__ bfloat16_simd_compile_1.c to bfloat16_simd_compile_3.c: Likewise to 
x_scalar_x, but also include (vector) 0x1234.. compilation (no assembler scan).

I had also done a small c++ test, but have chosen to shift that to the [2/2] 
patch because it is currently being blocked by target_invalid_conversion.

Let know know if anything is missing!

> 
>>> It would be good to have more test coverage than this.  E.g.:
>>>
>>> - a test that includes arm_bf16.h, with just scalar tests.
>>
>> Done as test 2, but it is a small test. Is there anything I could add to it?
>> (I feel like ideally I'd want to try and force it down every alternative of the
>> RTL pattern)
> 
> register asms are one way of doing that, see e.g
> gcc.target/aarch64/sve/struct_move_1.c
> 
Added some to check the movs in/out of GPRs, too.

>>>
>>> - a test for _Complex bfloat16_t.
>>
>> I don't think we currently have a decision on whether this should be supported
>> or not.
>> AFAICT we also don't have complex __fp16 support either. I'm getting the same
>> error messages attempting to compile a _Complex __fp16 but it's always likely
>> I'm going at this wrong!
>>
>> Added test 5 to show you what I was trying to do and to catch the error messages
>> in their current form, but I'm not sure if I've done this right either, tbh!
> 
> Testing for an error is a good option if we don't intend to support this.
> The main reason for having a test is to make sure that there's no ICE.
> 
> So the test in the new patch LGTM, thanks.
Cheers!

> 
>>> - a test for moves involving:
>>>
>>>       typedef bfloat16_t v16bf __attribute__((vector_size(32)));
>>
>> Oh that's a good idea, thank you for pointing it out!
>>
>> See test 6 for reference.
>>
>> So for vector size 16, 128bits, this looks fine, loading and storing from q
>> registers (using aarch64_simd_movv8bf).
>>
>> For vector size 32, 256 bits, the compiler chooses to use 4*x-registers instead,
>> resulting in this piece of assembler
>>
>> stacktest2:
>>            sub     sp, sp, #64
>>            ldp     x2, x3, [x0]
>>            stp     x2, x3, [sp]
>>            ldp     x0, x1, [x0, 16]
>>            stp     x0, x1, [sp, 16]
>>            ldp     x0, x1, [sp]
>>            stp     x0, x1, [sp, 32]
>>            ldp     x2, x3, [sp, 16]
>>            stp     x2, x3, [sp, 48]
>>            stp     x0, x1, [x8]
>>            ldp     x0, x1, [sp, 48]
>>            stp     x0, x1, [x8, 16]
>>            add     sp, sp, 64
>>            ret
>>
>> Which looks strange using regular registers in movti mode, but I tested it with
>> float16 and float32 vectors and they the same also give the same result.
>>
>> However, using an integer vector generates:
>>
>> stacktest2:
>>            ld1     {v0.16b - v1.16b}, [x0]
>>            sub     sp, sp, #32
>>            st1     {v0.16b - v1.16b}, [sp]
>>            ld1     {v0.16b - v1.16b}, [sp]
>>            st1     {v0.16b - v1.16b}, [x8]
>>            add     sp, sp, 32
>>            ret
>>
>> from the aarch64_movoi pattern. So now I'm unsure whether to leave this as is or
>> to look into why all float modes are not being used through the seemingly more
>> efficient movoi pattern. What do you think?
>> (i intend to look into this further)
> 
> Haven't tried, but is this affected by -fno-split-wide-types?
Apparently not! I seem to be getting the same assembler in both cases. In 
investigating I got as far as finding that for float types the ld/str was going 
through aarch64_expand_cpymem which limits them to TImode for some reason (and 
removing the limit allowed them to use OImode,XImode, etc.), but I stopped there.
> 
> But here too the main thing is to make sure that there's no ICE when
> using the vectors.  Making it efficient can be (very low priority)
> follow-on work.
> 
> So it's probably best not to match any specific output here.
> Just testing that the moves compile is OK.
Done :) And integrated into the vector tests.
> 
>>> - a test that involves moving constants, for both scalars and vectors.
>>>     You can create zero scalar constants in C++ using bfloat16_t() etc.
>>>     For vectors it's possible to do things like:
>>>
>>>       typedef short v2bf __attribute__((vector_size(4)));
>>>       v2hi foo (void) { return (v2hi) 0x12345678; }
>>>
>>>     The same sort of things should work for bfloat16x4_t and bfloat16x8_t.
>>
>> Leaving this as an open issue for now because I'm not 100% sure what we
>> should/shouldn't be allowing past the tree-level target hooks.
>>
>> If we do want to block this we would do this in the [2/2] patch.
>> I will come back to it and create a scan-assembler test when I'm more clear on
>> what we should and shouldn't allow at the higher level :)
> 
> FWIW, I'm not sure we should go out of our way to disallow this.
> Preventing bfloat16_t() in C++ would IMO be unnatural.  And the
> "(vector) vector-sized-integer" syntax specifically treats the vector
> as a bundle of bits without really caring what the element type is.
> Even if we did manage to forbid the conversion in that context,
> it would still be possible to achieve the same thing using:
> 
>     v2hi
>     foo (void)
>     {
>       union { v2hi v; unsigned int i; } u;
>       u.i = 0x12345678;
>       return u.v;
>     }
> 
Added the compilation of "(vector) vector-sized-integer" in the vector tests.

But target_invalid_conversion in the [2/2] patch is a complication to this (as 
with bfloat_16t() in c++.

I was under the impression that the original intent of bfloat was for it to be 
storage only, with any initialisation happening through the float32 convert 
intrinsic.

Either I'd be happy to allow it, but it does feel like we'd slightly be going 
against what's the ACLE currently.
However, looking back at it now, it only mentions using ACLE intrinsics over C 
operators, so I'd be happy to allow this for vectors.

For scalars though, if we e.g. were to allow:

bfloat16_t (0x1234);

on a single bfloat, I don't see how we could still block conversions like:

bfloat16_t scalar1 = 0.1;
bfloat16_t scalar2 = 0;
bfloat16_t scalar3 = is_a_float;

Agreed that the union {} would still always slip through, though.

I'll also reply to the 2/2 email to show you what I currently have there.
Let me know of your thoughts on that!

Cheers,
Stam

> Thanks for the new patch, looks good apart from the points above and:
> 
>> +;; Iterator for all scalar floating point modes suitable for moving, including
>> +;; special BF type.(HF, SF, DF, TF and BF)
> 
> Nit: should be space rather than "." before "(".
Done
> 
>> +(define_mode_iterator GPF_TF_F16_MOV [(HF "") (BF "TARGET_BF16_FP") (SF "")
>> +				      (DF "") (TF "")])
>> +
>>   ;; Double vector modes.
>>   (define_mode_iterator VDF [V2SF V4HF])
>>   
>> @@ -79,6 +87,9 @@
>>   ;; Double vector modes.
>>   (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
>>   
>> +;; Double vector modes suitable for moving.  Includes BFmode.
>> +(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
>> +
>>   ;; All modes stored in registers d0-d31.
>>   (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
>>   
>> @@ -94,6 +105,9 @@
>>   ;; Quad vector modes.
>>   (define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>>   
>> +;; Quad vector modes suitable for moving.  Includes BFmode.
>> +(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
>> +
>>   ;; Copy of the above.
>>   (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>>   
> 
> This looks a bit inconsistent: the scalar iterator requires
> TARGET_BF16_FP for bf16 modes, but the vector iterator doesn't.

Ah yes this was because I chose to put the TARGET_xx only on the define_expands. 
But all this has been removed for now (all the BFmode movs are unrestricted).

> 
>> @@ -160,6 +174,15 @@
>>   (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>>   				V4HF V8HF V2SF V4SF V2DF])
>>   
>> +;; All Advanced SIMD modes suitable for moving, loading, and storing,
>> +;; including special Bfloat vector types.
>> +(define_mode_iterator VALL_F16MOV [(V8QI "") (V16QI "") (V4HI "") (V8HI "")
>> +				   (V2SI "") (V4SI "") (V2DI "")
>> +				   (V4HF "") (V8HF "")
>> +				   (V4BF "TARGET_BF16_SIMD")
>> +				   (V8BF "TARGET_BF16_SIMD")
>> +				   (V2SF "") (V4SF "") (V2DF "")])
>> +
>>   ;; The VALL_F16 modes except the 128-bit 2-element ones.
>>   (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
>>   				V4HF V8HF V2SF V4SF])
> 
> whereas here we do check.  But that comes back to the (a)/(b) choice above

Agreed. Since we implemented (b) in this revision all these restrictions on the 
iterators have been removed.

.
> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>> new file mode 100644
>> index 00000000000..f2bef671deb
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>> @@ -0,0 +1,51 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-O3 --save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**stacktest1:
>> +**	...
>> +**	str	h0, \[sp, [0-9]+\]
>> +**	ldr	h0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16_t stacktest1 (bfloat16_t __a)
>> +{
>> +  volatile bfloat16_t b = __a;
>> +  return b;
>> +}
>> +
>> +/*
>> +**stacktest2:
>> +**	...
>> +**	str	d0, \[sp, [0-9]+\]
>> +**	ldr	d0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
>> +{
>> +  volatile bfloat16x4_t b = __a;
>> +  return b;
>> +}
>> +
>> +/*
>> +**stacktest3:
>> +**	...
>> +**	str	q0, \[sp\]
>> +**	ldr	q0, \[sp\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
>> +{
>> +  volatile bfloat16x8_t b = __a;
>> +  return b;
>> +}
> 
> Might be a daft question, but why do we have an offset for the first
> two and not for the last one?  Might be worth hard-coding whatever
> offset we use.
> 
> If we use -fomit-frame-pointer then the whole function body should
> be stable: sub, str, ldr, add, ret.

Oh I don't know why to be honest, it just seemed to be how they were compiled.
In this case -fomit-frame-pointer doesn't seem to do anything to remove the 
offset (apparently this flag is enabled by default from -O, anyway).
So I've hard-coded the offset into the test for now. Lmk if this is what you meant!



> 
>> @@ -0,0 +1,21 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>> +/* { dg-additional-options "-O3 --save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +#include <arm_bf16.h>
>> +
>> +/*
>> +**stacktest1:
>> +**	...
>> +**	str	h0, \[sp, [0-9]+\]
>> +**	ldr	h0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16_t stacktest1 (bfloat16_t __a)
>> +{
>> +  volatile bfloat16_t b = __a;
>> +  return b;
>> +}
> 
> Same comment here.
Done
> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
>> new file mode 100644
>> index 00000000000..9bcb53b32d8
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_3.c
>> @@ -0,0 +1,25 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-march=armv8.2-a -O2" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("+bf16")
>> +
>> +#include <arm_bf16.h>
>> +
>> +/*
>> +**stacktest1:
>> +**	...
>> +**	str	h0, \[sp, [0-9]+\]
>> +**	ldr	h0, \[sp, [0-9]+\]
>> +**	...
>> +**	ret
>> +*/
>> +bfloat16_t stacktest1 (bfloat16_t __a)
>> +{
>> +  volatile bfloat16_t b = __a;
>> +  return b;
>> +}
>> +
>> +#pragma GCC pop_options
> 
> Here too.  No real need for the push & pop, but keeping them is fine
> if that seems more obvious.
Same as above
Oh good to know! Yes I left the push & pop in, if only for the sake of clarity.

Cheers,
Stam
> 
> Thanks,
> Richard
> 


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: BFmode1of2-rev4.patch --]
[-- Type: text/x-patch; name="BFmode1of2-rev4.patch", Size: 36118 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c3d6464f3e6..075e46072d1 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1bd2640a1ce..b2d6b761489 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1072,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1240,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 6cd8ed0972a..1eeb8d88452 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index 76d4d130013..e885755bc92 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4e28cf97516..cea9592695a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -234,8 +234,8 @@
 
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-      (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+      (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && GP_REGNUM_P (REGNO (operands[0]))
    && GP_REGNUM_P (REGNO (operands[1]))"
@@ -246,8 +246,8 @@
 })
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-        (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+        (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
@@ -258,8 +258,8 @@
 })
 
 (define_expand "@aarch64_split_simd_mov<mode>"
-  [(set (match_operand:VQ 0)
-        (match_operand:VQ 1))]
+  [(set (match_operand:VQMOV 0)
+        (match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -295,8 +295,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_lo_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_lo_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[0]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -306,8 +306,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>high"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_hi_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_hi_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[1]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -1471,8 +1471,8 @@
 ;; On big-endian this is { zeroes, operand }
 
 (define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
 	  (vec_duplicate:<VHALF> (const_int 0))))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
@@ -1501,8 +1501,8 @@
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (vec_duplicate:<VHALF> (const_int 0))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
@@ -1531,8 +1531,8 @@
 )
 
 (define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQ 0 "register_operand")
-   (match_operand:VQ 1 "register_operand")]
+  [(match_operand:VQMOV 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
   "TARGET_SIMD"
 {
   if (BYTES_BIG_ENDIAN)
@@ -1549,11 +1549,11 @@
 ;; For big-endian this is { operand1, operand2 }
 
 (define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -1563,12 +1563,12 @@
 )
 
 (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    ins\\t%0.d[1], %1.d[0]
@@ -1577,7 +1577,7 @@
 )
 
 (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand")
+ [(match_operand:VQMOV 0 "register_operand")
   (match_operand:<VHALF> 1 "register_operand")]
  "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 85cadef1be8..ddf5a84a3b5 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15603,6 +15605,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
@@ -16116,6 +16122,8 @@ aarch64_vq_mode (scalar_mode mode)
       return V4SFmode;
     case E_HFmode:
       return V8HFmode;
+    case E_BFmode:
+      return V8BFmode;
     case E_SImode:
       return V4SImode;
     case E_HImode:
@@ -16149,6 +16157,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 	    return V2SFmode;
 	  case E_HFmode:
 	    return V4HFmode;
+	  case E_BFmode:
+	    return V4BFmode;
 	  case E_SImode:
 	    return V2SImode;
 	  case E_HImode:
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 04dabd46437..b0492205610 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 34cb99e2897..85106910f74 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1304,8 +1304,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1321,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 00000000000..884b6f3bc7a
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,32 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c7425346b86..eaba156e26c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,9 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34609,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e5fa31f6748..9fd05abe3b6 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,16 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type (HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +86,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -97,6 +107,12 @@
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
@@ -160,6 +176,11 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -226,6 +247,9 @@
 ;; Advanced SIMD modes for Q and H types.
 (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
 
+;; Advanced SIMD modes for BF vector types.
+(define_mode_iterator VBF [V4BF V8BF])
+
 ;; Advanced SIMD modes for H and S types.
 (define_mode_iterator VDQHS [V4HI V8HI V2SI V4SI])
 
@@ -745,6 +769,7 @@
 			  (V2SI "2") (V4SI "4")
 				     (V2DI "2")
 			  (V4HF "4") (V8HF "8")
+			  (V4BF "4") (V8BF "8")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")])
@@ -885,7 +910,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8BF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -965,12 +991,13 @@
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
-			 (V8HF "V4HF")  (V2DF  "DF")])
+			 (V8HF "V4HF")  (V2DF  "DF")
+			 (V8BF "V4BF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
-			 (V8HF  "v4hf")
+			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
@@ -1265,6 +1292,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
new file mode 100644
index 00000000000..5186d0e3d24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
@@ -0,0 +1,118 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	...
+**	mov	v1.h\[0\], v2.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	...
+**	dup	v1.4h, w1
+**	...
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	...
+**	umov	w1, v1.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	...
+**	mov	w1, w2
+**	...
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	...
+**	strh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_rm (void)
+{
+  register bfloat16_t x asm ("w2");
+  volatile bfloat16_t y;
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" : : : "memory");
+}
+
+/*
+**bfloat_mov_mr:
+**	...
+**	ldrh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_mr (void)
+{
+  volatile bfloat16_t x;
+  register bfloat16_t y asm ("w2");
+  asm volatile ("#foo" : : : "memory");
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
new file mode 100644
index 00000000000..02656d32f14
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
@@ -0,0 +1,122 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	...
+**	mov	v1.h\[0\], v2.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	...
+**	dup	v1.4h, w1
+**	...
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	...
+**	umov	w1, v1.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	...
+**	mov	w1, w2
+**	...
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	...
+**	strh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_rm (void)
+{
+  register bfloat16_t x asm ("w2");
+  volatile bfloat16_t y;
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" : : : "memory");
+}
+
+/*
+**bfloat_mov_mr:
+**	...
+**	ldrh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_mr (void)
+{
+  volatile bfloat16_t x;
+  register bfloat16_t y asm ("w2");
+  asm volatile ("#foo" : : : "memory");
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_3.c
new file mode 100644
index 00000000000..6170c85b196
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_3.c
@@ -0,0 +1,116 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	...
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	...
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	...
+**	mov	v1.h\[0\], v2.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	...
+**	dup	v1.4h, w1
+**	...
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	...
+**	umov	w1, v1.h\[0\]
+**	...
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=w" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	...
+**	mov	w1, w2
+**	...
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	...
+**	strh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_rm (void)
+{
+  register bfloat16_t x asm ("w2");
+  volatile bfloat16_t y;
+  asm volatile ("#foo" : "=r" (x));
+  y = x;
+  asm volatile ("#foo" : : : "memory");
+}
+
+/*
+**bfloat_mov_mr:
+**	...
+**	ldrh	w2, \[sp, 14\]
+**	...
+**	ret
+*/
+void bfloat_mov_mr (void)
+{
+  volatile bfloat16_t x;
+  register bfloat16_t y asm ("w2");
+  asm volatile ("#foo" : : : "memory");
+  y = x;
+  asm volatile ("#foo" :: "r" (y));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_4.c
new file mode 100644
index 00000000000..b812011c223
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_4.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_1.c
new file mode 100644
index 00000000000..1db85fb9ba0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_1.c
@@ -0,0 +1,93 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub.*
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add.*
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub.*
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add.*
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub.*
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add.*
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/* Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_2.c
new file mode 100644
index 00000000000..660a02d0f03
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_2.c
@@ -0,0 +1,97 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub.*
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add.*
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub.*
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add.*
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub.*
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add.*
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_3.c
new file mode 100644
index 00000000000..6b22bae59af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_compile_3.c
@@ -0,0 +1,92 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub.*
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add.*
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub.*
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add.*
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub.*
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add.*
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2020-01-07 11:42       ` Stam Markianos-Wright
@ 2020-01-07 17:15         ` Richard Sandiford
  2020-01-09 15:12           ` Stam Markianos-Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Sandiford @ 2020-01-07 17:15 UTC (permalink / raw)
  To: Stam Markianos-Wright
  Cc: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft

Thanks for the update.  The new patch looks really good, just some
minor comments.

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> [...]
> Also I've update the filenames of all our tests to make them a bit clearer:
>
> C tests:
>
> __ bfloat16_scalar_compile_1.c to bfloat16_scalar_compile_3.c: Compilation of 
> scalar moves/loads/stores with "-march8.2-a+bf16", "-march8.2-a and +bf16 target 
> pragma", "-march8.2-a" (now does not error out at all). There now include 
> register asms to check more MOV alternatives.
>
> __ bfloat16_scalar_compile_4.c: The _Complex error test.
>
> __ bfloat16_simd_compile_1.c to bfloat16_simd_compile_3.c: Likewise to 
> x_scalar_x, but also include (vector) 0x1234.. compilation (no assembler scan).

Sounds good to me, although TBH the "_compile" feels a bit redundant.

> I had also done a small c++ test, but have chosen to shift that to the [2/2] 
> patch because it is currently being blocked by target_invalid_conversion.

OK.  Does that include the mangling test?

> [...]
>>>> - a test that involves moving constants, for both scalars and vectors.
>>>>     You can create zero scalar constants in C++ using bfloat16_t() etc.
>>>>     For vectors it's possible to do things like:
>>>>
>>>>       typedef short v2bf __attribute__((vector_size(4)));
>>>>       v2hi foo (void) { return (v2hi) 0x12345678; }
>>>>
>>>>     The same sort of things should work for bfloat16x4_t and bfloat16x8_t.
>>>
>>> Leaving this as an open issue for now because I'm not 100% sure what we
>>> should/shouldn't be allowing past the tree-level target hooks.
>>>
>>> If we do want to block this we would do this in the [2/2] patch.
>>> I will come back to it and create a scan-assembler test when I'm more clear on
>>> what we should and shouldn't allow at the higher level :)
>> 
>> FWIW, I'm not sure we should go out of our way to disallow this.
>> Preventing bfloat16_t() in C++ would IMO be unnatural.  And the
>> "(vector) vector-sized-integer" syntax specifically treats the vector
>> as a bundle of bits without really caring what the element type is.
>> Even if we did manage to forbid the conversion in that context,
>> it would still be possible to achieve the same thing using:
>> 
>>     v2hi
>>     foo (void)
>>     {
>>       union { v2hi v; unsigned int i; } u;
>>       u.i = 0x12345678;
>>       return u.v;
>>     }
>> 
> Added the compilation of "(vector) vector-sized-integer" in the vector tests.
>
> But target_invalid_conversion in the [2/2] patch is a complication to this (as 
> with bfloat_16t() in c++.
>
> I was under the impression that the original intent of bfloat was for it to be 
> storage only, with any initialisation happening through the float32 convert 
> intrinsic.
>
> Either I'd be happy to allow it, but it does feel like we'd slightly be going 
> against what's the ACLE currently.
> However, looking back at it now, it only mentions using ACLE intrinsics over C 
> operators, so I'd be happy to allow this for vectors.
>
> For scalars though, if we e.g. were to allow:
>
> bfloat16_t (0x1234);
>
> on a single bfloat, I don't see how we could still block conversions like:
>
> bfloat16_t scalar1 = 0.1;
> bfloat16_t scalar2 = 0;
> bfloat16_t scalar3 = is_a_float;
>
> Agreed that the union {} would still always slip through, though.

It wasn't clear sorry, but I meant literally "bfloat16_t()", i.e.
construction with zero initialisation.  I agree we don't want to
support "bfloat16_t(0.25)" etc.

> [...]
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>>> new file mode 100644
>>> index 00000000000..f2bef671deb
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>>> @@ -0,0 +1,51 @@
>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>>> +/* { dg-additional-options "-O3 --save-temps" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +/*
>>> +**stacktest1:
>>> +**	...
>>> +**	str	h0, \[sp, [0-9]+\]
>>> +**	ldr	h0, \[sp, [0-9]+\]
>>> +**	...
>>> +**	ret
>>> +*/
>>> +bfloat16_t stacktest1 (bfloat16_t __a)
>>> +{
>>> +  volatile bfloat16_t b = __a;
>>> +  return b;
>>> +}
>>> +
>>> +/*
>>> +**stacktest2:
>>> +**	...
>>> +**	str	d0, \[sp, [0-9]+\]
>>> +**	ldr	d0, \[sp, [0-9]+\]
>>> +**	...
>>> +**	ret
>>> +*/
>>> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
>>> +{
>>> +  volatile bfloat16x4_t b = __a;
>>> +  return b;
>>> +}
>>> +
>>> +/*
>>> +**stacktest3:
>>> +**	...
>>> +**	str	q0, \[sp\]
>>> +**	ldr	q0, \[sp\]
>>> +**	...
>>> +**	ret
>>> +*/
>>> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
>>> +{
>>> +  volatile bfloat16x8_t b = __a;
>>> +  return b;
>>> +}
>> 
>> Might be a daft question, but why do we have an offset for the first
>> two and not for the last one?  Might be worth hard-coding whatever
>> offset we use.

I should have realised first time, but it's because we allocate the
local variable area downwards from the soft frame pointer.  So the
area gets padded downwards rather than upwards.

> [...]
> @@ -97,6 +107,12 @@
>  ;; Copy of the above.
>  (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>  
> +;; Quad vector modes suitable for moving.  Includes BFmode.
> +(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
> +
> +;; Quad vector modes suitable for moving.  Includes BFmode.
> +(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])

Comment pasto for VQMOV_NO2E.  Think it should be:

;; VQMOV without 2-element modes.

>  ;; Quad integer vector modes.
>  (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>  
> @@ -160,6 +176,11 @@
>  (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>  				V4HF V8HF V2SF V4SF V2DF])
>  
> +;; All Advanced SIMD modes suitable for moving, loading, and storing,
> +;; including special Bfloat vector types.
> +(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
> +				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])

Nit: line should be indented below "V8QI".

> @@ -226,6 +247,9 @@
>  ;; Advanced SIMD modes for Q and H types.
>  (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
>  
> +;; Advanced SIMD modes for BF vector types.
> +(define_mode_iterator VBF [V4BF V8BF])

Nothing in this patch uses VBF, so probably best to leave it until later.

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
> new file mode 100644
> index 00000000000..5186d0e3d24
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
> @@ -0,0 +1,118 @@
> [...]
> +/*
> +**bfloat_mov_rm:
> +**	...
> +**	strh	w2, \[sp, 14\]
> +**	...
> +**	ret
> +*/
> +void bfloat_mov_rm (void)
> +{
> +  register bfloat16_t x asm ("w2");
> +  volatile bfloat16_t y;
> +  asm volatile ("#foo" : "=r" (x));
> +  y = x;
> +  asm volatile ("#foo" : : : "memory");
> +}

Probably simpler as:

/*
**bfloat_mov_rm:
**	strh	w2, \[x0\]
**	ret
*/
void bfloat_mov_rm (bfloat16_t *ptr)
{
  register bfloat16_t x asm ("w2");
  asm volatile ("#foo" : "=r" (x));
  *ptr = x;
}

> +/*
> +**bfloat_mov_mr:
> +**	...
> +**	ldrh	w2, \[sp, 14\]
> +**	...
> +**	ret
> +*/
> +void bfloat_mov_mr (void)
> +{
> +  volatile bfloat16_t x;
> +  register bfloat16_t y asm ("w2");
> +  asm volatile ("#foo" : : : "memory");
> +  y = x;
> +  asm volatile ("#foo" :: "r" (y));
> +}

Similarly here:

/*
**bfloat_mov_mr:
**	ldrh	w2, \[x0\]
**	ret
*/
void bfloat_mov_mr (bfloat16_t *ptr)
{
  register bfloat16_t y asm ("w2");
  y = *ptr;
  asm volatile ("#foo" :: "r" (y));
}

Same for _2.d and _3.c

> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
> new file mode 100644
> index 00000000000..02656d32f14
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
> @@ -0,0 +1,122 @@
> +/* { dg-do assemble { target { aarch64*-*-* } } } */
> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
> +/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#pragma GCC push_options
> +#pragma GCC target ("+bf16")
> +
> +#include <arm_bf16.h>

This effectively tests the same thing as bfloat16_scalar_compile_1.c.
IMO the more interesting way round is:

#include <arm_bf16.h>

#pragma GCC push_options
#pragma GCC target ("+bf16")

like for the simd tests.  So _1.c is the normal "enable before include"
case, _2.c is "enable after include" and _3.c is "don't enable at all".

Thanks,
Richard

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2020-01-07 17:15         ` Richard Sandiford
@ 2020-01-09 15:12           ` Stam Markianos-Wright
  2020-01-09 15:48             ` Richard Sandiford
  0 siblings, 1 reply; 9+ messages in thread
From: Stam Markianos-Wright @ 2020-01-09 15:12 UTC (permalink / raw)
  To: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 10231 bytes --]



On 1/7/20 5:14 PM, Richard Sandiford wrote:
> Thanks for the update.  The new patch looks really good, just some
> minor comments.
> 
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> [...]
>> Also I've update the filenames of all our tests to make them a bit clearer:
>>
>> C tests:
>>
>> __ bfloat16_scalar_compile_1.c to bfloat16_scalar_compile_3.c: Compilation of
>> scalar moves/loads/stores with "-march8.2-a+bf16", "-march8.2-a and +bf16 target
>> pragma", "-march8.2-a" (now does not error out at all). There now include
>> register asms to check more MOV alternatives.
>>
>> __ bfloat16_scalar_compile_4.c: The _Complex error test.
>>
>> __ bfloat16_simd_compile_1.c to bfloat16_simd_compile_3.c: Likewise to
>> x_scalar_x, but also include (vector) 0x1234.. compilation (no assembler scan).
> 
> Sounds good to me, although TBH the "_compile" feels a bit redundant.

Yes, true that! Removed it.

> 
>> I had also done a small c++ test, but have chosen to shift that to the [2/2]
>> patch because it is currently being blocked by target_invalid_conversion.
> 
> OK.  Does that include the mangling test?

Aaah no, this is the test checking for bfloat16_t(), bfloat16_t (0x1234), 
bfloat16_t(0.25), etc. (which are more of language-level checks)

Oh! I had forgotten about the mangling, so I've added it in this revision.

> 
>> [...]
>>>>> - a test that involves moving constants, for both scalars and vectors.
>>>>>      You can create zero scalar constants in C++ using bfloat16_t() etc.
>>>>>      For vectors it's possible to do things like:
>>>>>
>>>>>        typedef short v2bf __attribute__((vector_size(4)));
>>>>>        v2hi foo (void) { return (v2hi) 0x12345678; }
>>>>>
>>>>>      The same sort of things should work for bfloat16x4_t and bfloat16x8_t.
>>>>
>>>> Leaving this as an open issue for now because I'm not 100% sure what we
>>>> should/shouldn't be allowing past the tree-level target hooks.
>>>>
>>>> If we do want to block this we would do this in the [2/2] patch.
>>>> I will come back to it and create a scan-assembler test when I'm more clear on
>>>> what we should and shouldn't allow at the higher level :)
>>>
>>> FWIW, I'm not sure we should go out of our way to disallow this.
>>> Preventing bfloat16_t() in C++ would IMO be unnatural.  And the
>>> "(vector) vector-sized-integer" syntax specifically treats the vector
>>> as a bundle of bits without really caring what the element type is.
>>> Even if we did manage to forbid the conversion in that context,
>>> it would still be possible to achieve the same thing using:
>>>
>>>      v2hi
>>>      foo (void)
>>>      {
>>>        union { v2hi v; unsigned int i; } u;
>>>        u.i = 0x12345678;
>>>        return u.v;
>>>      }
>>>
>> Added the compilation of "(vector) vector-sized-integer" in the vector tests.
>>
>> But target_invalid_conversion in the [2/2] patch is a complication to this (as
>> with bfloat_16t() in c++.
>>
>> I was under the impression that the original intent of bfloat was for it to be
>> storage only, with any initialisation happening through the float32 convert
>> intrinsic.
>>
>> Either I'd be happy to allow it, but it does feel like we'd slightly be going
>> against what's the ACLE currently.
>> However, looking back at it now, it only mentions using ACLE intrinsics over C
>> operators, so I'd be happy to allow this for vectors.
>>
>> For scalars though, if we e.g. were to allow:
>>
>> bfloat16_t (0x1234);
>>
>> on a single bfloat, I don't see how we could still block conversions like:
>>
>> bfloat16_t scalar1 = 0.1;
>> bfloat16_t scalar2 = 0;
>> bfloat16_t scalar3 = is_a_float;
>>
>> Agreed that the union {} would still always slip through, though.
> 
> It wasn't clear sorry, but I meant literally "bfloat16_t()", i.e.
> construction with zero initialisation.  I agree we don't want to
> support "bfloat16_t(0.25)" etc.

Added to [2/2] as mentioned above.

> 
>> [...]
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>>>> new file mode 100644
>>>> index 00000000000..f2bef671deb
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_compile_1.c
>>>> @@ -0,0 +1,51 @@
>>>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>>>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>>>> +/* { dg-add-options arm_v8_2a_bf16_neon }  */
>>>> +/* { dg-additional-options "-O3 --save-temps" } */
>>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>>> +
>>>> +#include <arm_neon.h>
>>>> +
>>>> +/*
>>>> +**stacktest1:
>>>> +**	...
>>>> +**	str	h0, \[sp, [0-9]+\]
>>>> +**	ldr	h0, \[sp, [0-9]+\]
>>>> +**	...
>>>> +**	ret
>>>> +*/
>>>> +bfloat16_t stacktest1 (bfloat16_t __a)
>>>> +{
>>>> +  volatile bfloat16_t b = __a;
>>>> +  return b;
>>>> +}
>>>> +
>>>> +/*
>>>> +**stacktest2:
>>>> +**	...
>>>> +**	str	d0, \[sp, [0-9]+\]
>>>> +**	ldr	d0, \[sp, [0-9]+\]
>>>> +**	...
>>>> +**	ret
>>>> +*/
>>>> +bfloat16x4_t stacktest2 (bfloat16x4_t __a)
>>>> +{
>>>> +  volatile bfloat16x4_t b = __a;
>>>> +  return b;
>>>> +}
>>>> +
>>>> +/*
>>>> +**stacktest3:
>>>> +**	...
>>>> +**	str	q0, \[sp\]
>>>> +**	ldr	q0, \[sp\]
>>>> +**	...
>>>> +**	ret
>>>> +*/
>>>> +bfloat16x8_t stacktest3 (bfloat16x8_t __a)
>>>> +{
>>>> +  volatile bfloat16x8_t b = __a;
>>>> +  return b;
>>>> +}
>>>
>>> Might be a daft question, but why do we have an offset for the first
>>> two and not for the last one?  Might be worth hard-coding whatever
>>> offset we use.
> 
> I should have realised first time, but it's because we allocate the
> local variable area downwards from the soft frame pointer.  So the
> area gets padded downwards rather than upwards.

Ahh ok thank you!

Also in terms of these I removed the #foo markers because they were tripping up 
check-function-bodies (I realised that they weren't being ignored like other 
comments, so after removing them I no longer need to have the "+**	..." 
before/after the MOVs).

> 
>> [...]
>> @@ -97,6 +107,12 @@
>>   ;; Copy of the above.
>>   (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
>>   
>> +;; Quad vector modes suitable for moving.  Includes BFmode.
>> +(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
>> +
>> +;; Quad vector modes suitable for moving.  Includes BFmode.
>> +(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
> 
> Comment pasto for VQMOV_NO2E.  Think it should be:
> 
> ;; VQMOV without 2-element modes.

Yes, correct!

> 
>>   ;; Quad integer vector modes.
>>   (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
>>   
>> @@ -160,6 +176,11 @@
>>   (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>>   				V4HF V8HF V2SF V4SF V2DF])
>>   
>> +;; All Advanced SIMD modes suitable for moving, loading, and storing,
>> +;; including special Bfloat vector types.
>> +(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
>> +				V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
> 
> Nit: line should be indented below "V8QI".

Done!

> 
>> @@ -226,6 +247,9 @@
>>   ;; Advanced SIMD modes for Q and H types.
>>   (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
>>   
>> +;; Advanced SIMD modes for BF vector types.
>> +(define_mode_iterator VBF [V4BF V8BF])
> 
> Nothing in this patch uses VBF, so probably best to leave it until later.

Yep, removed it.

> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
>> new file mode 100644
>> index 00000000000..5186d0e3d24
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_1.c
>> @@ -0,0 +1,118 @@
>> [...]
>> +/*
>> +**bfloat_mov_rm:
>> +**	...
>> +**	strh	w2, \[sp, 14\]
>> +**	...
>> +**	ret
>> +*/
>> +void bfloat_mov_rm (void)
>> +{
>> +  register bfloat16_t x asm ("w2");
>> +  volatile bfloat16_t y;
>> +  asm volatile ("#foo" : "=r" (x));
>> +  y = x;
>> +  asm volatile ("#foo" : : : "memory");
>> +}
> 
> Probably simpler as:
> 
> /*
> **bfloat_mov_rm:
> **	strh	w2, \[x0\]
> **	ret
> */
> void bfloat_mov_rm (bfloat16_t *ptr)
> {
>    register bfloat16_t x asm ("w2");
>    asm volatile ("#foo" : "=r" (x));
>    *ptr = x;
> }

Done

> 
>> +/*
>> +**bfloat_mov_mr:
>> +**	...
>> +**	ldrh	w2, \[sp, 14\]
>> +**	...
>> +**	ret
>> +*/
>> +void bfloat_mov_mr (void)
>> +{
>> +  volatile bfloat16_t x;
>> +  register bfloat16_t y asm ("w2");
>> +  asm volatile ("#foo" : : : "memory");
>> +  y = x;
>> +  asm volatile ("#foo" :: "r" (y));
>> +}
> 
> Similarly here:
> 
> /*
> **bfloat_mov_mr:
> **	ldrh	w2, \[x0\]
> **	ret
> */
> void bfloat_mov_mr (bfloat16_t *ptr)
> {
>    register bfloat16_t y asm ("w2");
>    y = *ptr;
>    asm volatile ("#foo" :: "r" (y));
> }
> 
> Same for _2.d and _3.c

Done

> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
>> new file mode 100644
>> index 00000000000..02656d32f14
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_compile_2.c
>> @@ -0,0 +1,122 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +
>> +#pragma GCC push_options
>> +#pragma GCC target ("+bf16")
>> +
>> +#include <arm_bf16.h>
> 
> This effectively tests the same thing as bfloat16_scalar_compile_1.c.
> IMO the more interesting way round is:
> 
> #include <arm_bf16.h>

Yes, I changed it in the simd test but not here. Good catch!

> 
> #pragma GCC push_options
> #pragma GCC target ("+bf16")
> 
> like for the simd tests.  So _1.c is the normal "enable before include"
> case, _2.c is "enable after include" and _3.c is "don't enable at all".
> 
> Thanks,
> Richard
> 

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: BFmode1of2-rev4.patch --]
[-- Type: text/x-patch; name="BFmode1of2-rev4.patch", Size: 39277 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c3d6464f3e6adaa1db818a61de00cff8e00ae08e..075e46072d1643302b9587d4e3f14f2e29b4ec8d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1bd2640a1ced352de232fed1cf134b46c69b80f7..b2d6b761489183c262320d62293bec343b315c11 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1072,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1240,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 6cd8ed0972ad7029e0319aad71d3afbda5684a4f..1eeb8d884520b1a53b8a580f165d42858c03228c 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index 76d4d130013d7498a23728337b63875958273a54..e885755bc927d1174dce8d490636df463b76d2f8 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4e28cf97516df19e1d502e56c776f6b34f15c116..cea9592695ac8bd2f4e625f8b769ddaf716e9091 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -234,8 +234,8 @@
 
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-      (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+      (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && GP_REGNUM_P (REGNO (operands[0]))
    && GP_REGNUM_P (REGNO (operands[1]))"
@@ -246,8 +246,8 @@
 })
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-        (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+        (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
@@ -258,8 +258,8 @@
 })
 
 (define_expand "@aarch64_split_simd_mov<mode>"
-  [(set (match_operand:VQ 0)
-        (match_operand:VQ 1))]
+  [(set (match_operand:VQMOV 0)
+        (match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -295,8 +295,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_lo_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_lo_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[0]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -306,8 +306,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>high"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_hi_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_hi_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[1]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -1471,8 +1471,8 @@
 ;; On big-endian this is { zeroes, operand }
 
 (define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
 	  (vec_duplicate:<VHALF> (const_int 0))))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
@@ -1501,8 +1501,8 @@
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (vec_duplicate:<VHALF> (const_int 0))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
@@ -1531,8 +1531,8 @@
 )
 
 (define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQ 0 "register_operand")
-   (match_operand:VQ 1 "register_operand")]
+  [(match_operand:VQMOV 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
   "TARGET_SIMD"
 {
   if (BYTES_BIG_ENDIAN)
@@ -1549,11 +1549,11 @@
 ;; For big-endian this is { operand1, operand2 }
 
 (define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -1563,12 +1563,12 @@
 )
 
 (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    ins\\t%0.d[1], %1.d[0]
@@ -1577,7 +1577,7 @@
 )
 
 (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand")
+ [(match_operand:VQMOV 0 "register_operand")
   (match_operand:<VHALF> 1 "register_operand")]
  "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 04dabd46437cf650e38e085d219c4e629b537e67..b04922056106ad2060d72b99fb49d57fd2b50f4b 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 85cadef1be819b3c1ad68ae70e755e0150ad6469..ebd3f6cf45bc0b5118c4c39e323e6380d64c885e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15603,6 +15605,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
@@ -16116,6 +16122,8 @@ aarch64_vq_mode (scalar_mode mode)
       return V4SFmode;
     case E_HFmode:
       return V8HFmode;
+    case E_BFmode:
+      return V8BFmode;
     case E_SImode:
       return V4SImode;
     case E_HImode:
@@ -16149,6 +16157,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 	    return V2SFmode;
 	  case E_HFmode:
 	    return V4HFmode;
+	  case E_BFmode:
+	    return V4BFmode;
 	  case E_SImode:
 	    return V2SImode;
 	  case E_HImode:
@@ -16263,9 +16273,14 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
-  /* Half-precision float.  */
+  /* Half-precision floating point types.  */
   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
-    return "Dh";
+    {
+      if (TYPE_MODE (type) == BFmode)
+	return "u6__bf16";
+      else
+	return "Dh";
+    }
 
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 34cb99e28975de2ef10d7f4202417e2f05a870a2..85106910f7459d1211e729c73f222f99f04e6d7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1304,8 +1304,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1321,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 0000000000000000000000000000000000000000..884b6f3bc7a28c516e54c26a71b1b769f55867a7
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,32 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c7425346b86b5f5310a7148c465497b53ac75bf5..eaba156e26cf35b07b96972fe2741a9c00d6caa9 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,9 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34609,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e5fa31f6748ee81d4323f11544fd8edb19d9af43..b0be5492e5e928daae93fde08c046150eab631e2 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,16 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type (HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +86,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -97,6 +107,12 @@
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; VQMOV without 2-element modes.
+(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
@@ -160,6 +176,11 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -745,6 +766,7 @@
 			  (V2SI "2") (V4SI "4")
 				     (V2DI "2")
 			  (V4HF "4") (V8HF "8")
+			  (V4BF "4") (V8BF "8")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")])
@@ -885,7 +907,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8BF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -965,12 +988,13 @@
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
-			 (V8HF "V4HF")  (V2DF  "DF")])
+			 (V8HF "V4HF")  (V2DF  "DF")
+			 (V8BF "V4BF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
-			 (V8HF  "v4hf")
+			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
@@ -1265,6 +1289,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 5740c0281b2fdf8bbc11d9428ca2f6ba8f1760a0..50c1452ed83c8a2f4ad3b162931292db328813c6 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -14,6 +14,7 @@ void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
 void f61 (float16x4_t a) {}
+void f62 (bfloat16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -27,6 +28,7 @@ void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
 void f171 (float16x8_t a) {}
+void f172 (bfloat16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -45,6 +47,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
 // { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
+// { dg-final { scan-assembler "_Z3f6214__Bfloat16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -57,6 +60,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
 // { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
+// { dg-final { scan-assembler "_Z4f17214__Bfloat16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
new file mode 100644
index 0000000000000000000000000000000000000000..5426a1814b842db9d73d556bcc228d19f970f466
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
@@ -0,0 +1,13 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+
+/* Test mangling */
+
+/* { dg-final { scan-assembler "\t.global\t_Z1fPu6__bf16" } } */
+void f (__bf16 *x) { }
+
+/* { dg-final { scan-assembler "\t.global\t_Z1gPu6__bf16S_" } } */
+void g (__bf16 *x, __bf16 *y) { }
+
+/* { dg-final { scan-assembler "\t.global\t_ZN1SIu6__bf16u6__bf16E1iE" } } */
+template <typename T, typename U> struct S { static int i; };
+template <> int S<__bf16, __bf16>::i = 3;
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef43766495c8f7bc628e658b2818bdc5b8bea247
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
@@ -0,0 +1,102 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..df8e7518c24c6534f04f1e1b3c50e2655f69bf95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d7a4317ceefbdd411062fe506e3bf9461d98bf8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
@@ -0,0 +1,101 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..b812011c223b257fe405ef210d24bf5edc3535c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cad557ebf2cd8e9b2f063d1cc7e9ad4a3e6ac31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
@@ -0,0 +1,93 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3891dcfc900ab942bf29eb638d16660a194597e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
@@ -0,0 +1,97 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b35f5e527be1fe7a6fd928bcd326b57fb376596a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
@@ -0,0 +1,92 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2020-01-09 15:12           ` Stam Markianos-Wright
@ 2020-01-09 15:48             ` Richard Sandiford
  2020-01-10 19:31               ` Stam Markianos-Wright
  0 siblings, 1 reply; 9+ messages in thread
From: Richard Sandiford @ 2020-01-09 15:48 UTC (permalink / raw)
  To: Stam Markianos-Wright
  Cc: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft

Thanks for the update, looks great.

Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
> new file mode 100644
> index 0000000000000000000000000000000000000000..884b6f3bc7a28c516e54c26a71b1b769f55867a7
> --- /dev/null
> +++ b/gcc/config/aarch64/arm_bf16.h
> @@ -0,0 +1,32 @@
> +/* Arm BF16 instrinsics include file.
> +
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   Contributed by Arm.

Needs to include 2020 now :-)  Maybe 2019-2020 since it was posted
in 2019 and would have been changed to 2019-2020 in the automatic update.

Which reminds me to update my patches too...

OK for trunk with that change, thanks.

Richard

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2]
  2020-01-09 15:48             ` Richard Sandiford
@ 2020-01-10 19:31               ` Stam Markianos-Wright
  0 siblings, 0 replies; 9+ messages in thread
From: Stam Markianos-Wright @ 2020-01-10 19:31 UTC (permalink / raw)
  To: gcc-patches, Richard Earnshaw, Kyrylo Tkachov, Marcus Shawcroft,
	Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 996 bytes --]



On 1/9/20 3:42 PM, Richard Sandiford wrote:
> Thanks for the update, looks great.
> 
> Stam Markianos-Wright <Stam.Markianos-Wright@arm.com> writes:
>> diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..884b6f3bc7a28c516e54c26a71b1b769f55867a7
>> --- /dev/null
>> +++ b/gcc/config/aarch64/arm_bf16.h
>> @@ -0,0 +1,32 @@
>> +/* Arm BF16 instrinsics include file.
>> +
>> +   Copyright (C) 2019 Free Software Foundation, Inc.
>> +   Contributed by Arm.
> 
> Needs to include 2020 now :-)  Maybe 2019-2020 since it was posted
> in 2019 and would have been changed to 2019-2020 in the automatic update.
> 
> Which reminds me to update my patches too...
> 
> OK for trunk with that change, thanks.

Done and committed as 280129.

Diff attached for reference (and as an attempt to try and keep myself sane and 
not mix it all up!)

Cheers,
Stam

> 
> Richard
> 



[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: BFmode1of2-final.patch --]
[-- Type: text/x-patch; name="BFmode1of2-final.patch", Size: 39286 bytes --]

diff --git a/gcc/config.gcc b/gcc/config.gcc
index c3d6464f3e6adaa1db818a61de00cff8e00ae08e..075e46072d1643302b9587d4e3f14f2e29b4ec8d 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -315,7 +315,7 @@ m32c*-*-*)
         ;;
 aarch64*-*-*)
 	cpu_type=aarch64
-	extra_headers="arm_fp16.h arm_neon.h arm_acle.h arm_sve.h"
+	extra_headers="arm_fp16.h arm_neon.h arm_bf16.h arm_acle.h arm_sve.h"
 	c_target_objs="aarch64-c.o"
 	cxx_target_objs="aarch64-c.o"
 	d_target_objs="aarch64-d.o"
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 1bd2640a1ced352de232fed1cf134b46c69b80f7..b2d6b761489183c262320d62293bec343b315c11 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -68,6 +68,9 @@
 #define hi_UP    E_HImode
 #define hf_UP    E_HFmode
 #define qi_UP    E_QImode
+#define bf_UP    E_BFmode
+#define v4bf_UP  E_V4BFmode
+#define v8bf_UP  E_V8BFmode
 #define UP(X) X##_UP
 
 #define SIMD_MAX_BUILTIN_ARGS 5
@@ -568,6 +571,10 @@ static tree aarch64_simd_intXI_type_node = NULL_TREE;
 tree aarch64_fp16_type_node = NULL_TREE;
 tree aarch64_fp16_ptr_type_node = NULL_TREE;
 
+/* Back-end node type for brain float (bfloat) types.  */
+tree aarch64_bf16_type_node = NULL_TREE;
+tree aarch64_bf16_ptr_type_node = NULL_TREE;
+
 /* Wrapper around add_builtin_function.  NAME is the name of the built-in
    function, TYPE is the function type, and CODE is the function subcode
    (relative to AARCH64_BUILTIN_GENERAL).  */
@@ -659,6 +666,8 @@ aarch64_simd_builtin_std_type (machine_mode mode,
       return float_type_node;
     case E_DFmode:
       return double_type_node;
+    case E_BFmode:
+      return aarch64_bf16_type_node;
     default:
       gcc_unreachable ();
     }
@@ -750,6 +759,10 @@ aarch64_init_simd_builtin_types (void)
   aarch64_simd_types[Float64x1_t].eltype = double_type_node;
   aarch64_simd_types[Float64x2_t].eltype = double_type_node;
 
+  /* Init Bfloat vector types with underlying __bf16 type.  */
+  aarch64_simd_types[Bfloat16x4_t].eltype = aarch64_bf16_type_node;
+  aarch64_simd_types[Bfloat16x8_t].eltype = aarch64_bf16_type_node;
+
   for (i = 0; i < nelts; i++)
     {
       tree eltype = aarch64_simd_types[i].eltype;
@@ -1059,6 +1072,19 @@ aarch64_init_fp16_types (void)
   aarch64_fp16_ptr_type_node = build_pointer_type (aarch64_fp16_type_node);
 }
 
+/* Initialize the backend REAL_TYPE type supporting bfloat types.  */
+static void
+aarch64_init_bf16_types (void)
+{
+  aarch64_bf16_type_node = make_node (REAL_TYPE);
+  TYPE_PRECISION (aarch64_bf16_type_node) = 16;
+  SET_TYPE_MODE (aarch64_bf16_type_node, BFmode);
+  layout_type (aarch64_bf16_type_node);
+
+  lang_hooks.types.register_builtin_type (aarch64_bf16_type_node, "__bf16");
+  aarch64_bf16_ptr_type_node = build_pointer_type (aarch64_bf16_type_node);
+}
+
 /* Pointer authentication builtins that will become NOP on legacy platform.
    Currently, these builtins are for internal use only (libgcc EH unwinder).  */
 
@@ -1214,6 +1240,8 @@ aarch64_general_init_builtins (void)
 
   aarch64_init_fp16_types ();
 
+  aarch64_init_bf16_types ();
+
   if (TARGET_SIMD)
     aarch64_init_simd_builtins ();
 
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index 6cd8ed0972ad7029e0319aad71d3afbda5684a4f..1eeb8d884520b1a53b8a580f165d42858c03228c 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -69,6 +69,13 @@ VECTOR_MODES (FLOAT, 16);     /*            V4SF V2DF.  */
 VECTOR_MODE (FLOAT, DF, 1);   /*                 V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF.  */
 
+/* Bfloat16 modes.  */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
+VECTOR_MODE (FLOAT, BF, 4);   /*		 V4BF.  */
+VECTOR_MODE (FLOAT, BF, 8);   /*		 V8BF.  */
+
 /* Oct Int: 256-bit integer mode needed for 32-byte vector arguments.  */
 INT_MODE (OI, 32);
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index 76d4d130013d7498a23728337b63875958273a54..e885755bc927d1174dce8d490636df463b76d2f8 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -50,3 +50,5 @@
   ENTRY (Float32x4_t, V4SF, none, 13)
   ENTRY (Float64x1_t, V1DF, none, 13)
   ENTRY (Float64x2_t, V2DF, none, 13)
+  ENTRY (Bfloat16x4_t, V4BF, none, 14)
+  ENTRY (Bfloat16x8_t, V8BF, none, 14)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 4e28cf97516df19e1d502e56c776f6b34f15c116..cea9592695ac8bd2f4e625f8b769ddaf716e9091 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -19,8 +19,8 @@
 ;; <http://www.gnu.org/licenses/>.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
-	(match_operand:VALL_F16 1 "general_operand"))]
+  [(set (match_operand:VALL_F16MOV 0 "nonimmediate_operand")
+	(match_operand:VALL_F16MOV 1 "general_operand"))]
   "TARGET_SIMD"
   "
   /* Force the operand into a register if it is not an
@@ -101,10 +101,10 @@
   [(set_attr "type" "neon_dup<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VD:mode>"
-  [(set (match_operand:VD 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VDMOV:mode>"
+  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
 		"=w, m,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VD 1 "general_operand"
+	(match_operand:VDMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -129,10 +129,10 @@
 		     mov_reg, neon_move<q>")]
 )
 
-(define_insn "*aarch64_simd_mov<VQ:mode>"
-  [(set (match_operand:VQ 0 "nonimmediate_operand"
+(define_insn "*aarch64_simd_mov<VQMOV:mode>"
+  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
 		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
-	(match_operand:VQ 1 "general_operand"
+	(match_operand:VQMOV 1 "general_operand"
 		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
@@ -234,8 +234,8 @@
 
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-      (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+      (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && GP_REGNUM_P (REGNO (operands[0]))
    && GP_REGNUM_P (REGNO (operands[1]))"
@@ -246,8 +246,8 @@
 })
 
 (define_split
-  [(set (match_operand:VQ 0 "register_operand" "")
-        (match_operand:VQ 1 "register_operand" ""))]
+  [(set (match_operand:VQMOV 0 "register_operand" "")
+        (match_operand:VQMOV 1 "register_operand" ""))]
   "TARGET_SIMD && reload_completed
    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
@@ -258,8 +258,8 @@
 })
 
 (define_expand "@aarch64_split_simd_mov<mode>"
-  [(set (match_operand:VQ 0)
-        (match_operand:VQ 1))]
+  [(set (match_operand:VQMOV 0)
+        (match_operand:VQMOV 1))]
   "TARGET_SIMD"
   {
     rtx dst = operands[0];
@@ -295,8 +295,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_lo_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_lo_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[0]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -306,8 +306,8 @@
 (define_insn "aarch64_simd_mov_from_<mode>high"
   [(set (match_operand:<VHALF> 0 "register_operand" "=r")
         (vec_select:<VHALF>
-          (match_operand:VQ 1 "register_operand" "w")
-          (match_operand:VQ 2 "vect_par_cnst_hi_half" "")))]
+          (match_operand:VQMOV 1 "register_operand" "w")
+          (match_operand:VQMOV 2 "vect_par_cnst_hi_half" "")))]
   "TARGET_SIMD && reload_completed"
   "umov\t%0, %1.d[1]"
   [(set_attr "type" "neon_to_gp<q>")
@@ -1471,8 +1471,8 @@
 ;; On big-endian this is { zeroes, operand }
 
 (define_insn "move_lo_quad_internal_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")
 	  (vec_duplicate:<VHALF> (const_int 0))))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
@@ -1501,8 +1501,8 @@
 )
 
 (define_insn "move_lo_quad_internal_be_<mode>"
-  [(set (match_operand:VQ_NO2E 0 "register_operand" "=w,w,w")
-	(vec_concat:VQ_NO2E
+  [(set (match_operand:VQMOV_NO2E 0 "register_operand" "=w,w,w")
+	(vec_concat:VQMOV_NO2E
 	  (vec_duplicate:<VHALF> (const_int 0))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r,r")))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
@@ -1531,8 +1531,8 @@
 )
 
 (define_expand "move_lo_quad_<mode>"
-  [(match_operand:VQ 0 "register_operand")
-   (match_operand:VQ 1 "register_operand")]
+  [(match_operand:VQMOV 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
   "TARGET_SIMD"
 {
   if (BYTES_BIG_ENDIAN)
@@ -1549,11 +1549,11 @@
 ;; For big-endian this is { operand1, operand2 }
 
 (define_insn "aarch64_simd_move_hi_quad_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")))]
   "TARGET_SIMD && !BYTES_BIG_ENDIAN"
   "@
@@ -1563,12 +1563,12 @@
 )
 
 (define_insn "aarch64_simd_move_hi_quad_be_<mode>"
-  [(set (match_operand:VQ 0 "register_operand" "+w,w")
-        (vec_concat:VQ
+  [(set (match_operand:VQMOV 0 "register_operand" "+w,w")
+        (vec_concat:VQMOV
 	  (match_operand:<VHALF> 1 "register_operand" "w,r")
           (vec_select:<VHALF>
                 (match_dup 0)
-                (match_operand:VQ 2 "vect_par_cnst_lo_half" ""))))]
+                (match_operand:VQMOV 2 "vect_par_cnst_lo_half" ""))))]
   "TARGET_SIMD && BYTES_BIG_ENDIAN"
   "@
    ins\\t%0.d[1], %1.d[0]
@@ -1577,7 +1577,7 @@
 )
 
 (define_expand "move_hi_quad_<mode>"
- [(match_operand:VQ 0 "register_operand")
+ [(match_operand:VQMOV 0 "register_operand")
   (match_operand:<VHALF> 1 "register_operand")]
  "TARGET_SIMD"
 {
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 04dabd46437cf650e38e085d219c4e629b537e67..b04922056106ad2060d72b99fb49d57fd2b50f4b 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -1120,13 +1120,13 @@ extern enum aarch64_code_model aarch64_cmodel;
 #define AARCH64_VALID_SIMD_DREG_MODE(MODE) \
   ((MODE) == V2SImode || (MODE) == V4HImode || (MODE) == V8QImode \
    || (MODE) == V2SFmode || (MODE) == V4HFmode || (MODE) == DImode \
-   || (MODE) == DFmode)
+   || (MODE) == DFmode || (MODE) == V4BFmode)
 
 /* Modes valid for AdvSIMD Q registers.  */
 #define AARCH64_VALID_SIMD_QREG_MODE(MODE) \
   ((MODE) == V4SImode || (MODE) == V8HImode || (MODE) == V16QImode \
    || (MODE) == V4SFmode || (MODE) == V8HFmode || (MODE) == V2DImode \
-   || (MODE) == V2DFmode)
+   || (MODE) == V2DFmode || (MODE) == V8BFmode)
 
 #define ENDIAN_LANE_N(NUNITS, N) \
   (BYTES_BIG_ENDIAN ? NUNITS - 1 - N : N)
@@ -1174,6 +1174,11 @@ extern const char *host_detect_local_cpu (int argc, const char **argv);
 extern tree aarch64_fp16_type_node;
 extern tree aarch64_fp16_ptr_type_node;
 
+/* This type is the user-visible __bf16, and a pointer to that type.  Defined
+   in aarch64-builtins.c.  */
+extern tree aarch64_bf16_type_node;
+extern tree aarch64_bf16_ptr_type_node;
+
 /* The generic unwind code in libgcc does not initialize the frame pointer.
    So in order to unwind a function using a frame pointer, the very first
    function that is unwound must save the frame pointer.  That way the frame
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 85cadef1be819b3c1ad68ae70e755e0150ad6469..ebd3f6cf45bc0b5118c4c39e323e6380d64c885e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1692,6 +1692,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V2SImode:
     /* ...E_V1DImode doesn't exist.  */
     case E_V4HFmode:
+    case E_V4BFmode:
     case E_V2SFmode:
     case E_V1DFmode:
     /* 128-bit Advanced SIMD vectors.  */
@@ -1700,6 +1701,7 @@ aarch64_classify_vector_mode (machine_mode mode)
     case E_V4SImode:
     case E_V2DImode:
     case E_V8HFmode:
+    case E_V8BFmode:
     case E_V4SFmode:
     case E_V2DFmode:
       return TARGET_SIMD ? VEC_ADVSIMD : 0;
@@ -15603,6 +15605,10 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
 	  field_t = aarch64_fp16_type_node;
 	  field_ptr_t = aarch64_fp16_ptr_type_node;
 	  break;
+	case E_BFmode:
+	  field_t = aarch64_bf16_type_node;
+	  field_ptr_t = aarch64_bf16_ptr_type_node;
+	  break;
 	case E_V2SImode:
 	case E_V4SImode:
 	    {
@@ -16116,6 +16122,8 @@ aarch64_vq_mode (scalar_mode mode)
       return V4SFmode;
     case E_HFmode:
       return V8HFmode;
+    case E_BFmode:
+      return V8BFmode;
     case E_SImode:
       return V4SImode;
     case E_HImode:
@@ -16149,6 +16157,8 @@ aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 	    return V2SFmode;
 	  case E_HFmode:
 	    return V4HFmode;
+	  case E_BFmode:
+	    return V4BFmode;
 	  case E_SImode:
 	    return V2SImode;
 	  case E_HImode:
@@ -16263,9 +16273,14 @@ aarch64_mangle_type (const_tree type)
   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
     return "St9__va_list";
 
-  /* Half-precision float.  */
+  /* Half-precision floating point types.  */
   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
-    return "Dh";
+    {
+      if (TYPE_MODE (type) == BFmode)
+	return "u6__bf16";
+      else
+	return "Dh";
+    }
 
   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
      builtin types.  */
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 34cb99e28975de2ef10d7f4202417e2f05a870a2..85106910f7459d1211e729c73f222f99f04e6d7f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1304,8 +1304,8 @@
 })
 
 (define_expand "mov<mode>"
-  [(set (match_operand:GPF_TF_F16 0 "nonimmediate_operand")
-	(match_operand:GPF_TF_F16 1 "general_operand"))]
+  [(set (match_operand:GPF_TF_F16_MOV 0 "nonimmediate_operand")
+	(match_operand:GPF_TF_F16_MOV 1 "general_operand"))]
   ""
   {
     if (!TARGET_FLOAT)
@@ -1321,11 +1321,11 @@
   }
 )
 
-(define_insn "*movhf_aarch64"
-  [(set (match_operand:HF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
-	(match_operand:HF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
-  "TARGET_FLOAT && (register_operand (operands[0], HFmode)
-    || aarch64_reg_or_fp_zero (operands[1], HFmode))"
+(define_insn "*mov<mode>_aarch64"
+  [(set (match_operand:HFBF 0 "nonimmediate_operand" "=w,w  , w,?r,w,w  ,w  ,w,m,r,m ,r")
+	(match_operand:HFBF 1 "general_operand"      "Y ,?rY,?r, w,w,Ufc,Uvi,m,w,m,rY,r"))]
+  "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
+    || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
   "@
    movi\\t%0.4h, #0
    fmov\\t%h0, %w1
diff --git a/gcc/config/aarch64/arm_bf16.h b/gcc/config/aarch64/arm_bf16.h
new file mode 100644
index 0000000000000000000000000000000000000000..3759c0d1cb449a7f0125cc2a1433127564d66622
--- /dev/null
+++ b/gcc/config/aarch64/arm_bf16.h
@@ -0,0 +1,32 @@
+/* Arm BF16 instrinsics include file.
+
+   Copyright (C) 2019-2020 Free Software Foundation, Inc.
+   Contributed by Arm.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _AARCH64_BF16_H_
+#define _AARCH64_BF16_H_
+
+typedef __bf16 bfloat16_t;
+
+#endif
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c7425346b86b5f5310a7148c465497b53ac75bf5..eaba156e26cf35b07b96972fe2741a9c00d6caa9 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -73,6 +73,9 @@ typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
 
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
 typedef struct int8x8x2_t
 {
   int8x8_t val[2];
@@ -34606,6 +34609,8 @@ vrnd64xq_f64 (float64x2_t __a)
 
 #pragma GCC pop_options
 
+#include "arm_bf16.h"
+
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index e5fa31f6748ee81d4323f11544fd8edb19d9af43..b0be5492e5e928daae93fde08c046150eab631e2 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -57,9 +57,16 @@
 ;; Iterator for all scalar floating point modes (HF, SF, DF)
 (define_mode_iterator GPF_HF [HF SF DF])
 
+;; Iterator for all 16-bit scalar floating point modes (HF, BF)
+(define_mode_iterator HFBF [HF BF])
+
 ;; Iterator for all scalar floating point modes (HF, SF, DF and TF)
 (define_mode_iterator GPF_TF_F16 [HF SF DF TF])
 
+;; Iterator for all scalar floating point modes suitable for moving, including
+;; special BF type (HF, SF, DF, TF and BF)
+(define_mode_iterator GPF_TF_F16_MOV [HF BF SF DF TF])
+
 ;; Double vector modes.
 (define_mode_iterator VDF [V2SF V4HF])
 
@@ -79,6 +86,9 @@
 ;; Double vector modes.
 (define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF])
 
+;; Double vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF])
+
 ;; All modes stored in registers d0-d31.
 (define_mode_iterator DREG [V8QI V4HI V4HF V2SI V2SF DF])
 
@@ -97,6 +107,12 @@
 ;; Copy of the above.
 (define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF])
 
+;; Quad vector modes suitable for moving.  Includes BFmode.
+(define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF])
+
+;; VQMOV without 2-element modes.
+(define_mode_iterator VQMOV_NO2E [V16QI V8HI V4SI V8HF V8BF V4SF])
+
 ;; Quad integer vector modes.
 (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI])
 
@@ -160,6 +176,11 @@
 (define_mode_iterator VALL_F16 [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
 				V4HF V8HF V2SF V4SF V2DF])
 
+;; All Advanced SIMD modes suitable for moving, loading, and storing,
+;; including special Bfloat vector types.
+(define_mode_iterator VALL_F16MOV [V8QI V16QI V4HI V8HI V2SI V4SI V2DI
+				   V4HF V8HF V4BF V8BF V2SF V4SF V2DF])
+
 ;; The VALL_F16 modes except the 128-bit 2-element ones.
 (define_mode_iterator VALL_F16_NO_V2Q [V8QI V16QI V4HI V8HI V2SI V4SI
 				V4HF V8HF V2SF V4SF])
@@ -745,6 +766,7 @@
 			  (V2SI "2") (V4SI "4")
 				     (V2DI "2")
 			  (V4HF "4") (V8HF "8")
+			  (V4BF "4") (V8BF "8")
 			  (V2SF "2") (V4SF "4")
 			  (V1DF "1") (V2DF "2")
 			  (DI "1") (DF "1")])
@@ -885,7 +907,8 @@
 			  (V8HF "16b") (V2SF  "8b")
 			  (V4SF "16b") (V2DF  "16b")
 			  (DI   "8b")  (DF    "8b")
-			  (SI   "8b")  (SF    "8b")])
+			  (SI   "8b")  (SF    "8b")
+			  (V4BF "8b")  (V8BF  "16b")])
 
 ;; Define element mode for each vector mode.
 (define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
@@ -965,12 +988,13 @@
 			 (V2SI "SI")    (V4SI  "V2SI")
 			 (V2DI "DI")    (V2SF  "SF")
 			 (V4SF "V2SF")  (V4HF "V2HF")
-			 (V8HF "V4HF")  (V2DF  "DF")])
+			 (V8HF "V4HF")  (V2DF  "DF")
+			 (V8BF "V4BF")])
 
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
-			 (V8HF  "v4hf")
+			 (V8HF  "v4hf") (V8BF  "v4bf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
@@ -1265,6 +1289,7 @@
 		     (V2SI "") (V4SI  "_q")
 		     (DI   "") (V2DI  "_q")
 		     (V4HF "") (V8HF "_q")
+		     (V4BF "") (V8BF "_q")
 		     (V2SF "") (V4SF  "_q")
 			       (V2DF  "_q")
 		     (QI "") (HI "") (SI "") (DI "") (HF "") (SF "") (DF "")])
diff --git a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
index 5740c0281b2fdf8bbc11d9428ca2f6ba8f1760a0..50c1452ed83c8a2f4ad3b162931292db328813c6 100644
--- a/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
+++ b/gcc/testsuite/g++.dg/abi/mangle-neon-aarch64.C
@@ -14,6 +14,7 @@ void f4 (uint16x4_t a) {}
 void f5 (uint32x2_t a) {}
 void f23 (uint64x1_t a) {}
 void f61 (float16x4_t a) {}
+void f62 (bfloat16x4_t a) {}
 void f6 (float32x2_t a) {}
 void f7 (poly8x8_t a) {}
 void f8 (poly16x4_t a) {}
@@ -27,6 +28,7 @@ void f14 (uint16x8_t a) {}
 void f15 (uint32x4_t a) {}
 void f16 (uint64x2_t a) {}
 void f171 (float16x8_t a) {}
+void f172 (bfloat16x8_t a) {}
 void f17 (float32x4_t a) {}
 void f18 (float64x2_t a) {}
 void f19 (poly8x16_t a) {}
@@ -45,6 +47,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z2f512__Uint32x2_t:" } }
 // { dg-final { scan-assembler "_Z3f2312__Uint64x1_t:" } }
 // { dg-final { scan-assembler "_Z3f6113__Float16x4_t:" } }
+// { dg-final { scan-assembler "_Z3f6214__Bfloat16x4_t:" } }
 // { dg-final { scan-assembler "_Z2f613__Float32x2_t:" } }
 // { dg-final { scan-assembler "_Z2f711__Poly8x8_t:" } }
 // { dg-final { scan-assembler "_Z2f812__Poly16x4_t:" } }
@@ -57,6 +60,7 @@ void g1 (int8x16_t, int8x16_t) {}
 // { dg-final { scan-assembler "_Z3f1512__Uint32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1612__Uint64x2_t:" } }
 // { dg-final { scan-assembler "_Z4f17113__Float16x8_t:" } }
+// { dg-final { scan-assembler "_Z4f17214__Bfloat16x8_t:" } }
 // { dg-final { scan-assembler "_Z3f1713__Float32x4_t:" } }
 // { dg-final { scan-assembler "_Z3f1813__Float64x2_t:" } }
 // { dg-final { scan-assembler "_Z3f1912__Poly8x16_t:" } }
diff --git a/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
new file mode 100644
index 0000000000000000000000000000000000000000..5426a1814b842db9d73d556bcc228d19f970f466
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/arm-bf16/bf16-mangle-aarch64-1.C
@@ -0,0 +1,13 @@
+/* { dg-do compile { target aarch64*-*-* } } */
+
+/* Test mangling */
+
+/* { dg-final { scan-assembler "\t.global\t_Z1fPu6__bf16" } } */
+void f (__bf16 *x) { }
+
+/* { dg-final { scan-assembler "\t.global\t_Z1gPu6__bf16S_" } } */
+void g (__bf16 *x, __bf16 *y) { }
+
+/* { dg-final { scan-assembler "\t.global\t_ZN1SIu6__bf16u6__bf16E1iE" } } */
+template <typename T, typename U> struct S { static int i; };
+template <> int S<__bf16, __bf16>::i = 3;
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef43766495c8f7bc628e658b2818bdc5b8bea247
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_1.c
@@ -0,0 +1,102 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..df8e7518c24c6534f04f1e1b3c50e2655f69bf95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_2.c
@@ -0,0 +1,106 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
+#pragma GCC pop_options
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..5d7a4317ceefbdd411062fe506e3bf9461d98bf8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_3.c
@@ -0,0 +1,101 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps -std=gnu90" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_bf16.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**bfloat_mov_ww:
+**	mov	v1.h\[0\], v2.h\[0\]
+**	ret
+*/
+void bfloat_mov_ww (void)
+{
+  register bfloat16_t x asm ("h2");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_rw:
+**	dup	v1.4h, w1
+**	ret
+*/
+void bfloat_mov_rw (void)
+{
+  register bfloat16_t x asm ("w1");
+  register bfloat16_t y asm ("h1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "w" (y));
+}
+
+/*
+**bfloat_mov_wr:
+**	umov	w1, v1.h\[0\]
+**	ret
+*/
+void bfloat_mov_wr (void)
+{
+  register bfloat16_t x asm ("h1");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=w" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rr:
+**	mov	w1, w2
+**	ret
+*/
+void bfloat_mov_rr (void)
+{
+  register bfloat16_t x asm ("w2");
+  register bfloat16_t y asm ("w1");
+  asm volatile ("" : "=r" (x));
+  y = x;
+  asm volatile ("" :: "r" (y));
+}
+
+/*
+**bfloat_mov_rm:
+**	strh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_rm (bfloat16_t *ptr)
+{
+   register bfloat16_t x asm ("w2");
+   asm volatile ("" : "=r" (x));
+   *ptr = x;
+}
+
+/*
+**bfloat_mov_mr:
+**	ldrh	w2, \[x0\]
+**	ret
+*/
+void bfloat_mov_mr (bfloat16_t *ptr)
+{
+   register bfloat16_t y asm ("w2");
+   y = *ptr;
+   asm volatile ("" :: "r" (y));
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..b812011c223b257fe405ef210d24bf5edc3535c0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_scalar_4.c
@@ -0,0 +1,16 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-std=c99 -pedantic-errors -O3 --save-temps" } */
+
+#include <arm_bf16.h>
+
+_Complex bfloat16_t stacktest1 (_Complex bfloat16_t __a)
+{
+  volatile _Complex bfloat16_t b = __a;
+  return b;
+}
+
+/* { dg-error {ISO C does not support plain 'complex' meaning 'double complex'} "" { target *-*-* } 8 } */
+/* { dg-error {expected '=', ',', ';', 'asm' or '__attribute__' before 'stacktest1'} "" { target *-*-* } 8 } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6cad557ebf2cd8e9b2f063d1cc7e9ad4a3e6ac31
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_1.c
@@ -0,0 +1,93 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */
+/* { dg-additional-options "-O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3891dcfc900ab942bf29eb638d16660a194597e4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_2.c
@@ -0,0 +1,97 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+#pragma GCC push_options
+#pragma GCC target ("+bf16")
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }
+
+#pragma GCC pop_options
diff --git a/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b35f5e527be1fe7a6fd928bcd326b57fb376596a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bfloat16_simd_3.c
@@ -0,0 +1,92 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-additional-options "-march=armv8.2-a -O3 --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_neon.h>
+
+/*
+**stacktest1:
+**	sub	sp, sp, #16
+**	str	h0, \[sp, 14\]
+**	ldr	h0, \[sp, 14\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16_t stacktest1 (bfloat16_t __a)
+{
+  volatile bfloat16_t b = __a;
+  return b;
+}
+
+/*
+**stacktest2:
+**	sub	sp, sp, #16
+**	str	d0, \[sp, 8\]
+**	ldr	d0, \[sp, 8\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x4_t stacktest2 (bfloat16x4_t __a)
+{
+  volatile bfloat16x4_t b = __a;
+  return b;
+}
+
+/*
+**stacktest3:
+**	sub	sp, sp, #16
+**	str	q0, \[sp\]
+**	ldr	q0, \[sp\]
+**	add	sp, sp, 16
+**	ret
+*/
+bfloat16x8_t stacktest3 (bfloat16x8_t __a)
+{
+  volatile bfloat16x8_t b = __a;
+  return b;
+}
+
+/*  Test compilation of __attribute__ vectors of 8, 16, 32, etc. BFloats.  */
+typedef bfloat16_t v8bf __attribute__((vector_size(16)));
+typedef bfloat16_t v16bf __attribute__((vector_size(32)));
+typedef bfloat16_t v32bf __attribute__((vector_size(64)));
+typedef bfloat16_t v64bf __attribute__((vector_size(128)));
+typedef bfloat16_t v128bf __attribute__((vector_size(256)));
+
+v8bf stacktest4 (v8bf __a)
+{
+  volatile v8bf b = __a;
+  return b;
+}
+
+v16bf stacktest5 (v16bf __a)
+{
+  volatile v16bf b = __a;
+  return b;
+}
+
+v32bf stacktest6 (v32bf __a)
+{
+  volatile v32bf b = __a;
+  return b;
+}
+
+v64bf stacktest7 (v64bf __a)
+{
+  volatile v64bf b = __a;
+  return b;
+}
+
+v128bf stacktest8 (v128bf __a)
+{
+  volatile v128bf b = __a;
+  return b;
+}
+
+/* Test use of constant values to assign values to vectors.  */
+
+typedef bfloat16_t v2bf __attribute__((vector_size(4)));
+v2bf c2 (void) { return (v2bf) 0x12345678; }
+
+bfloat16x4_t c3 (void) { return (bfloat16x4_t) 0x1234567812345678; }



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-01-10 19:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-18 16:35 [GCC][PATCH][Aarch64] Add Bfloat16_t scalar type, vector types and machine modes to Aarch64 back-end [1/2] Stam Markianos-Wright
2019-12-19 10:07 ` Richard Sandiford
2019-12-23 16:57   ` Stam Markianos-Wright
2019-12-23 17:07     ` Richard Sandiford
2020-01-07 11:42       ` Stam Markianos-Wright
2020-01-07 17:15         ` Richard Sandiford
2020-01-09 15:12           ` Stam Markianos-Wright
2020-01-09 15:48             ` Richard Sandiford
2020-01-10 19:31               ` Stam Markianos-Wright

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).