public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [AArch64, 2/6] Reimplement vector fixed-point intrinsics
       [not found] ` <57430271.3070504@foss.arm.com>
@ 2016-05-24  8:24   ` Jiong Wang
       [not found]   ` <5743029C.60208@foss.arm.com>
  1 sibling, 0 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1320 bytes --]

Based on top of [1/6], this patch reimplement vector intrinsics for
conversion between floating-point and fixed-point.

gcc/
2016-05-23  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (scvtf): New builtins for 
vector types.
         (ucvtf): Likewise.
         (fcvtzs): Likewise.
         (fcvtzu): Likewise.
         * config/aarch64/aarch64-simd.md
         (<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3): Extend to more modes.
         Rename to <FCVT_F2FIXED:fcvt_fixed_insn><VALLF:mode>3.
         (<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3): Likewise and 
rename to
<FCVT_FIXED2F:fcvt_fixed_insn><VALLI:mode>3.
         * config/aarch64/arm_neon.h (vcvt_n_f32_s32): Remove inline 
assembly.
         Use builtin.
         (vcvt_n_f32_u32): Likewise.
         (vcvt_n_s32_f32): Likewise.
         (vcvt_n_u32_f32): Likewise.
         (vcvtq_n_f32_s32): Likewise.
         (vcvtq_n_f32_u32): Likewise.
         (vcvtq_n_f64_s64): Likewise.
         (vcvtq_n_f64_u64): Likewise.
         (vcvtq_n_s32_f32): Likewise.
         (vcvtq_n_s64_f64): Likewise.
         (vcvtq_n_u32_f32): Likewise.
         (vcvtq_n_u64_f64): Likewise.
         * config/aarch64/iterators.md (VALLI): New mode iterator.
         (fcvt_target): Support V4DI, V4SI and V2SI.
         (FCVT_TARGET): Likewise.

[-- Attachment #2: 0002-2.patch --]
[-- Type: text/x-patch, Size: 16218 bytes --]

From 63e8362e7d0afc2f4dd4288d38d3f64b62bfd657 Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:12:04 +0100
Subject: [PATCH 2/6] 2

---
 gcc/config/aarch64/aarch64-builtins.def |   4 +
 gcc/config/aarch64/aarch64-simd.md      |  22 ++--
 gcc/config/aarch64/arm_neon.h           | 216 +++++++++++---------------------
 gcc/config/aarch64/iterators.md         |   5 +
 4 files changed, 92 insertions(+), 155 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
index 4528db3..5e6280c 100644
--- a/gcc/config/aarch64/aarch64-builtins.def
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -455,3 +455,7 @@
   BUILTIN_GPI (BINOP, fcvtzsdf, 3)
   BUILTIN_GPI (BINOP_USS, fcvtzusf, 3)
   BUILTIN_GPI (BINOP_USS, fcvtzudf, 3)
+  BUILTIN_VALLI (BINOP, scvtf, 3)
+  BUILTIN_VALLI (BINOP_SUS, ucvtf, 3)
+  BUILTIN_VALLF (BINOP, fcvtzs, 3)
+  BUILTIN_VALLF (BINOP_USS, fcvtzu, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 670c690..66ca2de 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1778,26 +1778,26 @@
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
-;; Convert between fixed-point and floating-point (scalar variant from SIMD)
+;; Convert between fixed-point and floating-point (SIMD)
 
-(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
-  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=w")
-	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w")
-				   (match_operand:SI 2 "immediate_operand" "i")]
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><VALLF:mode>3"
+  [(set (match_operand:<VALLF:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VALLF:FCVT_TARGET> [(match_operand:VALLF 1 "register_operand" "w")
+				     (match_operand:SI 2 "immediate_operand" "i")]
 	 FCVT_F2FIXED))]
   "TARGET_SIMD"
   "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
-  [(set_attr "type" "neon_fp_to_int_<GPF:Vetype><q>")]
+  [(set_attr "type" "neon_fp_to_int_<VALLF:Vetype><q>")]
 )
 
-(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
-  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w")
-	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "w")
-				   (match_operand:SI 2 "immediate_operand" "i")]
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><VALLI:mode>3"
+  [(set (match_operand:<VALLI:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VALLI:FCVT_TARGET> [(match_operand:VALLI 1 "register_operand" "w")
+				     (match_operand:SI 2 "immediate_operand" "i")]
 	 FCVT_FIXED2F))]
   "TARGET_SIMD"
   "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
-  [(set_attr "type" "neon_int_to_fp_<GPI:Vetype><q>")]
+  [(set_attr "type" "neon_int_to_fp_<VALLI:Vetype><q>")]
 )
 
 ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 012a11a..bd712fc 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6025,150 +6025,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvt_n_f32_s32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t a_ = (a);                                              \
-       float32x2_t result;                                              \
-       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_f32_u32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t a_ = (a);                                             \
-       float32x2_t result;                                              \
-       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_s32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       int32x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_u32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       uint32x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t a_ = (a);                                              \
-       float32x4_t result;                                              \
-       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       float32x4_t result;                                              \
-       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f64_s64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       float64x2_t result;                                              \
-       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f64_u64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t a_ = (a);                                             \
-       float64x2_t result;                                              \
-       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       int32x4_t result;                                                \
-       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_s64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       int64x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       uint32x4_t result;                                               \
-       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_u64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       uint64x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcvtx_f32_f64 (float64x2_t a)
 {
@@ -12760,6 +12616,42 @@ vcvts_n_f32_u32 (uint32_t __a, const int __b)
   return __builtin_aarch64_ucvtfsisf_sus (__a, __b);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv2si (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv4si (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv2di (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
+}
+
 /* vcvt (float -> <u>fixed-point).  */
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
@@ -12786,6 +12678,42 @@ vcvts_n_u32_f32 (float32_t __a, const int __b)
   return __builtin_aarch64_fcvtzusfsi_uss (__a, __b);
 }
 
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv2df (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
+}
+
 /* vcvt  (<u>int -> float)  */
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 4ebd6f7..2264459 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -166,6 +166,9 @@
 ;; Vector and scalar integer modes for H and S
 (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
 
+;; Vector and scalar integer modes for S and D
+(define_mode_iterator VALLI [V2SI V4SI V2DI SI DI])
+
 ;; Vector and scalar 64-bit container: 16, 32-bit integer modes
 (define_mode_iterator VSD_HSI [V4HI V2SI HI SI])
 
@@ -653,8 +656,10 @@
   [(QI "b") (HI "h") (SI "") (DI "")])
 
 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
+			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
 			       (SF "si") (DF "di") (SI "sf") (DI "df")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
+			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
 			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
 
 
-- 
1.9.1





^ permalink raw reply	[flat|nested] 28+ messages in thread

* [AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns
       [not found]       ` <574302FC.5050701@foss.arm.com>
       [not found]         ` <5743031A.8060307@foss.arm.com>
@ 2016-05-24  8:24         ` Jiong Wang
  2016-05-27 14:41           ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 775 bytes --]

These intrinsics were implemented before "fabd<mode>_3" introduces.  
Meanwhile
the patterns "fabd<mode>_3" and "*fabd_scalar<mode>3" can be merged into a
single "fabd<mode>3" using VALLF.

This patch migrate the implementation to builtins backed by this pattern.

gcc/
2016-05-23  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (fabd): New builtins for 
modes
         VALLF.
         * config/aarch64/aarch64-simd.md (fabd<mode>_3): Extend modes 
from VDQF
         to VALLF.
         "*fabd_scalar<mode>3): Delete.
         * config/aarch64/arm_neon.h (vabds_f32): Remove inline assembly.
         Use builtin.
         (vabdd_f64): Likewise.
         (vabd_f32): Likewise.
         (vabdq_f32): Likewise.
         (vabdq_f64): Likewise.


[-- Attachment #2: 0005-5.patch --]
[-- Type: text/x-patch, Size: 5615 bytes --]

From 9bafb58055d4e379df7b626acd6aa80bdb0d4b22 Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:12:53 +0100
Subject: [PATCH 5/6] 5

---
 gcc/config/aarch64/aarch64-builtins.def |  3 ++
 gcc/config/aarch64/aarch64-simd.md      | 23 +++------
 gcc/config/aarch64/arm_neon.h           | 87 ++++++++++++---------------------
 3 files changed, 42 insertions(+), 71 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
index 1955d17..40baebe 100644
--- a/gcc/config/aarch64/aarch64-builtins.def
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -465,3 +465,6 @@
 
   /* Implemented by aarch64_rsqrts<mode>.  */
   BUILTIN_VALLF (BINOP, rsqrts, 0)
+
+  /* Implemented by fabd<mode>_3.  */
+  BUILTIN_VALLF (BINOP, fabd, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cca6c1b..71dd74a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -474,23 +474,14 @@
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
-(define_insn "fabd<mode>_3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(abs:VDQF (minus:VDQF
-		   (match_operand:VDQF 1 "register_operand" "w")
-		   (match_operand:VDQF 2 "register_operand" "w"))))]
-  "TARGET_SIMD"
-  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
-)
-
-(define_insn "*fabd_scalar<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (abs:GPF (minus:GPF
-                 (match_operand:GPF 1 "register_operand" "w")
-                 (match_operand:GPF 2 "register_operand" "w"))))]
+(define_insn "fabd<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(abs:VALLF
+	  (minus:VALLF
+	    (match_operand:VALLF 1 "register_operand" "w")
+	    (match_operand:VALLF 2 "register_operand" "w"))))]
   "TARGET_SIMD"
-  "fabd\t%<s>0, %<s>1, %<s>2"
+  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
   [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9bbe815..ca29074 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -5440,17 +5440,6 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fabd %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vabd_s8 (int8x8_t a, int8x8_t b)
 {
@@ -5517,17 +5506,6 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vabdd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fabd %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vabdl_high_s8 (int8x16_t a, int8x16_t b)
 {
@@ -5660,28 +5638,6 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabdq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fabd %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vabdq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fabd %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vabdq_s8 (int8x16_t a, int8x16_t b)
 {
@@ -5748,17 +5704,6 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vabds_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fabd %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddlv_s8 (int8x8_t a)
 {
@@ -10246,6 +10191,38 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
 
 /* Start of optimal implementations in approved order.  */
 
+/* vabd.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vabds_f32 (float32_t a, float32_t b)
+{
+  return __builtin_aarch64_fabdsf (a, b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vabdd_f64 (float64_t a, float64_t b)
+{
+  return __builtin_aarch64_fabddf (a, b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabd_f32 (float32x2_t a, float32x2_t b)
+{
+  return __builtin_aarch64_fabdv2sf (a, b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabdq_f32 (float32x4_t a, float32x4_t b)
+{
+  return __builtin_aarch64_fabdv4sf (a, b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vabdq_f64 (float64x2_t a, float64x2_t b)
+{
+  return __builtin_aarch64_fabdv2df (a, b);
+}
+
 /* vabs  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-- 
1.9.1




^ permalink raw reply	[flat|nested] 28+ messages in thread

* [AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
       [not found]         ` <5743031A.8060307@foss.arm.com>
@ 2016-05-24  8:24           ` Jiong Wang
  2016-05-27 14:45             ` James Greenhalgh
  0 siblings, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 844 bytes --]

These intrinsics was implemented by inline assembly using "faddp" 
instruction.
There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only 
while we can
extend this pattern to support VDQF mode, then we can reimplement these
intrinsics through builtlins.

gcc/
2016-05-23  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (faddp): New builtins for 
modes in VDQF.
         * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
         (arch64_addpv4sf): Delete.
         (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
         "gen_aarch64_addpv4sf".
         * gcc/config/aarch64/iterators.md (UNSPEC_FADDP): New.
         * config/aarch64/arm_neon.h (vpadd_f32): Remove inline 
assembly.  Use
         builtin.
         (vpaddq_f32): Likewise.
         (vpaddq_f64): Likewise.


[-- Attachment #2: 0006-6.patch --]
[-- Type: text/x-patch, Size: 5328 bytes --]

From d97a40ac2e69403b64bcf53596581b49b86ef40c Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:13:13 +0100
Subject: [PATCH 6/6] 6

---
 gcc/config/aarch64/aarch64-builtins.def |  3 ++
 gcc/config/aarch64/aarch64-simd.md      | 23 ++++++++-------
 gcc/config/aarch64/arm_neon.h           | 51 ++++++++++++---------------------
 gcc/config/aarch64/iterators.md         |  1 +
 4 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
index 40baebe..37d8183 100644
--- a/gcc/config/aarch64/aarch64-builtins.def
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -468,3 +468,6 @@
 
   /* Implemented by fabd<mode>_3.  */
   BUILTIN_VALLF (BINOP, fabd, 3)
+
+  /* Implemented by aarch64_faddp<mode>.  */
+  BUILTIN_VDQF (BINOP, faddp, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 71dd74a..9b9f8df 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1992,6 +1992,16 @@
   }
 )
 
+(define_insn "aarch64_faddp<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
+		     (match_operand:VDQF 2 "register_operand" "w")]
+		     UNSPEC_FADDP))]
+ "TARGET_SIMD"
+ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
@@ -2019,15 +2029,6 @@
   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
 )
 
-(define_insn "aarch64_addpv4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%0.4s, %1.4s, %1.4s"
-  [(set_attr "type" "neon_fp_reduc_add_s_q")]
-)
-
 (define_expand "reduc_plus_scal_v4sf"
  [(set (match_operand:SF 0 "register_operand")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
@@ -2036,8 +2037,8 @@
 {
   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
   rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
-  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
   DONE;
 })
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ae4c429..a37ceeb 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8225,17 +8225,6 @@ vpadalq_u32 (uint64x2_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpadd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vpaddl_s8 (int8x8_t a)
 {
@@ -8368,28 +8357,6 @@ vpaddlq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpaddq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpaddq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vpaddq_s8 (int8x16_t a, int8x16_t b)
 {
@@ -18629,6 +18596,24 @@ vnegq_s64 (int64x2_t __a)
 
 /* vpadd  */
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_faddpv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_faddpv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_faddpv2df (__a, __b);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vpadd_s8 (int8x8_t __a, int8x8_t __b)
 {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 2264459..7323091 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -219,6 +219,7 @@
     UNSPEC_FMIN		; Used in aarch64-simd.md.
     UNSPEC_FMINNMV	; Used in aarch64-simd.md.
     UNSPEC_FMINV	; Used in aarch64-simd.md.
+    UNSPEC_FADDP	; Used in aarch64-simd.md.
     UNSPEC_FADDV	; Used in aarch64-simd.md.
     UNSPEC_ADDV		; Used in aarch64-simd.md.
     UNSPEC_SCVTF	; Used in aarch64-simd.md.
-- 
1.9.1






^ permalink raw reply	[flat|nested] 28+ messages in thread

* [AArch64, 4/6] Reimplement frsqrts intrinsics
       [not found]     ` <574302DA.6090803@foss.arm.com>
@ 2016-05-24  8:24       ` Jiong Wang
  2016-05-27 14:12         ` James Greenhalgh
       [not found]       ` <574302FC.5050701@foss.arm.com>
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 778 bytes --]

Similar as [3/6], these intrinsics were implemented before the instruction
pattern "aarch64_rsqrts<mode>" added, that these intrinsics were implemented
through inline assembly.

This mirgrate the implementation to builtin.

gcc/
2016-05-23  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (rsqrts): New builtins 
for modes
         VALLF.
         * config/aarch64/aarch64-simd.md (aarch64_rsqrts_<mode>3): 
Rename to
"aarch64_rsqrts<mode>".
         * config/aarch64/aarch64.c (get_rsqrts_type): Update gen* name.
         * config/aarch64/arm_neon.h (vrsqrtss_f32): Remove inline 
assembly.  Use
builtin.
         (vrsqrtsd_f64): Likewise.
         (vrsqrts_f32): Likewise.
         (vrsqrtsq_f32): Likewise.
         (vrsqrtsq_f64): Likewise.

[-- Attachment #2: 0004-4.patch --]
[-- Type: text/x-patch, Size: 5314 bytes --]

From ea271deeb19e3a1e611cbc1ddf3abfec06388958 Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:12:33 +0100
Subject: [PATCH 4/6] 4

---
 gcc/config/aarch64/aarch64-builtins.def |  3 ++
 gcc/config/aarch64/aarch64-simd.md      |  2 +-
 gcc/config/aarch64/aarch64.c            | 10 ++--
 gcc/config/aarch64/arm_neon.h           | 87 ++++++++++++---------------------
 4 files changed, 41 insertions(+), 61 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
index 32bcd06..1955d17 100644
--- a/gcc/config/aarch64/aarch64-builtins.def
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -462,3 +462,6 @@
 
   /* Implemented by aarch64_rsqrte<mode>.  */
   BUILTIN_VALLF (UNOP, rsqrte, 0)
+
+  /* Implemented by aarch64_rsqrts<mode>.  */
+  BUILTIN_VALLF (BINOP, rsqrts, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c34d21e..cca6c1b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -390,7 +390,7 @@
   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
   [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
 
-(define_insn "aarch64_rsqrts_<mode>3"
+(define_insn "aarch64_rsqrts<mode>"
   [(set (match_operand:VALLF 0 "register_operand" "=w")
 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
 	       (match_operand:VALLF 2 "register_operand" "w")]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 18a8c1e..ba71d2a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7377,11 +7377,11 @@ get_rsqrts_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrts_df3;
-    case SFmode:   return gen_aarch64_rsqrts_sf3;
-    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
-    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
-    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    case DFmode:   return gen_aarch64_rsqrtsdf;
+    case SFmode:   return gen_aarch64_rsqrtssf;
+    case V2DFmode: return gen_aarch64_rsqrtsv2df;
+    case V2SFmode: return gen_aarch64_rsqrtsv2sf;
+    case V4SFmode: return gen_aarch64_rsqrtsv4sf;
     default: gcc_unreachable ();
   }
 }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index be48a5e..1971373 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9196,61 +9196,6 @@ vrsqrteq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrts_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrtsd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("frsqrts %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtss_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("frsqrts %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vshrn_high_n_s16(a, b, c)                                       \
   __extension__                                                         \
     ({                                                                  \
@@ -21481,6 +21426,38 @@ vrsqrteq_f64 (float64x2_t a)
   return __builtin_aarch64_rsqrtev2df (a);
 }
 
+/* vrsqrts.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrsqrtss_f32 (float32_t a, float32_t b)
+{
+  return __builtin_aarch64_rsqrtssf (a, b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrsqrtsd_f64 (float64_t a, float64_t b)
+{
+  return __builtin_aarch64_rsqrtsdf (a, b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrts_f32 (float32x2_t a, float32x2_t b)
+{
+  return __builtin_aarch64_rsqrtsv2sf (a, b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
+{
+  return __builtin_aarch64_rsqrtsv4sf (a, b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
+{
+  return __builtin_aarch64_rsqrtsv2df (a, b);
+}
+
 /* vrsra */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-- 
1.9.1





^ permalink raw reply	[flat|nested] 28+ messages in thread

* [AArch64, 1/6] Reimplement scalar fixed-point intrinsics
       [not found] <57430251.6060902@foss.arm.com>
       [not found] ` <57430271.3070504@foss.arm.com>
@ 2016-05-24  8:24 ` Jiong Wang
  2016-05-27 13:50   ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2629 bytes --]

This patch reimplement scalar intrinsics for conversion between floating-
point and fixed-point.

Previously, all such intrinsics are implemented through inline assembly.
This patch added RTL pattern for these operations that those intrinsics
can be implemented through builtins.

gcc/
2016-05-23  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New
         (TYPES_BINOP_SUS): Likewise.
         (aarch64_simd_builtin_data): Update include file name.
         (aarch64_builtins): Likewise.
         * config/aarch64/aarch64-simd-builtins.def: Rename to
         aarch64-builtins.def.
         (scvtfsi): New entries for conversion between scalar
         float-point and fixed-point.
         (scvtfdi): Likewise.
         (ucvtfsi): Likewise.
         (ucvtfdi): Likewise.
         (fcvtzssf): Likewise.
         (fcvtzsdf): Likewise.
         (fcvtzusf): Likewise.
         (fcvtzudf): Likewise.
         * config/aarch64/aarch64.md
         (<FCVT_F2FIXED_SCALAR:fcvt_fixed_insn><GPF:mode><GPI:mode>3): New
         pattern for conversion between scalar float to fixed-pointer.
         (<FCVT_FIXED2F_SCALAR:fcvt_fixed_insn><GPI:mode><GPF:mode>3): Likewise.
         (UNSPEC_FCVTZS_SCALAR): New UNSPEC enumeration.
         (UNSPEC_FCVTZU_SCALAR): Likewise.
         (UNSPEC_SCVTF_SCALAR): Likewise.
         (UNSPEC_UCVTF_SCALAR): Likewise.
         * config/aarch64/aarch64-simd.md
         (<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3): New pattern for conversion
         between scalar variant of SIMD and fixed-point
         (<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3): Likewise.
         * config/aarch64/arm_neon.h (vcvtd_n_f64_s64): Remove inline assembly.  Use
         builtin.
         (vcvtd_n_f64_u64): Likewise.
         (vcvtd_n_s64_f64): Likewise.
         (vcvtd_n_u64_f64): Likewise.
         (vcvtd_n_f32_s32): Likewise.
         (vcvts_n_f32_u32): Likewise.
         (vcvtd_n_s32_f32): Likewise.
         (vcvts_n_u32_f32): Likewise.
         * config/aarch64/iterators.md (UNSPEC_FCVTZS): New.
         (UNSPEC_FCVTZU): Likewise.
         (UNSPEC_SCVTF): Likewise.
         (UNSPEC_UCVTF): Likewise.
         (fcvt_target): Support integer to float mapping.
         (FCVT_TARGET): Likewise.
         (FCVT_FIXED2F): New iterator.
         (FCVT_F2FIXED): Likewise.
         (FCVT_FIXED2F_SCALAR): Likewise.
         (FCVT_F2FIXED_SCALAR): Likewise.
         (fcvt_fixed_insn): New define_int_attr.
         * config/aarch64/t-aarch64 (aarch64-builtins.o): Change dependency file
         name from "aarch64-simd-builtins.def" to "aarch64-builtins.def".


[-- Attachment #2: 0001-1.patch --]
[-- Type: text/x-patch, Size: 52570 bytes --]

From 91adf34dbcf5a233c3d159e7038256d3f5c7572e Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:11:53 +0100
Subject: [PATCH 1/6] 1

---
 gcc/config/aarch64/aarch64-builtins.c        |  12 +-
 gcc/config/aarch64/aarch64-builtins.def      | 457 +++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64-simd-builtins.def | 447 --------------------------
 gcc/config/aarch64/aarch64-simd.md           |  22 ++
 gcc/config/aarch64/aarch64.md                |  26 ++
 gcc/config/aarch64/arm_neon.h                | 148 +++------
 gcc/config/aarch64/iterators.md              |  25 +-
 gcc/config/aarch64/t-aarch64                 |   2 +-
 8 files changed, 591 insertions(+), 548 deletions(-)
 create mode 100644 gcc/config/aarch64/aarch64-builtins.def
 delete mode 100644 gcc/config/aarch64/aarch64-simd-builtins.def

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 5573903..d79ba3d 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -139,6 +139,14 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned };
 #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_sus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_BINOP_SUS (aarch64_types_binop_sus_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_poly, qualifier_poly, qualifier_poly };
 #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
@@ -291,7 +299,7 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #include "aarch64-builtin-iterators.h"
 
 static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
-#include "aarch64-simd-builtins.def"
+#include "aarch64-builtins.def"
 };
 
 /* There's only 8 CRC32 builtins.  Probably not worth their own .def file.  */
@@ -336,7 +344,7 @@ enum aarch64_builtins
   AARCH64_BUILTIN_RSQRT_V4SF,
   AARCH64_SIMD_BUILTIN_BASE,
   AARCH64_SIMD_BUILTIN_LANE_CHECK,
-#include "aarch64-simd-builtins.def"
+#include "aarch64-builtins.def"
   /* The first enum element which is based on an insn_data pattern.  */
   AARCH64_SIMD_PATTERN_START = AARCH64_SIMD_BUILTIN_LANE_CHECK + 1,
   AARCH64_SIMD_BUILTIN_MAX = AARCH64_SIMD_PATTERN_START
diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
new file mode 100644
index 0000000..4528db3
--- /dev/null
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -0,0 +1,457 @@
+/* Machine description for AArch64 architecture.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   Contributed by ARM Ltd.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+/* In the list below, the BUILTIN_<ITERATOR> macros expand to create
+   builtins for each of the modes described by <ITERATOR>.  When adding
+   new builtins to this list, a helpful idiom to follow is to add
+   a line for each pattern in the md file.  Thus, ADDP, which has one
+   pattern defined for the VD_BHSI iterator, and one for DImode, has two
+   entries below.
+
+   Parameter 1 is the 'type' of the intrinsic.  This is used to
+   describe the type modifiers (for example; unsigned) applied to
+   each of the parameters to the intrinsic function.
+
+   Parameter 2 is the name of the intrinsic.  This is appended
+   to `__builtin_aarch64_<name><mode>` to give the intrinsic name
+   as exported to the front-ends.
+
+   Parameter 3 describes how to map from the name to the CODE_FOR_
+   macro holding the RTL pattern for the intrinsic.  This mapping is:
+   0 - CODE_FOR_aarch64_<name><mode>
+   1-9 - CODE_FOR_<name><mode><1-9>
+   10 - CODE_FOR_<name><mode>.  */
+
+  BUILTIN_VDC (COMBINE, combine, 0)
+  BUILTIN_VB (BINOP, pmul, 0)
+  BUILTIN_VALLF (BINOP, fmulx, 0)
+  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
+  BUILTIN_VD_BHSI (BINOP, addp, 0)
+  VAR1 (UNOP, addp, 0, di)
+  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
+  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
+  BUILTIN_VS (UNOP, ctz, 2)
+  BUILTIN_VB (UNOP, popcount, 2)
+
+  /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
+  BUILTIN_VSDQ_I (BINOP, sqshl, 0)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0)
+  BUILTIN_VSDQ_I (BINOP, sqrshl, 0)
+  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0)
+  /* Implemented by aarch64_<su_optab><optab><mode>.  */
+  BUILTIN_VSDQ_I (BINOP, sqadd, 0)
+  BUILTIN_VSDQ_I (BINOPU, uqadd, 0)
+  BUILTIN_VSDQ_I (BINOP, sqsub, 0)
+  BUILTIN_VSDQ_I (BINOPU, uqsub, 0)
+  /* Implemented by aarch64_<sur>qadd<mode>.  */
+  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0)
+  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0)
+
+  /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
+  BUILTIN_VDC (GETREG, get_dregoi, 0)
+  BUILTIN_VDC (GETREG, get_dregci, 0)
+  BUILTIN_VDC (GETREG, get_dregxi, 0)
+  /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
+  BUILTIN_VQ (GETREG, get_qregoi, 0)
+  BUILTIN_VQ (GETREG, get_qregci, 0)
+  BUILTIN_VQ (GETREG, get_qregxi, 0)
+  /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
+  BUILTIN_VQ (SETREG, set_qregoi, 0)
+  BUILTIN_VQ (SETREG, set_qregci, 0)
+  BUILTIN_VQ (SETREG, set_qregxi, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
+  BUILTIN_VDC (LOADSTRUCT, ld2, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld3, 0)
+  BUILTIN_VDC (LOADSTRUCT, ld4, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
+  BUILTIN_VQ (LOADSTRUCT, ld2, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld3, 0)
+  BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
+  /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0)
+  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0)
+  /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
+  BUILTIN_VDC (STORESTRUCT, st2, 0)
+  BUILTIN_VDC (STORESTRUCT, st3, 0)
+  BUILTIN_VDC (STORESTRUCT, st4, 0)
+  /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
+  BUILTIN_VQ (STORESTRUCT, st2, 0)
+  BUILTIN_VQ (STORESTRUCT, st3, 0)
+  BUILTIN_VQ (STORESTRUCT, st4, 0)
+
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0)
+  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0)
+
+  BUILTIN_VQW (BINOP, saddl2, 0)
+  BUILTIN_VQW (BINOP, uaddl2, 0)
+  BUILTIN_VQW (BINOP, ssubl2, 0)
+  BUILTIN_VQW (BINOP, usubl2, 0)
+  BUILTIN_VQW (BINOP, saddw2, 0)
+  BUILTIN_VQW (BINOP, uaddw2, 0)
+  BUILTIN_VQW (BINOP, ssubw2, 0)
+  BUILTIN_VQW (BINOP, usubw2, 0)
+  /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
+  BUILTIN_VD_BHSI (BINOP, saddl, 0)
+  BUILTIN_VD_BHSI (BINOP, uaddl, 0)
+  BUILTIN_VD_BHSI (BINOP, ssubl, 0)
+  BUILTIN_VD_BHSI (BINOP, usubl, 0)
+  /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
+  BUILTIN_VD_BHSI (BINOP, saddw, 0)
+  BUILTIN_VD_BHSI (BINOP, uaddw, 0)
+  BUILTIN_VD_BHSI (BINOP, ssubw, 0)
+  BUILTIN_VD_BHSI (BINOP, usubw, 0)
+  /* Implemented by aarch64_<sur>h<addsub><mode>.  */
+  BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
+  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
+  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
+  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
+  /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
+  BUILTIN_VQN (BINOP, addhn, 0)
+  BUILTIN_VQN (BINOP, subhn, 0)
+  BUILTIN_VQN (BINOP, raddhn, 0)
+  BUILTIN_VQN (BINOP, rsubhn, 0)
+  /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
+  BUILTIN_VQN (TERNOP, addhn2, 0)
+  BUILTIN_VQN (TERNOP, subhn2, 0)
+  BUILTIN_VQN (TERNOP, raddhn2, 0)
+  BUILTIN_VQN (TERNOP, rsubhn2, 0)
+
+  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
+  /* Implemented by aarch64_<sur>qmovn<mode>.  */
+  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0)
+  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0)
+  /* Implemented by aarch64_s<optab><mode>.  */
+  BUILTIN_VSDQ_I (UNOP, sqabs, 0)
+  BUILTIN_VSDQ_I (UNOP, sqneg, 0)
+
+  /* Implemented by aarch64_sqdml<SBINQOPS:as>l<mode>.  */
+  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0)
+  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0)
+  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_lane<mode>.  */
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0)
+  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_laneq<mode>.  */
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0)
+  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0)
+  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_n<mode>.  */
+  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0)
+  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0)
+
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0)
+  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
+  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
+
+  BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
+  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
+  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0)
+  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0)
+  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0)
+  /* Implemented by aarch64_sq<r>dmulh<mode>.  */
+  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0)
+  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0)
+  /* Implemented by aarch64_sq<r>dmulh_lane<q><mode>.  */
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
+  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
+
+  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
+  /* Implemented by aarch64_<sur>shl<mode>.  */
+  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
+  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
+
+  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
+  VAR1 (SHIFTIMM, ashr_simd, 0, di)
+  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
+  VAR1 (USHIFTIMM, lshr_simd, 0, di)
+  /* Implemented by aarch64_<sur>shr_n<mode>.  */
+  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0)
+  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0)
+  /* Implemented by aarch64_<sur>sra_n<mode>.  */
+  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0)
+  /* Implemented by aarch64_<sur>shll_n<mode>.  */
+  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0)
+  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0)
+  /* Implemented by aarch64_<sur>shll2_n<mode>.  */
+  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0)
+  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0)
+  /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0)
+  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0)
+  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0)
+  /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
+  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
+  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
+  /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
+  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
+  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
+  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
+
+  /* Implemented by aarch64_reduc_plus_<mode>.  */
+  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
+
+  /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
+  BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
+  BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
+  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
+  BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10)
+  BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10)
+
+  /* Implemented by <maxmin><mode>3.
+     smax variants map to fmaxnm,
+     smax_nan variants map to fmax.  */
+  BUILTIN_VDQIF (BINOP, smax, 3)
+  BUILTIN_VDQIF (BINOP, smin, 3)
+  BUILTIN_VDQ_BHSI (BINOP, umax, 3)
+  BUILTIN_VDQ_BHSI (BINOP, umin, 3)
+  BUILTIN_VDQF (BINOP, smax_nan, 3)
+  BUILTIN_VDQF (BINOP, smin_nan, 3)
+
+  /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
+  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
+  BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
+  BUILTIN_VDQF (BINOP, smaxp, 0)
+  BUILTIN_VDQF (BINOP, sminp, 0)
+  BUILTIN_VDQF (BINOP, smax_nanp, 0)
+  BUILTIN_VDQF (BINOP, smin_nanp, 0)
+
+  /* Implemented by <frint_pattern><mode>2.  */
+  BUILTIN_VDQF (UNOP, btrunc, 2)
+  BUILTIN_VDQF (UNOP, ceil, 2)
+  BUILTIN_VDQF (UNOP, floor, 2)
+  BUILTIN_VDQF (UNOP, nearbyint, 2)
+  BUILTIN_VDQF (UNOP, rint, 2)
+  BUILTIN_VDQF (UNOP, round, 2)
+  BUILTIN_VDQF_DF (UNOP, frintn, 2)
+
+  /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
+  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
+  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
+  VAR1 (UNOP, lbtruncv2df, 2, v2di)
+
+  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
+  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
+  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
+
+  VAR1 (UNOP, lroundv2sf, 2, v2si)
+  VAR1 (UNOP, lroundv4sf, 2, v4si)
+  VAR1 (UNOP, lroundv2df, 2, v2di)
+  /* Implemented by l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2.  */
+  VAR1 (UNOP, lroundsf, 2, si)
+  VAR1 (UNOP, lrounddf, 2, di)
+
+  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
+  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
+  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
+  VAR1 (UNOPUS, lroundusf, 2, si)
+  VAR1 (UNOPUS, lroundudf, 2, di)
+
+  VAR1 (UNOP, lceilv2sf, 2, v2si)
+  VAR1 (UNOP, lceilv4sf, 2, v4si)
+  VAR1 (UNOP, lceilv2df, 2, v2di)
+
+  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
+  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
+  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
+  VAR1 (UNOPUS, lceilusf, 2, si)
+  VAR1 (UNOPUS, lceiludf, 2, di)
+
+  VAR1 (UNOP, lfloorv2sf, 2, v2si)
+  VAR1 (UNOP, lfloorv4sf, 2, v4si)
+  VAR1 (UNOP, lfloorv2df, 2, v2di)
+
+  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
+  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
+  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
+  VAR1 (UNOPUS, lfloorusf, 2, si)
+  VAR1 (UNOPUS, lfloorudf, 2, di)
+
+  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
+  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
+  VAR1 (UNOP, lfrintnv2df, 2, v2di)
+  VAR1 (UNOP, lfrintnsf, 2, si)
+  VAR1 (UNOP, lfrintndf, 2, di)
+
+  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
+  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
+  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
+  VAR1 (UNOPUS, lfrintnusf, 2, si)
+  VAR1 (UNOPUS, lfrintnudf, 2, di)
+
+  /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
+  VAR1 (UNOP, floatv2si, 2, v2sf)
+  VAR1 (UNOP, floatv4si, 2, v4sf)
+  VAR1 (UNOP, floatv2di, 2, v2df)
+
+  VAR1 (UNOP, floatunsv2si, 2, v2sf)
+  VAR1 (UNOP, floatunsv4si, 2, v4sf)
+  VAR1 (UNOP, floatunsv2di, 2, v2df)
+
+  VAR5 (UNOPU, bswap, 2, v4hi, v8hi, v2si, v4si, v2di)
+
+  BUILTIN_VB (UNOP, rbit, 0)
+
+  /* Implemented by
+     aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>.  */
+  BUILTIN_VALL (BINOP, zip1, 0)
+  BUILTIN_VALL (BINOP, zip2, 0)
+  BUILTIN_VALL (BINOP, uzp1, 0)
+  BUILTIN_VALL (BINOP, uzp2, 0)
+  BUILTIN_VALL (BINOP, trn1, 0)
+  BUILTIN_VALL (BINOP, trn2, 0)
+
+  /* Implemented by
+     aarch64_frecp<FRECP:frecp_suffix><mode>.  */
+  BUILTIN_GPF (UNOP, frecpe, 0)
+  BUILTIN_GPF (BINOP, frecps, 0)
+  BUILTIN_GPF (UNOP, frecpx, 0)
+
+  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
+
+  BUILTIN_VDQF (UNOP, frecpe, 0)
+  BUILTIN_VDQF (BINOP, frecps, 0)
+
+  /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
+     only ever used for the int64x1_t intrinsic, there is no scalar version.  */
+  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
+  BUILTIN_VDQF (UNOP, abs, 2)
+
+  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
+  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
+  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
+
+  VAR1 (UNOP, float_extend_lo_, 0, v2df)
+  VAR1 (UNOP, float_extend_lo_,  0, v4sf)
+  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
+
+  /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
+
+  /* Implemented by aarch64_st1<VALL_F16:mode>.  */
+  BUILTIN_VALL_F16 (STORE1, st1, 0)
+
+  /* Implemented by fma<mode>4.  */
+  BUILTIN_VDQF (TERNOP, fma, 4)
+
+  /* Implemented by aarch64_simd_bsl<mode>.  */
+  BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
+  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
+  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
+
+  /* Implemented by aarch64_crypto_aes<op><mode>.  */
+  VAR1 (BINOPU, crypto_aese, 0, v16qi)
+  VAR1 (BINOPU, crypto_aesd, 0, v16qi)
+  VAR1 (UNOPU, crypto_aesmc, 0, v16qi)
+  VAR1 (UNOPU, crypto_aesimc, 0, v16qi)
+
+  /* Implemented by aarch64_crypto_sha1<op><mode>.  */
+  VAR1 (UNOPU, crypto_sha1h, 0, si)
+  VAR1 (BINOPU, crypto_sha1su1, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha1c, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha1m, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha1p, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha1su0, 0, v4si)
+
+  /* Implemented by aarch64_crypto_sha256<op><mode>.  */
+  VAR1 (TERNOPU, crypto_sha256h, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha256h2, 0, v4si)
+  VAR1 (BINOPU, crypto_sha256su0, 0, v4si)
+  VAR1 (TERNOPU, crypto_sha256su1, 0, v4si)
+
+  /* Implemented by aarch64_crypto_pmull<mode>.  */
+  VAR1 (BINOPP, crypto_pmull, 0, di)
+  VAR1 (BINOPP, crypto_pmull, 0, v2di)
+
+  /* Implemented by aarch64_tbl3<mode>.  */
+  VAR1 (BINOP, tbl3, 0, v8qi)
+  VAR1 (BINOP, tbl3, 0, v16qi)
+
+  /* Implemented by aarch64_qtbl3<mode>.  */
+  VAR1 (BINOP, qtbl3, 0, v8qi)
+  VAR1 (BINOP, qtbl3, 0, v16qi)
+
+  /* Implemented by aarch64_qtbl4<mode>.  */
+  VAR1 (BINOP, qtbl4, 0, v8qi)
+  VAR1 (BINOP, qtbl4, 0, v16qi)
+
+  /* Implemented by aarch64_tbx4<mode>.  */
+  VAR1 (TERNOP, tbx4, 0, v8qi)
+  VAR1 (TERNOP, tbx4, 0, v16qi)
+
+  /* Implemented by aarch64_qtbx3<mode>.  */
+  VAR1 (TERNOP, qtbx3, 0, v8qi)
+  VAR1 (TERNOP, qtbx3, 0, v16qi)
+
+  /* Implemented by aarch64_qtbx4<mode>.  */
+  VAR1 (TERNOP, qtbx4, 0, v8qi)
+  VAR1 (TERNOP, qtbx4, 0, v16qi)
+
+  /* Builtins for ARMv8.1 Adv.SIMD instructions.  */
+
+  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
+  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0)
+
+  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>.  */
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0)
+
+  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
+  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+
+  /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
+  BUILTIN_GPF (BINOP, scvtfsi, 3)
+  BUILTIN_GPF (BINOP, scvtfdi, 3)
+  BUILTIN_GPF (BINOP_SUS, ucvtfsi, 3)
+  BUILTIN_GPF (BINOP_SUS, ucvtfdi, 3)
+  BUILTIN_GPI (BINOP, fcvtzssf, 3)
+  BUILTIN_GPI (BINOP, fcvtzsdf, 3)
+  BUILTIN_GPI (BINOP_USS, fcvtzusf, 3)
+  BUILTIN_GPI (BINOP_USS, fcvtzudf, 3)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
deleted file mode 100644
index dd04579..0000000
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ /dev/null
@@ -1,447 +0,0 @@
-/* Machine description for AArch64 architecture.
-   Copyright (C) 2012-2016 Free Software Foundation, Inc.
-   Contributed by ARM Ltd.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   GCC is distributed in the hope that it will be useful, but
-   WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   <http://www.gnu.org/licenses/>.  */
-
-/* In the list below, the BUILTIN_<ITERATOR> macros expand to create
-   builtins for each of the modes described by <ITERATOR>.  When adding
-   new builtins to this list, a helpful idiom to follow is to add
-   a line for each pattern in the md file.  Thus, ADDP, which has one
-   pattern defined for the VD_BHSI iterator, and one for DImode, has two
-   entries below.
-
-   Parameter 1 is the 'type' of the intrinsic.  This is used to
-   describe the type modifiers (for example; unsigned) applied to
-   each of the parameters to the intrinsic function.
-
-   Parameter 2 is the name of the intrinsic.  This is appended
-   to `__builtin_aarch64_<name><mode>` to give the intrinsic name
-   as exported to the front-ends.
-
-   Parameter 3 describes how to map from the name to the CODE_FOR_
-   macro holding the RTL pattern for the intrinsic.  This mapping is:
-   0 - CODE_FOR_aarch64_<name><mode>
-   1-9 - CODE_FOR_<name><mode><1-9>
-   10 - CODE_FOR_<name><mode>.  */
-
-  BUILTIN_VDC (COMBINE, combine, 0)
-  BUILTIN_VB (BINOP, pmul, 0)
-  BUILTIN_VALLF (BINOP, fmulx, 0)
-  BUILTIN_VDQF_DF (UNOP, sqrt, 2)
-  BUILTIN_VD_BHSI (BINOP, addp, 0)
-  VAR1 (UNOP, addp, 0, di)
-  BUILTIN_VDQ_BHSI (UNOP, clrsb, 2)
-  BUILTIN_VDQ_BHSI (UNOP, clz, 2)
-  BUILTIN_VS (UNOP, ctz, 2)
-  BUILTIN_VB (UNOP, popcount, 2)
-
-  /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0)
-  BUILTIN_VSDQ_I (BINOP, sqrshl, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0)
-  /* Implemented by aarch64_<su_optab><optab><mode>.  */
-  BUILTIN_VSDQ_I (BINOP, sqadd, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqadd, 0)
-  BUILTIN_VSDQ_I (BINOP, sqsub, 0)
-  BUILTIN_VSDQ_I (BINOPU, uqsub, 0)
-  /* Implemented by aarch64_<sur>qadd<mode>.  */
-  BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0)
-  BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0)
-
-  /* Implemented by aarch64_get_dreg<VSTRUCT:mode><VDC:mode>.  */
-  BUILTIN_VDC (GETREG, get_dregoi, 0)
-  BUILTIN_VDC (GETREG, get_dregci, 0)
-  BUILTIN_VDC (GETREG, get_dregxi, 0)
-  /* Implemented by aarch64_get_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (GETREG, get_qregoi, 0)
-  BUILTIN_VQ (GETREG, get_qregci, 0)
-  BUILTIN_VQ (GETREG, get_qregxi, 0)
-  /* Implemented by aarch64_set_qreg<VSTRUCT:mode><VQ:mode>.  */
-  BUILTIN_VQ (SETREG, set_qregoi, 0)
-  BUILTIN_VQ (SETREG, set_qregci, 0)
-  BUILTIN_VQ (SETREG, set_qregxi, 0)
-  /* Implemented by aarch64_ld<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (LOADSTRUCT, ld2, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld3, 0)
-  BUILTIN_VDC (LOADSTRUCT, ld4, 0)
-  /* Implemented by aarch64_ld<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (LOADSTRUCT, ld2, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld3, 0)
-  BUILTIN_VQ (LOADSTRUCT, ld4, 0)
-  /* Implemented by aarch64_ld<VSTRUCT:nregs>r<VALLDIF:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT, ld2r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld3r, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT, ld4r, 0)
-  /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>.  */
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld2_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld3_lane, 0)
-  BUILTIN_VALLDIF (LOADSTRUCT_LANE, ld4_lane, 0)
-  /* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>.  */
-  BUILTIN_VDC (STORESTRUCT, st2, 0)
-  BUILTIN_VDC (STORESTRUCT, st3, 0)
-  BUILTIN_VDC (STORESTRUCT, st4, 0)
-  /* Implemented by aarch64_st<VSTRUCT:nregs><VQ:mode>.  */
-  BUILTIN_VQ (STORESTRUCT, st2, 0)
-  BUILTIN_VQ (STORESTRUCT, st3, 0)
-  BUILTIN_VQ (STORESTRUCT, st4, 0)
-
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st2_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st3_lane, 0)
-  BUILTIN_VALLDIF (STORESTRUCT_LANE, st4_lane, 0)
-
-  BUILTIN_VQW (BINOP, saddl2, 0)
-  BUILTIN_VQW (BINOP, uaddl2, 0)
-  BUILTIN_VQW (BINOP, ssubl2, 0)
-  BUILTIN_VQW (BINOP, usubl2, 0)
-  BUILTIN_VQW (BINOP, saddw2, 0)
-  BUILTIN_VQW (BINOP, uaddw2, 0)
-  BUILTIN_VQW (BINOP, ssubw2, 0)
-  BUILTIN_VQW (BINOP, usubw2, 0)
-  /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddl, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddl, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubl, 0)
-  BUILTIN_VD_BHSI (BINOP, usubl, 0)
-  /* Implemented by aarch64_<ANY_EXTEND:su><ADDSUB:optab>w<mode>.  */
-  BUILTIN_VD_BHSI (BINOP, saddw, 0)
-  BUILTIN_VD_BHSI (BINOP, uaddw, 0)
-  BUILTIN_VD_BHSI (BINOP, ssubw, 0)
-  BUILTIN_VD_BHSI (BINOP, usubw, 0)
-  /* Implemented by aarch64_<sur>h<addsub><mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, shadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, shsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uhsub, 0)
-  BUILTIN_VDQ_BHSI (BINOP, srhadd, 0)
-  BUILTIN_VDQ_BHSI (BINOP, urhadd, 0)
-  /* Implemented by aarch64_<sur><addsub>hn<mode>.  */
-  BUILTIN_VQN (BINOP, addhn, 0)
-  BUILTIN_VQN (BINOP, subhn, 0)
-  BUILTIN_VQN (BINOP, raddhn, 0)
-  BUILTIN_VQN (BINOP, rsubhn, 0)
-  /* Implemented by aarch64_<sur><addsub>hn2<mode>.  */
-  BUILTIN_VQN (TERNOP, addhn2, 0)
-  BUILTIN_VQN (TERNOP, subhn2, 0)
-  BUILTIN_VQN (TERNOP, raddhn2, 0)
-  BUILTIN_VQN (TERNOP, rsubhn2, 0)
-
-  BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0)
-  /* Implemented by aarch64_<sur>qmovn<mode>.  */
-  BUILTIN_VSQN_HSDI (UNOP, sqmovn, 0)
-  BUILTIN_VSQN_HSDI (UNOP, uqmovn, 0)
-  /* Implemented by aarch64_s<optab><mode>.  */
-  BUILTIN_VSDQ_I (UNOP, sqabs, 0)
-  BUILTIN_VSDQ_I (UNOP, sqneg, 0)
-
-  /* Implemented by aarch64_sqdml<SBINQOPS:as>l<mode>.  */
-  BUILTIN_VSD_HSI (TERNOP, sqdmlal, 0)
-  BUILTIN_VSD_HSI (TERNOP, sqdmlsl, 0)
-  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_lane<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_lane, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_lane, 0)
-  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_laneq<mode>.  */
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlal_laneq, 0)
-  BUILTIN_VSD_HSI (QUADOP_LANE, sqdmlsl_laneq, 0)
-  /* Implemented by aarch64_sqdml<SBINQOPS:as>l_n<mode>.  */
-  BUILTIN_VD_HSI (TERNOP, sqdmlal_n, 0)
-  BUILTIN_VD_HSI (TERNOP, sqdmlsl_n, 0)
-
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_lane, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlal2_laneq, 0)
-  BUILTIN_VQ_HSI (QUADOP_LANE, sqdmlsl2_laneq, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlal2_n, 0)
-  BUILTIN_VQ_HSI (TERNOP, sqdmlsl2_n, 0)
-
-  BUILTIN_VSD_HSI (BINOP, sqdmull, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_lane, 0)
-  BUILTIN_VSD_HSI (TERNOP_LANE, sqdmull_laneq, 0)
-  BUILTIN_VD_HSI (BINOP, sqdmull_n, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_lane, 0)
-  BUILTIN_VQ_HSI (TERNOP_LANE, sqdmull2_laneq, 0)
-  BUILTIN_VQ_HSI (BINOP, sqdmull2_n, 0)
-  /* Implemented by aarch64_sq<r>dmulh<mode>.  */
-  BUILTIN_VSDQ_HSI (BINOP, sqdmulh, 0)
-  BUILTIN_VSDQ_HSI (BINOP, sqrdmulh, 0)
-  /* Implemented by aarch64_sq<r>dmulh_lane<q><mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqdmulh_laneq, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_lane, 0)
-  BUILTIN_VSDQ_HSI (TERNOP_LANE, sqrdmulh_laneq, 0)
-
-  BUILTIN_VSDQ_I_DI (BINOP, ashl, 3)
-  /* Implemented by aarch64_<sur>shl<mode>.  */
-  BUILTIN_VSDQ_I_DI (BINOP, sshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP, srshl, 0)
-  BUILTIN_VSDQ_I_DI (BINOP_UUS, urshl, 0)
-
-  BUILTIN_VDQ_I (SHIFTIMM, ashr, 3)
-  VAR1 (SHIFTIMM, ashr_simd, 0, di)
-  BUILTIN_VDQ_I (SHIFTIMM, lshr, 3)
-  VAR1 (USHIFTIMM, lshr_simd, 0, di)
-  /* Implemented by aarch64_<sur>shr_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTIMM, srshr_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTIMM, urshr_n, 0)
-  /* Implemented by aarch64_<sur>sra_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTACC, ssra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usra_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTACC, srsra_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, ursra_n, 0)
-  /* Implemented by aarch64_<sur>shll_n<mode>.  */
-  BUILTIN_VD_BHSI (SHIFTIMM, sshll_n, 0)
-  BUILTIN_VD_BHSI (USHIFTIMM, ushll_n, 0)
-  /* Implemented by aarch64_<sur>shll2_n<mode>.  */
-  BUILTIN_VQW (SHIFTIMM, sshll2_n, 0)
-  BUILTIN_VQW (SHIFTIMM, ushll2_n, 0)
-  /* Implemented by aarch64_<sur>q<r>shr<u>n_n<mode>.  */
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrun_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqshrn_n, 0)
-  BUILTIN_VSQN_HSDI (SHIFTIMM, sqrshrn_n, 0)
-  BUILTIN_VSQN_HSDI (USHIFTIMM, uqrshrn_n, 0)
-  /* Implemented by aarch64_<sur>s<lr>i_n<mode>.  */
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssri_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usri_n, 0)
-  BUILTIN_VSDQ_I_DI (SHIFTINSERT, ssli_n, 0)
-  BUILTIN_VSDQ_I_DI (USHIFTACC, usli_n, 0)
-  /* Implemented by aarch64_<sur>qshl<u>_n<mode>.  */
-  BUILTIN_VSDQ_I (SHIFTIMM_USS, sqshlu_n, 0)
-  BUILTIN_VSDQ_I (SHIFTIMM, sqshl_n, 0)
-  BUILTIN_VSDQ_I (USHIFTIMM, uqshl_n, 0)
-
-  /* Implemented by aarch64_reduc_plus_<mode>.  */
-  BUILTIN_VALL (UNOP, reduc_plus_scal_, 10)
-
-  /* Implemented by reduc_<maxmin_uns>_scal_<mode> (producing scalar).  */
-  BUILTIN_VDQIF (UNOP, reduc_smax_scal_, 10)
-  BUILTIN_VDQIF (UNOP, reduc_smin_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umax_scal_, 10)
-  BUILTIN_VDQ_BHSI (UNOPU, reduc_umin_scal_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smax_nan_scal_, 10)
-  BUILTIN_VDQF (UNOP, reduc_smin_nan_scal_, 10)
-
-  /* Implemented by <maxmin><mode>3.
-     smax variants map to fmaxnm,
-     smax_nan variants map to fmax.  */
-  BUILTIN_VDQIF (BINOP, smax, 3)
-  BUILTIN_VDQIF (BINOP, smin, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umax, 3)
-  BUILTIN_VDQ_BHSI (BINOP, umin, 3)
-  BUILTIN_VDQF (BINOP, smax_nan, 3)
-  BUILTIN_VDQF (BINOP, smin_nan, 3)
-
-  /* Implemented by aarch64_<maxmin_uns>p<mode>.  */
-  BUILTIN_VDQ_BHSI (BINOP, smaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, sminp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, umaxp, 0)
-  BUILTIN_VDQ_BHSI (BINOP, uminp, 0)
-  BUILTIN_VDQF (BINOP, smaxp, 0)
-  BUILTIN_VDQF (BINOP, sminp, 0)
-  BUILTIN_VDQF (BINOP, smax_nanp, 0)
-  BUILTIN_VDQF (BINOP, smin_nanp, 0)
-
-  /* Implemented by <frint_pattern><mode>2.  */
-  BUILTIN_VDQF (UNOP, btrunc, 2)
-  BUILTIN_VDQF (UNOP, ceil, 2)
-  BUILTIN_VDQF (UNOP, floor, 2)
-  BUILTIN_VDQF (UNOP, nearbyint, 2)
-  BUILTIN_VDQF (UNOP, rint, 2)
-  BUILTIN_VDQF (UNOP, round, 2)
-  BUILTIN_VDQF_DF (UNOP, frintn, 2)
-
-  /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
-  VAR1 (UNOP, lbtruncv2sf, 2, v2si)
-  VAR1 (UNOP, lbtruncv4sf, 2, v4si)
-  VAR1 (UNOP, lbtruncv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lbtruncuv2df, 2, v2di)
-
-  VAR1 (UNOP, lroundv2sf, 2, v2si)
-  VAR1 (UNOP, lroundv4sf, 2, v4si)
-  VAR1 (UNOP, lroundv2df, 2, v2di)
-  /* Implemented by l<fcvt_pattern><su_optab><GPF:mode><GPI:mode>2.  */
-  VAR1 (UNOP, lroundsf, 2, si)
-  VAR1 (UNOP, lrounddf, 2, di)
-
-  VAR1 (UNOPUS, lrounduv2sf, 2, v2si)
-  VAR1 (UNOPUS, lrounduv4sf, 2, v4si)
-  VAR1 (UNOPUS, lrounduv2df, 2, v2di)
-  VAR1 (UNOPUS, lroundusf, 2, si)
-  VAR1 (UNOPUS, lroundudf, 2, di)
-
-  VAR1 (UNOP, lceilv2sf, 2, v2si)
-  VAR1 (UNOP, lceilv4sf, 2, v4si)
-  VAR1 (UNOP, lceilv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lceiluv2sf, 2, v2si)
-  VAR1 (UNOPUS, lceiluv4sf, 2, v4si)
-  VAR1 (UNOPUS, lceiluv2df, 2, v2di)
-  VAR1 (UNOPUS, lceilusf, 2, si)
-  VAR1 (UNOPUS, lceiludf, 2, di)
-
-  VAR1 (UNOP, lfloorv2sf, 2, v2si)
-  VAR1 (UNOP, lfloorv4sf, 2, v4si)
-  VAR1 (UNOP, lfloorv2df, 2, v2di)
-
-  VAR1 (UNOPUS, lflooruv2sf, 2, v2si)
-  VAR1 (UNOPUS, lflooruv4sf, 2, v4si)
-  VAR1 (UNOPUS, lflooruv2df, 2, v2di)
-  VAR1 (UNOPUS, lfloorusf, 2, si)
-  VAR1 (UNOPUS, lfloorudf, 2, di)
-
-  VAR1 (UNOP, lfrintnv2sf, 2, v2si)
-  VAR1 (UNOP, lfrintnv4sf, 2, v4si)
-  VAR1 (UNOP, lfrintnv2df, 2, v2di)
-  VAR1 (UNOP, lfrintnsf, 2, si)
-  VAR1 (UNOP, lfrintndf, 2, di)
-
-  VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si)
-  VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si)
-  VAR1 (UNOPUS, lfrintnuv2df, 2, v2di)
-  VAR1 (UNOPUS, lfrintnusf, 2, si)
-  VAR1 (UNOPUS, lfrintnudf, 2, di)
-
-  /* Implemented by <optab><fcvt_target><VDQF:mode>2.  */
-  VAR1 (UNOP, floatv2si, 2, v2sf)
-  VAR1 (UNOP, floatv4si, 2, v4sf)
-  VAR1 (UNOP, floatv2di, 2, v2df)
-
-  VAR1 (UNOP, floatunsv2si, 2, v2sf)
-  VAR1 (UNOP, floatunsv4si, 2, v4sf)
-  VAR1 (UNOP, floatunsv2di, 2, v2df)
-
-  VAR5 (UNOPU, bswap, 2, v4hi, v8hi, v2si, v4si, v2di)
-
-  BUILTIN_VB (UNOP, rbit, 0)
-
-  /* Implemented by
-     aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>.  */
-  BUILTIN_VALL (BINOP, zip1, 0)
-  BUILTIN_VALL (BINOP, zip2, 0)
-  BUILTIN_VALL (BINOP, uzp1, 0)
-  BUILTIN_VALL (BINOP, uzp2, 0)
-  BUILTIN_VALL (BINOP, trn1, 0)
-  BUILTIN_VALL (BINOP, trn2, 0)
-
-  /* Implemented by
-     aarch64_frecp<FRECP:frecp_suffix><mode>.  */
-  BUILTIN_GPF (UNOP, frecpe, 0)
-  BUILTIN_GPF (BINOP, frecps, 0)
-  BUILTIN_GPF (UNOP, frecpx, 0)
-
-  BUILTIN_VDQ_SI (UNOP, urecpe, 0)
-
-  BUILTIN_VDQF (UNOP, frecpe, 0)
-  BUILTIN_VDQF (BINOP, frecps, 0)
-
-  /* Implemented by a mixture of abs2 patterns.  Note the DImode builtin is
-     only ever used for the int64x1_t intrinsic, there is no scalar version.  */
-  BUILTIN_VSDQ_I_DI (UNOP, abs, 0)
-  BUILTIN_VDQF (UNOP, abs, 2)
-
-  BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10)
-  VAR1 (BINOP, float_truncate_hi_, 0, v4sf)
-  VAR1 (BINOP, float_truncate_hi_, 0, v8hf)
-
-  VAR1 (UNOP, float_extend_lo_, 0, v2df)
-  VAR1 (UNOP, float_extend_lo_,  0, v4sf)
-  BUILTIN_VDF (UNOP, float_truncate_lo_, 0)
-
-  /* Implemented by aarch64_ld1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (LOAD1, ld1, 0)
-
-  /* Implemented by aarch64_st1<VALL_F16:mode>.  */
-  BUILTIN_VALL_F16 (STORE1, st1, 0)
-
-  /* Implemented by fma<mode>4.  */
-  BUILTIN_VDQF (TERNOP, fma, 4)
-
-  /* Implemented by aarch64_simd_bsl<mode>.  */
-  BUILTIN_VDQQH (BSL_P, simd_bsl, 0)
-  BUILTIN_VSDQ_I_DI (BSL_U, simd_bsl, 0)
-  BUILTIN_VALLDIF (BSL_S, simd_bsl, 0)
-
-  /* Implemented by aarch64_crypto_aes<op><mode>.  */
-  VAR1 (BINOPU, crypto_aese, 0, v16qi)
-  VAR1 (BINOPU, crypto_aesd, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesmc, 0, v16qi)
-  VAR1 (UNOPU, crypto_aesimc, 0, v16qi)
-
-  /* Implemented by aarch64_crypto_sha1<op><mode>.  */
-  VAR1 (UNOPU, crypto_sha1h, 0, si)
-  VAR1 (BINOPU, crypto_sha1su1, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1c, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1m, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1p, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha1su0, 0, v4si)
-
-  /* Implemented by aarch64_crypto_sha256<op><mode>.  */
-  VAR1 (TERNOPU, crypto_sha256h, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256h2, 0, v4si)
-  VAR1 (BINOPU, crypto_sha256su0, 0, v4si)
-  VAR1 (TERNOPU, crypto_sha256su1, 0, v4si)
-
-  /* Implemented by aarch64_crypto_pmull<mode>.  */
-  VAR1 (BINOPP, crypto_pmull, 0, di)
-  VAR1 (BINOPP, crypto_pmull, 0, v2di)
-
-  /* Implemented by aarch64_tbl3<mode>.  */
-  VAR1 (BINOP, tbl3, 0, v8qi)
-  VAR1 (BINOP, tbl3, 0, v16qi)
-
-  /* Implemented by aarch64_qtbl3<mode>.  */
-  VAR1 (BINOP, qtbl3, 0, v8qi)
-  VAR1 (BINOP, qtbl3, 0, v16qi)
-
-  /* Implemented by aarch64_qtbl4<mode>.  */
-  VAR1 (BINOP, qtbl4, 0, v8qi)
-  VAR1 (BINOP, qtbl4, 0, v16qi)
-
-  /* Implemented by aarch64_tbx4<mode>.  */
-  VAR1 (TERNOP, tbx4, 0, v8qi)
-  VAR1 (TERNOP, tbx4, 0, v16qi)
-
-  /* Implemented by aarch64_qtbx3<mode>.  */
-  VAR1 (TERNOP, qtbx3, 0, v8qi)
-  VAR1 (TERNOP, qtbx3, 0, v16qi)
-
-  /* Implemented by aarch64_qtbx4<mode>.  */
-  VAR1 (TERNOP, qtbx4, 0, v8qi)
-  VAR1 (TERNOP, qtbx4, 0, v16qi)
-
-  /* Builtins for ARMv8.1 Adv.SIMD instructions.  */
-
-  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h<mode>.  */
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlah, 0)
-  BUILTIN_VSDQ_HSI (TERNOP, sqrdmlsh, 0)
-
-  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_lane<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_lane, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_lane, 0)
-
-  /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
-  BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 59a578f..670c690 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1778,6 +1778,28 @@
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+;; Convert between fixed-point and floating-point (scalar variant from SIMD)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
+  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w")
+				   (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED))]
+  "TARGET_SIMD"
+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_fp_to_int_<GPF:Vetype><q>")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
+  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "w")
+				   (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F))]
+  "TARGET_SIMD"
+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_int_to_fp_<GPI:Vetype><q>")]
+)
+
 ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
 ;; is inconsistent with vector ordering elsewhere in the compiler, in that
 ;; the meaning of HI and LO changes depending on the target endianness.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 223a4cc..d463808 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -75,6 +75,8 @@
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_FCVTZS_SCALAR
+    UNSPEC_FCVTZU_SCALAR
     UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
@@ -105,6 +107,7 @@
     UNSPEC_NOP
     UNSPEC_PRLG_STK
     UNSPEC_RBIT
+    UNSPEC_SCVTF_SCALAR
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
     UNSPEC_SISD_USHL
@@ -122,6 +125,7 @@
     UNSPEC_TLSLE24
     UNSPEC_TLSLE32
     UNSPEC_TLSLE48
+    UNSPEC_UCVTF_SCALAR
     UNSPEC_USHL_2S
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
@@ -4626,6 +4630,28 @@
   [(set_attr "type" "f_cvti2f")]
 )
 
+;; Convert between fixed-point and floating-point
+
+(define_insn "<FCVT_F2FIXED_SCALAR:fcvt_fixed_insn><GPF:mode><GPI:mode>3"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(unspec:GPI [(match_operand:GPF 1 "register_operand" "w")
+		     (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED_SCALAR))]
+  "TARGET_FLOAT"
+  "<FCVT_F2FIXED_SCALAR:fcvt_fixed_insn>\t%<w1>0, %<s>1, #%2"
+  [(set_attr "type" "f_cvtf2i")]
+)
+
+(define_insn "<FCVT_FIXED2F_SCALAR:fcvt_fixed_insn><GPI:mode><GPF:mode>3"
+  [(set (match_operand:GPF 0 "register_operand" "=w")
+	(unspec:GPF [(match_operand:GPI 1 "register_operand" "r")
+		     (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F_SCALAR))]
+  "TARGET_FLOAT"
+  "<FCVT_FIXED2F_SCALAR:fcvt_fixed_insn>\t%<s>0, %<w1>1, #%2"
+  [(set_attr "type" "f_cvti2f")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Floating-point arithmetic
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index e563e3d..012a11a 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6073,54 +6073,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvtd_n_f64_s64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int64_t a_ = (a);                                                \
-       float64_t result;                                                \
-       __asm__ ("scvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_f64_u64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64_t a_ = (a);                                               \
-       float64_t result;                                                \
-       __asm__ ("ucvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_s64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       int64_t result;                                                  \
-       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_u64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       uint64_t result;                                                 \
-       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 #define vcvtq_n_f32_s32(a, b)                                           \
   __extension__                                                         \
     ({                                                                  \
@@ -6217,54 +6169,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvts_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32_t a_ = (a);                                                \
-       float32_t result;                                                \
-       __asm__ ("scvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32_t a_ = (a);                                               \
-       float32_t result;                                                \
-       __asm__ ("ucvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       int32_t result;                                                  \
-       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       uint32_t result;                                                 \
-       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcvtx_f32_f64 (float64x2_t a)
 {
@@ -12830,6 +12734,58 @@ vcvt_high_f64_f32 (float32x4_t __a)
   return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
 }
 
+/* vcvt (<u>fixed-point -> float).  */
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_n_f64_s64 (int64_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfdidf (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_n_f64_u64 (uint64_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfdidf_sus (__a, __b);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_n_f32_s32 (int32_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfsisf (__a, __b);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_n_f32_u32 (uint32_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfsisf_sus (__a, __b);
+}
+
+/* vcvt (float -> <u>fixed-point).  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtd_n_s64_f64 (float64_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsdfdi (__a, __b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtd_n_u64_f64 (float64_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzudfdi_uss (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvts_n_s32_f32 (float32_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzssfsi (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvts_n_u32_f32 (float32_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzusfsi_uss (__a, __b);
+}
+
 /* vcvt  (<u>int -> float)  */
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d9bd391..4ebd6f7 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -208,6 +208,8 @@
     UNSPEC_ASHIFT_SIGNED	; Used in aarch-simd.md.
     UNSPEC_ASHIFT_UNSIGNED	; Used in aarch64-simd.md.
     UNSPEC_ABS		; Used in aarch64-simd.md.
+    UNSPEC_FCVTZS	; Used in aarch64-simd.md.
+    UNSPEC_FCVTZU	; Used in aarch64-simd.md.
     UNSPEC_FMAX		; Used in aarch64-simd.md.
     UNSPEC_FMAXNMV	; Used in aarch64-simd.md.
     UNSPEC_FMAXV	; Used in aarch64-simd.md.
@@ -216,8 +218,10 @@
     UNSPEC_FMINV	; Used in aarch64-simd.md.
     UNSPEC_FADDV	; Used in aarch64-simd.md.
     UNSPEC_ADDV		; Used in aarch64-simd.md.
+    UNSPEC_SCVTF	; Used in aarch64-simd.md.
     UNSPEC_SMAXV	; Used in aarch64-simd.md.
     UNSPEC_SMINV	; Used in aarch64-simd.md.
+    UNSPEC_UCVTF	; Used in aarch64-simd.md.
     UNSPEC_UMAXV	; Used in aarch64-simd.md.
     UNSPEC_UMINV	; Used in aarch64-simd.md.
     UNSPEC_SHADD	; Used in aarch64-simd.md.
@@ -648,8 +652,11 @@
 (define_mode_attr atomic_sfx
   [(QI "b") (HI "h") (SI "") (DI "")])
 
-(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (SF "si") (DF "di")])
-(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (SF "SI") (DF "DI")])
+(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
+			       (SF "si") (DF "di") (SI "sf") (DI "df")])
+(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
+			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
+
 
 ;; for the inequal width integer to fp conversions
 (define_mode_attr fcvt_iesize [(SF "di") (DF "si")])
@@ -1001,6 +1008,11 @@
 (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
 			    UNSPEC_FRINTA UNSPEC_FRINTN])
 
+(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
+(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
+(define_int_iterator FCVT_F2FIXED_SCALAR [UNSPEC_FCVTZS_SCALAR UNSPEC_FCVTZU_SCALAR])
+(define_int_iterator FCVT_FIXED2F_SCALAR [UNSPEC_SCVTF_SCALAR UNSPEC_UCVTF_SCALAR])
+
 (define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
 
 (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
@@ -1137,6 +1149,15 @@
 			       (UNSPEC_FRINTP "ceil") (UNSPEC_FRINTM "floor")
 			       (UNSPEC_FRINTN "frintn")])
 
+(define_int_attr fcvt_fixed_insn [(UNSPEC_SCVTF "scvtf")
+				  (UNSPEC_SCVTF_SCALAR "scvtf")
+				  (UNSPEC_UCVTF "ucvtf")
+				  (UNSPEC_UCVTF_SCALAR "ucvtf")
+				  (UNSPEC_FCVTZS "fcvtzs")
+				  (UNSPEC_FCVTZS_SCALAR "fcvtzs")
+				  (UNSPEC_FCVTZU "fcvtzu")
+				  (UNSPEC_FCVTZU_SCALAR "fcvtzu")])
+
 (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])
diff --git a/gcc/config/aarch64/t-aarch64 b/gcc/config/aarch64/t-aarch64
index 778e15c..2473776 100644
--- a/gcc/config/aarch64/t-aarch64
+++ b/gcc/config/aarch64/t-aarch64
@@ -34,7 +34,7 @@ aarch64-builtins.o: $(srcdir)/config/aarch64/aarch64-builtins.c $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
   $(RTL_H) $(TREE_H) expr.h $(TM_P_H) $(RECOG_H) langhooks.h \
   $(DIAGNOSTIC_CORE_H) $(OPTABS_H) \
-  $(srcdir)/config/aarch64/aarch64-simd-builtins.def \
+  $(srcdir)/config/aarch64/aarch64-builtins.def \
   $(srcdir)/config/aarch64/aarch64-simd-builtin-types.def \
   aarch64-builtin-iterators.h
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
-- 
1.9.1






^ permalink raw reply	[flat|nested] 28+ messages in thread

* [AArch64, 3/6] Reimplement frsqrte intrinsics
       [not found]   ` <5743029C.60208@foss.arm.com>
       [not found]     ` <574302DA.6090803@foss.arm.com>
@ 2016-05-24  8:24     ` Jiong Wang
  2016-05-27 14:09       ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-05-24  8:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 760 bytes --]

These intrinsics were implemented before the instruction pattern
"aarch64_rsqrte<mode>" added, that these intrinsics were implemented through
inline assembly.

This mirgrate the implementation to builtin.

gcc/
2016-05-23  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (rsqrte): New builtins 
for modes
         VALLF.
         * config/aarch64/aarch64-simd.md (aarch64_rsqrte_<mode>2): 
Rename to
"aarch64_rsqrte<mode>".
         * config/aarch64/aarch64.c (get_rsqrte_type): Update gen* name.
         * config/aarch64/arm_neon.h (vrsqrts_f32): Remove inline 
assembly.  Use
builtin.
         (vrsqrted_f64): Likewise.
         (vrsqrte_f32): Likewise.
         (vrsqrteq_f32): Likewise.
         (vrsqrteq_f64): Likewise.


[-- Attachment #2: 0003-3.patch --]
[-- Type: text/x-patch, Size: 5421 bytes --]

From 4921317940fe69353cd057cc329943350bc45adf Mon Sep 17 00:00:00 2001
From: "Jiong.Wang" <jiong.wang@arm.com>
Date: Mon, 23 May 2016 12:12:19 +0100
Subject: [PATCH 3/6] 3

---
 gcc/config/aarch64/aarch64-builtins.def |  3 ++
 gcc/config/aarch64/aarch64-simd.md      |  2 +-
 gcc/config/aarch64/aarch64.c            | 10 ++--
 gcc/config/aarch64/arm_neon.h           | 87 ++++++++++++---------------------
 4 files changed, 41 insertions(+), 61 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
index 5e6280c..32bcd06 100644
--- a/gcc/config/aarch64/aarch64-builtins.def
+++ b/gcc/config/aarch64/aarch64-builtins.def
@@ -459,3 +459,6 @@
   BUILTIN_VALLI (BINOP_SUS, ucvtf, 3)
   BUILTIN_VALLF (BINOP, fcvtzs, 3)
   BUILTIN_VALLF (BINOP_USS, fcvtzu, 3)
+
+  /* Implemented by aarch64_rsqrte<mode>.  */
+  BUILTIN_VALLF (UNOP, rsqrte, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 66ca2de..c34d21e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -382,7 +382,7 @@
   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
 )
 
-(define_insn "aarch64_rsqrte_<mode>2"
+(define_insn "aarch64_rsqrte<mode>"
   [(set (match_operand:VALLF 0 "register_operand" "=w")
 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
 		     UNSPEC_RSQRTE))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bd45a7d..18a8c1e 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7358,11 +7358,11 @@ get_rsqrte_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrte_df2;
-    case SFmode:   return gen_aarch64_rsqrte_sf2;
-    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
-    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
-    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    case DFmode:   return gen_aarch64_rsqrtedf;
+    case SFmode:   return gen_aarch64_rsqrtesf;
+    case V2DFmode: return gen_aarch64_rsqrtev2df;
+    case V2SFmode: return gen_aarch64_rsqrtev2sf;
+    case V4SFmode: return gen_aarch64_rsqrtev4sf;
     default: gcc_unreachable ();
   }
 }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index bd712fc..4c9976e 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9163,17 +9163,6 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
        result;                                                          \
      })
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrte_f32 (float32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("frsqrte %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
 vrsqrte_f64 (float64x1_t a)
 {
@@ -9196,39 +9185,6 @@ vrsqrte_u32 (uint32x2_t a)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrted_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("frsqrte %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_f32 (float32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("frsqrte %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrteq_f64 (float64x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("frsqrte %0.2d,%1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vrsqrteq_u32 (uint32x4_t a)
 {
@@ -9240,17 +9196,6 @@ vrsqrteq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtes_f32 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("frsqrte %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vrsqrts_f32 (float32x2_t a, float32x2_t b)
 {
@@ -21504,6 +21449,38 @@ vrshrd_n_u64 (uint64_t __a, const int __b)
   return __builtin_aarch64_urshr_ndi_uus (__a, __b);
 }
 
+/* vrsqrte.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrsqrtes_f32 (float32_t a)
+{
+  return __builtin_aarch64_rsqrtesf (a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrsqrted_f64 (float64_t a)
+{
+  return __builtin_aarch64_rsqrtedf (a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrte_f32 (float32x2_t a)
+{
+  return __builtin_aarch64_rsqrtev2sf (a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_f32 (float32x4_t a)
+{
+  return __builtin_aarch64_rsqrtev4sf (a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrsqrteq_f64 (float64x2_t a)
+{
+  return __builtin_aarch64_rsqrtev2df (a);
+}
+
 /* vrsra */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-- 
1.9.1




^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 1/6] Reimplement scalar fixed-point intrinsics
  2016-05-24  8:24 ` [AArch64, 1/6] Reimplement scalar fixed-point intrinsics Jiong Wang
@ 2016-05-27 13:50   ` James Greenhalgh
  2016-05-27 20:01     ` Jiong Wang
  0 siblings, 1 reply; 28+ messages in thread
From: James Greenhalgh @ 2016-05-27 13:50 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Tue, May 24, 2016 at 09:23:36AM +0100, Jiong Wang wrote:
> This patch reimplement scalar intrinsics for conversion between floating-
> point and fixed-point.
> 
> Previously, all such intrinsics are implemented through inline assembly.
> This patch added RTL pattern for these operations that those intrinsics
> can be implemented through builtins.
> 
> gcc/
> 2016-05-23  Jiong Wang<jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New
>         (TYPES_BINOP_SUS): Likewise.
>         (aarch64_simd_builtin_data): Update include file name.
>         (aarch64_builtins): Likewise.
>         * config/aarch64/aarch64-simd-builtins.def: Rename to
>         aarch64-builtins.def.

Why? We already have some number of intrinsics in here that are not
strictly SIMD, but I don't see the value in the rename?

> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 223a4cc..d463808 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -75,6 +75,8 @@
>      UNSPEC_CRC32H
>      UNSPEC_CRC32W
>      UNSPEC_CRC32X
> +    UNSPEC_FCVTZS_SCALAR
> +    UNSPEC_FCVTZU_SCALAR

Why do we need special "scalar" versions of the unspec? The operation is
semantically the same between the scalar and vector versions.

>      UNSPEC_URECPE
>      UNSPEC_FRECPE
>      UNSPEC_FRECPS
> @@ -105,6 +107,7 @@
>      UNSPEC_NOP
>      UNSPEC_PRLG_STK
>      UNSPEC_RBIT
> +    UNSPEC_SCVTF_SCALAR
>      UNSPEC_SISD_NEG
>      UNSPEC_SISD_SSHL
>      UNSPEC_SISD_USHL
> @@ -122,6 +125,7 @@
>      UNSPEC_TLSLE24
>      UNSPEC_TLSLE32
>      UNSPEC_TLSLE48
> +    UNSPEC_UCVTF_SCALAR

> +(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
> +(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
> +(define_int_iterator FCVT_F2FIXED_SCALAR [UNSPEC_FCVTZS_SCALAR UNSPEC_FCVTZU_SCALAR])
> +(define_int_iterator FCVT_FIXED2F_SCALAR [UNSPEC_SCVTF_SCALAR UNSPEC_UCVTF_SCALAR])

Again, do we need the "SCALAR" versions at all?

Thanks,
James

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 3/6] Reimplement frsqrte intrinsics
  2016-05-24  8:24     ` [AArch64, 3/6] Reimplement frsqrte intrinsics Jiong Wang
@ 2016-05-27 14:09       ` James Greenhalgh
  2016-05-27 19:59         ` Jiong Wang
  0 siblings, 1 reply; 28+ messages in thread
From: James Greenhalgh @ 2016-05-27 14:09 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Tue, May 24, 2016 at 09:23:48AM +0100, Jiong Wang wrote:
> These intrinsics were implemented before the instruction pattern
> "aarch64_rsqrte<mode>" added, that these intrinsics were implemented through
> inline assembly.
> 
> This mirgrate the implementation to builtin.
> 
> gcc/
> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (rsqrte): New builtins
> for modes
>         VALLF.
>         * config/aarch64/aarch64-simd.md (aarch64_rsqrte_<mode>2):
> Rename to
> "aarch64_rsqrte<mode>".
>         * config/aarch64/aarch64.c (get_rsqrte_type): Update gen* name.
>         * config/aarch64/arm_neon.h (vrsqrts_f32): Remove inline
> assembly.  Use
> builtin.
>         (vrsqrted_f64): Likewise.
>         (vrsqrte_f32): Likewise.
>         (vrsqrteq_f32): Likewise.
>         (vrsqrteq_f64): Likewise.

This ChangeLog is not in the correct form. 

It looks like you are missing vrsqrte_f64, could you please add that?

Thanks,
James

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 4/6] Reimplement frsqrts intrinsics
  2016-05-24  8:24       ` [AArch64, 4/6] Reimplement frsqrts intrinsics Jiong Wang
@ 2016-05-27 14:12         ` James Greenhalgh
  2016-05-27 15:12           ` Jiong Wang
  0 siblings, 1 reply; 28+ messages in thread
From: James Greenhalgh @ 2016-05-27 14:12 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Tue, May 24, 2016 at 09:23:53AM +0100, Jiong Wang wrote:
> Similar as [3/6], these intrinsics were implemented before the instruction
> pattern "aarch64_rsqrts<mode>" added, that these intrinsics were implemented
> through inline assembly.
> 
> This mirgrate the implementation to builtin.
> 
> gcc/
> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (rsqrts): New builtins
> for modes
>         VALLF.
>         * config/aarch64/aarch64-simd.md (aarch64_rsqrts_<mode>3):
> Rename to
> "aarch64_rsqrts<mode>".
>         * config/aarch64/aarch64.c (get_rsqrts_type): Update gen* name.
>         * config/aarch64/arm_neon.h (vrsqrtss_f32): Remove inline
> assembly.  Use
> builtin.
>         (vrsqrtsd_f64): Likewise.
>         (vrsqrts_f32): Likewise.
>         (vrsqrtsq_f32): Likewise.
>         (vrsqrtsq_f64): Likewise.

This ChangeLog format is incorrect.

It looks like you're missing vrsqrts_f64, could you please add that?

Thanks,
James

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns
  2016-05-24  8:24         ` [AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns Jiong Wang
@ 2016-05-27 14:41           ` James Greenhalgh
  2016-05-27 14:52             ` Jiong Wang
  0 siblings, 1 reply; 28+ messages in thread
From: James Greenhalgh @ 2016-05-27 14:41 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Tue, May 24, 2016 at 09:23:58AM +0100, Jiong Wang wrote:
> These intrinsics were implemented before "fabd<mode>_3" introduces.
> Meanwhile
> the patterns "fabd<mode>_3" and "*fabd_scalar<mode>3" can be merged into a
> single "fabd<mode>3" using VALLF.
> 
> This patch migrate the implementation to builtins backed by this pattern.
> 
> gcc/
> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (fabd): New builtins
> for modes
>         VALLF.
>         * config/aarch64/aarch64-simd.md (fabd<mode>_3): Extend
> modes from VDQF
>         to VALLF.
>         "*fabd_scalar<mode>3): Delete.
>         * config/aarch64/arm_neon.h (vabds_f32): Remove inline assembly.
>         Use builtin.
>         (vabdd_f64): Likewise.
>         (vabd_f32): Likewise.
>         (vabdq_f32): Likewise.
>         (vabdq_f64): Likewise.
> 

This ChangeLog format is wrong.

It looks like you've missed vabd_f64, could you please add that?

> From 9bafb58055d4e379df7b626acd6aa80bdb0d4b22 Mon Sep 17 00:00:00 2001
> From: "Jiong.Wang" <jiong.wang@arm.com>
> Date: Mon, 23 May 2016 12:12:53 +0100
> Subject: [PATCH 5/6] 5
> 
> ---
>  gcc/config/aarch64/aarch64-builtins.def |  3 ++
>  gcc/config/aarch64/aarch64-simd.md      | 23 +++------
>  gcc/config/aarch64/arm_neon.h           | 87 ++++++++++++---------------------
>  3 files changed, 42 insertions(+), 71 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64-builtins.def b/gcc/config/aarch64/aarch64-builtins.def
> index 1955d17..40baebe 100644
> --- a/gcc/config/aarch64/aarch64-builtins.def
> +++ b/gcc/config/aarch64/aarch64-builtins.def
> @@ -465,3 +465,6 @@
>  
>    /* Implemented by aarch64_rsqrts<mode>.  */
>    BUILTIN_VALLF (BINOP, rsqrts, 0)
> +
> +  /* Implemented by fabd<mode>_3.  */

This comment is incorrect, it should say "Implemented by fabd<mode>3.",
without the underscore.

> +  BUILTIN_VALLF (BINOP, fabd, 3)

Thanks,
James

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
  2016-05-24  8:24           ` [AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes Jiong Wang
@ 2016-05-27 14:45             ` James Greenhalgh
  2016-05-27 14:51               ` Jiong Wang
  0 siblings, 1 reply; 28+ messages in thread
From: James Greenhalgh @ 2016-05-27 14:45 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Tue, May 24, 2016 at 09:24:03AM +0100, Jiong Wang wrote:
> These intrinsics was implemented by inline assembly using "faddp"
> instruction.
> There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only
> while we can
> extend this pattern to support VDQF mode, then we can reimplement these
> intrinsics through builtlins.
> 
> gcc/
> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (faddp): New builtins
> for modes in VDQF.
>         * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
>         (arch64_addpv4sf): Delete.
>         (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
>         "gen_aarch64_addpv4sf".
>         * gcc/config/aarch64/iterators.md (UNSPEC_FADDP): New.
>         * config/aarch64/arm_neon.h (vpadd_f32): Remove inline
> assembly.  Use
>         builtin.
>         (vpaddq_f32): Likewise.
>         (vpaddq_f64): Likewise.

This ChangeLog format is incorrect.

You've missed vpaddd_f64 and vpadds_f32, could you add those?

Thanks,
James

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
  2016-05-27 14:45             ` James Greenhalgh
@ 2016-05-27 14:51               ` Jiong Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-27 14:51 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches



On 27/05/16 14:42, James Greenhalgh wrote:
> On Tue, May 24, 2016 at 09:24:03AM +0100, Jiong Wang wrote:
>> These intrinsics was implemented by inline assembly using "faddp"
>> instruction.
>> There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only
>> while we can
>> extend this pattern to support VDQF mode, then we can reimplement these
>> intrinsics through builtlins.
>>
>> gcc/
>> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
>>
>>          * config/aarch64/aarch64-builtins.def (faddp): New builtins
>> for modes in VDQF.
>>          * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
>>          (arch64_addpv4sf): Delete.
>>          (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
>>          "gen_aarch64_addpv4sf".
>>          * gcc/config/aarch64/iterators.md (UNSPEC_FADDP): New.
>>          * config/aarch64/arm_neon.h (vpadd_f32): Remove inline
>> assembly.  Use
>>          builtin.
>>          (vpaddq_f32): Likewise.
>>          (vpaddq_f64): Likewise.
> This ChangeLog format is incorrect.
>
> You've missed vpaddd_f64 and vpadds_f32, could you add those?

vpaddd_f64 is already there without inline assembly.


This patch cleans up those intrinsics with symmetric vector input and 
output.
vpadds_f32 looks to me is doing reduce job the return value is scalar 
instead of vector thus
can't fit well by the touched pattern. I can clean it up with a seperate 
patch. Is this OK?


>
> Thanks,
> James
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns
  2016-05-27 14:41           ` James Greenhalgh
@ 2016-05-27 14:52             ` Jiong Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-27 14:52 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, nd



On 27/05/16 14:31, James Greenhalgh wrote:
> On Tue, May 24, 2016 at 09:23:58AM +0100, Jiong Wang wrote:
>> These intrinsics were implemented before "fabd<mode>_3" introduces.
>> Meanwhile
>> the patterns "fabd<mode>_3" and "*fabd_scalar<mode>3" can be merged into a
>> single "fabd<mode>3" using VALLF.
>>
>> This patch migrate the implementation to builtins backed by this pattern.
>>
>> gcc/
>> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
>>
>>          * config/aarch64/aarch64-builtins.def (fabd): New builtins
>> for modes
>>          VALLF.
>>          * config/aarch64/aarch64-simd.md (fabd<mode>_3): Extend
>> modes from VDQF
>>          to VALLF.
>>          "*fabd_scalar<mode>3): Delete.
>>          * config/aarch64/arm_neon.h (vabds_f32): Remove inline assembly.
>>          Use builtin.
>>          (vabdd_f64): Likewise.
>>          (vabd_f32): Likewise.
>>          (vabdq_f32): Likewise.
>>          (vabdq_f64): Likewise.
>>
> This ChangeLog format is wrong.
>
> It looks like you've missed vabd_f64, could you please add that?

vabd_f64 is not there before this patch. so I haven't touched it.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 4/6] Reimplement frsqrts intrinsics
  2016-05-27 14:12         ` James Greenhalgh
@ 2016-05-27 15:12           ` Jiong Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-27 15:12 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches, nd



On 27/05/16 14:25, James Greenhalgh wrote:
> On Tue, May 24, 2016 at 09:23:53AM +0100, Jiong Wang wrote:
>> Similar as [3/6], these intrinsics were implemented before the instruction
>> pattern "aarch64_rsqrts<mode>" added, that these intrinsics were implemented
>> through inline assembly.
>>
>> This mirgrate the implementation to builtin.
>>
>> gcc/
>> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
>>
>>          * config/aarch64/aarch64-builtins.def (rsqrts): New builtins
>> for modes
>>          VALLF.
>>          * config/aarch64/aarch64-simd.md (aarch64_rsqrts_<mode>3):
>> Rename to
>> "aarch64_rsqrts<mode>".
>>          * config/aarch64/aarch64.c (get_rsqrts_type): Update gen* name.
>>          * config/aarch64/arm_neon.h (vrsqrtss_f32): Remove inline
>> assembly.  Use
>> builtin.
>>          (vrsqrtsd_f64): Likewise.
>>          (vrsqrts_f32): Likewise.
>>          (vrsqrtsq_f32): Likewise.
>>          (vrsqrtsq_f64): Likewise.
> This ChangeLog format is incorrect.
>
> It looks like you're missing vrsqrts_f64, could you please add that?

I haven't found vrsqrts_f64 before this rewrite patch.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 3/6] Reimplement frsqrte intrinsics
  2016-05-27 14:09       ` James Greenhalgh
@ 2016-05-27 19:59         ` Jiong Wang
  0 siblings, 0 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-27 19:59 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches



On 27/05/16 14:24, James Greenhalgh wrote:
> On Tue, May 24, 2016 at 09:23:48AM +0100, Jiong Wang wrote:
>> These intrinsics were implemented before the instruction pattern
>> "aarch64_rsqrte<mode>" added, that these intrinsics were implemented through
>> inline assembly.
>>
>> This mirgrate the implementation to builtin.
>>
>> gcc/
>> 2016-05-23  Jiong Wang <jiong.wang@arm.com>
>>
>>          * config/aarch64/aarch64-builtins.def (rsqrte): New builtins
>> for modes
>>          VALLF.
>>          * config/aarch64/aarch64-simd.md (aarch64_rsqrte_<mode>2):
>> Rename to
>> "aarch64_rsqrte<mode>".
>>          * config/aarch64/aarch64.c (get_rsqrte_type): Update gen* name.
>>          * config/aarch64/arm_neon.h (vrsqrts_f32): Remove inline
>> assembly.  Use
>> builtin.
>>          (vrsqrted_f64): Likewise.
>>          (vrsqrte_f32): Likewise.
>>          (vrsqrteq_f32): Likewise.
>>          (vrsqrteq_f64): Likewise.
> This ChangeLog is not in the correct form.
>
> It looks like you are missing vrsqrte_f64, could you please add that?

vrsqrte_f64 wasn't cleaned up in this patch because it's input type is 
float64x1 which
caused trouble during fitting it into aarch64 builtin infrastructure 
cleanly.

I might missed some thing, will double check this.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [AArch64, 1/6] Reimplement scalar fixed-point intrinsics
  2016-05-27 13:50   ` James Greenhalgh
@ 2016-05-27 20:01     ` Jiong Wang
       [not found]       ` <6af07de4-8179-c0bf-410c-317ef52876dd@foss.arm.com>
  2016-06-06 13:39       ` [v2][AArch64, 1/6] Reimplement scalar fixed-point intrinsics Jiong Wang
  0 siblings, 2 replies; 28+ messages in thread
From: Jiong Wang @ 2016-05-27 20:01 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches



On 27/05/16 14:03, James Greenhalgh wrote:
> On Tue, May 24, 2016 at 09:23:36AM +0100, Jiong Wang wrote:
>>          * config/aarch64/aarch64-simd-builtins.def: Rename to
>>          aarch64-builtins.def.
> Why? We already have some number of intrinsics in here that are not
> strictly SIMD, but I don't see the value in the rename?

Mostly because this builtin infrastructure is handy that I want to
implement some vfp builtins in this .def file instead of implement those
raw structure inside aarch64-builtins.c.

And there maybe more and more such builtins in the future, so I renamed
this file.


Is this OK?

>> +(define_int_iterator FCVT_FIXED2F_SCALAR [UNSPEC_SCVTF_SCALAR UNSPEC_UCVTF_SCALAR])
> Again, do we need the "SCALAR" versions at all?

That's because for scalar fixed-point conversion, we have two types of
instructions to support this.

   * scalar instruction from vfp
   * scalar variant instruction from simd

One is guarded by TARGET_FLOAT, the other is guarded by TARGET_SIMD, and
their instruction format is different, so I want to keep them in
aarch64.md and aarch64-simd.md seperately.

The other reason is these two use different patterns:

   * vfp scalar support conversion between different size, for example,
     SF->DI, DF->SI, so it's using two mode iterators, GPI and GPF, and
     is utilizing the product of the two to cover all supported
     conversions, sfsi, sfdi, dfsi, dfdi, sisf, sidf, disf, didf.

   * simd scalar only support conversion between same size that single
     mode iterator is used to cover sfsi, sisf, dfdi, didf.

For intrinsics implementation, I used builtins backed by vfp scalar
instead of simd scalar which requires the input sitting inside vector 
register.

I remember the simd scalar pattern was here because it's anyway needed
by patch [2/6] which extends it's modes naturally to vector modes. I was
thinking it's better to keep simd scalar variant with this scalar
intrinsics enable patch.

Is this OK?

Thanks.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 2/6] Reimplement vector fixed-point intrinsics
       [not found]       ` <6af07de4-8179-c0bf-410c-317ef52876dd@foss.arm.com>
@ 2016-06-06 13:39         ` Jiong Wang
  2016-06-08  9:51           ` James Greenhalgh
       [not found]         ` <7cb1e234-46f9-76b4-aefd-1eacabfb4ca7@foss.arm.com>
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:39 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1208 bytes --]

Based on top of [1/6], this patch reimplement vector intrinsics for
conversion between floating-point and fixed-point.

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (scvtf): Register vector modes.
         (ucvtf): Likewise.
         (fcvtzs): Likewise.
         (fcvtzu): Likewise.
         * config/aarch64/aarch64-simd.md
         (<FCVT_F2FIXED:fcvt_fixed_insn><VDQF:mode>3): New.
         (<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_SDI:mode>3): Likewise.
         * config/aarch64/arm_neon.h (vcvt_n_f32_s32): Remove inline assembly.
         Use builtin.
         (vcvt_n_f32_u32): Likewise.
         (vcvt_n_s32_f32): Likewise.
         (vcvt_n_u32_f32): Likewise.
         (vcvtq_n_f32_s32): Likewise.
         (vcvtq_n_f32_u32): Likewise.
         (vcvtq_n_f64_s64): Likewise.
         (vcvtq_n_f64_u64): Likewise.
         (vcvtq_n_s32_f32): Likewise.
         (vcvtq_n_s64_f64): Likewise.
         (vcvtq_n_u32_f32): Likewise.
         (vcvtq_n_u64_f64): Likewise.
         * config/aarch64/iterators.md (VDQ_SDI): New mode iterator.
         (VSDQ_SDI): Likewise.
         (fcvt_target): Support V4DI, V4SI and V2SI.
         (FCVT_TARGET): Likewise.


[-- Attachment #2: 2.patch --]
[-- Type: text/x-patch, Size: 15658 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 0b2f0631c740558c62cffe5715eaffa5ad0557a9..a7ea3c4b8ea7d695b12e6b0291e6ff815826a641 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -447,7 +447,7 @@
   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
 
   /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
-  BUILTIN_GPI (BINOP, scvtf, 3)
-  BUILTIN_GPI (BINOP_SUS, ucvtf, 3)
-  BUILTIN_GPF (BINOP, fcvtzs, 3)
-  BUILTIN_GPF (BINOP_USS, fcvtzu, 3)
+  BUILTIN_VSDQ_SDI (BINOP, scvtf, 3)
+  BUILTIN_VSDQ_SDI (BINOP_SUS, ucvtf, 3)
+  BUILTIN_VALLF (BINOP, fcvtzs, 3)
+  BUILTIN_VALLF (BINOP_USS, fcvtzu, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 6ea35bf487eaa47dd78742e3eae7507b6875ba1a..d2a6cc27de9c571e84cf59713e5fcb9c450f83a3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1778,6 +1778,28 @@
   [(set_attr "type" "neon_fp_cvt_widen_s")]
 )
 
+;; Convert between fixed-point and floating-point (vector modes)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><VDQF:mode>3"
+  [(set (match_operand:<VDQF:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VDQF:FCVT_TARGET> [(match_operand:VDQF 1 "register_operand" "w")
+				    (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_F2FIXED))]
+  "TARGET_SIMD"
+  "<FCVT_F2FIXED:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_fp_to_int_<VDQF:Vetype><q>")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_SDI:mode>3"
+  [(set (match_operand:<VDQ_SDI:FCVT_TARGET> 0 "register_operand" "=w")
+	(unspec:<VDQ_SDI:FCVT_TARGET> [(match_operand:VDQ_SDI 1 "register_operand" "w")
+				       (match_operand:SI 2 "immediate_operand" "i")]
+	 FCVT_FIXED2F))]
+  "TARGET_SIMD"
+  "<FCVT_FIXED2F:fcvt_fixed_insn>\t%<v>0<Vmtype>, %<v>1<Vmtype>, #%2"
+  [(set_attr "type" "neon_int_to_fp_<VDQ_SDI:Vetype><q>")]
+)
+
 ;; ??? Note that the vectorizer usage of the vec_unpacks_[lo/hi] patterns
 ;; is inconsistent with vector ordering elsewhere in the compiler, in that
 ;; the meaning of HI and LO changes depending on the target endianness.
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 8a0fba6513e572ede9f2e4aaf8d29baf6baf683d..04bce9ab80c151877619ee75e7cb50f5951099f7 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6025,150 +6025,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvt_n_f32_s32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x2_t a_ = (a);                                              \
-       float32x2_t result;                                              \
-       __asm__ ("scvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_f32_u32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x2_t a_ = (a);                                             \
-       float32x2_t result;                                              \
-       __asm__ ("ucvtf %0.2s, %1.2s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_s32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       int32x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvt_n_u32_f32(a, b)                                            \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x2_t a_ = (a);                                            \
-       uint32x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2s, %1.2s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32x4_t a_ = (a);                                              \
-       float32x4_t result;                                              \
-       __asm__ ("scvtf %0.4s, %1.4s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32x4_t a_ = (a);                                             \
-       float32x4_t result;                                              \
-       __asm__ ("ucvtf %0.4s, %1.4s, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f64_s64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int64x2_t a_ = (a);                                              \
-       float64x2_t result;                                              \
-       __asm__ ("scvtf %0.2d, %1.2d, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_f64_u64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64x2_t a_ = (a);                                             \
-       float64x2_t result;                                              \
-       __asm__ ("ucvtf %0.2d, %1.2d, #%2"                               \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       int32x4_t result;                                                \
-       __asm__ ("fcvtzs %0.4s, %1.4s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_s64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       int64x2_t result;                                                \
-       __asm__ ("fcvtzs %0.2d, %1.2d, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32x4_t a_ = (a);                                            \
-       uint32x4_t result;                                               \
-       __asm__ ("fcvtzu %0.4s, %1.4s, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtq_n_u64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64x2_t a_ = (a);                                            \
-       uint64x2_t result;                                               \
-       __asm__ ("fcvtzu %0.2d, %1.2d, #%2"                              \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcvtx_f32_f64 (float64x2_t a)
 {
@@ -12760,6 +12616,42 @@ vcvts_n_f32_u32 (uint32_t __a, const int __b)
   return __builtin_aarch64_ucvtfsi_sus (__a, __b);
 }
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv2si (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv2si_sus (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv4si (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv4si_sus (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_f64_s64 (int64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfv2di (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_f64_u64 (uint64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfv2di_sus (__a, __b);
+}
+
 /* vcvt (float -> <u>fixed-point).  */
 
 __extension__ static __inline int64_t __attribute__ ((__always_inline__))
@@ -12786,6 +12678,42 @@ vcvts_n_u32_f32 (float32_t __a, const int __b)
   return __builtin_aarch64_fcvtzusf_uss (__a, __b);
 }
 
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv2sf (__a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv2sf_uss (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv4sf (__a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv4sf_uss (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_s64_f64 (float64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsv2df (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcvtq_n_u64_f64 (float64x2_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzuv2df_uss (__a, __b);
+}
+
 /* vcvt  (<u>int -> float)  */
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 2d59bed99b9d269c656e5c451246a16a7e13b8b8..e8fbb1281dec2e8f37f58ef2ced792dd62e3b5aa 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -154,6 +154,12 @@
 ;; Vector modes for S type.
 (define_mode_iterator VDQ_SI [V2SI V4SI])
 
+;; Vector modes for S and D
+(define_mode_iterator VDQ_SDI [V2SI V4SI V2DI])
+
+;; Scalar and Vector modes for S and D
+(define_mode_iterator VSDQ_SDI [V2SI V4SI V2DI SI DI])
+
 ;; Vector modes for Q and H types.
 (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI])
 
@@ -649,8 +655,10 @@
   [(QI "b") (HI "h") (SI "") (DI "")])
 
 (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
+			       (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf")
 			       (SF "si") (DF "di") (SI "sf") (DI "df")])
 (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
+			       (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF")
 			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
 
 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 1/6] Reimplement scalar fixed-point intrinsics
  2016-05-27 20:01     ` Jiong Wang
       [not found]       ` <6af07de4-8179-c0bf-410c-317ef52876dd@foss.arm.com>
@ 2016-06-06 13:39       ` Jiong Wang
  2016-06-08  9:47         ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:39 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 3788 bytes --]

On 27/05/16 17:52, Jiong Wang wrote:
>
>
> On 27/05/16 14:03, James Greenhalgh wrote:
>> On Tue, May 24, 2016 at 09:23:36AM +0100, Jiong Wang wrote:
>>>          * config/aarch64/aarch64-simd-builtins.def: Rename to
>>>          aarch64-builtins.def.
>> Why? We already have some number of intrinsics in here that are not
>> strictly SIMD, but I don't see the value in the rename?
>
> Mostly because this builtin infrastructure is handy that I want to
> implement some vfp builtins in this .def file instead of implement those
> raw structure inside aarch64-builtins.c.
>
> And there maybe more and more such builtins in the future, so I renamed
> this file.
>
>
> Is this OK?
>
>>> +(define_int_iterator FCVT_FIXED2F_SCALAR [UNSPEC_SCVTF_SCALAR 
>>> UNSPEC_UCVTF_SCALAR])
>> Again, do we need the "SCALAR" versions at all?
>
> That's because for scalar fixed-point conversion, we have two types of
> instructions to support this.
>
>   * scalar instruction from vfp
>   * scalar variant instruction from simd
>
> One is guarded by TARGET_FLOAT, the other is guarded by TARGET_SIMD, and
> their instruction format is different, so I want to keep them in
> aarch64.md and aarch64-simd.md seperately.
>
> The other reason is these two use different patterns:
>
>   * vfp scalar support conversion between different size, for example,
>     SF->DI, DF->SI, so it's using two mode iterators, GPI and GPF, and
>     is utilizing the product of the two to cover all supported
>     conversions, sfsi, sfdi, dfsi, dfdi, sisf, sidf, disf, didf.
>
>   * simd scalar only support conversion between same size that single
>     mode iterator is used to cover sfsi, sisf, dfdi, didf.
>
> For intrinsics implementation, I used builtins backed by vfp scalar
> instead of simd scalar which requires the input sitting inside vector 
> register.
>
> I remember the simd scalar pattern was here because it's anyway needed
> by patch [2/6] which extends it's modes naturally to vector modes. I was
> thinking it's better to keep simd scalar variant with this scalar
> intrinsics enable patch.
>
> Is this OK?
>
> Thanks.

I updated this patch set with the following modifications:

   * drop the renaming of aarch64-builtins.def
   * implemented vrsqrts_f64, vrsqrte_f64, vabd_f64, vpadds_f32 as I am here.


OK for trunk?

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New
         (TYPES_BINOP_SUS): Likewise.
         (aarch64_simd_builtin_data): Update include file name.
         (aarch64_builtins): Likewise.
         * config/aarch64/aarch64-simd-builtins.def (scvtf): New entries
         for conversion between scalar float-point and fixed-point.
         (ucvtf): Likewise.
         (fcvtzs): Likewise.
         (fcvtzu): Likewise.
         * config/aarch64/aarch64.md
         (<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3: New
         pattern for conversion between scalar float to fixed-pointer.
         (<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>: Likewise.
         (UNSPEC_FCVTZS): New UNSPEC enumeration.
         (UNSPEC_FCVTZU): Likewise.
         (UNSPEC_SCVTF): Likewise.
         (UNSPEC_UCVTF): Likewise.
         * config/aarch64/arm_neon.h (vcvtd_n_f64_s64): Remove inline assembly.  Use
         builtin.
         (vcvtd_n_f64_u64): Likewise.
         (vcvtd_n_s64_f64): Likewise.
         (vcvtd_n_u64_f64): Likewise.
         (vcvtd_n_f32_s32): Likewise.
         (vcvts_n_f32_u32): Likewise.
         (vcvtd_n_s32_f32): Likewise.
         (vcvts_n_u32_f32): Likewise.
         * config/aarch64/iterators.md (fcvt_target): Support integer to float mapping.
         (FCVT_TARGET): Likewise.
         (FCVT_FIXED2F): New iterator.
         (FCVT_F2FIXED): Likewise.
         (fcvt_fixed_insn): New define_int_attr.


[-- Attachment #2: 1.patch --]
[-- Type: text/x-patch, Size: 14395 bytes --]

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 5573903fe0a1f3d1ffc58c36992bd46cd0cb4dad..262ea1c519f4f01a1a0726296994e40a48f26680 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -139,6 +139,14 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_none, qualifier_none, qualifier_unsigned };
 #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers)
 static enum aarch64_type_qualifiers
+aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_unsigned, qualifier_none, qualifier_none };
+#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_binop_sus_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+  = { qualifier_none, qualifier_unsigned, qualifier_none };
+#define TYPES_BINOP_SUS (aarch64_types_binop_sus_qualifiers)
+static enum aarch64_type_qualifiers
 aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS]
   = { qualifier_poly, qualifier_poly, qualifier_poly };
 #define TYPES_BINOPP (aarch64_types_binopp_qualifiers)
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index dd045792b21f84b9587be08a07db0e0081e0c484..0b2f0631c740558c62cffe5715eaffa5ad0557a9 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -445,3 +445,9 @@
   /* Implemented by aarch64_sqrdml<SQRDMLH_AS:rdma_as>h_laneq<mode>.  */
   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlah_laneq, 0)
   BUILTIN_VSDQ_HSI (QUADOP_LANE, sqrdmlsh_laneq, 0)
+
+  /* Implemented by <FCVT_F2FIXED/FIXED2F:fcvt_fixed_insn><*><*>3.  */
+  BUILTIN_GPI (BINOP, scvtf, 3)
+  BUILTIN_GPI (BINOP_SUS, ucvtf, 3)
+  BUILTIN_GPF (BINOP, fcvtzs, 3)
+  BUILTIN_GPF (BINOP_USS, fcvtzu, 3)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f04f7daed276ad53619623405c384ffe300fc8c1..8e6a082e91fcad18cc891c83209b061eef6449e0 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -75,6 +75,8 @@
     UNSPEC_CRC32H
     UNSPEC_CRC32W
     UNSPEC_CRC32X
+    UNSPEC_FCVTZS
+    UNSPEC_FCVTZU
     UNSPEC_URECPE
     UNSPEC_FRECPE
     UNSPEC_FRECPS
@@ -105,6 +107,7 @@
     UNSPEC_NOP
     UNSPEC_PRLG_STK
     UNSPEC_RBIT
+    UNSPEC_SCVTF
     UNSPEC_SISD_NEG
     UNSPEC_SISD_SSHL
     UNSPEC_SISD_USHL
@@ -122,6 +125,7 @@
     UNSPEC_TLSLE24
     UNSPEC_TLSLE32
     UNSPEC_TLSLE48
+    UNSPEC_UCVTF
     UNSPEC_USHL_2S
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
@@ -4620,6 +4624,36 @@
   [(set_attr "type" "f_cvti2f")]
 )
 
+;; Convert between fixed-point and floating-point (scalar modes)
+
+(define_insn "<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3"
+  [(set (match_operand:<GPF:FCVT_TARGET> 0 "register_operand" "=r, w")
+	(unspec:<GPF:FCVT_TARGET> [(match_operand:GPF 1 "register_operand" "w, w")
+				   (match_operand:SI 2 "immediate_operand" "i, i")]
+	 FCVT_F2FIXED))]
+  ""
+  "@
+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<w1>0, %<s>1, #%2
+   <FCVT_F2FIXED:fcvt_fixed_insn>\t%<s>0, %<s>1, #%2"
+  [(set_attr "type" "f_cvtf2i, neon_fp_to_int_<GPF:Vetype>")
+   (set_attr "fp" "yes, *")
+   (set_attr "simd" "*, yes")]
+)
+
+(define_insn "<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>3"
+  [(set (match_operand:<GPI:FCVT_TARGET> 0 "register_operand" "=w, w")
+	(unspec:<GPI:FCVT_TARGET> [(match_operand:GPI 1 "register_operand" "r, w")
+				   (match_operand:SI 2 "immediate_operand" "i, i")]
+	 FCVT_FIXED2F))]
+  ""
+  "@
+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<s>0, %<w1>1, #%2
+   <FCVT_FIXED2F:fcvt_fixed_insn>\t%<s>0, %<s>1, #%2"
+  [(set_attr "type" "f_cvti2f, neon_int_to_fp_<GPI:Vetype>")
+   (set_attr "fp" "yes, *")
+   (set_attr "simd" "*, yes")]
+)
+
 ;; -------------------------------------------------------------------
 ;; Floating-point arithmetic
 ;; -------------------------------------------------------------------
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d20caf0919356eb7a87e7c7a9cd336d8408db35b..8a0fba6513e572ede9f2e4aaf8d29baf6baf683d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6073,54 +6073,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvtd_n_f64_s64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int64_t a_ = (a);                                                \
-       float64_t result;                                                \
-       __asm__ ("scvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_f64_u64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint64_t a_ = (a);                                               \
-       float64_t result;                                                \
-       __asm__ ("ucvtf %d0,%d1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_s64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       int64_t result;                                                  \
-       __asm__ ("fcvtzs %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvtd_n_u64_f64(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float64_t a_ = (a);                                              \
-       uint64_t result;                                                 \
-       __asm__ ("fcvtzu %d0,%d1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 #define vcvtq_n_f32_s32(a, b)                                           \
   __extension__                                                         \
     ({                                                                  \
@@ -6217,54 +6169,6 @@ vaddlvq_u32 (uint32x4_t a)
        result;                                                          \
      })
 
-#define vcvts_n_f32_s32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       int32_t a_ = (a);                                                \
-       float32_t result;                                                \
-       __asm__ ("scvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_f32_u32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       uint32_t a_ = (a);                                               \
-       float32_t result;                                                \
-       __asm__ ("ucvtf %s0,%s1,%2"                                      \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_s32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       int32_t result;                                                  \
-       __asm__ ("fcvtzs %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
-#define vcvts_n_u32_f32(a, b)                                           \
-  __extension__                                                         \
-    ({                                                                  \
-       float32_t a_ = (a);                                              \
-       uint32_t result;                                                 \
-       __asm__ ("fcvtzu %s0,%s1,%2"                                     \
-                : "=w"(result)                                          \
-                : "w"(a_), "i"(b)                                       \
-                : /* No clobbers */);                                   \
-       result;                                                          \
-     })
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vcvtx_f32_f64 (float64x2_t a)
 {
@@ -12830,6 +12734,58 @@ vcvt_high_f64_f32 (float32x4_t __a)
   return __builtin_aarch64_vec_unpacks_hi_v4sf (__a);
 }
 
+/* vcvt (<u>fixed-point -> float).  */
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_n_f64_s64 (int64_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfdi (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vcvtd_n_f64_u64 (uint64_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfdi_sus (__a, __b);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_n_f32_s32 (int32_t __a, const int __b)
+{
+  return __builtin_aarch64_scvtfsi (__a, __b);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vcvts_n_f32_u32 (uint32_t __a, const int __b)
+{
+  return __builtin_aarch64_ucvtfsi_sus (__a, __b);
+}
+
+/* vcvt (float -> <u>fixed-point).  */
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vcvtd_n_s64_f64 (float64_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzsdf (__a, __b);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vcvtd_n_u64_f64 (float64_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzudf_uss (__a, __b);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vcvts_n_s32_f32 (float32_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzssf (__a, __b);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vcvts_n_u32_f32 (float32_t __a, const int __b)
+{
+  return __builtin_aarch64_fcvtzusf_uss (__a, __b);
+}
+
 /* vcvt  (<u>int -> float)  */
 
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 43b22d81cda30398564af2f2fcaefceb215ec04c..2d59bed99b9d269c656e5c451246a16a7e13b8b8 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -648,8 +648,11 @@
 (define_mode_attr atomic_sfx
   [(QI "b") (HI "h") (SI "") (DI "")])
 
-(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (SF "si") (DF "di")])
-(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (SF "SI") (DF "DI")])
+(define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si")
+			       (SF "si") (DF "di") (SI "sf") (DI "df")])
+(define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI")
+			       (SF "SI") (DF "DI") (SI "SF") (DI "DF")])
+
 
 ;; for the inequal width integer to fp conversions
 (define_mode_attr fcvt_iesize [(SF "di") (DF "si")])
@@ -1002,6 +1005,9 @@
 (define_int_iterator FCVT [UNSPEC_FRINTZ UNSPEC_FRINTP UNSPEC_FRINTM
 			    UNSPEC_FRINTA UNSPEC_FRINTN])
 
+(define_int_iterator FCVT_F2FIXED [UNSPEC_FCVTZS UNSPEC_FCVTZU])
+(define_int_iterator FCVT_FIXED2F [UNSPEC_SCVTF UNSPEC_UCVTF])
+
 (define_int_iterator FRECP [UNSPEC_FRECPE UNSPEC_FRECPX])
 
 (define_int_iterator CRC [UNSPEC_CRC32B UNSPEC_CRC32H UNSPEC_CRC32W
@@ -1138,6 +1144,11 @@
 			       (UNSPEC_FRINTP "ceil") (UNSPEC_FRINTM "floor")
 			       (UNSPEC_FRINTN "frintn")])
 
+(define_int_attr fcvt_fixed_insn [(UNSPEC_SCVTF "scvtf")
+				  (UNSPEC_UCVTF "ucvtf")
+				  (UNSPEC_FCVTZS "fcvtzs")
+				  (UNSPEC_FCVTZU "fcvtzu")])
+
 (define_int_attr perm_insn [(UNSPEC_ZIP1 "zip") (UNSPEC_ZIP2 "zip")
 			    (UNSPEC_TRN1 "trn") (UNSPEC_TRN2 "trn")
 			    (UNSPEC_UZP1 "uzp") (UNSPEC_UZP2 "uzp")])



^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 4/6] Reimplement frsqrts intrinsics
       [not found]           ` <49a7c4d8-3fdc-8806-a4df-affa742cc5d7@foss.arm.com>
@ 2016-06-06 13:40             ` Jiong Wang
  2016-06-08  9:57               ` James Greenhalgh
       [not found]             ` <32b5ca55-e60a-42b0-3532-84319e5c0daf@foss.arm.com>
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:40 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 827 bytes --]

Similar as [3/6], these intrinsics were implemented before the instruction
pattern "aarch64_rsqrts<mode>" added, that these intrinsics were implemented
through inline assembly.

This mirgrate the implementation to builtin.

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (rsqrts): New builtins for modes
         VALLF.
         * config/aarch64/aarch64-simd.md (aarch64_rsqrts_<mode>3): Rename to
         "aarch64_rsqrts<mode>".
         * config/aarch64/aarch64.c (get_rsqrts_type): Update gen* name.
         * config/aarch64/arm_neon.h (vrsqrtss_f32): Remove inline assembly.  Use
         builtin.
         (vrsqrtsd_f64): Likewise.
         (vrsqrts_f32): Likewise.
         (vrsqrts_f64): Likewise.
         (vrsqrtsq_f32): Likewise.
         (vrsqrtsq_f64): Likewise.


[-- Attachment #2: 4.patch --]
[-- Type: text/x-patch, Size: 5405 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 32bcd06ec6e483c53b01caf1e30305e0b2b3fb21..1955d171d727e8995795d343ea766f130be0985e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -462,3 +462,6 @@
 
   /* Implemented by aarch64_rsqrte<mode>.  */
   BUILTIN_VALLF (UNOP, rsqrte, 0)
+
+  /* Implemented by aarch64_rsqrts<mode>.  */
+  BUILTIN_VALLF (BINOP, rsqrts, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 568dd20ad3436e4aa4c3e7cf6b6f766b7fc127db..78a87b1fb52b5b5e21ef5cd7dbe090c863369775 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -390,7 +390,7 @@
   "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
   [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
 
-(define_insn "aarch64_rsqrts_<mode>3"
+(define_insn "aarch64_rsqrts<mode>"
   [(set (match_operand:VALLF 0 "register_operand" "=w")
 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
 	       (match_operand:VALLF 2 "register_operand" "w")]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index acfb39dc025d74fe531d439bb87c52d18955ee7c..b60e5c52df6310a87635c523d723eee9768d7aef 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7368,11 +7368,11 @@ get_rsqrts_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrts_df3;
-    case SFmode:   return gen_aarch64_rsqrts_sf3;
-    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
-    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
-    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    case DFmode:   return gen_aarch64_rsqrtsdf;
+    case SFmode:   return gen_aarch64_rsqrtssf;
+    case V2DFmode: return gen_aarch64_rsqrtsv2df;
+    case V2SFmode: return gen_aarch64_rsqrtsv2sf;
+    case V4SFmode: return gen_aarch64_rsqrtsv4sf;
     default: gcc_unreachable ();
   }
 }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 49d572ff8e5007ad07672568ed4dccbea4e0e139..2177703180ca50acedd64d613e4e665264371fb2 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9185,61 +9185,6 @@ vrsqrteq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrts_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("frsqrts %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrtsd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("frsqrts %d0,%d1,%d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrtsq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("frsqrts %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrtsq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("frsqrts %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtss_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("frsqrts %s0,%s1,%s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 #define vshrn_high_n_s16(a, b, c)                                       \
   __extension__                                                         \
     ({                                                                  \
@@ -21476,6 +21421,45 @@ vrsqrteq_f64 (float64x2_t __a)
   return __builtin_aarch64_rsqrtev2df (__a);
 }
 
+/* vrsqrts.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrsqrtss_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_rsqrtssf (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrsqrtsd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_rsqrtsdf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_rsqrtsv2sf (__a, __b);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vrsqrts_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0),
+				      vget_lane_f64 (__b, 0))};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_rsqrtsv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_rsqrtsv2df (__a, __b);
+}
+
 /* vrsra */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 3/6] Reimplement frsqrte intrinsics
       [not found]         ` <7cb1e234-46f9-76b4-aefd-1eacabfb4ca7@foss.arm.com>
       [not found]           ` <49a7c4d8-3fdc-8806-a4df-affa742cc5d7@foss.arm.com>
@ 2016-06-06 13:40           ` Jiong Wang
  2016-06-08  9:53             ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:40 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 808 bytes --]

These intrinsics were implemented before the instruction pattern
"aarch64_rsqrte<mode>" added, that these intrinsics were implemented through
inline assembly.

This mirgrate the implementation to builtin.

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (rsqrte): New builtins for modes
         VALLF.
         * config/aarch64/aarch64-simd.md (aarch64_rsqrte_<mode>2): Rename to
         "aarch64_rsqrte<mode>".
         * config/aarch64/aarch64.c (get_rsqrte_type): Update gen* name.
         * config/aarch64/arm_neon.h (vrsqrts_f32): Remove inline assembly.  Use
         builtin.
         (vrsqrted_f64): Likewise.
         (vrsqrte_f32): Likewise.
         (vrsqrte_f64): Likewise.
         (vrsqrteq_f32): Likewise.
         (vrsqrteq_f64): Likewise.


[-- Attachment #2: 3.patch --]
[-- Type: text/x-patch, Size: 5704 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 25a5270766401bd2f31ccacdafee83c183bdf775..f60f84c42fefd32bace6f4aa690f97ca54f3e4b6 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -451,3 +451,6 @@
   BUILTIN_VALLI (BINOP_SUS, ucvtf, 3)
   BUILTIN_VALLF (BINOP, fcvtzs, 3)
   BUILTIN_VALLF (BINOP_USS, fcvtzu, 3)
+
+  /* Implemented by aarch64_rsqrte<mode>.  */
+  BUILTIN_VALLF (UNOP, rsqrte, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ca90b666a7e3888057b7d9e8562a2544a006cf0f..941214680262ef1015cbb23f518b4999f962bf9b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -382,7 +382,7 @@
   [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
 )
 
-(define_insn "aarch64_rsqrte_<mode>2"
+(define_insn "aarch64_rsqrte<mode>"
   [(set (match_operand:VALLF 0 "register_operand" "=w")
 	(unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
 		     UNSPEC_RSQRTE))]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index ad07fe196a814ace78d43f66e70280d20a4476b5..acfb39dc025d74fe531d439bb87c52d18955ee7c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7349,11 +7349,11 @@ get_rsqrte_type (machine_mode mode)
 {
   switch (mode)
   {
-    case DFmode:   return gen_aarch64_rsqrte_df2;
-    case SFmode:   return gen_aarch64_rsqrte_sf2;
-    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
-    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
-    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    case DFmode:   return gen_aarch64_rsqrtedf;
+    case SFmode:   return gen_aarch64_rsqrtesf;
+    case V2DFmode: return gen_aarch64_rsqrtev2df;
+    case V2SFmode: return gen_aarch64_rsqrtev2sf;
+    case V4SFmode: return gen_aarch64_rsqrtev4sf;
     default: gcc_unreachable ();
   }
 }
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 04bce9ab80c151877619ee75e7cb50f5951099f7..e4f7a66abcc59f306de289d22e9d09cfe32c0c87 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9163,28 +9163,6 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b)
        result;                                                          \
      })
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrte_f32 (float32x2_t a)
-{
-  float32x2_t result;
-  __asm__ ("frsqrte %0.2s,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
-vrsqrte_f64 (float64x1_t a)
-{
-  float64x1_t result;
-  __asm__ ("frsqrte %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vrsqrte_u32 (uint32x2_t a)
 {
@@ -9196,39 +9174,6 @@ vrsqrte_u32 (uint32x2_t a)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vrsqrted_f64 (float64_t a)
-{
-  float64_t result;
-  __asm__ ("frsqrte %d0,%d1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_f32 (float32x4_t a)
-{
-  float32x4_t result;
-  __asm__ ("frsqrte %0.4s,%1.4s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vrsqrteq_f64 (float64x2_t a)
-{
-  float64x2_t result;
-  __asm__ ("frsqrte %0.2d,%1.2d"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vrsqrteq_u32 (uint32x4_t a)
 {
@@ -9240,17 +9185,6 @@ vrsqrteq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vrsqrtes_f32 (float32_t a)
-{
-  float32_t result;
-  __asm__ ("frsqrte %s0,%s1"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vrsqrts_f32 (float32x2_t a, float32x2_t b)
 {
@@ -21504,6 +21438,44 @@ vrshrd_n_u64 (uint64_t __a, const int __b)
   return __builtin_aarch64_urshr_ndi_uus (__a, __b);
 }
 
+/* vrsqrte.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vrsqrtes_f32 (float32_t __a)
+{
+  return __builtin_aarch64_rsqrtesf (__a);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vrsqrted_f64 (float64_t __a)
+{
+  return __builtin_aarch64_rsqrtedf (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrte_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_rsqrtev2sf (__a);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vrsqrte_f64 (float64x1_t __a)
+{
+  return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_f32 (float32x4_t __a)
+{
+  return __builtin_aarch64_rsqrtev4sf (__a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vrsqrteq_f64 (float64x2_t __a)
+{
+  return __builtin_aarch64_rsqrtev2df (__a);
+}
+
 /* vrsra */
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
       [not found]               ` <1017fc5b-389d-ab41-24bd-491fff8e1a81@foss.arm.com>
@ 2016-06-06 13:41                 ` Jiong Wang
  2016-06-08  9:59                   ` James Greenhalgh
  0 siblings, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:41 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 808 bytes --]

These intrinsics was implemented by inline assembly using "faddp" instruction.
There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only while we can
extend this pattern to support VDQF mode, then we can reimplement these
intrinsics through builtlins.

gcc/
2016-06-06  Jiong Wang<jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (faddp): New builtins for modes in VDQF.
         * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
         (arch64_addpv4sf): Delete.
         (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
         "gen_aarch64_addpv4sf".
         * config/aarch64/arm_neon.h (vpadd_f32): Remove inline assembly.  Use
         builtin.
         (vpadds_f32): Likewise.
         (vpaddq_f32): Likewise.
         (vpaddq_f64): Likewise.


[-- Attachment #2: 6.patch --]
[-- Type: text/x-patch, Size: 5391 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index deab3450ab74fcd6dfcf8267fa9cedfc1423ca4e..1348e7c198763b24d092f774a0ff25e4d0fd1787 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -468,3 +468,6 @@
 
   /* Implemented by fabd<mode>3.  */
   BUILTIN_VALLF (BINOP, fabd, 3)
+
+  /* Implemented by aarch64_faddp<mode>.  */
+  BUILTIN_VDQF (BINOP, faddp, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ad8b9c1d0c155d022be2e7e7c426120b551f3f2b..f8d3e766a53736a4b87ba016caccd085eb793bda 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -1992,6 +1992,16 @@
   }
 )
 
+(define_insn "aarch64_faddp<mode>"
+ [(set (match_operand:VDQF 0 "register_operand" "=w")
+       (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")
+		     (match_operand:VDQF 2 "register_operand" "w")]
+		     UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
@@ -2019,15 +2029,6 @@
   [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
 )
 
-(define_insn "aarch64_addpv4sf"
- [(set (match_operand:V4SF 0 "register_operand" "=w")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "w")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%0.4s, %1.4s, %1.4s"
-  [(set_attr "type" "neon_fp_reduc_add_s_q")]
-)
-
 (define_expand "reduc_plus_scal_v4sf"
  [(set (match_operand:SF 0 "register_operand")
        (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
@@ -2036,8 +2037,8 @@
 {
   rtx elt = GEN_INT (ENDIAN_LANE_N (V4SFmode, 0));
   rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_addpv4sf (scratch, operands[1]));
-  emit_insn (gen_aarch64_addpv4sf (scratch, scratch));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
   emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
   DONE;
 })
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9e966e47789646ed968a081c1fc4cb76b45537af..13a4ab80cf7b0470d8ec8b07e0ed1988f8f4e66d 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -8225,17 +8225,6 @@ vpadalq_u32 (uint64x2_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpadd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("faddp %0.2s,%1.2s,%2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vpaddl_s8 (int8x8_t a)
 {
@@ -8368,28 +8357,6 @@ vpaddlq_u32 (uint32x4_t a)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vpaddq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("faddp %0.4s,%1.4s,%2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vpaddq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("faddp %0.2d,%1.2d,%2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vpaddq_s8 (int8x16_t a, int8x16_t b)
 {
@@ -8478,17 +8445,6 @@ vpaddq_u64 (uint64x2_t a, uint64x2_t b)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vpadds_f32 (float32x2_t a)
-{
-  float32_t result;
-  __asm__ ("faddp %s0,%1.2s"
-           : "=w"(result)
-           : "w"(a)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vqdmulh_n_s16 (int16x4_t a, int16_t b)
 {
@@ -18625,6 +18581,24 @@ vnegq_s64 (int64x2_t __a)
 
 /* vpadd  */
 
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_faddpv2sf (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vpaddq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_faddpv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vpaddq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_faddpv2df (__a, __b);
+}
+
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vpadd_s8 (int8x8_t __a, int8x8_t __b)
 {
@@ -18664,6 +18638,12 @@ vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
 						  (int32x2_t) __b);
 }
 
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vpadds_f32 (float32x2_t __a)
+{
+  return __builtin_aarch64_reduc_plus_scal_v2sf (__a);
+}
+
 __extension__ static __inline float64_t __attribute__ ((__always_inline__))
 vpaddd_f64 (float64x2_t __a)
 {


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [v2][AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns
       [not found]             ` <32b5ca55-e60a-42b0-3532-84319e5c0daf@foss.arm.com>
       [not found]               ` <1017fc5b-389d-ab41-24bd-491fff8e1a81@foss.arm.com>
@ 2016-06-06 13:41               ` Jiong Wang
  2016-06-08  9:58                 ` James Greenhalgh
  1 sibling, 1 reply; 28+ messages in thread
From: Jiong Wang @ 2016-06-06 13:41 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 831 bytes --]

These intrinsics were implemented before "fabd<mode>_3" introduces.  
Meanwhile
the patterns "fabd<mode>_3" and "*fabd_scalar<mode>3" can be merged into a
single "fabd<mode>3" using VALLF.

This patch migrate the implementation to builtins backed by this pattern.

gcc/
2016-06-01  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-builtins.def (fabd): New builtins for 
modes
         VALLF.
         * config/aarch64/aarch64-simd.md (fabd<mode>_3): Extend modes 
from VDQF
         to VALLF.  Rename to "fabd<mode>3".
         "*fabd_scalar<mode>3): Delete.
         * config/aarch64/arm_neon.h (vabds_f32): Remove inline assembly.
         Use builtin.
         (vabdd_f64): Likewise.
         (vabd_f32): Likewise.
         (vabd_f64): Likewise.
         (vabdq_f32): Likewise.
         (vabdq_f64): Likewise.

[-- Attachment #2: 5.patch --]
[-- Type: text/x-patch, Size: 5671 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 1955d171d727e8995795d343ea766f130be0985e..deab3450ab74fcd6dfcf8267fa9cedfc1423ca4e 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -465,3 +465,6 @@
 
   /* Implemented by aarch64_rsqrts<mode>.  */
   BUILTIN_VALLF (BINOP, rsqrts, 0)
+
+  /* Implemented by fabd<mode>3.  */
+  BUILTIN_VALLF (BINOP, fabd, 3)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 78a87b1fb52b5b5e21ef5cd7dbe090c863369775..ad8b9c1d0c155d022be2e7e7c426120b551f3f2b 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -474,23 +474,14 @@
   [(set_attr "type" "neon_arith_acc<q>")]
 )
 
-(define_insn "fabd<mode>_3"
-  [(set (match_operand:VDQF 0 "register_operand" "=w")
-	(abs:VDQF (minus:VDQF
-		   (match_operand:VDQF 1 "register_operand" "w")
-		   (match_operand:VDQF 2 "register_operand" "w"))))]
-  "TARGET_SIMD"
-  "fabd\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
-)
-
-(define_insn "*fabd_scalar<mode>3"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
-        (abs:GPF (minus:GPF
-                 (match_operand:GPF 1 "register_operand" "w")
-                 (match_operand:GPF 2 "register_operand" "w"))))]
+(define_insn "fabd<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+	(abs:VALLF
+	  (minus:VALLF
+	    (match_operand:VALLF 1 "register_operand" "w")
+	    (match_operand:VALLF 2 "register_operand" "w"))))]
   "TARGET_SIMD"
-  "fabd\t%<s>0, %<s>1, %<s>2"
+  "fabd\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
   [(set_attr "type" "neon_fp_abd_<Vetype><q>")]
 )
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2177703180ca50acedd64d613e4e665264371fb2..9e966e47789646ed968a081c1fc4cb76b45537af 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -5440,17 +5440,6 @@ vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c)
   return result;
 }
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabd_f32 (float32x2_t a, float32x2_t b)
-{
-  float32x2_t result;
-  __asm__ ("fabd %0.2s, %1.2s, %2.2s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vabd_s8 (int8x8_t a, int8x8_t b)
 {
@@ -5517,17 +5506,6 @@ vabd_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float64_t __attribute__ ((__always_inline__))
-vabdd_f64 (float64_t a, float64_t b)
-{
-  float64_t result;
-  __asm__ ("fabd %d0, %d1, %d2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vabdl_high_s8 (int8x16_t a, int8x16_t b)
 {
@@ -5660,28 +5638,6 @@ vabdl_u32 (uint32x2_t a, uint32x2_t b)
   return result;
 }
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabdq_f32 (float32x4_t a, float32x4_t b)
-{
-  float32x4_t result;
-  __asm__ ("fabd %0.4s, %1.4s, %2.4s"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vabdq_f64 (float64x2_t a, float64x2_t b)
-{
-  float64x2_t result;
-  __asm__ ("fabd %0.2d, %1.2d, %2.2d"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vabdq_s8 (int8x16_t a, int8x16_t b)
 {
@@ -5748,17 +5704,6 @@ vabdq_u32 (uint32x4_t a, uint32x4_t b)
   return result;
 }
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vabds_f32 (float32_t a, float32_t b)
-{
-  float32_t result;
-  __asm__ ("fabd %s0, %s1, %s2"
-           : "=w"(result)
-           : "w"(a), "w"(b)
-           : /* No clobbers */);
-  return result;
-}
-
 __extension__ static __inline int16_t __attribute__ ((__always_inline__))
 vaddlv_s8 (int8x8_t a)
 {
@@ -10235,6 +10180,45 @@ vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx)
 
 /* Start of optimal implementations in approved order.  */
 
+/* vabd.  */
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vabds_f32 (float32_t __a, float32_t __b)
+{
+  return __builtin_aarch64_fabdsf (__a, __b);
+}
+
+__extension__ static __inline float64_t __attribute__ ((__always_inline__))
+vabdd_f64 (float64_t __a, float64_t __b)
+{
+  return __builtin_aarch64_fabddf (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabd_f32 (float32x2_t __a, float32x2_t __b)
+{
+  return __builtin_aarch64_fabdv2sf (__a, __b);
+}
+
+__extension__ static __inline float64x1_t __attribute__ ((__always_inline__))
+vabd_f64 (float64x1_t __a, float64x1_t __b)
+{
+  return (float64x1_t) {vabdd_f64 (vget_lane_f64 (__a, 0),
+				   vget_lane_f64 (__b, 0))};
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabdq_f32 (float32x4_t __a, float32x4_t __b)
+{
+  return __builtin_aarch64_fabdv4sf (__a, __b);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vabdq_f64 (float64x2_t __a, float64x2_t __b)
+{
+  return __builtin_aarch64_fabdv2df (__a, __b);
+}
+
 /* vabs  */
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 1/6] Reimplement scalar fixed-point intrinsics
  2016-06-06 13:39       ` [v2][AArch64, 1/6] Reimplement scalar fixed-point intrinsics Jiong Wang
@ 2016-06-08  9:47         ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:47 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:38:58PM +0100, Jiong Wang wrote:
> On 27/05/16 17:52, Jiong Wang wrote:
> >
> >
> >On 27/05/16 14:03, James Greenhalgh wrote:
> >>On Tue, May 24, 2016 at 09:23:36AM +0100, Jiong Wang wrote:
> >>>         * config/aarch64/aarch64-simd-builtins.def: Rename to
> >>>         aarch64-builtins.def.
> >>Why? We already have some number of intrinsics in here that are not
> >>strictly SIMD, but I don't see the value in the rename?
> >
> >Mostly because this builtin infrastructure is handy that I want to
> >implement some vfp builtins in this .def file instead of implement those
> >raw structure inside aarch64-builtins.c.
> >
> >And there maybe more and more such builtins in the future, so I renamed
> >this file.
> >
> >
> >Is this OK?
> >
> >>>+(define_int_iterator FCVT_FIXED2F_SCALAR [UNSPEC_SCVTF_SCALAR
> >>>UNSPEC_UCVTF_SCALAR])
> >>Again, do we need the "SCALAR" versions at all?
> >
> >That's because for scalar fixed-point conversion, we have two types of
> >instructions to support this.
> >
> >  * scalar instruction from vfp
> >  * scalar variant instruction from simd
> >
> >One is guarded by TARGET_FLOAT, the other is guarded by TARGET_SIMD, and
> >their instruction format is different, so I want to keep them in
> >aarch64.md and aarch64-simd.md seperately.
> >
> >The other reason is these two use different patterns:
> >
> >  * vfp scalar support conversion between different size, for example,
> >    SF->DI, DF->SI, so it's using two mode iterators, GPI and GPF, and
> >    is utilizing the product of the two to cover all supported
> >    conversions, sfsi, sfdi, dfsi, dfdi, sisf, sidf, disf, didf.
> >
> >  * simd scalar only support conversion between same size that single
> >    mode iterator is used to cover sfsi, sisf, dfdi, didf.
> >
> >For intrinsics implementation, I used builtins backed by vfp scalar
> >instead of simd scalar which requires the input sitting inside
> >vector register.
> >
> >I remember the simd scalar pattern was here because it's anyway needed
> >by patch [2/6] which extends it's modes naturally to vector modes. I was
> >thinking it's better to keep simd scalar variant with this scalar
> >intrinsics enable patch.
> >
> >Is this OK?

This is OK. Just watch the length of some of your ChangeLog lines when you
commit.

Thanks,
James

> gcc/
> 2016-06-06  Jiong Wang<jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New
>         (TYPES_BINOP_SUS): Likewise.
>         (aarch64_simd_builtin_data): Update include file name.
>         (aarch64_builtins): Likewise.
>         * config/aarch64/aarch64-simd-builtins.def (scvtf): New entries
>         for conversion between scalar float-point and fixed-point.
>         (ucvtf): Likewise.
>         (fcvtzs): Likewise.
>         (fcvtzu): Likewise.
>         * config/aarch64/aarch64.md
>         (<FCVT_F2FIXED:fcvt_fixed_insn><GPF:mode>3: New
>         pattern for conversion between scalar float to fixed-pointer.
>         (<FCVT_FIXED2F:fcvt_fixed_insn><GPI:mode>: Likewise.
>         (UNSPEC_FCVTZS): New UNSPEC enumeration.
>         (UNSPEC_FCVTZU): Likewise.
>         (UNSPEC_SCVTF): Likewise.
>         (UNSPEC_UCVTF): Likewise.
>         * config/aarch64/arm_neon.h (vcvtd_n_f64_s64): Remove inline assembly.  Use
>         builtin.
>         (vcvtd_n_f64_u64): Likewise.
>         (vcvtd_n_s64_f64): Likewise.
>         (vcvtd_n_u64_f64): Likewise.
>         (vcvtd_n_f32_s32): Likewise.
>         (vcvts_n_f32_u32): Likewise.
>         (vcvtd_n_s32_f32): Likewise.
>         (vcvts_n_u32_f32): Likewise.
>         * config/aarch64/iterators.md (fcvt_target): Support integer to float mapping.
>         (FCVT_TARGET): Likewise.
>         (FCVT_FIXED2F): New iterator.
>         (FCVT_F2FIXED): Likewise.
>         (fcvt_fixed_insn): New define_int_attr.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 2/6] Reimplement vector fixed-point intrinsics
  2016-06-06 13:39         ` [v2][AArch64, 2/6] Reimplement vector " Jiong Wang
@ 2016-06-08  9:51           ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:51 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:39:38PM +0100, Jiong Wang wrote:
> Based on top of [1/6], this patch reimplement vector intrinsics for
> conversion between floating-point and fixed-point.

OK.

Thanks,
James

> 
> gcc/
> 2016-06-06  Jiong Wang<jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (scvtf): Register vector modes.
>         (ucvtf): Likewise.
>         (fcvtzs): Likewise.
>         (fcvtzu): Likewise.
>         * config/aarch64/aarch64-simd.md
>         (<FCVT_F2FIXED:fcvt_fixed_insn><VDQF:mode>3): New.
>         (<FCVT_FIXED2F:fcvt_fixed_insn><VDQ_SDI:mode>3): Likewise.
>         * config/aarch64/arm_neon.h (vcvt_n_f32_s32): Remove inline assembly.
>         Use builtin.
>         (vcvt_n_f32_u32): Likewise.
>         (vcvt_n_s32_f32): Likewise.
>         (vcvt_n_u32_f32): Likewise.
>         (vcvtq_n_f32_s32): Likewise.
>         (vcvtq_n_f32_u32): Likewise.
>         (vcvtq_n_f64_s64): Likewise.
>         (vcvtq_n_f64_u64): Likewise.
>         (vcvtq_n_s32_f32): Likewise.
>         (vcvtq_n_s64_f64): Likewise.
>         (vcvtq_n_u32_f32): Likewise.
>         (vcvtq_n_u64_f64): Likewise.
>         * config/aarch64/iterators.md (VDQ_SDI): New mode iterator.
>         (VSDQ_SDI): Likewise.
>         (fcvt_target): Support V4DI, V4SI and V2SI.
>         (FCVT_TARGET): Likewise.
> 


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 3/6] Reimplement frsqrte intrinsics
  2016-06-06 13:40           ` [v2][AArch64, 3/6] Reimplement frsqrte intrinsics Jiong Wang
@ 2016-06-08  9:53             ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:53 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:40:22PM +0100, Jiong Wang wrote:
> These intrinsics were implemented before the instruction pattern
> "aarch64_rsqrte<mode>" added, that these intrinsics were implemented through
> inline assembly.
> 
> This mirgrate the implementation to builtin.

OK. Thanks for the extra work in this patch set to add the missing
intrinsics. I'm glad to tick a nother couple off the TODO list!

Thanks,
James

> 
> gcc/
> 2016-06-06  Jiong Wang<jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (rsqrte): New builtins for modes
>         VALLF.
>         * config/aarch64/aarch64-simd.md (aarch64_rsqrte_<mode>2): Rename to
>         "aarch64_rsqrte<mode>".
>         * config/aarch64/aarch64.c (get_rsqrte_type): Update gen* name.
>         * config/aarch64/arm_neon.h (vrsqrts_f32): Remove inline assembly.  Use
>         builtin.
>         (vrsqrted_f64): Likewise.
>         (vrsqrte_f32): Likewise.
>         (vrsqrte_f64): Likewise.
>         (vrsqrteq_f32): Likewise.
>         (vrsqrteq_f64): Likewise.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 4/6] Reimplement frsqrts intrinsics
  2016-06-06 13:40             ` [v2][AArch64, 4/6] Reimplement frsqrts intrinsics Jiong Wang
@ 2016-06-08  9:57               ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:57 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:40:33PM +0100, Jiong Wang wrote:
> Similar as [3/6], these intrinsics were implemented before the instruction
> pattern "aarch64_rsqrts<mode>" added, that these intrinsics were implemented
> through inline assembly.
> 
> This mirgrate the implementation to builtin.

OK.

Thanks,
James


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns
  2016-06-06 13:41               ` [v2][AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns Jiong Wang
@ 2016-06-08  9:58                 ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:58 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:40:45PM +0100, Jiong Wang wrote:
> These intrinsics were implemented before "fabd<mode>_3" introduces.
> Meanwhile
> the patterns "fabd<mode>_3" and "*fabd_scalar<mode>3" can be merged into a
> single "fabd<mode>3" using VALLF.
> 
> This patch migrate the implementation to builtins backed by this pattern.

OK, but watch your ChangeLog format and line length.

Thanks,
James

> 
> gcc/
> 2016-06-01  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (fabd): New builtins
> for modes
>         VALLF.
>         * config/aarch64/aarch64-simd.md (fabd<mode>_3): Extend
> modes from VDQF
>         to VALLF.  Rename to "fabd<mode>3".
>         "*fabd_scalar<mode>3): Delete.
>         * config/aarch64/arm_neon.h (vabds_f32): Remove inline assembly.
>         Use builtin.
>         (vabdd_f64): Likewise.
>         (vabd_f32): Likewise.
>         (vabd_f64): Likewise.
>         (vabdq_f32): Likewise.
>         (vabdq_f64): Likewise.


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [v2][AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes
  2016-06-06 13:41                 ` [v2][AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes Jiong Wang
@ 2016-06-08  9:59                   ` James Greenhalgh
  0 siblings, 0 replies; 28+ messages in thread
From: James Greenhalgh @ 2016-06-08  9:59 UTC (permalink / raw)
  To: Jiong Wang; +Cc: GCC Patches, nd

On Mon, Jun 06, 2016 at 02:40:55PM +0100, Jiong Wang wrote:
> These intrinsics was implemented by inline assembly using "faddp" instruction.
> There was a pattern "aarch64_addpv4sf" which supportsV4SF mode only while we can
> extend this pattern to support VDQF mode, then we can reimplement these
> intrinsics through builtlins.

OK. But watch your ChangeLog format and line length.

Thanks again for this second spin of this patch set. I'm much happier
knowing that we don't have to revisit some of these intrinsics.

Thanks,
James

> 
> gcc/
> 2016-06-06  Jiong Wang<jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-builtins.def (faddp): New builtins for modes in VDQF.
>         * config/aarch64/aarch64-simd.md (aarch64_faddp<mode>): New.
>         (arch64_addpv4sf): Delete.
>         (reduc_plus_scal_v4sf): Use "gen_aarch64_faddpv4sf" instead of
>         "gen_aarch64_addpv4sf".
>         * config/aarch64/arm_neon.h (vpadd_f32): Remove inline assembly.  Use
>         builtin.
>         (vpadds_f32): Likewise.
>         (vpaddq_f32): Likewise.
>         (vpaddq_f64): Likewise.
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2016-06-08  9:59 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <57430251.6060902@foss.arm.com>
     [not found] ` <57430271.3070504@foss.arm.com>
2016-05-24  8:24   ` [AArch64, 2/6] Reimplement vector fixed-point intrinsics Jiong Wang
     [not found]   ` <5743029C.60208@foss.arm.com>
     [not found]     ` <574302DA.6090803@foss.arm.com>
2016-05-24  8:24       ` [AArch64, 4/6] Reimplement frsqrts intrinsics Jiong Wang
2016-05-27 14:12         ` James Greenhalgh
2016-05-27 15:12           ` Jiong Wang
     [not found]       ` <574302FC.5050701@foss.arm.com>
     [not found]         ` <5743031A.8060307@foss.arm.com>
2016-05-24  8:24           ` [AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes Jiong Wang
2016-05-27 14:45             ` James Greenhalgh
2016-05-27 14:51               ` Jiong Wang
2016-05-24  8:24         ` [AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns Jiong Wang
2016-05-27 14:41           ` James Greenhalgh
2016-05-27 14:52             ` Jiong Wang
2016-05-24  8:24     ` [AArch64, 3/6] Reimplement frsqrte intrinsics Jiong Wang
2016-05-27 14:09       ` James Greenhalgh
2016-05-27 19:59         ` Jiong Wang
2016-05-24  8:24 ` [AArch64, 1/6] Reimplement scalar fixed-point intrinsics Jiong Wang
2016-05-27 13:50   ` James Greenhalgh
2016-05-27 20:01     ` Jiong Wang
     [not found]       ` <6af07de4-8179-c0bf-410c-317ef52876dd@foss.arm.com>
2016-06-06 13:39         ` [v2][AArch64, 2/6] Reimplement vector " Jiong Wang
2016-06-08  9:51           ` James Greenhalgh
     [not found]         ` <7cb1e234-46f9-76b4-aefd-1eacabfb4ca7@foss.arm.com>
     [not found]           ` <49a7c4d8-3fdc-8806-a4df-affa742cc5d7@foss.arm.com>
2016-06-06 13:40             ` [v2][AArch64, 4/6] Reimplement frsqrts intrinsics Jiong Wang
2016-06-08  9:57               ` James Greenhalgh
     [not found]             ` <32b5ca55-e60a-42b0-3532-84319e5c0daf@foss.arm.com>
     [not found]               ` <1017fc5b-389d-ab41-24bd-491fff8e1a81@foss.arm.com>
2016-06-06 13:41                 ` [v2][AArch64, 6/6] Reimplement vpadd intrinsics & extend rtl patterns to all modes Jiong Wang
2016-06-08  9:59                   ` James Greenhalgh
2016-06-06 13:41               ` [v2][AArch64, 5/6] Reimplement fabd intrinsics & merge rtl patterns Jiong Wang
2016-06-08  9:58                 ` James Greenhalgh
2016-06-06 13:40           ` [v2][AArch64, 3/6] Reimplement frsqrte intrinsics Jiong Wang
2016-06-08  9:53             ` James Greenhalgh
2016-06-06 13:39       ` [v2][AArch64, 1/6] Reimplement scalar fixed-point intrinsics Jiong Wang
2016-06-08  9:47         ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).